summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorTony Gutierrez <anthony.gutierrez@amd.com>2016-01-19 14:28:22 -0500
committerTony Gutierrez <anthony.gutierrez@amd.com>2016-01-19 14:28:22 -0500
commit1a7d3f9fcb76a68540dd948f91413533a383bfde (patch)
tree867510a147cd095f19499d26b7c02d27de4cae9d /src
parent28e353e0403ea379d244a418e8dc8ee0b48187cf (diff)
downloadgem5-1a7d3f9fcb76a68540dd948f91413533a383bfde.tar.xz
gpu-compute: AMD's baseline GPU model
Diffstat (limited to 'src')
-rwxr-xr-xsrc/SConscript55
-rw-r--r--src/arch/SConscript8
-rw-r--r--src/arch/hsail/Brig.h67
-rw-r--r--src/arch/hsail/Brig_new.hpp1587
-rw-r--r--src/arch/hsail/SConscript54
-rw-r--r--src/arch/hsail/SConsopts40
-rwxr-xr-xsrc/arch/hsail/gen.py806
-rw-r--r--src/arch/hsail/generic_types.cc47
-rw-r--r--src/arch/hsail/generic_types.hh16
-rw-r--r--src/arch/hsail/gpu_decoder.hh77
-rw-r--r--src/arch/hsail/gpu_types.hh69
-rw-r--r--src/arch/hsail/insts/branch.cc86
-rw-r--r--src/arch/hsail/insts/branch.hh442
-rw-r--r--src/arch/hsail/insts/decl.hh1106
-rw-r--r--src/arch/hsail/insts/gpu_static_inst.cc64
-rw-r--r--src/arch/hsail/insts/gpu_static_inst.hh65
-rw-r--r--src/arch/hsail/insts/main.cc208
-rw-r--r--src/arch/hsail/insts/mem.cc139
-rw-r--r--src/arch/hsail/insts/mem.hh1629
-rw-r--r--src/arch/hsail/insts/mem_impl.hh660
-rw-r--r--src/arch/hsail/insts/pseudo_inst.cc787
-rw-r--r--src/arch/hsail/operand.cc449
-rw-r--r--src/arch/hsail/operand.hh768
-rw-r--r--src/gpu-compute/GPU.py310
-rw-r--r--src/gpu-compute/LdsState.py51
-rw-r--r--src/gpu-compute/SConscript99
-rw-r--r--src/gpu-compute/X86GPUTLB.py77
-rw-r--r--src/gpu-compute/brig_object.cc474
-rw-r--r--src/gpu-compute/brig_object.hh134
-rw-r--r--src/gpu-compute/cl_driver.cc272
-rw-r--r--src/gpu-compute/cl_driver.hh77
-rw-r--r--src/gpu-compute/cl_event.hh51
-rw-r--r--src/gpu-compute/code_enums.hh116
-rw-r--r--src/gpu-compute/compute_unit.cc1817
-rw-r--r--src/gpu-compute/compute_unit.hh767
-rw-r--r--src/gpu-compute/condition_register_state.cc83
-rw-r--r--src/gpu-compute/condition_register_state.hh101
-rw-r--r--src/gpu-compute/dispatcher.cc394
-rw-r--r--src/gpu-compute/dispatcher.hh163
-rw-r--r--src/gpu-compute/exec_stage.cc203
-rw-r--r--src/gpu-compute/exec_stage.hh129
-rw-r--r--src/gpu-compute/fetch_stage.cc106
-rw-r--r--src/gpu-compute/fetch_stage.hh78
-rw-r--r--src/gpu-compute/fetch_unit.cc293
-rw-r--r--src/gpu-compute/fetch_unit.hh89
-rw-r--r--src/gpu-compute/global_memory_pipeline.cc242
-rw-r--r--src/gpu-compute/global_memory_pipeline.hh123
-rw-r--r--src/gpu-compute/gpu_dyn_inst.cc198
-rw-r--r--src/gpu-compute/gpu_dyn_inst.hh464
-rw-r--r--src/gpu-compute/gpu_exec_context.cc53
-rw-r--r--src/gpu-compute/gpu_exec_context.hh54
-rw-r--r--src/gpu-compute/gpu_static_inst.cc42
-rw-r--r--src/gpu-compute/gpu_static_inst.hh166
-rw-r--r--src/gpu-compute/gpu_tlb.cc1801
-rw-r--r--src/gpu-compute/gpu_tlb.hh465
-rw-r--r--src/gpu-compute/hsa_code.hh101
-rw-r--r--src/gpu-compute/hsa_kernel_info.hh79
-rw-r--r--src/gpu-compute/hsa_object.cc76
-rw-r--r--src/gpu-compute/hsa_object.hh74
-rw-r--r--src/gpu-compute/hsail_code.cc453
-rw-r--r--src/gpu-compute/hsail_code.hh447
-rw-r--r--src/gpu-compute/kernel_cfg.cc296
-rw-r--r--src/gpu-compute/kernel_cfg.hh133
-rw-r--r--src/gpu-compute/lds_state.cc341
-rw-r--r--src/gpu-compute/lds_state.hh512
-rw-r--r--src/gpu-compute/local_memory_pipeline.cc200
-rw-r--r--src/gpu-compute/local_memory_pipeline.hh98
-rw-r--r--src/gpu-compute/misc.hh162
-rw-r--r--src/gpu-compute/ndrange.hh70
-rw-r--r--src/gpu-compute/of_scheduling_policy.cc76
-rw-r--r--src/gpu-compute/of_scheduling_policy.hh61
-rw-r--r--src/gpu-compute/pool_manager.cc42
-rw-r--r--src/gpu-compute/pool_manager.hh66
-rw-r--r--src/gpu-compute/qstruct.hh201
-rw-r--r--src/gpu-compute/rr_scheduling_policy.cc67
-rw-r--r--src/gpu-compute/rr_scheduling_policy.hh65
-rw-r--r--src/gpu-compute/schedule_stage.cc151
-rw-r--r--src/gpu-compute/schedule_stage.hh95
-rw-r--r--src/gpu-compute/scheduler.cc71
-rw-r--r--src/gpu-compute/scheduler.hh63
-rw-r--r--src/gpu-compute/scheduling_policy.hh57
-rw-r--r--src/gpu-compute/scoreboard_check_stage.cc173
-rw-r--r--src/gpu-compute/scoreboard_check_stage.hh106
-rw-r--r--src/gpu-compute/shader.cc412
-rw-r--r--src/gpu-compute/shader.hh212
-rw-r--r--src/gpu-compute/simple_pool_manager.cc108
-rw-r--r--src/gpu-compute/simple_pool_manager.hh72
-rw-r--r--src/gpu-compute/tlb_coalescer.cc583
-rw-r--r--src/gpu-compute/tlb_coalescer.hh252
-rw-r--r--src/gpu-compute/vector_register_file.cc251
-rw-r--r--src/gpu-compute/vector_register_file.hh142
-rw-r--r--src/gpu-compute/vector_register_state.cc58
-rw-r--r--src/gpu-compute/vector_register_state.hh101
-rw-r--r--src/gpu-compute/wavefront.cc925
-rw-r--r--src/gpu-compute/wavefront.hh368
-rw-r--r--src/mem/protocol/GPU_RfO-SQC.sm667
-rw-r--r--src/mem/protocol/GPU_RfO-TCC.sm1199
-rw-r--r--src/mem/protocol/GPU_RfO-TCCdir.sm2672
-rw-r--r--src/mem/protocol/GPU_RfO-TCP.sm1009
-rw-r--r--src/mem/protocol/GPU_RfO.slicc11
-rw-r--r--src/mem/protocol/GPU_VIPER-SQC.sm322
-rw-r--r--src/mem/protocol/GPU_VIPER-TCC.sm739
-rw-r--r--src/mem/protocol/GPU_VIPER-TCP.sm747
-rw-r--r--src/mem/protocol/GPU_VIPER.slicc9
-rw-r--r--src/mem/protocol/GPU_VIPER_Baseline.slicc9
-rw-r--r--src/mem/protocol/GPU_VIPER_Region-TCC.sm773
-rw-r--r--src/mem/protocol/GPU_VIPER_Region.slicc11
-rw-r--r--src/mem/protocol/MOESI_AMD_Base-CorePair.sm2904
-rw-r--r--src/mem/protocol/MOESI_AMD_Base-L3cache.sm1130
-rw-r--r--src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm3009
-rw-r--r--src/mem/protocol/MOESI_AMD_Base-Region-dir.sm2038
-rw-r--r--src/mem/protocol/MOESI_AMD_Base-Region-msg.sm291
-rw-r--r--src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm1368
-rw-r--r--src/mem/protocol/MOESI_AMD_Base-RegionDir.sm1187
-rw-r--r--src/mem/protocol/MOESI_AMD_Base-dir.sm1137
-rw-r--r--src/mem/protocol/MOESI_AMD_Base-msg.sm362
-rw-r--r--src/mem/protocol/MOESI_AMD_Base-probeFilter.sm1408
-rw-r--r--src/mem/protocol/MOESI_AMD_Base.slicc6
-rw-r--r--src/mem/protocol/RubySlicc_ComponentMapping.sm3
-rw-r--r--src/mem/protocol/RubySlicc_Exports.sm11
-rw-r--r--src/mem/protocol/RubySlicc_Types.sm45
-rw-r--r--src/mem/protocol/SConsopts5
-rw-r--r--src/mem/ruby/SConscript15
-rw-r--r--src/mem/ruby/profiler/Profiler.cc4
-rw-r--r--src/mem/ruby/slicc_interface/AbstractCacheEntry.hh6
-rw-r--r--src/mem/ruby/slicc_interface/AbstractController.cc6
-rw-r--r--src/mem/ruby/slicc_interface/AbstractController.hh3
-rw-r--r--src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh29
-rw-r--r--src/mem/ruby/structures/CacheMemory.cc50
-rw-r--r--src/mem/ruby/structures/CacheMemory.hh5
-rw-r--r--src/mem/ruby/structures/RubyCache.py1
-rw-r--r--src/mem/ruby/system/GPUCoalescer.cc1397
-rw-r--r--src/mem/ruby/system/GPUCoalescer.hh368
-rw-r--r--src/mem/ruby/system/GPUCoalescer.py48
-rw-r--r--src/mem/ruby/system/RubyPort.cc3
-rw-r--r--src/mem/ruby/system/RubyPort.hh4
-rw-r--r--src/mem/ruby/system/RubySystem.cc2
-rw-r--r--src/mem/ruby/system/SConscript10
-rw-r--r--src/mem/ruby/system/Sequencer.cc5
-rw-r--r--src/mem/ruby/system/Sequencer.hh3
-rw-r--r--src/mem/ruby/system/Sequencer.py86
-rw-r--r--src/mem/ruby/system/VIPERCoalescer.cc287
-rw-r--r--src/mem/ruby/system/VIPERCoalescer.hh75
-rw-r--r--src/mem/ruby/system/VIPERCoalescer.py45
-rw-r--r--src/mem/ruby/system/WeightedLRUPolicy.cc113
-rw-r--r--src/mem/ruby/system/WeightedLRUPolicy.hh62
-rw-r--r--src/mem/ruby/system/WeightedLRUReplacementPolicy.py45
-rw-r--r--src/mem/slicc/symbols/StateMachine.py44
148 files changed, 52249 insertions, 80 deletions
diff --git a/src/SConscript b/src/SConscript
index 322212cb7..2bac0bff3 100755
--- a/src/SConscript
+++ b/src/SConscript
@@ -78,7 +78,7 @@ class SourceMeta(type):
def __init__(cls, name, bases, dict):
super(SourceMeta, cls).__init__(name, bases, dict)
cls.all = []
-
+
def get(cls, **guards):
'''Find all files that match the specified guards. If a source
file does not specify a flag, the default is False'''
@@ -367,9 +367,9 @@ def makeTheISA(source, target, env):
target_isa = env['TARGET_ISA']
def define(isa):
return isa.upper() + '_ISA'
-
+
def namespace(isa):
- return isa[0].upper() + isa[1:].lower() + 'ISA'
+ return isa[0].upper() + isa[1:].lower() + 'ISA'
code = code_formatter()
@@ -407,6 +407,51 @@ def makeTheISA(source, target, env):
env.Command('config/the_isa.hh', map(Value, all_isa_list),
MakeAction(makeTheISA, Transform("CFG ISA", 0)))
+def makeTheGPUISA(source, target, env):
+ isas = [ src.get_contents() for src in source ]
+ target_gpu_isa = env['TARGET_GPU_ISA']
+ def define(isa):
+ return isa.upper() + '_ISA'
+
+ def namespace(isa):
+ return isa[0].upper() + isa[1:].lower() + 'ISA'
+
+
+ code = code_formatter()
+ code('''\
+#ifndef __CONFIG_THE_GPU_ISA_HH__
+#define __CONFIG_THE_GPU_ISA_HH__
+
+''')
+
+ # create defines for the preprocessing and compile-time determination
+ for i,isa in enumerate(isas):
+ code('#define $0 $1', define(isa), i + 1)
+ code()
+
+ # create an enum for any run-time determination of the ISA, we
+ # reuse the same name as the namespaces
+ code('enum class GPUArch {')
+ for i,isa in enumerate(isas):
+ if i + 1 == len(isas):
+ code(' $0 = $1', namespace(isa), define(isa))
+ else:
+ code(' $0 = $1,', namespace(isa), define(isa))
+ code('};')
+
+ code('''
+
+#define THE_GPU_ISA ${{define(target_gpu_isa)}}
+#define TheGpuISA ${{namespace(target_gpu_isa)}}
+#define THE_GPU_ISA_STR "${{target_gpu_isa}}"
+
+#endif // __CONFIG_THE_GPU_ISA_HH__''')
+
+ code.write(str(target[0]))
+
+env.Command('config/the_gpu_isa.hh', map(Value, all_gpu_isa_list),
+ MakeAction(makeTheGPUISA, Transform("CFG ISA", 0)))
+
########################################################################
#
# Prevent any SimObjects from being added after this point, they
@@ -784,7 +829,7 @@ extern "C" {
EmbeddedSwig embed_swig_${module}(init_${module});
''')
code.write(str(target[0]))
-
+
# Build all swig modules
for swig in SwigSource.all:
env.Command([swig.cc_source.tnode, swig.py_source.tnode], swig.tnode,
@@ -959,7 +1004,7 @@ const uint8_t data_${sym}[] = {
x = array.array('B', data[i:i+step])
code(''.join('%d,' % d for d in x))
code.dedent()
-
+
code('''};
EmbeddedPython embedded_${sym}(
diff --git a/src/arch/SConscript b/src/arch/SConscript
index e0d6845f5..b022cb01f 100644
--- a/src/arch/SConscript
+++ b/src/arch/SConscript
@@ -68,6 +68,14 @@ isa_switch_hdrs = Split('''
# Set up this directory to support switching headers
make_switching_dir('arch', isa_switch_hdrs, env)
+if env['BUILD_GPU']:
+ gpu_isa_switch_hdrs = Split('''
+ gpu_decoder.hh
+ gpu_types.hh
+ ''')
+
+ make_gpu_switching_dir('arch', gpu_isa_switch_hdrs, env)
+
#################################################################
#
# Include architecture-specific files.
diff --git a/src/arch/hsail/Brig.h b/src/arch/hsail/Brig.h
new file mode 100644
index 000000000..b260157ab
--- /dev/null
+++ b/src/arch/hsail/Brig.h
@@ -0,0 +1,67 @@
+// University of Illinois/NCSA
+// Open Source License
+//
+// Copyright (c) 2013, Advanced Micro Devices, Inc.
+// All rights reserved.
+//
+// Developed by:
+//
+// HSA Team
+//
+// Advanced Micro Devices, Inc
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal with
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimers in the
+// documentation and/or other materials provided with the distribution.
+//
+// * Neither the names of the LLVM Team, University of Illinois at
+// Urbana-Champaign, nor the names of its contributors may be used to
+// endorse or promote products derived from this Software without specific
+// prior written permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+// SOFTWARE.
+#ifndef INTERNAL_BRIG_H
+#define INTERNAL_BRIG_H
+
+#include <stdint.h>
+
+namespace Brig {
+#include "Brig_new.hpp"
+
+// These typedefs provide some backward compatibility with earlier versions
+// of Brig.h, reducing the number of code changes. The distinct names also
+// increase legibility by showing the code's intent.
+typedef BrigBase BrigDirective;
+typedef BrigBase BrigOperand;
+
+enum BrigMemoryFenceSegments { // for internal use only
+ //.mnemo={ s/^BRIG_MEMORY_FENCE_SEGMENT_//;lc }
+ //.mnemo_token=_EMMemoryFenceSegments
+ //.mnemo_context=EInstModifierInstFenceContext
+ BRIG_MEMORY_FENCE_SEGMENT_GLOBAL = 0,
+ BRIG_MEMORY_FENCE_SEGMENT_GROUP = 1,
+ BRIG_MEMORY_FENCE_SEGMENT_IMAGE = 2,
+ BRIG_MEMORY_FENCE_SEGMENT_LAST = 3 //.skip
+};
+
+}
+
+#endif // defined(INTERNAL_BRIG_H)
diff --git a/src/arch/hsail/Brig_new.hpp b/src/arch/hsail/Brig_new.hpp
new file mode 100644
index 000000000..60e6f4dea
--- /dev/null
+++ b/src/arch/hsail/Brig_new.hpp
@@ -0,0 +1,1587 @@
+// University of Illinois/NCSA
+// Open Source License
+//
+// Copyright (c) 2013-2015, Advanced Micro Devices, Inc.
+// All rights reserved.
+//
+// Developed by:
+//
+// HSA Team
+//
+// Advanced Micro Devices, Inc
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of
+// this software and associated documentation files (the "Software"), to deal with
+// the Software without restriction, including without limitation the rights to
+// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+// of the Software, and to permit persons to whom the Software is furnished to do
+// so, subject to the following conditions:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimers in the
+// documentation and/or other materials provided with the distribution.
+//
+// * Neither the names of the LLVM Team, University of Illinois at
+// Urbana-Champaign, nor the names of its contributors may be used to
+// endorse or promote products derived from this Software without specific
+// prior written permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+// SOFTWARE.
+
+//.ignore{
+
+#ifndef INCLUDED_BRIG_H
+#define INCLUDED_BRIG_H
+
+#include <stdint.h>
+
+enum BrigAuxDefs {
+ MAX_OPERANDS_NUM = 6
+};
+
+//}
+
+typedef uint32_t BrigVersion32_t;
+
+enum BrigVersion {
+
+ //.nowrap
+ //.nodump
+ //.nollvm
+
+ BRIG_VERSION_HSAIL_MAJOR = 1,
+ BRIG_VERSION_HSAIL_MINOR = 0,
+ BRIG_VERSION_BRIG_MAJOR = 1,
+ BRIG_VERSION_BRIG_MINOR = 0
+};
+
+typedef uint8_t BrigAlignment8_t; //.defValue=BRIG_ALIGNMENT_NONE
+
+typedef uint8_t BrigAllocation8_t; //.defValue=BRIG_ALLOCATION_NONE
+
+typedef uint8_t BrigAluModifier8_t;
+
+typedef uint8_t BrigAtomicOperation8_t;
+
+typedef uint32_t BrigCodeOffset32_t; //.defValue=0 //.wtype=ItemRef<Code>
+
+typedef uint8_t BrigCompareOperation8_t;
+
+typedef uint16_t BrigControlDirective16_t;
+
+typedef uint32_t BrigDataOffset32_t;
+
+typedef BrigDataOffset32_t BrigDataOffsetCodeList32_t; //.wtype=ListRef<Code> //.defValue=0
+
+typedef BrigDataOffset32_t BrigDataOffsetOperandList32_t; //.wtype=ListRef<Operand> //.defValue=0
+
+typedef BrigDataOffset32_t BrigDataOffsetString32_t; //.wtype=StrRef //.defValue=0
+
+typedef uint8_t BrigExecutableModifier8_t;
+
+typedef uint8_t BrigImageChannelOrder8_t; //.defValue=BRIG_CHANNEL_ORDER_UNKNOWN
+
+typedef uint8_t BrigImageChannelType8_t; //.defValue=BRIG_CHANNEL_TYPE_UNKNOWN
+
+typedef uint8_t BrigImageGeometry8_t; //.defValue=BRIG_GEOMETRY_UNKNOWN
+
+typedef uint8_t BrigImageQuery8_t;
+
+typedef uint16_t BrigKind16_t;
+
+typedef uint8_t BrigLinkage8_t; //.defValue=BRIG_LINKAGE_NONE
+
+typedef uint8_t BrigMachineModel8_t; //.defValue=BRIG_MACHINE_LARGE
+
+typedef uint8_t BrigMemoryModifier8_t;
+
+typedef uint8_t BrigMemoryOrder8_t; //.defValue=BRIG_MEMORY_ORDER_RELAXED
+
+typedef uint8_t BrigMemoryScope8_t; //.defValue=BRIG_MEMORY_SCOPE_SYSTEM
+
+typedef uint16_t BrigOpcode16_t;
+
+typedef uint32_t BrigOperandOffset32_t; //.defValue=0 //.wtype=ItemRef<Operand>
+
+typedef uint8_t BrigPack8_t; //.defValue=BRIG_PACK_NONE
+
+typedef uint8_t BrigProfile8_t; //.defValue=BRIG_PROFILE_FULL
+
+typedef uint16_t BrigRegisterKind16_t;
+
+typedef uint8_t BrigRound8_t; //.defValue=BRIG_ROUND_NONE
+
+typedef uint8_t BrigSamplerAddressing8_t; //.defValue=BRIG_ADDRESSING_CLAMP_TO_EDGE
+
+typedef uint8_t BrigSamplerCoordNormalization8_t;
+
+typedef uint8_t BrigSamplerFilter8_t;
+
+typedef uint8_t BrigSamplerQuery8_t;
+
+typedef uint32_t BrigSectionIndex32_t;
+
+typedef uint8_t BrigSegCvtModifier8_t;
+
+typedef uint8_t BrigSegment8_t; //.defValue=BRIG_SEGMENT_NONE
+
+typedef uint32_t BrigStringOffset32_t; //.defValue=0 //.wtype=StrRef
+
+typedef uint16_t BrigType16_t;
+
+typedef uint8_t BrigVariableModifier8_t;
+
+typedef uint8_t BrigWidth8_t;
+
+typedef uint32_t BrigExceptions32_t;
+
+enum BrigKind {
+
+ //.nollvm
+ //
+ //.wname={ s/^BRIG_KIND//; MACRO2Name($_) }
+ //.mnemo=$wname{ $wname }
+ //
+ //.sizeof=$wname{ "sizeof(".$structs->{"Brig".$wname}->{rawbrig}.")" }
+ //.sizeof_switch //.sizeof_proto="int size_of_brig_record(unsigned arg)" //.sizeof_default="return -1"
+ //
+ //.isBodyOnly={ "false" }
+ //.isBodyOnly_switch //.isBodyOnly_proto="bool isBodyOnly(Directive d)" //.isBodyOnly_arg="d.kind()"
+ //.isBodyOnly_default="assert(false); return false"
+ //
+ //.isToplevelOnly={ "false" }
+ //.isToplevelOnly_switch //.isToplevelOnly_proto="bool isToplevelOnly(Directive d)" //.isToplevelOnly_arg="d.kind()"
+ //.isToplevelOnly_default="assert(false); return false"
+
+ BRIG_KIND_NONE = 0x0000, //.skip
+
+ BRIG_KIND_DIRECTIVE_BEGIN = 0x1000, //.skip
+ BRIG_KIND_DIRECTIVE_ARG_BLOCK_END = 0x1000, //.isBodyOnly=true
+ BRIG_KIND_DIRECTIVE_ARG_BLOCK_START = 0x1001, //.isBodyOnly=true
+ BRIG_KIND_DIRECTIVE_COMMENT = 0x1002,
+ BRIG_KIND_DIRECTIVE_CONTROL = 0x1003, //.isBodyOnly=true
+ BRIG_KIND_DIRECTIVE_EXTENSION = 0x1004, //.isToplevelOnly=true
+ BRIG_KIND_DIRECTIVE_FBARRIER = 0x1005,
+ BRIG_KIND_DIRECTIVE_FUNCTION = 0x1006, //.isToplevelOnly=true
+ BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION = 0x1007, //.isToplevelOnly=true
+ BRIG_KIND_DIRECTIVE_KERNEL = 0x1008, //.isToplevelOnly=true
+ BRIG_KIND_DIRECTIVE_LABEL = 0x1009, //.isBodyOnly=true
+ BRIG_KIND_DIRECTIVE_LOC = 0x100a,
+ BRIG_KIND_DIRECTIVE_MODULE = 0x100b, //.isToplevelOnly=true
+ BRIG_KIND_DIRECTIVE_PRAGMA = 0x100c,
+ BRIG_KIND_DIRECTIVE_SIGNATURE = 0x100d, //.isToplevelOnly=true
+ BRIG_KIND_DIRECTIVE_VARIABLE = 0x100e,
+ BRIG_KIND_DIRECTIVE_END = 0x100f, //.skip
+
+ BRIG_KIND_INST_BEGIN = 0x2000, //.skip
+ BRIG_KIND_INST_ADDR = 0x2000,
+ BRIG_KIND_INST_ATOMIC = 0x2001,
+ BRIG_KIND_INST_BASIC = 0x2002,
+ BRIG_KIND_INST_BR = 0x2003,
+ BRIG_KIND_INST_CMP = 0x2004,
+ BRIG_KIND_INST_CVT = 0x2005,
+ BRIG_KIND_INST_IMAGE = 0x2006,
+ BRIG_KIND_INST_LANE = 0x2007,
+ BRIG_KIND_INST_MEM = 0x2008,
+ BRIG_KIND_INST_MEM_FENCE = 0x2009,
+ BRIG_KIND_INST_MOD = 0x200a,
+ BRIG_KIND_INST_QUERY_IMAGE = 0x200b,
+ BRIG_KIND_INST_QUERY_SAMPLER = 0x200c,
+ BRIG_KIND_INST_QUEUE = 0x200d,
+ BRIG_KIND_INST_SEG = 0x200e,
+ BRIG_KIND_INST_SEG_CVT = 0x200f,
+ BRIG_KIND_INST_SIGNAL = 0x2010,
+ BRIG_KIND_INST_SOURCE_TYPE = 0x2011,
+ BRIG_KIND_INST_END = 0x2012, //.skip
+
+ BRIG_KIND_OPERAND_BEGIN = 0x3000, //.skip
+ BRIG_KIND_OPERAND_ADDRESS = 0x3000,
+ BRIG_KIND_OPERAND_ALIGN = 0x3001,
+ BRIG_KIND_OPERAND_CODE_LIST = 0x3002,
+ BRIG_KIND_OPERAND_CODE_REF = 0x3003,
+ BRIG_KIND_OPERAND_CONSTANT_BYTES = 0x3004,
+ BRIG_KIND_OPERAND_RESERVED = 0x3005, //.skip
+ BRIG_KIND_OPERAND_CONSTANT_IMAGE = 0x3006,
+ BRIG_KIND_OPERAND_CONSTANT_OPERAND_LIST = 0x3007,
+ BRIG_KIND_OPERAND_CONSTANT_SAMPLER = 0x3008,
+ BRIG_KIND_OPERAND_OPERAND_LIST = 0x3009,
+ BRIG_KIND_OPERAND_REGISTER = 0x300a,
+ BRIG_KIND_OPERAND_STRING = 0x300b,
+ BRIG_KIND_OPERAND_WAVESIZE = 0x300c,
+ BRIG_KIND_OPERAND_END = 0x300d //.skip
+};
+
+enum BrigAlignment {
+
+ //.mnemo={ s/^BRIG_ALIGNMENT_//; lc }
+ //.mnemo_proto="const char* align2str(unsigned arg)"
+ //
+ //.bytes={ /(\d+)/ ? $1 : undef }
+ //.bytes_switch //.bytes_proto="unsigned align2num(unsigned arg)" //.bytes_default="assert(false); return -1"
+ //
+ //.rbytes=$bytes{ $bytes }
+ //.rbytes_switch //.rbytes_reverse //.rbytes_proto="BrigAlignment num2align(uint64_t arg)"
+ //.rbytes_default="return BRIG_ALIGNMENT_LAST"
+ //
+ //.print=$bytes{ $bytes>1 ? "_align($bytes)" : "" }
+
+ BRIG_ALIGNMENT_NONE = 0, //.no_mnemo
+ BRIG_ALIGNMENT_1 = 1, //.mnemo=""
+ BRIG_ALIGNMENT_2 = 2,
+ BRIG_ALIGNMENT_4 = 3,
+ BRIG_ALIGNMENT_8 = 4,
+ BRIG_ALIGNMENT_16 = 5,
+ BRIG_ALIGNMENT_32 = 6,
+ BRIG_ALIGNMENT_64 = 7,
+ BRIG_ALIGNMENT_128 = 8,
+ BRIG_ALIGNMENT_256 = 9,
+
+ BRIG_ALIGNMENT_LAST, //.skip
+ BRIG_ALIGNMENT_MAX = BRIG_ALIGNMENT_LAST - 1 //.skip
+};
+
+enum BrigAllocation {
+
+ //.mnemo={ s/^BRIG_ALLOCATION_//;lc }
+ //.mnemo_token=EAllocKind
+
+ BRIG_ALLOCATION_NONE = 0, //.mnemo=""
+ BRIG_ALLOCATION_PROGRAM = 1,
+ BRIG_ALLOCATION_AGENT = 2,
+ BRIG_ALLOCATION_AUTOMATIC = 3
+};
+
+enum BrigAluModifierMask {
+ BRIG_ALU_FTZ = 1
+};
+
+enum BrigAtomicOperation {
+
+ //.tdcaption="Atomic Operations"
+ //
+ //.mnemo={ s/^BRIG_ATOMIC_//;lc }
+ //.mnemo_token=_EMAtomicOp
+ //.mnemo_context=EInstModifierInstAtomicContext
+ //
+ //.print=$mnemo{ "_$mnemo" }
+
+ BRIG_ATOMIC_ADD = 0,
+ BRIG_ATOMIC_AND = 1,
+ BRIG_ATOMIC_CAS = 2,
+ BRIG_ATOMIC_EXCH = 3,
+ BRIG_ATOMIC_LD = 4,
+ BRIG_ATOMIC_MAX = 5,
+ BRIG_ATOMIC_MIN = 6,
+ BRIG_ATOMIC_OR = 7,
+ BRIG_ATOMIC_ST = 8,
+ BRIG_ATOMIC_SUB = 9,
+ BRIG_ATOMIC_WRAPDEC = 10,
+ BRIG_ATOMIC_WRAPINC = 11,
+ BRIG_ATOMIC_XOR = 12,
+ BRIG_ATOMIC_WAIT_EQ = 13,
+ BRIG_ATOMIC_WAIT_NE = 14,
+ BRIG_ATOMIC_WAIT_LT = 15,
+ BRIG_ATOMIC_WAIT_GTE = 16,
+ BRIG_ATOMIC_WAITTIMEOUT_EQ = 17,
+ BRIG_ATOMIC_WAITTIMEOUT_NE = 18,
+ BRIG_ATOMIC_WAITTIMEOUT_LT = 19,
+ BRIG_ATOMIC_WAITTIMEOUT_GTE = 20
+};
+
+enum BrigCompareOperation {
+
+ //.tdcaption="Comparison Operators"
+ //
+ //.mnemo={ s/^BRIG_COMPARE_//;lc }
+ //.mnemo_token=_EMCompare
+ //
+ //.print=$mnemo{ "_$mnemo" }
+
+ BRIG_COMPARE_EQ = 0,
+ BRIG_COMPARE_NE = 1,
+ BRIG_COMPARE_LT = 2,
+ BRIG_COMPARE_LE = 3,
+ BRIG_COMPARE_GT = 4,
+ BRIG_COMPARE_GE = 5,
+ BRIG_COMPARE_EQU = 6,
+ BRIG_COMPARE_NEU = 7,
+ BRIG_COMPARE_LTU = 8,
+ BRIG_COMPARE_LEU = 9,
+ BRIG_COMPARE_GTU = 10,
+ BRIG_COMPARE_GEU = 11,
+ BRIG_COMPARE_NUM = 12,
+ BRIG_COMPARE_NAN = 13,
+ BRIG_COMPARE_SEQ = 14,
+ BRIG_COMPARE_SNE = 15,
+ BRIG_COMPARE_SLT = 16,
+ BRIG_COMPARE_SLE = 17,
+ BRIG_COMPARE_SGT = 18,
+ BRIG_COMPARE_SGE = 19,
+ BRIG_COMPARE_SGEU = 20,
+ BRIG_COMPARE_SEQU = 21,
+ BRIG_COMPARE_SNEU = 22,
+ BRIG_COMPARE_SLTU = 23,
+ BRIG_COMPARE_SLEU = 24,
+ BRIG_COMPARE_SNUM = 25,
+ BRIG_COMPARE_SNAN = 26,
+ BRIG_COMPARE_SGTU = 27
+};
+
+enum BrigControlDirective {
+
+ //.mnemo={ s/^BRIG_CONTROL_//;lc }
+ //.mnemo_token=EControl
+ //
+ //.print=$mnemo{ $mnemo }
+
+ BRIG_CONTROL_NONE = 0, //.skip
+ BRIG_CONTROL_ENABLEBREAKEXCEPTIONS = 1,
+ BRIG_CONTROL_ENABLEDETECTEXCEPTIONS = 2,
+ BRIG_CONTROL_MAXDYNAMICGROUPSIZE = 3,
+ BRIG_CONTROL_MAXFLATGRIDSIZE = 4,
+ BRIG_CONTROL_MAXFLATWORKGROUPSIZE = 5,
+ BRIG_CONTROL_REQUIREDDIM = 6,
+ BRIG_CONTROL_REQUIREDGRIDSIZE = 7,
+ BRIG_CONTROL_REQUIREDWORKGROUPSIZE = 8,
+ BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS = 9
+};
+
+enum BrigExecutableModifierMask {
+ //.nodump
+ BRIG_EXECUTABLE_DEFINITION = 1
+};
+
+enum BrigImageChannelOrder {
+
+ //.mnemo={ s/^BRIG_CHANNEL_ORDER_?//;lc }
+ //.mnemo_token=EImageOrder
+ //.mnemo_context=EImageOrderContext
+ //
+ //.print=$mnemo{ $mnemo }
+
+ BRIG_CHANNEL_ORDER_A = 0,
+ BRIG_CHANNEL_ORDER_R = 1,
+ BRIG_CHANNEL_ORDER_RX = 2,
+ BRIG_CHANNEL_ORDER_RG = 3,
+ BRIG_CHANNEL_ORDER_RGX = 4,
+ BRIG_CHANNEL_ORDER_RA = 5,
+ BRIG_CHANNEL_ORDER_RGB = 6,
+ BRIG_CHANNEL_ORDER_RGBX = 7,
+ BRIG_CHANNEL_ORDER_RGBA = 8,
+ BRIG_CHANNEL_ORDER_BGRA = 9,
+ BRIG_CHANNEL_ORDER_ARGB = 10,
+ BRIG_CHANNEL_ORDER_ABGR = 11,
+ BRIG_CHANNEL_ORDER_SRGB = 12,
+ BRIG_CHANNEL_ORDER_SRGBX = 13,
+ BRIG_CHANNEL_ORDER_SRGBA = 14,
+ BRIG_CHANNEL_ORDER_SBGRA = 15,
+ BRIG_CHANNEL_ORDER_INTENSITY = 16,
+ BRIG_CHANNEL_ORDER_LUMINANCE = 17,
+ BRIG_CHANNEL_ORDER_DEPTH = 18,
+ BRIG_CHANNEL_ORDER_DEPTH_STENCIL = 19,
+
+ // used internally
+ BRIG_CHANNEL_ORDER_UNKNOWN, //.mnemo="" // used when no order is specified
+
+ BRIG_CHANNEL_ORDER_FIRST_USER_DEFINED = 128 //.skip
+
+};
+
+enum BrigImageChannelType {
+
+ //.mnemo={ s/^BRIG_CHANNEL_TYPE_//;lc }
+ //.mnemo_token=EImageFormat
+ //
+ //.print=$mnemo{ $mnemo }
+
+ BRIG_CHANNEL_TYPE_SNORM_INT8 = 0,
+ BRIG_CHANNEL_TYPE_SNORM_INT16 = 1,
+ BRIG_CHANNEL_TYPE_UNORM_INT8 = 2,
+ BRIG_CHANNEL_TYPE_UNORM_INT16 = 3,
+ BRIG_CHANNEL_TYPE_UNORM_INT24 = 4,
+ BRIG_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
+ BRIG_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
+ BRIG_CHANNEL_TYPE_UNORM_INT_101010 = 7,
+ BRIG_CHANNEL_TYPE_SIGNED_INT8 = 8,
+ BRIG_CHANNEL_TYPE_SIGNED_INT16 = 9,
+ BRIG_CHANNEL_TYPE_SIGNED_INT32 = 10,
+ BRIG_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
+ BRIG_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
+ BRIG_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
+ BRIG_CHANNEL_TYPE_HALF_FLOAT = 14,
+ BRIG_CHANNEL_TYPE_FLOAT = 15,
+
+ // used internally
+ BRIG_CHANNEL_TYPE_UNKNOWN, //.mnemo=""
+
+ BRIG_CHANNEL_TYPE_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigImageGeometry {
+
+ //.tdcaption="Geometry"
+ //
+ //.mnemo={ s/^BRIG_GEOMETRY_//;lc }
+ //.mnemo_token=EImageGeometry
+ //
+ //.dim={/_([0-9]+D)(A)?/ ? $1+(defined $2?1:0) : undef}
+ //.dim_switch //.dim_proto="unsigned getBrigGeometryDim(unsigned geo)" //.dim_arg="geo"
+ //.dim_default="assert(0); return 0"
+ //
+ //.depth={/DEPTH$/?"true":"false"}
+ //.depth_switch //.depth_proto="bool isBrigGeometryDepth(unsigned geo)" //.depth_arg="geo"
+ //.depth_default="return false"
+
+ BRIG_GEOMETRY_1D = 0,
+ BRIG_GEOMETRY_2D = 1,
+ BRIG_GEOMETRY_3D = 2,
+ BRIG_GEOMETRY_1DA = 3,
+ BRIG_GEOMETRY_2DA = 4,
+ BRIG_GEOMETRY_1DB = 5,
+ BRIG_GEOMETRY_2DDEPTH = 6,
+ BRIG_GEOMETRY_2DADEPTH = 7,
+
+ // used internally
+ BRIG_GEOMETRY_UNKNOWN, //.mnemo=""
+
+ BRIG_GEOMETRY_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigImageQuery {
+
+ //.mnemo={ s/^BRIG_IMAGE_QUERY_//;lc }
+ //
+ //.print=$mnemo{ $mnemo }
+
+ BRIG_IMAGE_QUERY_WIDTH = 0,
+ BRIG_IMAGE_QUERY_HEIGHT = 1,
+ BRIG_IMAGE_QUERY_DEPTH = 2,
+ BRIG_IMAGE_QUERY_ARRAY = 3,
+ BRIG_IMAGE_QUERY_CHANNELORDER = 4,
+ BRIG_IMAGE_QUERY_CHANNELTYPE = 5,
+ BRIG_IMAGE_QUERY_NUMMIPLEVELS = 6
+};
+
+enum BrigLinkage {
+
+ //.mnemo={ s/^BRIG_LINKAGE_//;s/NONE//;lc }
+
+ BRIG_LINKAGE_NONE = 0,
+ BRIG_LINKAGE_PROGRAM = 1,
+ BRIG_LINKAGE_MODULE = 2,
+ BRIG_LINKAGE_FUNCTION = 3,
+ BRIG_LINKAGE_ARG = 4
+};
+
+enum BrigMachineModel {
+
+ //.mnemo={ s/^BRIG_MACHINE_//; '$'.lc }
+ //.mnemo_token=ETargetMachine
+ //
+ //.print=$mnemo{ $mnemo }
+
+ BRIG_MACHINE_SMALL = 0,
+ BRIG_MACHINE_LARGE = 1,
+
+ BRIG_MACHINE_UNDEF = 2 //.skip
+};
+
+enum BrigMemoryModifierMask { //.tddef=0
+ BRIG_MEMORY_CONST = 1
+};
+
+enum BrigMemoryOrder {
+
+ //.mnemo={ s/^BRIG_MEMORY_ORDER_//; lc }
+ //.mnemo_token=_EMMemoryOrder
+ //
+ //.print=$mnemo{ "_$mnemo" }
+
+ BRIG_MEMORY_ORDER_NONE = 0, //.mnemo=""
+ BRIG_MEMORY_ORDER_RELAXED = 1, //.mnemo=rlx
+ BRIG_MEMORY_ORDER_SC_ACQUIRE = 2, //.mnemo=scacq
+ BRIG_MEMORY_ORDER_SC_RELEASE = 3, //.mnemo=screl
+ BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE = 4, //.mnemo=scar
+
+ BRIG_MEMORY_ORDER_LAST = 5 //.skip
+};
+
+enum BrigMemoryScope {
+
+ //.mnemo={ s/^BRIG_MEMORY_SCOPE_//; lc }
+ //.mnemo_token=_EMMemoryScope
+ //
+ //.print=$mnemo{ $mnemo }
+
+ BRIG_MEMORY_SCOPE_NONE = 0, //.mnemo=""
+ BRIG_MEMORY_SCOPE_WORKITEM = 1, //.mnemo=""
+ BRIG_MEMORY_SCOPE_WAVEFRONT = 2, //.mnemo=wave
+ BRIG_MEMORY_SCOPE_WORKGROUP = 3, //.mnemo=wg
+ BRIG_MEMORY_SCOPE_AGENT = 4, //.mnemo=agent
+ BRIG_MEMORY_SCOPE_SYSTEM = 5, //.mnemo=system
+
+ BRIG_MEMORY_SCOPE_LAST = 6 //.skip
+};
+
+enum BrigOpcode {
+
+ //.tdcaption="Instruction Opcodes"
+ //
+ //.k={ "BASIC" }
+ //.pscode=$k{ MACRO2Name("_".$k) }
+ //.opcodeparser=$pscode{ return $pscode && "parseMnemo$pscode" }
+ //.opcodeparser_incfile=ParserUtilities
+ //.opcodeparser_switch //.opcodeparser_proto="OpcodeParser getOpcodeParser(BrigOpcode16_t arg)" //.opcodeparser_default="return parseMnemoBasic"
+ //
+ //.psopnd={undef}
+ //.opndparser=$psopnd{ return $psopnd && "&Parser::parse$psopnd" }
+ //.opndparser_incfile=ParserUtilities
+ //.opndparser_switch //.opndparser_proto="Parser::OperandParser Parser::getOperandParser(BrigOpcode16_t arg)" //.opndparser_default="return &Parser::parseOperands"
+ //
+ //.mnemo={ s/^BRIG_OPCODE_//; s/GCN([^_])/GCN_$1/; lc }
+ //.mnemo_scanner=Instructions //.mnemo_token=EInstruction
+ //.mnemo_context=EDefaultContext
+ //
+ //.has_memory_order={undef}
+ //.semsupport=$has_memory_order{ return $has_memory_order && "true" }
+ //
+ //.hasType=$k{ return ($k and $k eq "BASIC_NO_TYPE") ? "false" : undef; }
+ //.hasType_switch //.hasType_proto="bool instHasType(BrigOpcode16_t arg)" //.hasType_default="return true"
+ //
+ //.opcodevis=$pscode{ s/^BRIG_OPCODE_//; sprintf("%-47s(","vis.visitOpcode_".$_) . ($pscode =~m/^(BasicOrMod|Nop)$/? "inst" : "HSAIL_ASM::Inst". ($pscode=~m/BasicNoType/? "Basic":$pscode) ."(inst)").")" }
+ //.opcodevis_switch //.opcodevis_proto="template <typename RetType, typename Visitor> RetType visitOpcode_gen(HSAIL_ASM::Inst inst, Visitor& vis)"
+ //.opcodevis_arg="inst.opcode()" //.opcodevis_default="return RetType()"
+ //.opcodevis_incfile=ItemUtils
+ //
+ //.ftz=$k{ return ($k eq "BASIC_OR_MOD" or $k eq "CMP" or $k eq "CVT") ? "true" : undef }
+ //.ftz_incfile=ItemUtils //.ftz_switch //.ftz_proto="inline bool instSupportsFtz(BrigOpcode16_t arg)" //.ftz_default="return false"
+ //
+ //.vecOpndIndex={undef}
+ //.vecOpndIndex_switch //.vecOpndIndex_proto="int vecOpndIndex(BrigOpcode16_t arg)" //.vecOpndIndex_default="return -1"
+ //.vecOpndIndex_incfile=ParserUtilities
+ //
+ //.numdst={undef}
+ //.numdst_switch //.numdst_proto="int instNumDstOperands(BrigOpcode16_t arg)" //.numdst_default="return 1"
+ //
+ //.print=$mnemo{ $mnemo }
+
+ BRIG_OPCODE_NOP = 0, //.k=NOP //.hasType=false
+ BRIG_OPCODE_ABS = 1, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_ADD = 2, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_BORROW = 3,
+ BRIG_OPCODE_CARRY = 4,
+ BRIG_OPCODE_CEIL = 5, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_COPYSIGN = 6, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_DIV = 7, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_FLOOR = 8, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_FMA = 9, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_FRACT = 10, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_MAD = 11, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_MAX = 12, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_MIN = 13, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_MUL = 14, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_MULHI = 15, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_NEG = 16, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_REM = 17,
+ BRIG_OPCODE_RINT = 18, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_SQRT = 19, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_SUB = 20, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_TRUNC = 21, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_MAD24 = 22,
+ BRIG_OPCODE_MAD24HI = 23,
+ BRIG_OPCODE_MUL24 = 24,
+ BRIG_OPCODE_MUL24HI = 25,
+ BRIG_OPCODE_SHL = 26,
+ BRIG_OPCODE_SHR = 27,
+ BRIG_OPCODE_AND = 28,
+ BRIG_OPCODE_NOT = 29,
+ BRIG_OPCODE_OR = 30,
+ BRIG_OPCODE_POPCOUNT = 31, //.k=SOURCE_TYPE
+ BRIG_OPCODE_XOR = 32,
+ BRIG_OPCODE_BITEXTRACT = 33,
+ BRIG_OPCODE_BITINSERT = 34,
+ BRIG_OPCODE_BITMASK = 35,
+ BRIG_OPCODE_BITREV = 36,
+ BRIG_OPCODE_BITSELECT = 37,
+ BRIG_OPCODE_FIRSTBIT = 38, //.k=SOURCE_TYPE
+ BRIG_OPCODE_LASTBIT = 39, //.k=SOURCE_TYPE
+ BRIG_OPCODE_COMBINE = 40, //.k=SOURCE_TYPE //.vecOpndIndex=1
+ BRIG_OPCODE_EXPAND = 41, //.k=SOURCE_TYPE //.vecOpndIndex=0
+ BRIG_OPCODE_LDA = 42, //.k=ADDR
+ BRIG_OPCODE_MOV = 43,
+ BRIG_OPCODE_SHUFFLE = 44,
+ BRIG_OPCODE_UNPACKHI = 45,
+ BRIG_OPCODE_UNPACKLO = 46,
+ BRIG_OPCODE_PACK = 47, //.k=SOURCE_TYPE
+ BRIG_OPCODE_UNPACK = 48, //.k=SOURCE_TYPE
+ BRIG_OPCODE_CMOV = 49,
+ BRIG_OPCODE_CLASS = 50, //.k=SOURCE_TYPE
+ BRIG_OPCODE_NCOS = 51,
+ BRIG_OPCODE_NEXP2 = 52,
+ BRIG_OPCODE_NFMA = 53,
+ BRIG_OPCODE_NLOG2 = 54,
+ BRIG_OPCODE_NRCP = 55,
+ BRIG_OPCODE_NRSQRT = 56,
+ BRIG_OPCODE_NSIN = 57,
+ BRIG_OPCODE_NSQRT = 58,
+ BRIG_OPCODE_BITALIGN = 59,
+ BRIG_OPCODE_BYTEALIGN = 60,
+ BRIG_OPCODE_PACKCVT = 61, //.k=SOURCE_TYPE
+ BRIG_OPCODE_UNPACKCVT = 62, //.k=SOURCE_TYPE
+ BRIG_OPCODE_LERP = 63,
+ BRIG_OPCODE_SAD = 64, //.k=SOURCE_TYPE
+ BRIG_OPCODE_SADHI = 65, //.k=SOURCE_TYPE
+ BRIG_OPCODE_SEGMENTP = 66, //.k=SEG_CVT
+ BRIG_OPCODE_FTOS = 67, //.k=SEG_CVT
+ BRIG_OPCODE_STOF = 68, //.k=SEG_CVT
+ BRIG_OPCODE_CMP = 69, //.k=CMP
+ BRIG_OPCODE_CVT = 70, //.k=CVT
+ BRIG_OPCODE_LD = 71, //.k=MEM //.has_memory_order //.vecOpndIndex=0
+ BRIG_OPCODE_ST = 72, //.k=MEM //.has_memory_order //.vecOpndIndex=0 //.numdst=0
+ BRIG_OPCODE_ATOMIC = 73, //.k=ATOMIC
+ BRIG_OPCODE_ATOMICNORET = 74, //.k=ATOMIC //.numdst=0
+ BRIG_OPCODE_SIGNAL = 75, //.k=SIGNAL
+ BRIG_OPCODE_SIGNALNORET = 76, //.k=SIGNAL //.numdst=0
+ BRIG_OPCODE_MEMFENCE = 77, //.k=MEM_FENCE //.numdst=0
+ BRIG_OPCODE_RDIMAGE = 78, //.k=IMAGE //.vecOpndIndex=0
+ BRIG_OPCODE_LDIMAGE = 79, //.k=IMAGE //.vecOpndIndex=0
+ BRIG_OPCODE_STIMAGE = 80, //.k=IMAGE //.vecOpndIndex=0 //.numdst=0
+ BRIG_OPCODE_IMAGEFENCE = 81, //.k=BASIC_NO_TYPE
+ BRIG_OPCODE_QUERYIMAGE = 82, //.k=QUERY_IMAGE
+ BRIG_OPCODE_QUERYSAMPLER = 83, //.k=QUERY_SAMPLER
+ BRIG_OPCODE_CBR = 84, //.k=BR //.numdst=0
+ BRIG_OPCODE_BR = 85, //.k=BR //.numdst=0 //.hasType=false
+ BRIG_OPCODE_SBR = 86, //.k=BR //.numdst=0 //.psopnd=SbrOperands
+ BRIG_OPCODE_BARRIER = 87, //.k=BR //.numdst=0 //.hasType=false
+ BRIG_OPCODE_WAVEBARRIER = 88, //.k=BR //.numdst=0 //.hasType=false
+ BRIG_OPCODE_ARRIVEFBAR = 89, //.k=BR //.numdst=0 //.hasType=false
+ BRIG_OPCODE_INITFBAR = 90, //.k=BASIC_NO_TYPE //.numdst=0 //.hasType=false
+ BRIG_OPCODE_JOINFBAR = 91, //.k=BR //.numdst=0 //.hasType=false
+ BRIG_OPCODE_LEAVEFBAR = 92, //.k=BR //.numdst=0 //.hasType=false
+ BRIG_OPCODE_RELEASEFBAR = 93, //.k=BASIC_NO_TYPE //.numdst=0
+ BRIG_OPCODE_WAITFBAR = 94, //.k=BR //.numdst=0 //.hasType=false
+ BRIG_OPCODE_LDF = 95,
+ BRIG_OPCODE_ACTIVELANECOUNT = 96, //.k=LANE
+ BRIG_OPCODE_ACTIVELANEID = 97, //.k=LANE
+ BRIG_OPCODE_ACTIVELANEMASK = 98, //.k=LANE //.vecOpndIndex=0
+ BRIG_OPCODE_ACTIVELANEPERMUTE = 99, //.k=LANE
+ BRIG_OPCODE_CALL = 100, //.k=BR //.psopnd=CallOperands //.numdst=0 //.hasType=false
+ BRIG_OPCODE_SCALL = 101, //.k=BR //.psopnd=CallOperands //.numdst=0
+ BRIG_OPCODE_ICALL = 102, //.k=BR //.psopnd=CallOperands //.numdst=0
+ BRIG_OPCODE_RET = 103, //.k=BASIC_NO_TYPE
+ BRIG_OPCODE_ALLOCA = 104, //.k=MEM
+ BRIG_OPCODE_CURRENTWORKGROUPSIZE = 105,
+ BRIG_OPCODE_CURRENTWORKITEMFLATID = 106,
+ BRIG_OPCODE_DIM = 107,
+ BRIG_OPCODE_GRIDGROUPS = 108,
+ BRIG_OPCODE_GRIDSIZE = 109,
+ BRIG_OPCODE_PACKETCOMPLETIONSIG = 110,
+ BRIG_OPCODE_PACKETID = 111,
+ BRIG_OPCODE_WORKGROUPID = 112,
+ BRIG_OPCODE_WORKGROUPSIZE = 113,
+ BRIG_OPCODE_WORKITEMABSID = 114,
+ BRIG_OPCODE_WORKITEMFLATABSID = 115,
+ BRIG_OPCODE_WORKITEMFLATID = 116,
+ BRIG_OPCODE_WORKITEMID = 117,
+ BRIG_OPCODE_CLEARDETECTEXCEPT = 118, //.numdst=0
+ BRIG_OPCODE_GETDETECTEXCEPT = 119,
+ BRIG_OPCODE_SETDETECTEXCEPT = 120, //.numdst=0
+ BRIG_OPCODE_ADDQUEUEWRITEINDEX = 121, //.k=QUEUE
+ BRIG_OPCODE_CASQUEUEWRITEINDEX = 122, //.k=QUEUE
+ BRIG_OPCODE_LDQUEUEREADINDEX = 123, //.k=QUEUE
+ BRIG_OPCODE_LDQUEUEWRITEINDEX = 124, //.k=QUEUE
+ BRIG_OPCODE_STQUEUEREADINDEX = 125, //.k=QUEUE //.numdst=0
+ BRIG_OPCODE_STQUEUEWRITEINDEX = 126, //.k=QUEUE //.numdst=0
+ BRIG_OPCODE_CLOCK = 127,
+ BRIG_OPCODE_CUID = 128,
+ BRIG_OPCODE_DEBUGTRAP = 129, //.numdst=0
+ BRIG_OPCODE_GROUPBASEPTR = 130,
+ BRIG_OPCODE_KERNARGBASEPTR = 131,
+ BRIG_OPCODE_LANEID = 132,
+ BRIG_OPCODE_MAXCUID = 133,
+ BRIG_OPCODE_MAXWAVEID = 134,
+ BRIG_OPCODE_NULLPTR = 135, //.k=SEG
+ BRIG_OPCODE_WAVEID = 136,
+ BRIG_OPCODE_FIRST_USER_DEFINED = 32768, //.skip
+
+ BRIG_OPCODE_GCNMADU = (1u << 15) | 0, //.k=BASIC_NO_TYPE
+ BRIG_OPCODE_GCNMADS = (1u << 15) | 1, //.k=BASIC_NO_TYPE
+ BRIG_OPCODE_GCNMAX3 = (1u << 15) | 2,
+ BRIG_OPCODE_GCNMIN3 = (1u << 15) | 3,
+ BRIG_OPCODE_GCNMED3 = (1u << 15) | 4,
+ BRIG_OPCODE_GCNFLDEXP = (1u << 15) | 5, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_GCNFREXP_EXP = (1u << 15) | 6, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_GCNFREXP_MANT = (1u << 15) | 7, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_GCNTRIG_PREOP = (1u << 15) | 8, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_GCNBFM = (1u << 15) | 9,
+ BRIG_OPCODE_GCNLD = (1u << 15) | 10, //.k=MEM //.has_memory_order //.vecOpndIndex=0
+ BRIG_OPCODE_GCNST = (1u << 15) | 11, //.k=MEM //.has_memory_order //.vecOpndIndex=0
+ BRIG_OPCODE_GCNATOMIC = (1u << 15) | 12, //.k=ATOMIC
+ BRIG_OPCODE_GCNATOMICNORET = (1u << 15) | 13, //.k=ATOMIC //.mnemo=gcn_atomicNoRet
+ BRIG_OPCODE_GCNSLEEP = (1u << 15) | 14,
+ BRIG_OPCODE_GCNPRIORITY = (1u << 15) | 15,
+ BRIG_OPCODE_GCNREGIONALLOC = (1u << 15) | 16, //.k=BASIC_NO_TYPE //.mnemo=gcn_region_alloc
+ BRIG_OPCODE_GCNMSAD = (1u << 15) | 17,
+ BRIG_OPCODE_GCNQSAD = (1u << 15) | 18,
+ BRIG_OPCODE_GCNMQSAD = (1u << 15) | 19,
+ BRIG_OPCODE_GCNMQSAD4 = (1u << 15) | 20, //.k=BASIC_NO_TYPE
+ BRIG_OPCODE_GCNSADW = (1u << 15) | 21,
+ BRIG_OPCODE_GCNSADD = (1u << 15) | 22,
+ BRIG_OPCODE_GCNCONSUME = (1u << 15) | 23, //.k=ADDR //.mnemo=gcn_atomic_consume
+ BRIG_OPCODE_GCNAPPEND = (1u << 15) | 24, //.k=ADDR //.mnemo=gcn_atomic_append
+ BRIG_OPCODE_GCNB4XCHG = (1u << 15) | 25, //.mnemo=gcn_b4xchg
+ BRIG_OPCODE_GCNB32XCHG = (1u << 15) | 26, //.mnemo=gcn_b32xchg
+ BRIG_OPCODE_GCNMAX = (1u << 15) | 27,
+ BRIG_OPCODE_GCNMIN = (1u << 15) | 28,
+ BRIG_OPCODE_GCNDIVRELAXED = (1u << 15) | 29, //.k=BASIC_OR_MOD
+ BRIG_OPCODE_GCNDIVRELAXEDNARROW = (1u << 15) | 30,
+
+ BRIG_OPCODE_AMDRDIMAGELOD = (1u << 15) | 31, //.k=IMAGE //.mnemo=amd_rdimagelod //.vecOpndIndex=0
+ BRIG_OPCODE_AMDRDIMAGEGRAD = (1u << 15) | 32, //.k=IMAGE //.mnemo=amd_rdimagegrad //.vecOpndIndex=0
+ BRIG_OPCODE_AMDLDIMAGEMIP = (1u << 15) | 33, //.k=IMAGE //.mnemo=amd_ldimagemip //.vecOpndIndex=0
+ BRIG_OPCODE_AMDSTIMAGEMIP = (1u << 15) | 34, //.k=IMAGE //.mnemo=amd_stimagemip //.vecOpndIndex=0 //.numdst=0
+ BRIG_OPCODE_AMDQUERYIMAGE = (1u << 15) | 35 //.k=QUERY_IMAGE //.mnemo=amd_queryimage
+};
+
+enum BrigPack {
+
+ //.tdcaption="Packing"
+ //
+ //.mnemo={ s/^BRIG_PACK_//;s/SAT$/_sat/;lc }
+ //.mnemo_token=_EMPacking
+ //
+ //.print=$mnemo{ "_$mnemo" }
+
+ BRIG_PACK_NONE = 0, //.mnemo=""
+ BRIG_PACK_PP = 1,
+ BRIG_PACK_PS = 2,
+ BRIG_PACK_SP = 3,
+ BRIG_PACK_SS = 4,
+ BRIG_PACK_S = 5,
+ BRIG_PACK_P = 6,
+ BRIG_PACK_PPSAT = 7,
+ BRIG_PACK_PSSAT = 8,
+ BRIG_PACK_SPSAT = 9,
+ BRIG_PACK_SSSAT = 10,
+ BRIG_PACK_SSAT = 11,
+ BRIG_PACK_PSAT = 12
+};
+
+enum BrigProfile {
+
+ //.mnemo={ s/^BRIG_PROFILE_//;'$'.lc }
+ //.mnemo_token=ETargetProfile
+ //
+ //.print=$mnemo{ $mnemo }
+
+ BRIG_PROFILE_BASE = 0,
+ BRIG_PROFILE_FULL = 1,
+
+ BRIG_PROFILE_UNDEF = 2 //.skip
+};
+
+enum BrigRegisterKind {
+
+ //.mnemo={ s/^BRIG_REGISTER_KIND_//;'$'.lc(substr($_,0,1)) }
+ //
+ //.bits={ }
+ //.bits_switch //.bits_proto="unsigned getRegBits(BrigRegisterKind16_t arg)" //.bits_default="return (unsigned)-1"
+ //
+ //.nollvm
+
+ BRIG_REGISTER_KIND_CONTROL = 0, //.bits=1
+ BRIG_REGISTER_KIND_SINGLE = 1, //.bits=32
+ BRIG_REGISTER_KIND_DOUBLE = 2, //.bits=64
+ BRIG_REGISTER_KIND_QUAD = 3 //.bits=128
+};
+
+enum BrigRound {
+
+ //.mnemo={}
+ //.mnemo_fn=round2str //.mnemo_token=_EMRound
+ //
+ //.sat={/_SAT$/? "true" : "false"}
+ //.sat_switch //.sat_proto="bool isSatRounding(unsigned rounding)" //.sat_arg="rounding"
+ //.sat_default="return false"
+ //
+ //.sig={/_SIGNALING_/? "true" : "false"}
+ //.sig_switch //.sig_proto="bool isSignalingRounding(unsigned rounding)" //.sig_arg="rounding"
+ //.sig_default="return false"
+ //
+ //.int={/_INTEGER_/? "true" : "false"}
+ //.int_switch //.int_proto="bool isIntRounding(unsigned rounding)" //.int_arg="rounding"
+ //.int_default="return false"
+ //
+ //.flt={/_FLOAT_/? "true" : "false"}
+ //.flt_switch //.flt_proto="bool isFloatRounding(unsigned rounding)" //.flt_arg="rounding"
+ //.flt_default="return false"
+ //
+ //.print=$mnemo{ "_$mnemo" }
+
+ BRIG_ROUND_NONE = 0, //.no_mnemo
+ BRIG_ROUND_FLOAT_DEFAULT = 1, //.no_mnemo
+ BRIG_ROUND_FLOAT_NEAR_EVEN = 2, //.mnemo=near
+ BRIG_ROUND_FLOAT_ZERO = 3, //.mnemo=zero
+ BRIG_ROUND_FLOAT_PLUS_INFINITY = 4, //.mnemo=up
+ BRIG_ROUND_FLOAT_MINUS_INFINITY = 5, //.mnemo=down
+ BRIG_ROUND_INTEGER_NEAR_EVEN = 6, //.mnemo=neari
+ BRIG_ROUND_INTEGER_ZERO = 7, //.mnemo=zeroi
+ BRIG_ROUND_INTEGER_PLUS_INFINITY = 8, //.mnemo=upi
+ BRIG_ROUND_INTEGER_MINUS_INFINITY = 9, //.mnemo=downi
+ BRIG_ROUND_INTEGER_NEAR_EVEN_SAT = 10, //.mnemo=neari_sat
+ BRIG_ROUND_INTEGER_ZERO_SAT = 11, //.mnemo=zeroi_sat
+ BRIG_ROUND_INTEGER_PLUS_INFINITY_SAT = 12, //.mnemo=upi_sat
+ BRIG_ROUND_INTEGER_MINUS_INFINITY_SAT = 13, //.mnemo=downi_sat
+ BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN = 14, //.mnemo=sneari
+ BRIG_ROUND_INTEGER_SIGNALING_ZERO = 15, //.mnemo=szeroi
+ BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY = 16, //.mnemo=supi
+ BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY = 17, //.mnemo=sdowni
+ BRIG_ROUND_INTEGER_SIGNALING_NEAR_EVEN_SAT = 18, //.mnemo=sneari_sat
+ BRIG_ROUND_INTEGER_SIGNALING_ZERO_SAT = 19, //.mnemo=szeroi_sat
+ BRIG_ROUND_INTEGER_SIGNALING_PLUS_INFINITY_SAT = 20, //.mnemo=supi_sat
+ BRIG_ROUND_INTEGER_SIGNALING_MINUS_INFINITY_SAT = 21 //.mnemo=sdowni_sat
+};
+
+enum BrigSamplerAddressing {
+
+ //.mnemo={ s/^BRIG_ADDRESSING_//;lc }
+ //.mnemo_token=ESamplerAddressingMode
+
+ BRIG_ADDRESSING_UNDEFINED = 0,
+ BRIG_ADDRESSING_CLAMP_TO_EDGE = 1,
+ BRIG_ADDRESSING_CLAMP_TO_BORDER = 2,
+ BRIG_ADDRESSING_REPEAT = 3,
+ BRIG_ADDRESSING_MIRRORED_REPEAT = 4,
+
+ BRIG_ADDRESSING_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigSamplerCoordNormalization {
+
+ //.mnemo={ s/^BRIG_COORD_//;lc }
+ //.mnemo_token=ESamplerCoord
+ //
+ //.print=$mnemo{ $mnemo }
+
+ BRIG_COORD_UNNORMALIZED = 0,
+ BRIG_COORD_NORMALIZED = 1
+};
+
+enum BrigSamplerFilter {
+
+ //.mnemo={ s/^BRIG_FILTER_//;lc }
+ //
+ //.print=$mnemo{ $mnemo }
+
+ BRIG_FILTER_NEAREST = 0,
+ BRIG_FILTER_LINEAR = 1,
+
+ BRIG_FILTER_FIRST_USER_DEFINED = 128 //.skip
+};
+
+enum BrigSamplerQuery {
+
+ //.mnemo={ s/^BRIG_SAMPLER_QUERY_//;lc }
+ //.mnemo_token=_EMSamplerQuery
+ //
+ //.print=$mnemo{ $mnemo }
+
+ BRIG_SAMPLER_QUERY_ADDRESSING = 0,
+ BRIG_SAMPLER_QUERY_COORD = 1,
+ BRIG_SAMPLER_QUERY_FILTER = 2
+};
+
+enum BrigSectionIndex {
+
+ //.nollvm
+ //
+ //.mnemo={ s/^BRIG_SECTION_INDEX_/HSA_/;lc }
+
+ BRIG_SECTION_INDEX_DATA = 0,
+ BRIG_SECTION_INDEX_CODE = 1,
+ BRIG_SECTION_INDEX_OPERAND = 2,
+ BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED = 3,
+
+ // used internally
+ BRIG_SECTION_INDEX_IMPLEMENTATION_DEFINED = BRIG_SECTION_INDEX_BEGIN_IMPLEMENTATION_DEFINED //.skip
+};
+
+enum BrigSegCvtModifierMask {
+ BRIG_SEG_CVT_NONULL = 1 //.mnemo="nonull" //.print="_nonull"
+};
+
+enum BrigSegment {
+
+ //.mnemo={ s/^BRIG_SEGMENT_//;lc}
+ //.mnemo_token=_EMSegment
+ //.mnemo_context=EInstModifierContext
+ //
+ //.print=$mnemo{ $mnemo ? "_$mnemo" : "" }
+
+ BRIG_SEGMENT_NONE = 0, //.mnemo=""
+ BRIG_SEGMENT_FLAT = 1, //.mnemo=""
+ BRIG_SEGMENT_GLOBAL = 2,
+ BRIG_SEGMENT_READONLY = 3,
+ BRIG_SEGMENT_KERNARG = 4,
+ BRIG_SEGMENT_GROUP = 5,
+ BRIG_SEGMENT_PRIVATE = 6,
+ BRIG_SEGMENT_SPILL = 7,
+ BRIG_SEGMENT_ARG = 8,
+
+ BRIG_SEGMENT_FIRST_USER_DEFINED = 128, //.skip
+
+ BRIG_SEGMENT_AMD_GCN = 9, //.mnemo="region"
+};
+
+enum BrigPackedTypeBits {
+
+ //.nodump
+ //
+ //.nollvm
+
+ BRIG_TYPE_BASE_SIZE = 5,
+ BRIG_TYPE_PACK_SIZE = 2,
+ BRIG_TYPE_ARRAY_SIZE = 1,
+
+ BRIG_TYPE_BASE_SHIFT = 0,
+ BRIG_TYPE_PACK_SHIFT = BRIG_TYPE_BASE_SHIFT + BRIG_TYPE_BASE_SIZE,
+ BRIG_TYPE_ARRAY_SHIFT = BRIG_TYPE_PACK_SHIFT + BRIG_TYPE_PACK_SIZE,
+
+ BRIG_TYPE_BASE_MASK = ((1 << BRIG_TYPE_BASE_SIZE) - 1) << BRIG_TYPE_BASE_SHIFT,
+ BRIG_TYPE_PACK_MASK = ((1 << BRIG_TYPE_PACK_SIZE) - 1) << BRIG_TYPE_PACK_SHIFT,
+ BRIG_TYPE_ARRAY_MASK = ((1 << BRIG_TYPE_ARRAY_SIZE) - 1) << BRIG_TYPE_ARRAY_SHIFT,
+
+ BRIG_TYPE_PACK_NONE = 0 << BRIG_TYPE_PACK_SHIFT,
+ BRIG_TYPE_PACK_32 = 1 << BRIG_TYPE_PACK_SHIFT,
+ BRIG_TYPE_PACK_64 = 2 << BRIG_TYPE_PACK_SHIFT,
+ BRIG_TYPE_PACK_128 = 3 << BRIG_TYPE_PACK_SHIFT,
+
+ BRIG_TYPE_ARRAY = 1 << BRIG_TYPE_ARRAY_SHIFT
+};
+
+enum BrigType {
+
+ //.numBits={ /ARRAY$/ ? undef : /([0-9]+)X([0-9]+)/ ? $1*$2 : /([0-9]+)/ ? $1 : undef }
+ //.numBits_switch //.numBits_proto="unsigned getBrigTypeNumBits(unsigned arg)" //.numBits_default="assert(0); return 0"
+ //.numBytes=$numBits{ $numBits > 1 ? $numBits/8 : undef }
+ //.numBytes_switch //.numBytes_proto="unsigned getBrigTypeNumBytes(unsigned arg)" //.numBytes_default="assert(0); return 0"
+ //
+ //.mnemo={ s/^BRIG_TYPE_//;lc }
+ //.mnemo_token=_EMType
+ //
+ //.array={/ARRAY$/?"true":"false"}
+ //.array_switch //.array_proto="bool isArrayType(unsigned type)" //.array_arg="type"
+ //.array_default="return false"
+ //
+ //.a2e={/(.*)_ARRAY$/? $1 : "BRIG_TYPE_NONE"}
+ //.a2e_switch //.a2e_proto="unsigned arrayType2elementType(unsigned type)" //.a2e_arg="type"
+ //.a2e_default="return BRIG_TYPE_NONE"
+ //
+ //.e2a={/_ARRAY$/? "BRIG_TYPE_NONE" : /_NONE$/ ? "BRIG_TYPE_NONE" : /_B1$/ ? "BRIG_TYPE_NONE" : $_ . "_ARRAY"}
+ //.e2a_switch //.e2a_proto="unsigned elementType2arrayType(unsigned type)" //.e2a_arg="type"
+ //.e2a_default="return BRIG_TYPE_NONE"
+ //
+ //.t2s={s/^BRIG_TYPE_//;lc s/_ARRAY$/[]/;lc}
+ //.t2s_switch //.t2s_proto="const char* type2name(unsigned type)" //.t2s_arg="type"
+ //.t2s_default="return NULL"
+ //
+ //.dispatch_switch //.dispatch_incfile=TemplateUtilities
+ //.dispatch_proto="template<typename RetType, typename Visitor>\nRetType dispatchByType_gen(unsigned type, Visitor& v)"
+ //.dispatch={ /ARRAY$/ ? "v.visitNone(type)" : /^BRIG_TYPE_([BUSF]|SIG)[0-9]+/ ? "v.template visit< BrigTypeTraits<$_> >()" : "v.visitNone(type)" }
+ //.dispatch_arg="type" //.dispatch_default="return v.visitNone(type)"
+ //
+ //- .tdname=BrigType
+ //
+ //.print=$mnemo{ "_$mnemo" }
+
+ BRIG_TYPE_NONE = 0, //.mnemo="" //.print=""
+ BRIG_TYPE_U8 = 1, //.ctype=uint8_t
+ BRIG_TYPE_U16 = 2, //.ctype=uint16_t
+ BRIG_TYPE_U32 = 3, //.ctype=uint32_t
+ BRIG_TYPE_U64 = 4, //.ctype=uint64_t
+ BRIG_TYPE_S8 = 5, //.ctype=int8_t
+ BRIG_TYPE_S16 = 6, //.ctype=int16_t
+ BRIG_TYPE_S32 = 7, //.ctype=int32_t
+ BRIG_TYPE_S64 = 8, //.ctype=int64_t
+ BRIG_TYPE_F16 = 9, //.ctype=f16_t
+ BRIG_TYPE_F32 = 10, //.ctype=float
+ BRIG_TYPE_F64 = 11, //.ctype=double
+ BRIG_TYPE_B1 = 12, //.ctype=bool //.numBytes=1
+ BRIG_TYPE_B8 = 13, //.ctype=uint8_t
+ BRIG_TYPE_B16 = 14, //.ctype=uint16_t
+ BRIG_TYPE_B32 = 15, //.ctype=uint32_t
+ BRIG_TYPE_B64 = 16, //.ctype=uint64_t
+ BRIG_TYPE_B128 = 17, //.ctype=b128_t
+ BRIG_TYPE_SAMP = 18, //.mnemo=samp //.numBits=64
+ BRIG_TYPE_ROIMG = 19, //.mnemo=roimg //.numBits=64
+ BRIG_TYPE_WOIMG = 20, //.mnemo=woimg //.numBits=64
+ BRIG_TYPE_RWIMG = 21, //.mnemo=rwimg //.numBits=64
+ BRIG_TYPE_SIG32 = 22, //.mnemo=sig32 //.numBits=64
+ BRIG_TYPE_SIG64 = 23, //.mnemo=sig64 //.numBits=64
+
+ BRIG_TYPE_U8X4 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_32, //.ctype=uint8_t
+ BRIG_TYPE_U8X8 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_64, //.ctype=uint8_t
+ BRIG_TYPE_U8X16 = BRIG_TYPE_U8 | BRIG_TYPE_PACK_128, //.ctype=uint8_t
+ BRIG_TYPE_U16X2 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_32, //.ctype=uint16_t
+ BRIG_TYPE_U16X4 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_64, //.ctype=uint16_t
+ BRIG_TYPE_U16X8 = BRIG_TYPE_U16 | BRIG_TYPE_PACK_128, //.ctype=uint16_t
+ BRIG_TYPE_U32X2 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_64, //.ctype=uint32_t
+ BRIG_TYPE_U32X4 = BRIG_TYPE_U32 | BRIG_TYPE_PACK_128, //.ctype=uint32_t
+ BRIG_TYPE_U64X2 = BRIG_TYPE_U64 | BRIG_TYPE_PACK_128, //.ctype=uint64_t
+ BRIG_TYPE_S8X4 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_32, //.ctype=int8_t
+ BRIG_TYPE_S8X8 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_64, //.ctype=int8_t
+ BRIG_TYPE_S8X16 = BRIG_TYPE_S8 | BRIG_TYPE_PACK_128, //.ctype=int8_t
+ BRIG_TYPE_S16X2 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_32, //.ctype=int16_t
+ BRIG_TYPE_S16X4 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_64, //.ctype=int16_t
+ BRIG_TYPE_S16X8 = BRIG_TYPE_S16 | BRIG_TYPE_PACK_128, //.ctype=int16_t
+ BRIG_TYPE_S32X2 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_64, //.ctype=int32_t
+ BRIG_TYPE_S32X4 = BRIG_TYPE_S32 | BRIG_TYPE_PACK_128, //.ctype=int32_t
+ BRIG_TYPE_S64X2 = BRIG_TYPE_S64 | BRIG_TYPE_PACK_128, //.ctype=int64_t
+ BRIG_TYPE_F16X2 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_32, //.ctype=f16_t
+ BRIG_TYPE_F16X4 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_64, //.ctype=f16_t
+ BRIG_TYPE_F16X8 = BRIG_TYPE_F16 | BRIG_TYPE_PACK_128, //.ctype=f16_t
+ BRIG_TYPE_F32X2 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_64, //.ctype=float
+ BRIG_TYPE_F32X4 = BRIG_TYPE_F32 | BRIG_TYPE_PACK_128, //.ctype=float
+ BRIG_TYPE_F64X2 = BRIG_TYPE_F64 | BRIG_TYPE_PACK_128, //.ctype=double
+
+ BRIG_TYPE_U8_ARRAY = BRIG_TYPE_U8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_U16_ARRAY = BRIG_TYPE_U16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_U32_ARRAY = BRIG_TYPE_U32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_U64_ARRAY = BRIG_TYPE_U64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_S8_ARRAY = BRIG_TYPE_S8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_S16_ARRAY = BRIG_TYPE_S16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_S32_ARRAY = BRIG_TYPE_S32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_S64_ARRAY = BRIG_TYPE_S64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_F16_ARRAY = BRIG_TYPE_F16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_F32_ARRAY = BRIG_TYPE_F32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_F64_ARRAY = BRIG_TYPE_F64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_B8_ARRAY = BRIG_TYPE_B8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_B16_ARRAY = BRIG_TYPE_B16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_B32_ARRAY = BRIG_TYPE_B32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_B64_ARRAY = BRIG_TYPE_B64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_B128_ARRAY = BRIG_TYPE_B128 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_SAMP_ARRAY = BRIG_TYPE_SAMP | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_ROIMG_ARRAY = BRIG_TYPE_ROIMG | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_WOIMG_ARRAY = BRIG_TYPE_WOIMG | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_RWIMG_ARRAY = BRIG_TYPE_RWIMG | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_SIG32_ARRAY = BRIG_TYPE_SIG32 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_SIG64_ARRAY = BRIG_TYPE_SIG64 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_U8X4_ARRAY = BRIG_TYPE_U8X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_U8X8_ARRAY = BRIG_TYPE_U8X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_U8X16_ARRAY = BRIG_TYPE_U8X16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_U16X2_ARRAY = BRIG_TYPE_U16X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_U16X4_ARRAY = BRIG_TYPE_U16X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_U16X8_ARRAY = BRIG_TYPE_U16X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_U32X2_ARRAY = BRIG_TYPE_U32X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_U32X4_ARRAY = BRIG_TYPE_U32X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_U64X2_ARRAY = BRIG_TYPE_U64X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_S8X4_ARRAY = BRIG_TYPE_S8X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_S8X8_ARRAY = BRIG_TYPE_S8X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_S8X16_ARRAY = BRIG_TYPE_S8X16 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_S16X2_ARRAY = BRIG_TYPE_S16X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_S16X4_ARRAY = BRIG_TYPE_S16X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_S16X8_ARRAY = BRIG_TYPE_S16X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_S32X2_ARRAY = BRIG_TYPE_S32X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_S32X4_ARRAY = BRIG_TYPE_S32X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_S64X2_ARRAY = BRIG_TYPE_S64X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_F16X2_ARRAY = BRIG_TYPE_F16X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_F16X4_ARRAY = BRIG_TYPE_F16X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_F16X8_ARRAY = BRIG_TYPE_F16X8 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_F32X2_ARRAY = BRIG_TYPE_F32X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_F32X4_ARRAY = BRIG_TYPE_F32X4 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+ BRIG_TYPE_F64X2_ARRAY = BRIG_TYPE_F64X2 | BRIG_TYPE_ARRAY, //.mnemo="" //.print=""
+
+ // Used internally
+ BRIG_TYPE_INVALID = (unsigned) -1 //.skip
+};
+
+enum BrigVariableModifierMask {
+
+ //.nodump
+
+ BRIG_VARIABLE_DEFINITION = 1,
+ BRIG_VARIABLE_CONST = 2
+};
+
+enum BrigWidth {
+
+ //.tddef=1
+ //
+ //.print={ s/^BRIG_WIDTH_//; "_width($_)" }
+
+ BRIG_WIDTH_NONE = 0,
+ BRIG_WIDTH_1 = 1,
+ BRIG_WIDTH_2 = 2,
+ BRIG_WIDTH_4 = 3,
+ BRIG_WIDTH_8 = 4,
+ BRIG_WIDTH_16 = 5,
+ BRIG_WIDTH_32 = 6,
+ BRIG_WIDTH_64 = 7,
+ BRIG_WIDTH_128 = 8,
+ BRIG_WIDTH_256 = 9,
+ BRIG_WIDTH_512 = 10,
+ BRIG_WIDTH_1024 = 11,
+ BRIG_WIDTH_2048 = 12,
+ BRIG_WIDTH_4096 = 13,
+ BRIG_WIDTH_8192 = 14,
+ BRIG_WIDTH_16384 = 15,
+ BRIG_WIDTH_32768 = 16,
+ BRIG_WIDTH_65536 = 17,
+ BRIG_WIDTH_131072 = 18,
+ BRIG_WIDTH_262144 = 19,
+ BRIG_WIDTH_524288 = 20,
+ BRIG_WIDTH_1048576 = 21,
+ BRIG_WIDTH_2097152 = 22,
+ BRIG_WIDTH_4194304 = 23,
+ BRIG_WIDTH_8388608 = 24,
+ BRIG_WIDTH_16777216 = 25,
+ BRIG_WIDTH_33554432 = 26,
+ BRIG_WIDTH_67108864 = 27,
+ BRIG_WIDTH_134217728 = 28,
+ BRIG_WIDTH_268435456 = 29,
+ BRIG_WIDTH_536870912 = 30,
+ BRIG_WIDTH_1073741824 = 31,
+ BRIG_WIDTH_2147483648 = 32,
+ BRIG_WIDTH_WAVESIZE = 33,
+ BRIG_WIDTH_ALL = 34,
+
+ BRIG_WIDTH_LAST //.skip
+};
+
+struct BrigUInt64 { //.isroot //.standalone
+ uint32_t lo; //.defValue=0
+ uint32_t hi; //.defValue=0
+
+ //+hcode KLASS& operator=(uint64_t rhs);
+ //+hcode operator uint64_t();
+ //+implcode inline KLASS& KLASS::operator=(uint64_t rhs) { lo() = (uint32_t)rhs; hi() = (uint32_t)(rhs >> 32); return *this; }
+ //+implcode inline KLASS::operator uint64_t() { return ((uint64_t)hi()) << 32 | lo(); }
+};
+
+struct BrigAluModifier { //.isroot //.standalone
+ BrigAluModifier8_t allBits; //.defValue=0
+ //^^ bool ftz; //.wtype=BitValRef<0>
+};
+
+struct BrigBase { //.nowrap
+ uint16_t byteCount;
+ BrigKind16_t kind;
+};
+
+//.alias Code:Base { //.generic //.isroot //.section=BRIG_SECTION_INDEX_CODE };
+//.alias Directive:Code { //.generic };
+//.alias Operand:Base { //.generic //.isroot //.section=BRIG_SECTION_INDEX_OPERAND };
+
+struct BrigData {
+ //.nowrap
+ uint32_t byteCount;
+ uint8_t bytes[1];
+};
+
+struct BrigExecutableModifier { //.isroot //.standalone
+ BrigExecutableModifier8_t allBits; //.defValue=0
+ //^^ bool isDefinition; //.wtype=BitValRef<0>
+};
+
+struct BrigMemoryModifier { //.isroot //.standalone
+ BrigMemoryModifier8_t allBits; //.defValue=0
+ //^^ bool isConst; //.wtype=BitValRef<0>
+};
+
+struct BrigSegCvtModifier { //.isroot //.standalone
+ BrigSegCvtModifier8_t allBits; //.defValue=0
+ //^^ bool isNoNull; //.wtype=BitValRef<0>
+};
+
+struct BrigVariableModifier { //.isroot //.standalone
+ BrigVariableModifier8_t allBits; //.defValue=0
+
+ //^^ bool isDefinition; //.wtype=BitValRef<0>
+ //^^ bool isConst; //.wtype=BitValRef<1>
+};
+
+struct BrigDirectiveArgBlockEnd {
+ BrigBase base;
+};
+
+struct BrigDirectiveArgBlockStart {
+ BrigBase base;
+};
+
+struct BrigDirectiveComment {
+ BrigBase base;
+ BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveControl {
+ BrigBase base;
+ BrigControlDirective16_t control;
+ uint16_t reserved; //.defValue=0
+ BrigDataOffsetOperandList32_t operands;
+};
+
+struct BrigDirectiveExecutable { //.generic
+ BrigBase base;
+ BrigDataOffsetString32_t name;
+ uint16_t outArgCount; //.defValue=0
+ uint16_t inArgCount; //.defValue=0
+ BrigCodeOffset32_t firstInArg;
+ BrigCodeOffset32_t firstCodeBlockEntry;
+ BrigCodeOffset32_t nextModuleEntry;
+ BrigExecutableModifier modifier; //.acc=subItem<ExecutableModifier> //.wtype=ExecutableModifier
+ BrigLinkage8_t linkage;
+ uint16_t reserved; //.defValue=0
+};
+
+//.alias DirectiveKernel:DirectiveExecutable { };
+//.alias DirectiveFunction:DirectiveExecutable { };
+//.alias DirectiveSignature:DirectiveExecutable { };
+//.alias DirectiveIndirectFunction:DirectiveExecutable { };
+
+struct BrigDirectiveExtension {
+ BrigBase base;
+ BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveFbarrier {
+ BrigBase base;
+ BrigDataOffsetString32_t name;
+ BrigVariableModifier modifier; //.acc=subItem<VariableModifier> //.wtype=VariableModifier
+ BrigLinkage8_t linkage;
+ uint16_t reserved; //.defValue=0
+};
+
+struct BrigDirectiveLabel {
+ BrigBase base;
+ BrigDataOffsetString32_t name;
+};
+
+struct BrigDirectiveLoc {
+ BrigBase base;
+ BrigDataOffsetString32_t filename;
+ uint32_t line;
+ uint32_t column; //.defValue=1
+};
+
+struct BrigDirectiveNone { //.enum=BRIG_KIND_NONE
+ BrigBase base;
+};
+
+struct BrigDirectivePragma {
+ BrigBase base;
+ BrigDataOffsetOperandList32_t operands;
+};
+
+struct BrigDirectiveVariable {
+ BrigBase base;
+ BrigDataOffsetString32_t name;
+ BrigOperandOffset32_t init;
+ BrigType16_t type;
+
+ //+hcode bool isArray();
+ //+implcode inline bool KLASS::isArray() { return isArrayType(type()); }
+
+ //+hcode unsigned elementType();
+ //+implcode inline unsigned KLASS::elementType() { return isArray()? arrayType2elementType(type()) : type(); }
+
+ BrigSegment8_t segment;
+ BrigAlignment8_t align;
+ BrigUInt64 dim; //.acc=subItem<UInt64> //.wtype=UInt64
+ BrigVariableModifier modifier; //.acc=subItem<VariableModifier> //.wtype=VariableModifier
+ BrigLinkage8_t linkage;
+ BrigAllocation8_t allocation;
+ uint8_t reserved; //.defValue=0
+};
+
+struct BrigDirectiveModule {
+ BrigBase base;
+ BrigDataOffsetString32_t name;
+ BrigVersion32_t hsailMajor; //.wtype=ValRef<uint32_t>
+ BrigVersion32_t hsailMinor; //.wtype=ValRef<uint32_t>
+ BrigProfile8_t profile;
+ BrigMachineModel8_t machineModel;
+ BrigRound8_t defaultFloatRound;
+ uint8_t reserved; //.defValue=0
+};
+
+struct BrigInstBase { //.wname=Inst //.generic //.parent=BrigCode
+ BrigBase base;
+ BrigOpcode16_t opcode;
+ BrigType16_t type;
+ BrigDataOffsetOperandList32_t operands;
+
+ //+hcode Operand operand(int index);
+ //+implcode inline Operand KLASS::operand(int index) { return operands()[index]; }
+};
+
+struct BrigInstAddr {
+ BrigInstBase base;
+ BrigSegment8_t segment;
+ uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstAtomic {
+ BrigInstBase base;
+ BrigSegment8_t segment;
+ BrigMemoryOrder8_t memoryOrder;
+ BrigMemoryScope8_t memoryScope;
+ BrigAtomicOperation8_t atomicOperation;
+ uint8_t equivClass;
+ uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstBasic {
+ BrigInstBase base;
+};
+
+struct BrigInstBr {
+ BrigInstBase base;
+ BrigWidth8_t width;
+ uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstCmp {
+ BrigInstBase base;
+ BrigType16_t sourceType;
+ BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier
+ BrigCompareOperation8_t compare;
+ BrigPack8_t pack;
+ uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstCvt {
+ BrigInstBase base;
+ BrigType16_t sourceType;
+ BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier
+ BrigRound8_t round;
+};
+
+struct BrigInstImage {
+ BrigInstBase base;
+ BrigType16_t imageType;
+ BrigType16_t coordType;
+ BrigImageGeometry8_t geometry;
+ uint8_t equivClass;
+ uint16_t reserved; //.defValue=0
+};
+
+struct BrigInstLane {
+ BrigInstBase base;
+ BrigType16_t sourceType;
+ BrigWidth8_t width;
+ uint8_t reserved; //.defValue=0
+};
+
+struct BrigInstMem {
+ BrigInstBase base;
+ BrigSegment8_t segment;
+ BrigAlignment8_t align;
+ uint8_t equivClass;
+ BrigWidth8_t width;
+ BrigMemoryModifier modifier; //.acc=subItem<MemoryModifier> //.wtype=MemoryModifier
+ uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstMemFence {
+ BrigInstBase base;
+ BrigMemoryOrder8_t memoryOrder;
+ BrigMemoryScope8_t globalSegmentMemoryScope;
+ BrigMemoryScope8_t groupSegmentMemoryScope;
+ BrigMemoryScope8_t imageSegmentMemoryScope;
+};
+
+struct BrigInstMod {
+ BrigInstBase base;
+ BrigAluModifier modifier; //.acc=subItem<AluModifier> //.wtype=AluModifier
+ BrigRound8_t round;
+ BrigPack8_t pack;
+ uint8_t reserved; //.defValue=0
+};
+
+struct BrigInstQueryImage {
+ BrigInstBase base;
+ BrigType16_t imageType;
+ BrigImageGeometry8_t geometry;
+ BrigImageQuery8_t imageQuery;
+};
+
+struct BrigInstQuerySampler {
+ BrigInstBase base;
+ BrigSamplerQuery8_t samplerQuery;
+ uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstQueue {
+ BrigInstBase base;
+ BrigSegment8_t segment;
+ BrigMemoryOrder8_t memoryOrder;
+ uint16_t reserved; //.defValue=0
+};
+
+struct BrigInstSeg {
+ BrigInstBase base;
+ BrigSegment8_t segment;
+ uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigInstSegCvt {
+ BrigInstBase base;
+ BrigType16_t sourceType;
+ BrigSegment8_t segment;
+ BrigSegCvtModifier modifier; //.acc=subItem<SegCvtModifier> //.wtype=SegCvtModifier
+};
+
+struct BrigInstSignal {
+ BrigInstBase base;
+ BrigType16_t signalType;
+ BrigMemoryOrder8_t memoryOrder;
+ BrigAtomicOperation8_t signalOperation;
+};
+
+struct BrigInstSourceType {
+ BrigInstBase base;
+ BrigType16_t sourceType;
+ uint16_t reserved; //.defValue=0
+};
+
+struct BrigOperandAddress {
+ BrigBase base;
+ BrigCodeOffset32_t symbol; //.wtype=ItemRef<DirectiveVariable>
+ BrigOperandOffset32_t reg; //.wtype=ItemRef<OperandRegister>
+ BrigUInt64 offset; //.acc=subItem<UInt64> //.wtype=UInt64
+};
+
+struct BrigOperandAlign {
+ BrigBase base;
+ BrigAlignment8_t align;
+ uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigOperandCodeList {
+ BrigBase base;
+ BrigDataOffsetCodeList32_t elements;
+
+ //+hcode unsigned elementCount();
+ //+implcode inline unsigned KLASS::elementCount() { return elements().size(); }
+ //+hcode Code elements(int index);
+ //+implcode inline Code KLASS::elements(int index) { return elements()[index]; }
+};
+
+struct BrigOperandCodeRef {
+ BrigBase base;
+ BrigCodeOffset32_t ref;
+};
+
+struct BrigOperandConstantBytes {
+ BrigBase base;
+ BrigType16_t type; //.defValue=0
+ uint16_t reserved; //.defValue=0
+ BrigDataOffsetString32_t bytes;
+};
+
+struct BrigOperandConstantOperandList {
+ BrigBase base;
+ BrigType16_t type;
+ uint16_t reserved; //.defValue=0
+ BrigDataOffsetOperandList32_t elements;
+
+ //+hcode unsigned elementCount();
+ //+implcode inline unsigned KLASS::elementCount() { return elements().size(); }
+ //+hcode Operand elements(int index);
+ //+implcode inline Operand KLASS::elements(int index) { return elements()[index]; }
+};
+
+struct BrigOperandConstantImage {
+ BrigBase base;
+ BrigType16_t type;
+ BrigImageGeometry8_t geometry;
+ BrigImageChannelOrder8_t channelOrder;
+ BrigImageChannelType8_t channelType;
+ uint8_t reserved[3]; //.defValue=0
+ BrigUInt64 width; //.acc=subItem<UInt64> //.wtype=UInt64
+ BrigUInt64 height; //.acc=subItem<UInt64> //.wtype=UInt64
+ BrigUInt64 depth; //.acc=subItem<UInt64> //.wtype=UInt64
+ BrigUInt64 array; //.acc=subItem<UInt64> //.wtype=UInt64
+};
+
+struct BrigOperandOperandList {
+ BrigBase base;
+ BrigDataOffsetOperandList32_t elements;
+
+ //+hcode unsigned elementCount();
+ //+implcode inline unsigned KLASS::elementCount() { return elements().size(); }
+ //+hcode Operand elements(int index);
+ //+implcode inline Operand KLASS::elements(int index) { return elements()[index]; }
+};
+
+struct BrigOperandRegister {
+ BrigBase base;
+ BrigRegisterKind16_t regKind;
+ uint16_t regNum;
+};
+
+struct BrigOperandConstantSampler {
+ BrigBase base;
+ BrigType16_t type;
+ BrigSamplerCoordNormalization8_t coord;
+ BrigSamplerFilter8_t filter;
+ BrigSamplerAddressing8_t addressing;
+ uint8_t reserved[3]; //.defValue=0
+};
+
+struct BrigOperandString {
+ BrigBase base;
+ BrigDataOffsetString32_t string;
+};
+
+struct BrigOperandWavesize {
+ BrigBase base;
+};
+
+//.ignore{
+
+enum BrigExceptionsMask {
+ BRIG_EXCEPTIONS_INVALID_OPERATION = 1 << 0,
+ BRIG_EXCEPTIONS_DIVIDE_BY_ZERO = 1 << 1,
+ BRIG_EXCEPTIONS_OVERFLOW = 1 << 2,
+ BRIG_EXCEPTIONS_UNDERFLOW = 1 << 3,
+ BRIG_EXCEPTIONS_INEXACT = 1 << 4,
+
+ BRIG_EXCEPTIONS_FIRST_USER_DEFINED = 1 << 16
+};
+
+struct BrigSectionHeader {
+ uint64_t byteCount;
+ uint32_t headerByteCount;
+ uint32_t nameLength;
+ uint8_t name[1];
+};
+
+#define MODULE_IDENTIFICATION_LENGTH (8)
+
+struct BrigModuleHeader {
+ char identification[MODULE_IDENTIFICATION_LENGTH];
+ BrigVersion32_t brigMajor;
+ BrigVersion32_t brigMinor;
+ uint64_t byteCount;
+ uint8_t hash[64];
+ uint32_t reserved;
+ uint32_t sectionCount;
+ uint64_t sectionIndex;
+};
+
+typedef BrigModuleHeader* BrigModule_t;
+
+#endif // defined(INCLUDED_BRIG_H)
+//}
diff --git a/src/arch/hsail/SConscript b/src/arch/hsail/SConscript
new file mode 100644
index 000000000..3455823a6
--- /dev/null
+++ b/src/arch/hsail/SConscript
@@ -0,0 +1,54 @@
+# -*- mode:python -*-
+
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Anthony Gutierrez
+#
+
+Import('*')
+
+if not env['BUILD_GPU']:
+ Return()
+
+if env['TARGET_GPU_ISA'] == 'hsail':
+ env.Command(['insts/gen_decl.hh', 'gpu_decoder.cc', 'insts/gen_exec.cc'],
+ 'gen.py', '$SOURCE $TARGETS')
+
+ Source('generic_types.cc')
+ Source('gpu_decoder.cc')
+ Source('insts/branch.cc')
+ Source('insts/gen_exec.cc')
+ Source('insts/gpu_static_inst.cc')
+ Source('insts/main.cc')
+ Source('insts/pseudo_inst.cc')
+ Source('insts/mem.cc')
+ Source('operand.cc')
diff --git a/src/arch/hsail/SConsopts b/src/arch/hsail/SConsopts
new file mode 100644
index 000000000..641963c82
--- /dev/null
+++ b/src/arch/hsail/SConsopts
@@ -0,0 +1,40 @@
+# -*- mode:python -*-
+
+#
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Anthony Gutierrez
+#
+
+Import('*')
+
+all_gpu_isa_list.append('hsail')
diff --git a/src/arch/hsail/gen.py b/src/arch/hsail/gen.py
new file mode 100755
index 000000000..f2996019b
--- /dev/null
+++ b/src/arch/hsail/gen.py
@@ -0,0 +1,806 @@
+#! /usr/bin/python
+
+#
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Steve Reinhardt
+#
+
+import sys, re
+
+from m5.util import code_formatter
+
+if len(sys.argv) != 4:
+ print "Error: need 3 args (file names)"
+ sys.exit(0)
+
+header_code = code_formatter()
+decoder_code = code_formatter()
+exec_code = code_formatter()
+
+###############
+#
+# Generate file prologs (includes etc.)
+#
+###############
+
+header_code('''
+#include "arch/hsail/insts/decl.hh"
+#include "base/bitfield.hh"
+#include "gpu-compute/hsail_code.hh"
+#include "gpu-compute/wavefront.hh"
+
+namespace HsailISA
+{
+''')
+header_code.indent()
+
+decoder_code('''
+#include "arch/hsail/gpu_decoder.hh"
+#include "arch/hsail/insts/branch.hh"
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/gen_decl.hh"
+#include "arch/hsail/insts/mem.hh"
+#include "arch/hsail/insts/mem_impl.hh"
+#include "gpu-compute/brig_object.hh"
+
+namespace HsailISA
+{
+ std::vector<GPUStaticInst*> Decoder::decodedInsts;
+
+ GPUStaticInst*
+ Decoder::decode(MachInst machInst)
+ {
+ using namespace Brig;
+
+ const BrigInstBase *ib = machInst.brigInstBase;
+ const BrigObject *obj = machInst.brigObj;
+
+ switch(ib->opcode) {
+''')
+decoder_code.indent()
+decoder_code.indent()
+
+exec_code('''
+#include "arch/hsail/insts/gen_decl.hh"
+#include "base/intmath.hh"
+
+namespace HsailISA
+{
+''')
+exec_code.indent()
+
+###############
+#
+# Define code templates for class declarations (for header file)
+#
+###############
+
+# Basic header template for an instruction with no template parameters.
+header_template_nodt = '''
+class $class_name : public $base_class
+{
+ public:
+ typedef $base_class Base;
+
+ $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : Base(ib, obj, "$opcode")
+ {
+ }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+# Basic header template for an instruction with a single DataType
+# template parameter.
+header_template_1dt = '''
+template<typename DataType>
+class $class_name : public $base_class<DataType>
+{
+ public:
+ typedef $base_class<DataType> Base;
+ typedef typename DataType::CType CType;
+
+ $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : Base(ib, obj, "$opcode")
+ {
+ }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+header_template_1dt_noexec = '''
+template<typename DataType>
+class $class_name : public $base_class<DataType>
+{
+ public:
+ typedef $base_class<DataType> Base;
+ typedef typename DataType::CType CType;
+
+ $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : Base(ib, obj, "$opcode")
+ {
+ }
+};
+
+'''
+
+# Same as header_template_1dt, except the base class has a second
+# template parameter NumSrcOperands to allow a variable number of
+# source operands. Note that since this is implemented with an array,
+# it only works for instructions where all sources are of the same
+# type (like most arithmetics).
+header_template_1dt_varsrcs = '''
+template<typename DataType>
+class $class_name : public $base_class<DataType, $num_srcs>
+{
+ public:
+ typedef $base_class<DataType, $num_srcs> Base;
+ typedef typename DataType::CType CType;
+
+ $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : Base(ib, obj, "$opcode")
+ {
+ }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+# Header template for instruction with two DataType template
+# parameters, one for the dest and one for the source. This is used
+# by compare and convert.
+header_template_2dt = '''
+template<typename DestDataType, class SrcDataType>
+class $class_name : public $base_class<DestDataType, SrcDataType>
+{
+ public:
+ typedef $base_class<DestDataType, SrcDataType> Base;
+ typedef typename DestDataType::CType DestCType;
+ typedef typename SrcDataType::CType SrcCType;
+
+ $class_name(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : Base(ib, obj, "$opcode")
+ {
+ }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+};
+
+'''
+
+header_templates = {
+ 'ArithInst': header_template_1dt_varsrcs,
+ 'CmovInst': header_template_1dt,
+ 'ClassInst': header_template_1dt,
+ 'ShiftInst': header_template_1dt,
+ 'ExtractInsertInst': header_template_1dt,
+ 'CmpInst': header_template_2dt,
+ 'CvtInst': header_template_2dt,
+ 'LdInst': '',
+ 'StInst': '',
+ 'SpecialInstNoSrc': header_template_nodt,
+ 'SpecialInst1Src': header_template_nodt,
+ 'SpecialInstNoSrcNoDest': '',
+}
+
+###############
+#
+# Define code templates for exec functions
+#
+###############
+
+# exec function body
+exec_template_nodt_nosrc = '''
+void
+$class_name::execute(GPUDynInstPtr gpuDynInst)
+{
+ Wavefront *w = gpuDynInst->wavefront();
+
+ typedef Base::DestCType DestCType;
+
+ const VectorMask &mask = w->get_pred();
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ DestCType dest_val = $expr;
+ this->dest.set(w, lane, dest_val);
+ }
+ }
+}
+
+'''
+
+exec_template_nodt_1src = '''
+void
+$class_name::execute(GPUDynInstPtr gpuDynInst)
+{
+ Wavefront *w = gpuDynInst->wavefront();
+
+ typedef Base::DestCType DestCType;
+ typedef Base::SrcCType SrcCType;
+
+ const VectorMask &mask = w->get_pred();
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ SrcCType src_val0 = this->src0.get<SrcCType>(w, lane);
+ DestCType dest_val = $expr;
+
+ this->dest.set(w, lane, dest_val);
+ }
+ }
+}
+
+'''
+
+exec_template_1dt_varsrcs = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+ Wavefront *w = gpuDynInst->wavefront();
+
+ const VectorMask &mask = w->get_pred();
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ CType dest_val;
+ if ($dest_is_src_flag) {
+ dest_val = this->dest.template get<CType>(w, lane);
+ }
+
+ CType src_val[$num_srcs];
+
+ for (int i = 0; i < $num_srcs; ++i) {
+ src_val[i] = this->src[i].template get<CType>(w, lane);
+ }
+
+ dest_val = (CType)($expr);
+
+ this->dest.set(w, lane, dest_val);
+ }
+ }
+}
+
+'''
+
+exec_template_1dt_3srcs = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+ Wavefront *w = gpuDynInst->wavefront();
+
+ typedef typename Base::Src0CType Src0T;
+ typedef typename Base::Src1CType Src1T;
+ typedef typename Base::Src2CType Src2T;
+
+ const VectorMask &mask = w->get_pred();
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ CType dest_val;
+
+ if ($dest_is_src_flag) {
+ dest_val = this->dest.template get<CType>(w, lane);
+ }
+
+ Src0T src_val0 = this->src0.template get<Src0T>(w, lane);
+ Src1T src_val1 = this->src1.template get<Src1T>(w, lane);
+ Src2T src_val2 = this->src2.template get<Src2T>(w, lane);
+
+ dest_val = $expr;
+
+ this->dest.set(w, lane, dest_val);
+ }
+ }
+}
+
+'''
+
+exec_template_1dt_2src_1dest = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+ Wavefront *w = gpuDynInst->wavefront();
+
+ typedef typename Base::DestCType DestT;
+ typedef CType Src0T;
+ typedef typename Base::Src1CType Src1T;
+
+ const VectorMask &mask = w->get_pred();
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ DestT dest_val;
+ if ($dest_is_src_flag) {
+ dest_val = this->dest.template get<DestT>(w, lane);
+ }
+ Src0T src_val0 = this->src0.template get<Src0T>(w, lane);
+ Src1T src_val1 = this->src1.template get<Src1T>(w, lane);
+
+ dest_val = $expr;
+
+ this->dest.set(w, lane, dest_val);
+ }
+ }
+}
+
+'''
+
+exec_template_shift = '''
+template<typename DataType>
+void
+$class_name<DataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+ Wavefront *w = gpuDynInst->wavefront();
+
+ const VectorMask &mask = w->get_pred();
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ CType dest_val;
+
+ if ($dest_is_src_flag) {
+ dest_val = this->dest.template get<CType>(w, lane);
+ }
+
+ CType src_val0 = this->src0.template get<CType>(w, lane);
+ uint32_t src_val1 = this->src1.template get<uint32_t>(w, lane);
+
+ dest_val = $expr;
+
+ this->dest.set(w, lane, dest_val);
+ }
+ }
+}
+
+'''
+
+exec_template_2dt = '''
+template<typename DestDataType, class SrcDataType>
+void
+$class_name<DestDataType, SrcDataType>::execute(GPUDynInstPtr gpuDynInst)
+{
+ Wavefront *w = gpuDynInst->wavefront();
+
+ const VectorMask &mask = w->get_pred();
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ DestCType dest_val;
+ SrcCType src_val[$num_srcs];
+
+ for (int i = 0; i < $num_srcs; ++i) {
+ src_val[i] = this->src[i].template get<SrcCType>(w, lane);
+ }
+
+ dest_val = $expr;
+
+ this->dest.set(w, lane, dest_val);
+ }
+ }
+}
+
+'''
+
+exec_templates = {
+ 'ArithInst': exec_template_1dt_varsrcs,
+ 'CmovInst': exec_template_1dt_3srcs,
+ 'ExtractInsertInst': exec_template_1dt_3srcs,
+ 'ClassInst': exec_template_1dt_2src_1dest,
+ 'CmpInst': exec_template_2dt,
+ 'CvtInst': exec_template_2dt,
+ 'LdInst': '',
+ 'StInst': '',
+ 'SpecialInstNoSrc': exec_template_nodt_nosrc,
+ 'SpecialInst1Src': exec_template_nodt_1src,
+ 'SpecialInstNoSrcNoDest': '',
+}
+
+###############
+#
+# Define code templates for the decoder cases
+#
+###############
+
+# decode template for nodt-opcode case
+decode_nodt_template = '''
+ case BRIG_OPCODE_$brig_opcode_upper: return $constructor(ib, obj);'''
+
+decode_case_prolog_class_inst = '''
+ case BRIG_OPCODE_$brig_opcode_upper:
+ {
+ //const BrigOperandBase *baseOp = obj->getOperand(ib->operands[1]);
+ BrigType16_t type = ((BrigInstSourceType*)ib)->sourceType;
+ //switch (baseOp->kind) {
+ // case BRIG_OPERAND_REG:
+ // type = ((const BrigOperandReg*)baseOp)->type;
+ // break;
+ // case BRIG_OPERAND_IMMED:
+ // type = ((const BrigOperandImmed*)baseOp)->type;
+ // break;
+ // default:
+ // fatal("CLASS unrecognized kind of operand %d\\n",
+ // baseOp->kind);
+ //}
+ switch (type) {'''
+
+# common prolog for 1dt- or 2dt-opcode case: switch on data type
+decode_case_prolog = '''
+ case BRIG_OPCODE_$brig_opcode_upper:
+ {
+ switch (ib->type) {'''
+
+# single-level decode case entry (for 1dt opcodes)
+decode_case_entry = \
+' case BRIG_TYPE_$type_name: return $constructor(ib, obj);'
+
+decode_store_prolog = \
+' case BRIG_TYPE_$type_name: {'
+
+decode_store_case_epilog = '''
+ }'''
+
+decode_store_case_entry = \
+' return $constructor(ib, obj);'
+
+# common epilog for type switch
+decode_case_epilog = '''
+ default: fatal("$brig_opcode_upper: unrecognized type %d\\n",
+ ib->type);
+ }
+ }
+ break;'''
+
+# Additional templates for nested decode on a second type field (for
+# compare and convert). These are used in place of the
+# decode_case_entry template to create a second-level switch on on the
+# second type field inside each case of the first-level type switch.
+# Because the name and location of the second type can vary, the Brig
+# instruction type must be provided in $brig_type, and the name of the
+# second type field must be provided in $type_field.
+decode_case2_prolog = '''
+ case BRIG_TYPE_$type_name:
+ switch (((Brig$brig_type*)ib)->$type2_field) {'''
+
+decode_case2_entry = \
+' case BRIG_TYPE_$type2_name: return $constructor(ib, obj);'
+
+decode_case2_epilog = '''
+ default: fatal("$brig_opcode_upper: unrecognized $type2_field %d\\n",
+ ((Brig$brig_type*)ib)->$type2_field);
+ }
+ break;'''
+
+# Figure out how many source operands an expr needs by looking for the
+# highest-numbered srcN value referenced. Since sources are numbered
+# starting at 0, the return value is N+1.
+def num_src_operands(expr):
+ if expr.find('src2') != -1:
+ return 3
+ elif expr.find('src1') != -1:
+ return 2
+ elif expr.find('src0') != -1:
+ return 1
+ else:
+ return 0
+
+###############
+#
+# Define final code generation methods
+#
+# The gen_nodt, and gen_1dt, and gen_2dt methods are the interface for
+# generating actual instructions.
+#
+###############
+
+# Generate class declaration, exec function, and decode switch case
+# for an brig_opcode with a single-level type switch. The 'types'
+# parameter is a list or tuple of types for which the instruction
+# should be instantiated.
+def gen(brig_opcode, types=None, expr=None, base_class='ArithInst',
+ type2_info=None, constructor_prefix='new ', is_store=False):
+ brig_opcode_upper = brig_opcode.upper()
+ class_name = brig_opcode
+ opcode = class_name.lower()
+
+ if base_class == 'ArithInst':
+ # note that expr must be provided with ArithInst so we can
+ # derive num_srcs for the template
+ assert expr
+
+ if expr:
+ # Derive several bits of info from expr. If expr is not used,
+ # this info will be irrelevant.
+ num_srcs = num_src_operands(expr)
+ # if the RHS expression includes 'dest', then we're doing an RMW
+ # on the reg and we need to treat it like a source
+ dest_is_src = expr.find('dest') != -1
+ dest_is_src_flag = str(dest_is_src).lower() # for C++
+ if base_class in ['ShiftInst']:
+ expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr)
+ elif base_class in ['ArithInst', 'CmpInst', 'CvtInst']:
+ expr = re.sub(r'\bsrc(\d)\b', r'src_val[\1]', expr)
+ else:
+ expr = re.sub(r'\bsrc(\d)\b', r'src_val\1', expr)
+ expr = re.sub(r'\bdest\b', r'dest_val', expr)
+
+ # Strip template arguments off of base class before looking up
+ # appropriate templates
+ base_class_base = re.sub(r'<.*>$', '', base_class)
+ header_code(header_templates[base_class_base])
+
+ if base_class.startswith('SpecialInst'):
+ exec_code(exec_templates[base_class_base])
+ elif base_class.startswith('ShiftInst'):
+ header_code(exec_template_shift)
+ else:
+ header_code(exec_templates[base_class_base])
+
+ if not types or isinstance(types, str):
+ # Just a single type
+ constructor = constructor_prefix + class_name
+ decoder_code(decode_nodt_template)
+ else:
+ # multiple types, need at least one level of decode
+ if brig_opcode == 'Class':
+ decoder_code(decode_case_prolog_class_inst)
+ else:
+ decoder_code(decode_case_prolog)
+ if not type2_info:
+ if is_store == False:
+ # single list of types, to basic one-level decode
+ for type_name in types:
+ full_class_name = '%s<%s>' % (class_name, type_name.upper())
+ constructor = constructor_prefix + full_class_name
+ decoder_code(decode_case_entry)
+ else:
+ # single list of types, to basic one-level decode
+ for type_name in types:
+ decoder_code(decode_store_prolog)
+ type_size = int(re.findall(r'[0-9]+', type_name)[0])
+ src_size = 32
+ type_type = type_name[0]
+ full_class_name = '%s<%s,%s>' % (class_name, \
+ type_name.upper(), \
+ '%s%d' % \
+ (type_type.upper(), \
+ type_size))
+ constructor = constructor_prefix + full_class_name
+ decoder_code(decode_store_case_entry)
+ decoder_code(decode_store_case_epilog)
+ else:
+ # need secondary type switch (convert, compare)
+ # unpack extra info on second switch
+ (type2_field, types2) = type2_info
+ brig_type = 'Inst%s' % brig_opcode
+ for type_name in types:
+ decoder_code(decode_case2_prolog)
+ fmt = '%s<%s,%%s>' % (class_name, type_name.upper())
+ for type2_name in types2:
+ full_class_name = fmt % type2_name.upper()
+ constructor = constructor_prefix + full_class_name
+ decoder_code(decode_case2_entry)
+
+ decoder_code(decode_case2_epilog)
+
+ decoder_code(decode_case_epilog)
+
+###############
+#
+# Generate instructions
+#
+###############
+
+# handy abbreviations for common sets of types
+
+# arithmetic ops are typically defined only on 32- and 64-bit sizes
+arith_int_types = ('S32', 'U32', 'S64', 'U64')
+arith_float_types = ('F32', 'F64')
+arith_types = arith_int_types + arith_float_types
+
+bit_types = ('B1', 'B32', 'B64')
+
+all_int_types = ('S8', 'U8', 'S16', 'U16') + arith_int_types
+
+# I think you might be able to do 'f16' memory ops too, but we'll
+# ignore them for now.
+mem_types = all_int_types + arith_float_types
+mem_atom_types = all_int_types + ('B32', 'B64')
+
+##### Arithmetic & logical operations
+gen('Add', arith_types, 'src0 + src1')
+gen('Sub', arith_types, 'src0 - src1')
+gen('Mul', arith_types, 'src0 * src1')
+gen('Div', arith_types, 'src0 / src1')
+gen('Min', arith_types, 'std::min(src0, src1)')
+gen('Max', arith_types, 'std::max(src0, src1)')
+gen('Gcnmin', arith_types, 'std::min(src0, src1)')
+
+gen('CopySign', arith_float_types,
+ 'src1 < 0 ? -std::abs(src0) : std::abs(src0)')
+gen('Sqrt', arith_float_types, 'sqrt(src0)')
+gen('Floor', arith_float_types, 'floor(src0)')
+
+# "fast" sqrt... same as slow for us
+gen('Nsqrt', arith_float_types, 'sqrt(src0)')
+gen('Nrsqrt', arith_float_types, '1.0/sqrt(src0)')
+gen('Nrcp', arith_float_types, '1.0/src0')
+gen('Fract', arith_float_types,
+ '(src0 >= 0.0)?(src0-floor(src0)):(floor(src0)-src0)')
+
+gen('Ncos', arith_float_types, 'cos(src0)');
+gen('Nsin', arith_float_types, 'sin(src0)');
+
+gen('And', bit_types, 'src0 & src1')
+gen('Or', bit_types, 'src0 | src1')
+gen('Xor', bit_types, 'src0 ^ src1')
+
+gen('Bitselect', bit_types, '(src1 & src0) | (src2 & ~src0)')
+gen('Firstbit',bit_types, 'firstbit(src0)')
+gen('Popcount', ('B32', 'B64'), '__builtin_popcount(src0)')
+
+gen('Shl', arith_int_types, 'src0 << (unsigned)src1', 'ShiftInst')
+gen('Shr', arith_int_types, 'src0 >> (unsigned)src1', 'ShiftInst')
+
+# gen('Mul_hi', types=('s32','u32', '??'))
+# gen('Mul24', types=('s32','u32', '??'))
+gen('Rem', arith_int_types, 'src0 - ((src0 / src1) * src1)')
+
+gen('Abs', arith_types, 'std::abs(src0)')
+gen('Neg', arith_types, '-src0')
+
+gen('Mov', bit_types, 'src0')
+gen('Not', bit_types, 'heynot(src0)')
+
+# mad and fma differ only in rounding behavior, which we don't emulate
+# also there's an integer form of mad, but not of fma
+gen('Mad', arith_types, 'src0 * src1 + src2')
+gen('Fma', arith_float_types, 'src0 * src1 + src2')
+
+#native floating point operations
+gen('Nfma', arith_float_types, 'src0 * src1 + src2')
+
+gen('Cmov', bit_types, 'src0 ? src1 : src2', 'CmovInst')
+gen('BitAlign', bit_types, '(src0 << src2)|(src1 >> (32 - src2))')
+gen('ByteAlign', bit_types, '(src0 << 8 * src2)|(src1 >> (32 - 8 * src2))')
+
+# see base/bitfield.hh
+gen('BitExtract', arith_int_types, 'bits(src0, src1, src1 + src2 - 1)',
+ 'ExtractInsertInst')
+
+gen('BitInsert', arith_int_types, 'insertBits(dest, src1, src2, src0)',
+ 'ExtractInsertInst')
+
+##### Compare
+gen('Cmp', ('B1', 'S32', 'U32', 'F32'), 'compare(src0, src1, this->cmpOp)',
+ 'CmpInst', ('sourceType', arith_types + bit_types))
+gen('Class', arith_float_types, 'fpclassify(src0,src1)','ClassInst')
+
+##### Conversion
+
+# Conversion operations are only defined on B1, not B32 or B64
+cvt_types = ('B1',) + mem_types
+
+gen('Cvt', cvt_types, 'src0', 'CvtInst', ('sourceType', cvt_types))
+
+
+##### Load & Store
+gen('Lda', mem_types, base_class = 'LdInst', constructor_prefix='decode')
+gen('Ld', mem_types, base_class = 'LdInst', constructor_prefix='decode')
+gen('St', mem_types, base_class = 'StInst', constructor_prefix='decode',
+ is_store=True)
+gen('Atomic', mem_atom_types, base_class='StInst', constructor_prefix='decode')
+gen('AtomicNoRet', mem_atom_types, base_class='StInst',
+ constructor_prefix='decode')
+
+gen('Cbr', base_class = 'LdInst', constructor_prefix='decode')
+gen('Br', base_class = 'LdInst', constructor_prefix='decode')
+
+##### Special operations
+def gen_special(brig_opcode, expr, dest_type='U32'):
+ num_srcs = num_src_operands(expr)
+ if num_srcs == 0:
+ base_class = 'SpecialInstNoSrc<%s>' % dest_type
+ elif num_srcs == 1:
+ base_class = 'SpecialInst1Src<%s>' % dest_type
+ else:
+ assert false
+
+ gen(brig_opcode, None, expr, base_class)
+
+gen_special('WorkItemId', 'w->workitemid[src0][lane]')
+gen_special('WorkItemAbsId',
+ 'w->workitemid[src0][lane] + (w->workgroupid[src0] * w->workgroupsz[src0])')
+gen_special('WorkGroupId', 'w->workgroupid[src0]')
+gen_special('WorkGroupSize', 'w->workgroupsz[src0]')
+gen_special('CurrentWorkGroupSize', 'w->workgroupsz[src0]')
+gen_special('GridSize', 'w->gridsz[src0]')
+gen_special('GridGroups',
+ 'divCeil(w->gridsz[src0],w->workgroupsz[src0])')
+gen_special('LaneId', 'lane')
+gen_special('WaveId', 'w->dynwaveid')
+gen_special('Clock', 'w->computeUnit->shader->tick_cnt', 'U64')
+
+# gen_special('CU'', ')
+
+gen('Ret', base_class='SpecialInstNoSrcNoDest')
+gen('Barrier', base_class='SpecialInstNoSrcNoDest')
+gen('MemFence', base_class='SpecialInstNoSrcNoDest')
+
+# Map magic instructions to the BrigSyscall opcode
+# Magic instructions are defined in magic.hh
+#
+# In the future, real HSA kernel system calls can be implemented and coexist
+# with magic instructions.
+gen('Call', base_class='SpecialInstNoSrcNoDest')
+
+###############
+#
+# Generate file epilogs
+#
+###############
+header_code.dedent()
+header_code('''
+} // namespace HsailISA
+''')
+
+# close off main decode switch
+decoder_code.dedent()
+decoder_code.dedent()
+decoder_code('''
+ default: fatal("unrecognized Brig opcode %d\\n", ib->opcode);
+ } // end switch(ib->opcode)
+ } // end decode()
+} // namespace HsailISA
+''')
+
+exec_code.dedent()
+exec_code('''
+} // namespace HsailISA
+''')
+
+###############
+#
+# Output accumulated code to files
+#
+###############
+header_code.write(sys.argv[1])
+decoder_code.write(sys.argv[2])
+exec_code.write(sys.argv[3])
diff --git a/src/arch/hsail/generic_types.cc b/src/arch/hsail/generic_types.cc
new file mode 100644
index 000000000..0cd55d1d5
--- /dev/null
+++ b/src/arch/hsail/generic_types.cc
@@ -0,0 +1,47 @@
+#include "arch/hsail/generic_types.hh"
+#include "base/misc.hh"
+
+using namespace Brig;
+
+namespace HsailISA
+{
+ Enums::GenericMemoryOrder
+ getGenericMemoryOrder(BrigMemoryOrder brig_memory_order)
+ {
+ switch(brig_memory_order) {
+ case BRIG_MEMORY_ORDER_NONE:
+ return Enums::MEMORY_ORDER_NONE;
+ case BRIG_MEMORY_ORDER_RELAXED:
+ return Enums::MEMORY_ORDER_RELAXED;
+ case BRIG_MEMORY_ORDER_SC_ACQUIRE:
+ return Enums::MEMORY_ORDER_SC_ACQUIRE;
+ case BRIG_MEMORY_ORDER_SC_RELEASE:
+ return Enums::MEMORY_ORDER_SC_RELEASE;
+ case BRIG_MEMORY_ORDER_SC_ACQUIRE_RELEASE:
+ return Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE;
+ default:
+ fatal("HsailISA::MemInst::getGenericMemoryOrder -> ",
+ "bad BrigMemoryOrder\n");
+ }
+ }
+
+ Enums::GenericMemoryScope
+ getGenericMemoryScope(BrigMemoryScope brig_memory_scope)
+ {
+ switch(brig_memory_scope) {
+ case BRIG_MEMORY_SCOPE_NONE:
+ return Enums::MEMORY_SCOPE_NONE;
+ case BRIG_MEMORY_SCOPE_WORKITEM:
+ return Enums::MEMORY_SCOPE_WORKITEM;
+ case BRIG_MEMORY_SCOPE_WORKGROUP:
+ return Enums::MEMORY_SCOPE_WORKGROUP;
+ case BRIG_MEMORY_SCOPE_AGENT:
+ return Enums::MEMORY_SCOPE_DEVICE;
+ case BRIG_MEMORY_SCOPE_SYSTEM:
+ return Enums::MEMORY_SCOPE_SYSTEM;
+ default:
+ fatal("HsailISA::MemInst::getGenericMemoryScope -> ",
+ "bad BrigMemoryScope\n");
+ }
+ }
+} // namespace HsailISA
diff --git a/src/arch/hsail/generic_types.hh b/src/arch/hsail/generic_types.hh
new file mode 100644
index 000000000..50e430bef
--- /dev/null
+++ b/src/arch/hsail/generic_types.hh
@@ -0,0 +1,16 @@
+#ifndef __ARCH_HSAIL_GENERIC_TYPES_HH__
+#define __ARCH_HSAIL_GENERIC_TYPES_HH__
+
+#include "arch/hsail/Brig.h"
+#include "enums/GenericMemoryOrder.hh"
+#include "enums/GenericMemoryScope.hh"
+
+namespace HsailISA
+{
+ Enums::GenericMemoryOrder
+ getGenericMemoryOrder(Brig::BrigMemoryOrder brig_memory_order);
+ Enums::GenericMemoryScope
+ getGenericMemoryScope(Brig::BrigMemoryScope brig_memory_scope);
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_GENERIC_TYPES_HH__
diff --git a/src/arch/hsail/gpu_decoder.hh b/src/arch/hsail/gpu_decoder.hh
new file mode 100644
index 000000000..98a689664
--- /dev/null
+++ b/src/arch/hsail/gpu_decoder.hh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __ARCH_HSAIL_GPU_DECODER_HH__
+#define __ARCH_HSAIL_GPU_DECODER_HH__
+
+#include <vector>
+
+#include "arch/hsail/gpu_types.hh"
+
+class BrigObject;
+class GPUStaticInst;
+
+namespace Brig
+{
+ class BrigInstBase;
+}
+
+namespace HsailISA
+{
+ class Decoder
+ {
+ public:
+ GPUStaticInst* decode(MachInst machInst);
+
+ GPUStaticInst*
+ decode(RawMachInst inst)
+ {
+ return inst < decodedInsts.size() ? decodedInsts.at(inst) : nullptr;
+ }
+
+ RawMachInst
+ saveInst(GPUStaticInst *decodedInst)
+ {
+ decodedInsts.push_back(decodedInst);
+
+ return decodedInsts.size() - 1;
+ }
+
+ private:
+ static std::vector<GPUStaticInst*> decodedInsts;
+ };
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_GPU_DECODER_HH__
diff --git a/src/arch/hsail/gpu_types.hh b/src/arch/hsail/gpu_types.hh
new file mode 100644
index 000000000..4b3a66a9a
--- /dev/null
+++ b/src/arch/hsail/gpu_types.hh
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __ARCH_HSAIL_GPU_TYPES_HH__
+#define __ARCH_HSAIL_GPU_TYPES_HH__
+
+#include <cstdint>
+
+namespace Brig
+{
+ class BrigInstBase;
+}
+
+class BrigObject;
+
+namespace HsailISA
+{
+ // A raw machine instruction represents the raw bits that
+ // our model uses to represent an actual instruction. In
+ // the case of HSAIL this is just an index into a list of
+ // instruction objects.
+ typedef uint64_t RawMachInst;
+
+ // The MachInst is a representation of an instruction
+ // that has more information than just the machine code.
+ // For HSAIL the actual machine code is a BrigInstBase
+ // and the BrigObject contains more pertinent
+ // information related to operaands, etc.
+
+ struct MachInst
+ {
+ const Brig::BrigInstBase *brigInstBase;
+ const BrigObject *brigObj;
+ };
+}
+
+#endif // __ARCH_HSAIL_GPU_TYPES_HH__
diff --git a/src/arch/hsail/insts/branch.cc b/src/arch/hsail/insts/branch.cc
new file mode 100644
index 000000000..d65279cc8
--- /dev/null
+++ b/src/arch/hsail/insts/branch.cc
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "arch/hsail/insts/branch.hh"
+
+#include "gpu-compute/hsail_code.hh"
+
+namespace HsailISA
+{
+ GPUStaticInst*
+ decodeBrn(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ // Detect direct vs indirect branch by seeing whether we have a
+ // register operand.
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+ if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+ return new BrnIndirectInst(ib, obj);
+ } else {
+ return new BrnDirectInst(ib, obj);
+ }
+ }
+
+ GPUStaticInst*
+ decodeCbr(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ // Detect direct vs indirect branch by seeing whether we have a
+ // second register operand (after the condition).
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+ const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+ if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+ return new CbrIndirectInst(ib, obj);
+ } else {
+ return new CbrDirectInst(ib, obj);
+ }
+ }
+
+ GPUStaticInst*
+ decodeBr(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ // Detect direct vs indirect branch by seeing whether we have a
+ // second register operand (after the condition).
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+ const Brig::BrigOperand *reg = obj->getOperand(op_offs);
+
+ if (reg->kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+ return new BrIndirectInst(ib, obj);
+ } else {
+ return new BrDirectInst(ib, obj);
+ }
+ }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/branch.hh b/src/arch/hsail/insts/branch.hh
new file mode 100644
index 000000000..54ad9a042
--- /dev/null
+++ b/src/arch/hsail/insts/branch.hh
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_BRANCH_HH__
+#define __ARCH_HSAIL_INSTS_BRANCH_HH__
+
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/wavefront.hh"
+
+namespace HsailISA
+{
+
+ // The main difference between a direct branch and an indirect branch
+ // is whether the target is a register or a label, so we can share a
+ // lot of code if we template the base implementation on that type.
+ template<typename TargetType>
+ class BrnInstBase : public HsailGPUStaticInst
+ {
+ public:
+ void generateDisassembly();
+
+ Brig::BrigWidth8_t width;
+ TargetType target;
+
+ BrnInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : HsailGPUStaticInst(obj, "brn")
+ {
+ o_type = Enums::OT_BRANCH;
+ width = ((Brig::BrigInstBr*)ib)->width;
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ target.init(op_offs, obj);
+ o_type = Enums::OT_BRANCH;
+ }
+
+ uint32_t getTargetPc() override { return target.getTarget(0, 0); }
+
+ bool unconditionalJumpInstruction() override { return true; }
+ bool isVectorRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.isScalarRegister();
+ }
+
+ bool isSrcOperand(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return true;
+ }
+
+ bool isDstOperand(int operandIndex) {
+ return false;
+ }
+
+ int getOperandSize(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.opSize();
+ }
+
+ int getRegisterIndex(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.regIndex();
+ }
+
+ int getNumOperands() {
+ return 1;
+ }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+ };
+
+ template<typename TargetType>
+ void
+ BrnInstBase<TargetType>::generateDisassembly()
+ {
+ std::string widthClause;
+
+ if (width != 1) {
+ widthClause = csprintf("_width(%d)", width);
+ }
+
+ disassembly = csprintf("%s%s %s", opcode, widthClause,
+ target.disassemble());
+ }
+
+ template<typename TargetType>
+ void
+ BrnInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ if (getTargetPc() == w->rpc()) {
+ w->popFromReconvergenceStack();
+ } else {
+ // Rpc and execution mask remain the same
+ w->pc(getTargetPc());
+ }
+ w->discardFetch();
+ }
+
+ class BrnDirectInst : public BrnInstBase<LabelOperand>
+ {
+ public:
+ BrnDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : BrnInstBase<LabelOperand>(ib, obj)
+ {
+ }
+ int numSrcRegOperands() { return 0; }
+ int numDstRegOperands() { return 0; }
+ };
+
+ class BrnIndirectInst : public BrnInstBase<SRegOperand>
+ {
+ public:
+ BrnIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : BrnInstBase<SRegOperand>(ib, obj)
+ {
+ }
+ int numSrcRegOperands() { return target.isVectorRegister(); }
+ int numDstRegOperands() { return 0; }
+ };
+
+ GPUStaticInst* decodeBrn(const Brig::BrigInstBase *ib,
+ const BrigObject *obj);
+
+ template<typename TargetType>
+ class CbrInstBase : public HsailGPUStaticInst
+ {
+ public:
+ void generateDisassembly();
+
+ Brig::BrigWidth8_t width;
+ CRegOperand cond;
+ TargetType target;
+
+ CbrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : HsailGPUStaticInst(obj, "cbr")
+ {
+ o_type = Enums::OT_BRANCH;
+ width = ((Brig::BrigInstBr *)ib)->width;
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ cond.init(op_offs, obj);
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ target.init(op_offs, obj);
+ o_type = Enums::OT_BRANCH;
+ }
+
+ uint32_t getTargetPc() override { return target.getTarget(0, 0); }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+ // Assumption: Target is operand 0, Condition Register is operand 1
+ bool isVectorRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ if (!operandIndex)
+ return target.isVectorRegister();
+ else
+ return false;
+ }
+ bool isCondRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ if (!operandIndex)
+ return target.isCondRegister();
+ else
+ return true;
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return target.isScalarRegister();
+ else
+ return false;
+ }
+ bool isSrcOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex == 0)
+ return true;
+ return false;
+ }
+ // both Condition Register and Target are source operands
+ bool isDstOperand(int operandIndex) {
+ return false;
+ }
+ int getOperandSize(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ if (!operandIndex)
+ return target.opSize();
+ else
+ return 1;
+ }
+ int getRegisterIndex(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ if (!operandIndex)
+ return target.regIndex();
+ else
+ return -1;
+ }
+
+ // Operands = Target, Condition Register
+ int getNumOperands() {
+ return 2;
+ }
+ };
+
+ template<typename TargetType>
+ void
+ CbrInstBase<TargetType>::generateDisassembly()
+ {
+ std::string widthClause;
+
+ if (width != 1) {
+ widthClause = csprintf("_width(%d)", width);
+ }
+
+ disassembly = csprintf("%s%s %s,%s", opcode, widthClause,
+ cond.disassemble(), target.disassemble());
+ }
+
+ template<typename TargetType>
+ void
+ CbrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ const uint32_t curr_pc = w->pc();
+ const uint32_t curr_rpc = w->rpc();
+ const VectorMask curr_mask = w->execMask();
+
+ /**
+ * TODO: can we move this pop outside the instruction, and
+ * into the wavefront?
+ */
+ w->popFromReconvergenceStack();
+
+ // immediate post-dominator instruction
+ const uint32_t rpc = static_cast<uint32_t>(ipdInstNum());
+ if (curr_rpc != rpc) {
+ w->pushToReconvergenceStack(rpc, curr_rpc, curr_mask);
+ }
+
+ // taken branch
+ const uint32_t true_pc = getTargetPc();
+ VectorMask true_mask;
+ for (unsigned int lane = 0; lane < VSZ; ++lane) {
+ true_mask[lane] = cond.get<bool>(w, lane) & curr_mask[lane];
+ }
+
+ // not taken branch
+ const uint32_t false_pc = curr_pc + 1;
+ assert(true_pc != false_pc);
+ if (false_pc != rpc && true_mask.count() < curr_mask.count()) {
+ VectorMask false_mask = curr_mask & ~true_mask;
+ w->pushToReconvergenceStack(false_pc, rpc, false_mask);
+ }
+
+ if (true_pc != rpc && true_mask.count()) {
+ w->pushToReconvergenceStack(true_pc, rpc, true_mask);
+ }
+ assert(w->pc() != curr_pc);
+ w->discardFetch();
+ }
+
+
+ class CbrDirectInst : public CbrInstBase<LabelOperand>
+ {
+ public:
+ CbrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : CbrInstBase<LabelOperand>(ib, obj)
+ {
+ }
+ // the source operand of a conditional branch is a Condition
+ // Register which is not stored in the VRF
+ // so we do not count it as a source-register operand
+ // even though, formally, it is one.
+ int numSrcRegOperands() { return 0; }
+ int numDstRegOperands() { return 0; }
+ };
+
+ class CbrIndirectInst : public CbrInstBase<SRegOperand>
+ {
+ public:
+ CbrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : CbrInstBase<SRegOperand>(ib, obj)
+ {
+ }
+ // one source operand of the conditional indirect branch is a Condition
+ // register which is not stored in the VRF so we do not count it
+ // as a source-register operand even though, formally, it is one.
+ int numSrcRegOperands() { return target.isVectorRegister(); }
+ int numDstRegOperands() { return 0; }
+ };
+
+ GPUStaticInst* decodeCbr(const Brig::BrigInstBase *ib,
+ const BrigObject *obj);
+
+ template<typename TargetType>
+ class BrInstBase : public HsailGPUStaticInst
+ {
+ public:
+ void generateDisassembly();
+
+ ImmOperand<uint32_t> width;
+ TargetType target;
+
+ BrInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : HsailGPUStaticInst(obj, "br")
+ {
+ o_type = Enums::OT_BRANCH;
+ width.init(((Brig::BrigInstBr *)ib)->width, obj);
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ target.init(op_offs, obj);
+ o_type = Enums::OT_BRANCH;
+ }
+
+ uint32_t getTargetPc() override { return target.getTarget(0, 0); }
+
+ bool unconditionalJumpInstruction() override { return true; }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+ bool isVectorRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return true;
+ }
+ bool isDstOperand(int operandIndex) { return false; }
+ int getOperandSize(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.opSize();
+ }
+ int getRegisterIndex(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return target.regIndex();
+ }
+ int getNumOperands() { return 1; }
+ };
+
+ template<typename TargetType>
+ void
+ BrInstBase<TargetType>::generateDisassembly()
+ {
+ std::string widthClause;
+
+ if (width.bits != 1) {
+ widthClause = csprintf("_width(%d)", width.bits);
+ }
+
+ disassembly = csprintf("%s%s %s", opcode, widthClause,
+ target.disassemble());
+ }
+
+ template<typename TargetType>
+ void
+ BrInstBase<TargetType>::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ if (getTargetPc() == w->rpc()) {
+ w->popFromReconvergenceStack();
+ } else {
+ // Rpc and execution mask remain the same
+ w->pc(getTargetPc());
+ }
+ w->discardFetch();
+ }
+
+ class BrDirectInst : public BrInstBase<LabelOperand>
+ {
+ public:
+ BrDirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : BrInstBase<LabelOperand>(ib, obj)
+ {
+ }
+
+ int numSrcRegOperands() { return 0; }
+ int numDstRegOperands() { return 0; }
+ };
+
+ class BrIndirectInst : public BrInstBase<SRegOperand>
+ {
+ public:
+ BrIndirectInst(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : BrInstBase<SRegOperand>(ib, obj)
+ {
+ }
+ int numSrcRegOperands() { return target.isVectorRegister(); }
+ int numDstRegOperands() { return 0; }
+ };
+
+ GPUStaticInst* decodeBr(const Brig::BrigInstBase *ib,
+ const BrigObject *obj);
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_BRANCH_HH__
diff --git a/src/arch/hsail/insts/decl.hh b/src/arch/hsail/insts/decl.hh
new file mode 100644
index 000000000..e2da501b9
--- /dev/null
+++ b/src/arch/hsail/insts/decl.hh
@@ -0,0 +1,1106 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_DECL_HH__
+#define __ARCH_HSAIL_INSTS_DECL_HH__
+
+#include <cmath>
+
+#include "arch/hsail/generic_types.hh"
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+#include "debug/HSAIL.hh"
+#include "enums/OpType.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+
+namespace HsailISA
+{
+ template<typename _DestOperand, typename _SrcOperand>
+ class HsailOperandType
+ {
+ public:
+ typedef _DestOperand DestOperand;
+ typedef _SrcOperand SrcOperand;
+ };
+
+ typedef HsailOperandType<CRegOperand, CRegOrImmOperand> CRegOperandType;
+ typedef HsailOperandType<SRegOperand, SRegOrImmOperand> SRegOperandType;
+ typedef HsailOperandType<DRegOperand, DRegOrImmOperand> DRegOperandType;
+
+ // The IsBits parameter serves only to disambiguate tbhe B* types from
+ // the U* types, which otherwise would be identical (and
+ // indistinguishable).
+ template<typename _OperandType, typename _CType, Enums::MemType _memType,
+ vgpr_type _vgprType, int IsBits=0>
+ class HsailDataType
+ {
+ public:
+ typedef _OperandType OperandType;
+ typedef _CType CType;
+ static const Enums::MemType memType = _memType;
+ static const vgpr_type vgprType = _vgprType;
+ static const char *label;
+ };
+
+ typedef HsailDataType<CRegOperandType, bool, Enums::M_U8, VT_32, 1> B1;
+ typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32, 1> B8;
+
+ typedef HsailDataType<SRegOperandType, uint16_t,
+ Enums::M_U16, VT_32, 1> B16;
+
+ typedef HsailDataType<SRegOperandType, uint32_t,
+ Enums::M_U32, VT_32, 1> B32;
+
+ typedef HsailDataType<DRegOperandType, uint64_t,
+ Enums::M_U64, VT_64, 1> B64;
+
+ typedef HsailDataType<SRegOperandType, int8_t, Enums::M_S8, VT_32> S8;
+ typedef HsailDataType<SRegOperandType, int16_t, Enums::M_S16, VT_32> S16;
+ typedef HsailDataType<SRegOperandType, int32_t, Enums::M_S32, VT_32> S32;
+ typedef HsailDataType<DRegOperandType, int64_t, Enums::M_S64, VT_64> S64;
+
+ typedef HsailDataType<SRegOperandType, uint8_t, Enums::M_U8, VT_32> U8;
+ typedef HsailDataType<SRegOperandType, uint16_t, Enums::M_U16, VT_32> U16;
+ typedef HsailDataType<SRegOperandType, uint32_t, Enums::M_U32, VT_32> U32;
+ typedef HsailDataType<DRegOperandType, uint64_t, Enums::M_U64, VT_64> U64;
+
+ typedef HsailDataType<SRegOperandType, float, Enums::M_F32, VT_32> F32;
+ typedef HsailDataType<DRegOperandType, double, Enums::M_F64, VT_64> F64;
+
+ template<typename DestOperandType, typename SrcOperandType,
+ int NumSrcOperands>
+ class CommonInstBase : public HsailGPUStaticInst
+ {
+ protected:
+ typename DestOperandType::DestOperand dest;
+ typename SrcOperandType::SrcOperand src[NumSrcOperands];
+
+ void
+ generateDisassembly()
+ {
+ disassembly = csprintf("%s%s %s", opcode, opcode_suffix(),
+ dest.disassemble());
+
+ for (int i = 0; i < NumSrcOperands; ++i) {
+ disassembly += ",";
+ disassembly += src[i].disassemble();
+ }
+ }
+
+ virtual std::string opcode_suffix() = 0;
+
+ public:
+ CommonInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *opcode)
+ : HsailGPUStaticInst(obj, opcode)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+
+ dest.init(op_offs, obj);
+
+ for (int i = 0; i < NumSrcOperands; ++i) {
+ op_offs = obj->getOperandPtr(ib->operands, i + 1);
+ src[i].init(op_offs, obj);
+ }
+ }
+
+ bool isVectorRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].isVectorRegister();
+ else
+ return dest.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].isCondRegister();
+ else
+ return dest.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].isScalarRegister();
+ else
+ return dest.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return true;
+ return false;
+ }
+
+ bool isDstOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex >= NumSrcOperands)
+ return true;
+ return false;
+ }
+ int getOperandSize(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].opSize();
+ else
+ return dest.opSize();
+ }
+ int getRegisterIndex(int operandIndex) {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].regIndex();
+ else
+ return dest.regIndex();
+ }
+ int numSrcRegOperands() {
+ int operands = 0;
+ for (int i = 0; i < NumSrcOperands; i++) {
+ if (src[i].isVectorRegister() == true) {
+ operands++;
+ }
+ }
+ return operands;
+ }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ int getNumOperands() { return NumSrcOperands + 1; }
+ };
+
+ template<typename DataType, int NumSrcOperands>
+ class ArithInst : public CommonInstBase<typename DataType::OperandType,
+ typename DataType::OperandType,
+ NumSrcOperands>
+ {
+ public:
+ std::string opcode_suffix() { return csprintf("_%s", DataType::label); }
+
+ ArithInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *opcode)
+ : CommonInstBase<typename DataType::OperandType,
+ typename DataType::OperandType,
+ NumSrcOperands>(ib, obj, opcode)
+ {
+ }
+ };
+
+ template<typename DestOperandType, typename Src0OperandType,
+ typename Src1OperandType, typename Src2OperandType>
+ class ThreeNonUniformSourceInstBase : public HsailGPUStaticInst
+ {
+ protected:
+ typename DestOperandType::DestOperand dest;
+ typename Src0OperandType::SrcOperand src0;
+ typename Src1OperandType::SrcOperand src1;
+ typename Src2OperandType::SrcOperand src2;
+
+ void
+ generateDisassembly()
+ {
+ disassembly = csprintf("%s %s,%s,%s,%s", opcode, dest.disassemble(),
+ src0.disassemble(), src1.disassemble(),
+ src2.disassemble());
+ }
+
+ public:
+ ThreeNonUniformSourceInstBase(const Brig::BrigInstBase *ib,
+ const BrigObject *obj,
+ const char *opcode)
+ : HsailGPUStaticInst(obj, opcode)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ dest.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ src0.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 2);
+ src1.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 3);
+ src2.init(op_offs, obj);
+ }
+
+ bool isVectorRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.isVectorRegister();
+ else if (operandIndex == 1)
+ return src1.isVectorRegister();
+ else if (operandIndex == 2)
+ return src2.isVectorRegister();
+ else
+ return dest.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.isCondRegister();
+ else if (operandIndex == 1)
+ return src1.isCondRegister();
+ else if (operandIndex == 2)
+ return src2.isCondRegister();
+ else
+ return dest.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.isScalarRegister();
+ else if (operandIndex == 1)
+ return src1.isScalarRegister();
+ else if (operandIndex == 2)
+ return src2.isScalarRegister();
+ else
+ return dest.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < 3)
+ return true;
+ else
+ return false;
+ }
+ bool isDstOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex >= 3)
+ return true;
+ else
+ return false;
+ }
+ int getOperandSize(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.opSize();
+ else if (operandIndex == 1)
+ return src1.opSize();
+ else if (operandIndex == 2)
+ return src2.opSize();
+ else
+ return dest.opSize();
+ }
+ int getRegisterIndex(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.regIndex();
+ else if (operandIndex == 1)
+ return src1.regIndex();
+ else if (operandIndex == 2)
+ return src2.regIndex();
+ else
+ return dest.regIndex();
+ }
+
+ int numSrcRegOperands() {
+ int operands = 0;
+ if (src0.isVectorRegister() == true) {
+ operands++;
+ }
+ if (src1.isVectorRegister() == true) {
+ operands++;
+ }
+ if (src2.isVectorRegister() == true) {
+ operands++;
+ }
+ return operands;
+ }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ int getNumOperands() { return 4; }
+ };
+
+ template<typename DestDataType, typename Src0DataType,
+ typename Src1DataType, typename Src2DataType>
+ class ThreeNonUniformSourceInst :
+ public ThreeNonUniformSourceInstBase<typename DestDataType::OperandType,
+ typename Src0DataType::OperandType,
+ typename Src1DataType::OperandType,
+ typename Src2DataType::OperandType>
+ {
+ public:
+ typedef typename DestDataType::CType DestCType;
+ typedef typename Src0DataType::CType Src0CType;
+ typedef typename Src1DataType::CType Src1CType;
+ typedef typename Src2DataType::CType Src2CType;
+
+ ThreeNonUniformSourceInst(const Brig::BrigInstBase *ib,
+ const BrigObject *obj, const char *opcode)
+ : ThreeNonUniformSourceInstBase<typename DestDataType::OperandType,
+ typename Src0DataType::OperandType,
+ typename Src1DataType::OperandType,
+ typename Src2DataType::OperandType>(ib,
+ obj, opcode)
+ {
+ }
+ };
+
+ template<typename DataType>
+ class CmovInst : public ThreeNonUniformSourceInst<DataType, B1,
+ DataType, DataType>
+ {
+ public:
+ CmovInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *opcode)
+ : ThreeNonUniformSourceInst<DataType, B1, DataType,
+ DataType>(ib, obj, opcode)
+ {
+ }
+ };
+
+ template<typename DataType>
+ class ExtractInsertInst : public ThreeNonUniformSourceInst<DataType,
+ DataType, U32,
+ U32>
+ {
+ public:
+ ExtractInsertInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *opcode)
+ : ThreeNonUniformSourceInst<DataType, DataType, U32,
+ U32>(ib, obj, opcode)
+ {
+ }
+ };
+
+ template<typename DestOperandType, typename Src0OperandType,
+ typename Src1OperandType>
+ class TwoNonUniformSourceInstBase : public HsailGPUStaticInst
+ {
+ protected:
+ typename DestOperandType::DestOperand dest;
+ typename Src0OperandType::SrcOperand src0;
+ typename Src1OperandType::SrcOperand src1;
+
+ void
+ generateDisassembly()
+ {
+ disassembly = csprintf("%s %s,%s,%s", opcode, dest.disassemble(),
+ src0.disassemble(), src1.disassemble());
+ }
+
+
+ public:
+ TwoNonUniformSourceInstBase(const Brig::BrigInstBase *ib,
+ const BrigObject *obj, const char *opcode)
+ : HsailGPUStaticInst(obj, opcode)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ dest.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ src0.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 2);
+ src1.init(op_offs, obj);
+ }
+ bool isVectorRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.isVectorRegister();
+ else if (operandIndex == 1)
+ return src1.isVectorRegister();
+ else
+ return dest.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.isCondRegister();
+ else if (operandIndex == 1)
+ return src1.isCondRegister();
+ else
+ return dest.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.isScalarRegister();
+ else if (operandIndex == 1)
+ return src1.isScalarRegister();
+ else
+ return dest.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < 2)
+ return true;
+ else
+ return false;
+ }
+ bool isDstOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex >= 2)
+ return true;
+ else
+ return false;
+ }
+ int getOperandSize(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.opSize();
+ else if (operandIndex == 1)
+ return src1.opSize();
+ else
+ return dest.opSize();
+ }
+ int getRegisterIndex(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (!operandIndex)
+ return src0.regIndex();
+ else if (operandIndex == 1)
+ return src1.regIndex();
+ else
+ return dest.regIndex();
+ }
+
+ int numSrcRegOperands() {
+ int operands = 0;
+ if (src0.isVectorRegister() == true) {
+ operands++;
+ }
+ if (src1.isVectorRegister() == true) {
+ operands++;
+ }
+ return operands;
+ }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ int getNumOperands() { return 3; }
+ };
+
+ template<typename DestDataType, typename Src0DataType,
+ typename Src1DataType>
+ class TwoNonUniformSourceInst :
+ public TwoNonUniformSourceInstBase<typename DestDataType::OperandType,
+ typename Src0DataType::OperandType,
+ typename Src1DataType::OperandType>
+ {
+ public:
+ typedef typename DestDataType::CType DestCType;
+ typedef typename Src0DataType::CType Src0CType;
+ typedef typename Src1DataType::CType Src1CType;
+
+ TwoNonUniformSourceInst(const Brig::BrigInstBase *ib,
+ const BrigObject *obj, const char *opcode)
+ : TwoNonUniformSourceInstBase<typename DestDataType::OperandType,
+ typename Src0DataType::OperandType,
+ typename Src1DataType::OperandType>(ib,
+ obj, opcode)
+ {
+ }
+ };
+
+ // helper function for ClassInst
+ template<typename T>
+ bool
+ fpclassify(T src0, uint32_t src1)
+ {
+ int fpclass = std::fpclassify(src0);
+
+ if ((src1 & 0x3) && (fpclass == FP_NAN)) {
+ return true;
+ }
+
+ if (src0 <= -0.0) {
+ if ((src1 & 0x4) && fpclass == FP_INFINITE)
+ return true;
+ if ((src1 & 0x8) && fpclass == FP_NORMAL)
+ return true;
+ if ((src1 & 0x10) && fpclass == FP_SUBNORMAL)
+ return true;
+ if ((src1 & 0x20) && fpclass == FP_ZERO)
+ return true;
+ } else {
+ if ((src1 & 0x40) && fpclass == FP_ZERO)
+ return true;
+ if ((src1 & 0x80) && fpclass == FP_SUBNORMAL)
+ return true;
+ if ((src1 & 0x100) && fpclass == FP_NORMAL)
+ return true;
+ if ((src1 & 0x200) && fpclass == FP_INFINITE)
+ return true;
+ }
+ return false;
+ }
+
+ template<typename DataType>
+ class ClassInst : public TwoNonUniformSourceInst<B1, DataType, U32>
+ {
+ public:
+ ClassInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *opcode)
+ : TwoNonUniformSourceInst<B1, DataType, U32>(ib, obj, opcode)
+ {
+ }
+ };
+
+ template<typename DataType>
+ class ShiftInst : public TwoNonUniformSourceInst<DataType, DataType, U32>
+ {
+ public:
+ ShiftInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *opcode)
+ : TwoNonUniformSourceInst<DataType, DataType, U32>(ib, obj, opcode)
+ {
+ }
+ };
+
+ // helper function for CmpInst
+ template<typename T>
+ bool
+ compare(T src0, T src1, Brig::BrigCompareOperation cmpOp)
+ {
+ using namespace Brig;
+
+ switch (cmpOp) {
+ case BRIG_COMPARE_EQ:
+ case BRIG_COMPARE_EQU:
+ case BRIG_COMPARE_SEQ:
+ case BRIG_COMPARE_SEQU:
+ return (src0 == src1);
+
+ case BRIG_COMPARE_NE:
+ case BRIG_COMPARE_NEU:
+ case BRIG_COMPARE_SNE:
+ case BRIG_COMPARE_SNEU:
+ return (src0 != src1);
+
+ case BRIG_COMPARE_LT:
+ case BRIG_COMPARE_LTU:
+ case BRIG_COMPARE_SLT:
+ case BRIG_COMPARE_SLTU:
+ return (src0 < src1);
+
+ case BRIG_COMPARE_LE:
+ case BRIG_COMPARE_LEU:
+ case BRIG_COMPARE_SLE:
+ case BRIG_COMPARE_SLEU:
+ return (src0 <= src1);
+
+ case BRIG_COMPARE_GT:
+ case BRIG_COMPARE_GTU:
+ case BRIG_COMPARE_SGT:
+ case BRIG_COMPARE_SGTU:
+ return (src0 > src1);
+
+ case BRIG_COMPARE_GE:
+ case BRIG_COMPARE_GEU:
+ case BRIG_COMPARE_SGE:
+ case BRIG_COMPARE_SGEU:
+ return (src0 >= src1);
+
+ case BRIG_COMPARE_NUM:
+ case BRIG_COMPARE_SNUM:
+ return (src0 == src0) || (src1 == src1);
+
+ case BRIG_COMPARE_NAN:
+ case BRIG_COMPARE_SNAN:
+ return (src0 != src0) || (src1 != src1);
+
+ default:
+ fatal("Bad cmpOp value %d\n", (int)cmpOp);
+ }
+ }
+
+ template<typename T>
+ int32_t
+ firstbit(T src0)
+ {
+ if (!src0)
+ return -1;
+
+ //handle positive and negative numbers
+ T tmp = (src0 < 0) ? (~src0) : (src0);
+
+ //the starting pos is MSB
+ int pos = 8 * sizeof(T) - 1;
+ int cnt = 0;
+
+ //search the first bit set to 1
+ while (!(tmp & (1 << pos))) {
+ ++cnt;
+ --pos;
+ }
+ return cnt;
+ }
+
+ const char* cmpOpToString(Brig::BrigCompareOperation cmpOp);
+
+ template<typename DestOperandType, typename SrcOperandType>
+ class CmpInstBase : public CommonInstBase<DestOperandType, SrcOperandType,
+ 2>
+ {
+ protected:
+ Brig::BrigCompareOperation cmpOp;
+
+ public:
+ CmpInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : CommonInstBase<DestOperandType, SrcOperandType, 2>(ib, obj,
+ _opcode)
+ {
+ assert(ib->base.kind == Brig::BRIG_KIND_INST_CMP);
+ Brig::BrigInstCmp *i = (Brig::BrigInstCmp*)ib;
+ cmpOp = (Brig::BrigCompareOperation)i->compare;
+ }
+ };
+
+ template<typename DestDataType, typename SrcDataType>
+ class CmpInst : public CmpInstBase<typename DestDataType::OperandType,
+ typename SrcDataType::OperandType>
+ {
+ public:
+ std::string
+ opcode_suffix()
+ {
+ return csprintf("_%s_%s_%s", cmpOpToString(this->cmpOp),
+ DestDataType::label, SrcDataType::label);
+ }
+
+ CmpInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : CmpInstBase<typename DestDataType::OperandType,
+ typename SrcDataType::OperandType>(ib, obj, _opcode)
+ {
+ }
+ };
+
+ template<typename DestDataType, typename SrcDataType>
+ class CvtInst : public CommonInstBase<typename DestDataType::OperandType,
+ typename SrcDataType::OperandType, 1>
+ {
+ public:
+ std::string opcode_suffix()
+ {
+ return csprintf("_%s_%s", DestDataType::label, SrcDataType::label);
+ }
+
+ CvtInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : CommonInstBase<typename DestDataType::OperandType,
+ typename SrcDataType::OperandType,
+ 1>(ib, obj, _opcode)
+ {
+ }
+ };
+
+ class SpecialInstNoSrcNoDest : public HsailGPUStaticInst
+ {
+ public:
+ SpecialInstNoSrcNoDest(const Brig::BrigInstBase *ib,
+ const BrigObject *obj, const char *_opcode)
+ : HsailGPUStaticInst(obj, _opcode)
+ {
+ }
+
+ bool isVectorRegister(int operandIndex) { return false; }
+ bool isCondRegister(int operandIndex) { return false; }
+ bool isScalarRegister(int operandIndex) { return false; }
+ bool isSrcOperand(int operandIndex) { return false; }
+ bool isDstOperand(int operandIndex) { return false; }
+ int getOperandSize(int operandIndex) { return 0; }
+ int getRegisterIndex(int operandIndex) { return -1; }
+
+ int numSrcRegOperands() { return 0; }
+ int numDstRegOperands() { return 0; }
+ int getNumOperands() { return 0; }
+ };
+
+ template<typename DestOperandType>
+ class SpecialInstNoSrcBase : public HsailGPUStaticInst
+ {
+ protected:
+ typename DestOperandType::DestOperand dest;
+
+ void generateDisassembly()
+ {
+ disassembly = csprintf("%s %s", opcode, dest.disassemble());
+ }
+
+ public:
+ SpecialInstNoSrcBase(const Brig::BrigInstBase *ib,
+ const BrigObject *obj, const char *_opcode)
+ : HsailGPUStaticInst(obj, _opcode)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ dest.init(op_offs, obj);
+ }
+
+ bool isVectorRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex) { return false; }
+ bool isDstOperand(int operandIndex) { return true; }
+ int getOperandSize(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.opSize();
+ }
+ int getRegisterIndex(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.regIndex();
+ }
+ int numSrcRegOperands() { return 0; }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ int getNumOperands() { return 1; }
+ };
+
+ template<typename DestDataType>
+ class SpecialInstNoSrc :
+ public SpecialInstNoSrcBase<typename DestDataType::OperandType>
+ {
+ public:
+ typedef typename DestDataType::CType DestCType;
+
+ SpecialInstNoSrc(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : SpecialInstNoSrcBase<typename DestDataType::OperandType>(ib, obj,
+ _opcode)
+ {
+ }
+ };
+
+ template<typename DestOperandType>
+ class SpecialInst1SrcBase : public HsailGPUStaticInst
+ {
+ protected:
+ typedef int SrcCType; // used in execute() template
+
+ typename DestOperandType::DestOperand dest;
+ ImmOperand<SrcCType> src0;
+
+ void
+ generateDisassembly()
+ {
+ disassembly = csprintf("%s %s,%s", opcode, dest.disassemble(),
+ src0.disassemble());
+ }
+
+ public:
+ SpecialInst1SrcBase(const Brig::BrigInstBase *ib,
+ const BrigObject *obj, const char *_opcode)
+ : HsailGPUStaticInst(obj, _opcode)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ dest.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ src0.init(op_offs, obj);
+ }
+ bool isVectorRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex) { return false; }
+ bool isDstOperand(int operandIndex) { return true; }
+ int getOperandSize(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.opSize();
+ }
+ int getRegisterIndex(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return dest.regIndex();
+ }
+ int numSrcRegOperands() { return 0; }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ int getNumOperands() { return 1; }
+ };
+
+ template<typename DestDataType>
+ class SpecialInst1Src :
+ public SpecialInst1SrcBase<typename DestDataType::OperandType>
+ {
+ public:
+ typedef typename DestDataType::CType DestCType;
+
+ SpecialInst1Src(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : SpecialInst1SrcBase<typename DestDataType::OperandType>(ib, obj,
+ _opcode)
+ {
+ }
+ };
+
+ class Ret : public SpecialInstNoSrcNoDest
+ {
+ public:
+ typedef SpecialInstNoSrcNoDest Base;
+
+ Ret(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : Base(ib, obj, "ret")
+ {
+ o_type = Enums::OT_RET;
+ }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+ };
+
+ class Barrier : public SpecialInstNoSrcNoDest
+ {
+ public:
+ typedef SpecialInstNoSrcNoDest Base;
+ uint8_t width;
+
+ Barrier(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : Base(ib, obj, "barrier")
+ {
+ o_type = Enums::OT_BARRIER;
+ assert(ib->base.kind == Brig::BRIG_KIND_INST_BR);
+ width = (uint8_t)((Brig::BrigInstBr*)ib)->width;
+ }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+ };
+
+ class MemFence : public SpecialInstNoSrcNoDest
+ {
+ public:
+ typedef SpecialInstNoSrcNoDest Base;
+
+ Brig::BrigMemoryOrder memFenceMemOrder;
+ Brig::BrigMemoryScope memFenceScopeSegGroup;
+ Brig::BrigMemoryScope memFenceScopeSegGlobal;
+ Brig::BrigMemoryScope memFenceScopeSegImage;
+
+ MemFence(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : Base(ib, obj, "memfence")
+ {
+ assert(ib->base.kind == Brig::BRIG_KIND_INST_MEM_FENCE);
+
+ memFenceScopeSegGlobal = (Brig::BrigMemoryScope)
+ ((Brig::BrigInstMemFence*)ib)->globalSegmentMemoryScope;
+
+ memFenceScopeSegGroup = (Brig::BrigMemoryScope)
+ ((Brig::BrigInstMemFence*)ib)->groupSegmentMemoryScope;
+
+ memFenceScopeSegImage = (Brig::BrigMemoryScope)
+ ((Brig::BrigInstMemFence*)ib)->imageSegmentMemoryScope;
+
+ memFenceMemOrder = (Brig::BrigMemoryOrder)
+ ((Brig::BrigInstMemFence*)ib)->memoryOrder;
+
+ // set o_type based on scopes
+ if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE &&
+ memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) {
+ o_type = Enums::OT_BOTH_MEMFENCE;
+ } else if (memFenceScopeSegGlobal != Brig::BRIG_MEMORY_SCOPE_NONE) {
+ o_type = Enums::OT_GLOBAL_MEMFENCE;
+ } else if (memFenceScopeSegGroup != Brig::BRIG_MEMORY_SCOPE_NONE) {
+ o_type = Enums::OT_SHARED_MEMFENCE;
+ } else {
+ fatal("MemFence constructor: bad scope specifiers\n");
+ }
+ }
+
+ void
+ initiateAcc(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *wave = gpuDynInst->wavefront();
+ wave->computeUnit->injectGlobalMemFence(gpuDynInst);
+ }
+
+ void
+ execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+ // 2 cases:
+ // * memfence to a sequentially consistent memory (e.g., LDS).
+ // These can be handled as no-ops.
+ // * memfence to a relaxed consistency cache (e.g., Hermes, Viper,
+ // etc.). We send a packet, tagged with the memory order and
+ // scope, and let the GPU coalescer handle it.
+
+ if (o_type == Enums::OT_GLOBAL_MEMFENCE ||
+ o_type == Enums::OT_BOTH_MEMFENCE) {
+ gpuDynInst->simdId = w->simdId;
+ gpuDynInst->wfSlotId = w->wfSlotId;
+ gpuDynInst->wfDynId = w->wfDynId;
+ gpuDynInst->kern_id = w->kern_id;
+ gpuDynInst->cu_id = w->computeUnit->cu_id;
+
+ gpuDynInst->memoryOrder =
+ getGenericMemoryOrder(memFenceMemOrder);
+ gpuDynInst->scope =
+ getGenericMemoryScope(memFenceScopeSegGlobal);
+ gpuDynInst->useContinuation = false;
+ GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe);
+ gmp->getGMReqFIFO().push(gpuDynInst);
+
+ w->wr_gm_reqs_in_pipe--;
+ w->rd_gm_reqs_in_pipe--;
+ w->mem_reqs_in_pipe--;
+ w->outstanding_reqs++;
+ } else if (o_type == Enums::OT_SHARED_MEMFENCE) {
+ // no-op
+ } else {
+ fatal("MemFence execute: bad o_type\n");
+ }
+ }
+ };
+
+ class Call : public HsailGPUStaticInst
+ {
+ public:
+ // private helper functions
+ void calcAddr(Wavefront* w, GPUDynInstPtr m);
+
+ void
+ generateDisassembly()
+ {
+ if (dest.disassemble() == "") {
+ disassembly = csprintf("%s %s (%s)", opcode, src0.disassemble(),
+ src1.disassemble());
+ } else {
+ disassembly = csprintf("%s %s (%s) (%s)", opcode,
+ src0.disassemble(), dest.disassemble(),
+ src1.disassemble());
+ }
+ }
+
+ bool
+ isPseudoOp()
+ {
+ std::string func_name = src0.disassemble();
+ if (func_name.find("__gem5_hsail_op") != std::string::npos) {
+ return true;
+ }
+ return false;
+ }
+
+ // member variables
+ ListOperand dest;
+ FunctionRefOperand src0;
+ ListOperand src1;
+ HsailCode *func_ptr;
+
+ // exec function for pseudo instructions mapped on top of call opcode
+ void execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst);
+
+ // user-defined pseudo instructions
+ void MagicPrintLane(Wavefront *w);
+ void MagicPrintLane64(Wavefront *w);
+ void MagicPrintWF32(Wavefront *w);
+ void MagicPrintWF64(Wavefront *w);
+ void MagicPrintWFFloat(Wavefront *w);
+ void MagicSimBreak(Wavefront *w);
+ void MagicPrefixSum(Wavefront *w);
+ void MagicReduction(Wavefront *w);
+ void MagicMaskLower(Wavefront *w);
+ void MagicMaskUpper(Wavefront *w);
+ void MagicJoinWFBar(Wavefront *w);
+ void MagicWaitWFBar(Wavefront *w);
+ void MagicPanic(Wavefront *w);
+
+ void MagicAtomicNRAddGlobalU32Reg(Wavefront *w,
+ GPUDynInstPtr gpuDynInst);
+
+ void MagicAtomicNRAddGroupU32Reg(Wavefront *w,
+ GPUDynInstPtr gpuDynInst);
+
+ void MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst);
+
+ void MagicXactCasLd(Wavefront *w);
+ void MagicMostSigThread(Wavefront *w);
+ void MagicMostSigBroadcast(Wavefront *w);
+
+ void MagicPrintWF32ID(Wavefront *w);
+ void MagicPrintWFID64(Wavefront *w);
+
+ Call(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ : HsailGPUStaticInst(obj, "call")
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ dest.init(op_offs, obj);
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ src0.init(op_offs, obj);
+
+ func_ptr = nullptr;
+ std::string func_name = src0.disassemble();
+ if (!isPseudoOp()) {
+ func_ptr = dynamic_cast<HsailCode*>(obj->
+ getFunction(func_name));
+
+ if (!func_ptr)
+ fatal("call::exec cannot find function: %s\n", func_name);
+ }
+
+ op_offs = obj->getOperandPtr(ib->operands, 2);
+ src1.init(op_offs, obj);
+ }
+
+ bool isVectorRegister(int operandIndex) { return false; }
+ bool isCondRegister(int operandIndex) { return false; }
+ bool isScalarRegister(int operandIndex) { return false; }
+ bool isSrcOperand(int operandIndex) { return false; }
+ bool isDstOperand(int operandIndex) { return false; }
+ int getOperandSize(int operandIndex) { return 0; }
+ int getRegisterIndex(int operandIndex) { return -1; }
+
+ void
+ execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ std::string func_name = src0.disassemble();
+ if (isPseudoOp()) {
+ execPseudoInst(w, gpuDynInst);
+ } else {
+ fatal("Native HSAIL functions are not yet implemented: %s\n",
+ func_name);
+ }
+ }
+ int numSrcRegOperands() { return 0; }
+ int numDstRegOperands() { return 0; }
+ int getNumOperands() { return 2; }
+ };
+
+ template<typename T> T heynot(T arg) { return ~arg; }
+ template<> inline bool heynot<bool>(bool arg) { return !arg; }
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_DECL_HH__
diff --git a/src/arch/hsail/insts/gpu_static_inst.cc b/src/arch/hsail/insts/gpu_static_inst.cc
new file mode 100644
index 000000000..bbaeb13e6
--- /dev/null
+++ b/src/arch/hsail/insts/gpu_static_inst.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "arch/hsail/insts/gpu_static_inst.hh"
+
+#include "gpu-compute/brig_object.hh"
+
+namespace HsailISA
+{
+ HsailGPUStaticInst::HsailGPUStaticInst(const BrigObject *obj,
+ const std::string &opcode)
+ : GPUStaticInst(opcode), hsailCode(obj->currentCode)
+ {
+ }
+
+ void
+ HsailGPUStaticInst::generateDisassembly()
+ {
+ disassembly = opcode;
+ }
+
+ const std::string&
+ HsailGPUStaticInst::disassemble()
+ {
+ if (disassembly.empty()) {
+ generateDisassembly();
+ assert(!disassembly.empty());
+ }
+
+ return disassembly;
+ }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/gpu_static_inst.hh b/src/arch/hsail/insts/gpu_static_inst.hh
new file mode 100644
index 000000000..29aab1f70
--- /dev/null
+++ b/src/arch/hsail/insts/gpu_static_inst.hh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
+#define __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
+
+/*
+ * @file gpu_static_inst.hh
+ *
+ * Defines the base class representing HSAIL GPU static instructions.
+ */
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+class BrigObject;
+class HsailCode;
+
+namespace HsailISA
+{
+ class HsailGPUStaticInst : public GPUStaticInst
+ {
+ public:
+ HsailGPUStaticInst(const BrigObject *obj, const std::string &opcode);
+ void generateDisassembly();
+ const std::string &disassemble();
+ uint32_t instSize() { return 4; }
+
+ protected:
+ HsailCode *hsailCode;
+ };
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_GPU_STATIC_INST_HH__
diff --git a/src/arch/hsail/insts/main.cc b/src/arch/hsail/insts/main.cc
new file mode 100644
index 000000000..4e70bf46a
--- /dev/null
+++ b/src/arch/hsail/insts/main.cc
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/insts/decl.hh"
+#include "debug/GPUExec.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+
+namespace HsailISA
+{
+ template<> const char *B1::label = "b1";
+ template<> const char *B8::label = "b8";
+ template<> const char *B16::label = "b16";
+ template<> const char *B32::label = "b32";
+ template<> const char *B64::label = "b64";
+
+ template<> const char *S8::label = "s8";
+ template<> const char *S16::label = "s16";
+ template<> const char *S32::label = "s32";
+ template<> const char *S64::label = "s64";
+
+ template<> const char *U8::label = "u8";
+ template<> const char *U16::label = "u16";
+ template<> const char *U32::label = "u32";
+ template<> const char *U64::label = "u64";
+
+ template<> const char *F32::label = "f32";
+ template<> const char *F64::label = "f64";
+
+ const char*
+ cmpOpToString(Brig::BrigCompareOperation cmpOp)
+ {
+ using namespace Brig;
+
+ switch (cmpOp) {
+ case BRIG_COMPARE_EQ:
+ return "eq";
+ case BRIG_COMPARE_NE:
+ return "ne";
+ case BRIG_COMPARE_LT:
+ return "lt";
+ case BRIG_COMPARE_LE:
+ return "le";
+ case BRIG_COMPARE_GT:
+ return "gt";
+ case BRIG_COMPARE_GE:
+ return "ge";
+ case BRIG_COMPARE_EQU:
+ return "equ";
+ case BRIG_COMPARE_NEU:
+ return "neu";
+ case BRIG_COMPARE_LTU:
+ return "ltu";
+ case BRIG_COMPARE_LEU:
+ return "leu";
+ case BRIG_COMPARE_GTU:
+ return "gtu";
+ case BRIG_COMPARE_GEU:
+ return "geu";
+ case BRIG_COMPARE_NUM:
+ return "num";
+ case BRIG_COMPARE_NAN:
+ return "nan";
+ case BRIG_COMPARE_SEQ:
+ return "seq";
+ case BRIG_COMPARE_SNE:
+ return "sne";
+ case BRIG_COMPARE_SLT:
+ return "slt";
+ case BRIG_COMPARE_SLE:
+ return "sle";
+ case BRIG_COMPARE_SGT:
+ return "sgt";
+ case BRIG_COMPARE_SGE:
+ return "sge";
+ case BRIG_COMPARE_SGEU:
+ return "sgeu";
+ case BRIG_COMPARE_SEQU:
+ return "sequ";
+ case BRIG_COMPARE_SNEU:
+ return "sneu";
+ case BRIG_COMPARE_SLTU:
+ return "sltu";
+ case BRIG_COMPARE_SLEU:
+ return "sleu";
+ case BRIG_COMPARE_SNUM:
+ return "snum";
+ case BRIG_COMPARE_SNAN:
+ return "snan";
+ case BRIG_COMPARE_SGTU:
+ return "sgtu";
+ default:
+ return "unknown";
+ }
+ }
+
+ void
+ Ret::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ const VectorMask &mask = w->get_pred();
+
+ // mask off completed work-items
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ w->init_mask[lane] = 0;
+ }
+
+ }
+
+ // delete extra instructions fetched for completed work-items
+ w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
+ w->instructionBuffer.end());
+ if (w->pendingFetch) {
+ w->dropFetch = true;
+ }
+
+ // if all work-items have completed, then wave-front is done
+ if (w->init_mask.none()) {
+ w->status = Wavefront::S_STOPPED;
+
+ int32_t refCount = w->computeUnit->getLds().
+ decreaseRefCounter(w->dispatchid, w->wg_id);
+
+ DPRINTF(GPUExec, "CU%d: decrease ref ctr WG[%d] to [%d]\n",
+ w->computeUnit->cu_id, w->wg_id, refCount);
+
+ // free the vector registers of the completed wavefront
+ w->computeUnit->vectorRegsReserved[w->simdId] -=
+ w->reservedVectorRegs;
+
+ assert(w->computeUnit->vectorRegsReserved[w->simdId] >= 0);
+
+ uint32_t endIndex = (w->startVgprIndex +
+ w->reservedVectorRegs - 1) %
+ w->computeUnit->vrf[w->simdId]->numRegs();
+
+ w->computeUnit->vrf[w->simdId]->manager->
+ freeRegion(w->startVgprIndex, endIndex);
+
+ w->reservedVectorRegs = 0;
+ w->startVgprIndex = 0;
+ w->computeUnit->completedWfs++;
+
+ DPRINTF(GPUExec, "Doing return for CU%d: WF[%d][%d][%d]\n",
+ w->computeUnit->cu_id, w->simdId, w->wfSlotId, w->wfDynId);
+
+ if (!refCount) {
+ // Notify Memory System of Kernel Completion
+ // Kernel End = isKernel + isRelease
+ w->status = Wavefront::S_RETURNING;
+ GPUDynInstPtr local_mempacket = gpuDynInst;
+ local_mempacket->memoryOrder = Enums::MEMORY_ORDER_SC_RELEASE;
+ local_mempacket->scope = Enums::MEMORY_SCOPE_SYSTEM;
+ local_mempacket->useContinuation = false;
+ local_mempacket->simdId = w->simdId;
+ local_mempacket->wfSlotId = w->wfSlotId;
+ local_mempacket->wfDynId = w->wfDynId;
+ w->computeUnit->injectGlobalMemFence(local_mempacket, true);
+ } else {
+ w->computeUnit->shader->dispatcher->scheduleDispatch();
+ }
+ }
+ }
+
+ void
+ Barrier::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ assert(w->barrier_cnt == w->old_barrier_cnt);
+ w->barrier_cnt = w->old_barrier_cnt + 1;
+ w->stalledAtBarrier = true;
+ }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/mem.cc b/src/arch/hsail/insts/mem.cc
new file mode 100644
index 000000000..97d4c902b
--- /dev/null
+++ b/src/arch/hsail/insts/mem.cc
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/insts/mem.hh"
+
+#include "arch/hsail/Brig.h"
+#include "enums/OpType.hh"
+
+using namespace Brig;
+
+namespace HsailISA
+{
+ const char* atomicOpToString(BrigAtomicOperation brigOp);
+
+ Enums::MemOpType
+ brigAtomicToMemOpType(BrigOpcode brigOpCode, BrigAtomicOperation brigOp)
+ {
+ if (brigOpCode == Brig::BRIG_OPCODE_ATOMIC) {
+ switch (brigOp) {
+ case BRIG_ATOMIC_AND:
+ return Enums::MO_AAND;
+ case BRIG_ATOMIC_OR:
+ return Enums::MO_AOR;
+ case BRIG_ATOMIC_XOR:
+ return Enums::MO_AXOR;
+ case BRIG_ATOMIC_CAS:
+ return Enums::MO_ACAS;
+ case BRIG_ATOMIC_EXCH:
+ return Enums::MO_AEXCH;
+ case BRIG_ATOMIC_ADD:
+ return Enums::MO_AADD;
+ case BRIG_ATOMIC_WRAPINC:
+ return Enums::MO_AINC;
+ case BRIG_ATOMIC_WRAPDEC:
+ return Enums::MO_ADEC;
+ case BRIG_ATOMIC_MIN:
+ return Enums::MO_AMIN;
+ case BRIG_ATOMIC_MAX:
+ return Enums::MO_AMAX;
+ case BRIG_ATOMIC_SUB:
+ return Enums::MO_ASUB;
+ default:
+ fatal("Bad BrigAtomicOperation code %d\n", brigOp);
+ }
+ } else if (brigOpCode == Brig::BRIG_OPCODE_ATOMICNORET) {
+ switch (brigOp) {
+ case BRIG_ATOMIC_AND:
+ return Enums::MO_ANRAND;
+ case BRIG_ATOMIC_OR:
+ return Enums::MO_ANROR;
+ case BRIG_ATOMIC_XOR:
+ return Enums::MO_ANRXOR;
+ case BRIG_ATOMIC_CAS:
+ return Enums::MO_ANRCAS;
+ case BRIG_ATOMIC_EXCH:
+ return Enums::MO_ANREXCH;
+ case BRIG_ATOMIC_ADD:
+ return Enums::MO_ANRADD;
+ case BRIG_ATOMIC_WRAPINC:
+ return Enums::MO_ANRINC;
+ case BRIG_ATOMIC_WRAPDEC:
+ return Enums::MO_ANRDEC;
+ case BRIG_ATOMIC_MIN:
+ return Enums::MO_ANRMIN;
+ case BRIG_ATOMIC_MAX:
+ return Enums::MO_ANRMAX;
+ case BRIG_ATOMIC_SUB:
+ return Enums::MO_ANRSUB;
+ default:
+ fatal("Bad BrigAtomicOperation code %d\n", brigOp);
+ }
+ } else {
+ fatal("Bad BrigAtomicOpcode %d\n", brigOpCode);
+ }
+ }
+
+ const char*
+ atomicOpToString(BrigAtomicOperation brigOp)
+ {
+ switch (brigOp) {
+ case BRIG_ATOMIC_AND:
+ return "and";
+ case BRIG_ATOMIC_OR:
+ return "or";
+ case BRIG_ATOMIC_XOR:
+ return "xor";
+ case BRIG_ATOMIC_CAS:
+ return "cas";
+ case BRIG_ATOMIC_EXCH:
+ return "exch";
+ case BRIG_ATOMIC_ADD:
+ return "add";
+ case BRIG_ATOMIC_WRAPINC:
+ return "inc";
+ case BRIG_ATOMIC_WRAPDEC:
+ return "dec";
+ case BRIG_ATOMIC_MIN:
+ return "min";
+ case BRIG_ATOMIC_MAX:
+ return "max";
+ case BRIG_ATOMIC_SUB:
+ return "sub";
+ default:
+ return "unknown";
+ }
+ }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/mem.hh b/src/arch/hsail/insts/mem.hh
new file mode 100644
index 000000000..d3ce76dee
--- /dev/null
+++ b/src/arch/hsail/insts/mem.hh
@@ -0,0 +1,1629 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_INSTS_MEM_HH__
+#define __ARCH_HSAIL_INSTS_MEM_HH__
+
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/gpu_static_inst.hh"
+#include "arch/hsail/operand.hh"
+
+namespace HsailISA
+{
+ class MemInst
+ {
+ public:
+ MemInst() : size(0), addr_operand(nullptr) { }
+
+ MemInst(Enums::MemType m_type)
+ {
+ if (m_type == Enums::M_U64 ||
+ m_type == Enums::M_S64 ||
+ m_type == Enums::M_F64) {
+ size = 8;
+ } else if (m_type == Enums::M_U32 ||
+ m_type == Enums::M_S32 ||
+ m_type == Enums::M_F32) {
+ size = 4;
+ } else if (m_type == Enums::M_U16 ||
+ m_type == Enums::M_S16 ||
+ m_type == Enums::M_F16) {
+ size = 2;
+ } else {
+ size = 1;
+ }
+
+ addr_operand = nullptr;
+ }
+
+ void
+ init_addr(AddrOperandBase *_addr_operand)
+ {
+ addr_operand = _addr_operand;
+ }
+
+ private:
+ int size;
+ AddrOperandBase *addr_operand;
+
+ public:
+ int getMemOperandSize() { return size; }
+ AddrOperandBase *getAddressOperand() { return addr_operand; }
+ };
+
+ template<typename DestOperandType, typename AddrOperandType>
+ class LdaInstBase : public HsailGPUStaticInst
+ {
+ public:
+ typename DestOperandType::DestOperand dest;
+ AddrOperandType addr;
+
+ LdaInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : HsailGPUStaticInst(obj, _opcode)
+ {
+ using namespace Brig;
+
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ dest.init(op_offs, obj);
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ addr.init(op_offs, obj);
+ }
+
+ int numSrcRegOperands() { return(this->addr.isVectorRegister()); }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ bool isVectorRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.isVectorRegister() :
+ this->addr.isVectorRegister());
+ }
+ bool isCondRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.isCondRegister() :
+ this->addr.isCondRegister());
+ }
+ bool isScalarRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.isScalarRegister() :
+ this->addr.isScalarRegister());
+ }
+ bool isSrcOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex > 0)
+ return(this->addr.isVectorRegister());
+ return false;
+ }
+ bool isDstOperand(int operandIndex) {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return(operandIndex == 0);
+ }
+ int getOperandSize(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.opSize() :
+ this->addr.opSize());
+ }
+ int getRegisterIndex(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.regIndex() :
+ this->addr.regIndex());
+ }
+ int getNumOperands()
+ {
+ if (this->addr.isVectorRegister())
+ return 2;
+ return 1;
+ }
+ };
+
+ template<typename DestDataType, typename AddrOperandType>
+ class LdaInst :
+ public LdaInstBase<typename DestDataType::OperandType, AddrOperandType>,
+ public MemInst
+ {
+ public:
+ void generateDisassembly();
+
+ LdaInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : LdaInstBase<typename DestDataType::OperandType,
+ AddrOperandType>(ib, obj, _opcode)
+ {
+ init_addr(&this->addr);
+ }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+ };
+
+ template<typename DataType>
+ GPUStaticInst*
+ decodeLda(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 1);
+ BrigRegOperandInfo regDataType = findRegDataType(op_offs, obj);
+
+ if (regDataType.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+ return new LdaInst<DataType, NoRegAddrOperand>(ib, obj, "ldas");
+ } else if (regDataType.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+ // V2/V4 not allowed
+ switch (regDataType.regKind) {
+ case Brig::BRIG_REGISTER_KIND_SINGLE:
+ return new LdaInst<DataType, SRegAddrOperand>(ib, obj, "ldas");
+ case Brig::BRIG_REGISTER_KIND_DOUBLE:
+ return new LdaInst<DataType, DRegAddrOperand>(ib, obj, "ldas");
+ default:
+ fatal("Bad ldas register operand type %d\n", regDataType.type);
+ }
+ } else {
+ fatal("Bad ldas register operand kind %d\n", regDataType.kind);
+ }
+ }
+
+ template<typename MemOperandType, typename DestOperandType,
+ typename AddrOperandType>
+ class LdInstBase : public HsailGPUStaticInst
+ {
+ public:
+ Brig::BrigWidth8_t width;
+ typename DestOperandType::DestOperand dest;
+ AddrOperandType addr;
+
+ Brig::BrigSegment segment;
+ Brig::BrigMemoryOrder memoryOrder;
+ Brig::BrigMemoryScope memoryScope;
+ unsigned int equivClass;
+ bool isArgLoad()
+ {
+ return segment == Brig::BRIG_SEGMENT_KERNARG ||
+ segment == Brig::BRIG_SEGMENT_ARG;
+ }
+ void
+ initLd(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ {
+ using namespace Brig;
+
+ const BrigInstMem *ldst = (const BrigInstMem*)ib;
+
+ segment = (BrigSegment)ldst->segment;
+ memoryOrder = BRIG_MEMORY_ORDER_NONE;
+ memoryScope = BRIG_MEMORY_SCOPE_NONE;
+ equivClass = ldst->equivClass;
+
+ switch (segment) {
+ case BRIG_SEGMENT_GLOBAL:
+ o_type = Enums::OT_GLOBAL_READ;
+ break;
+
+ case BRIG_SEGMENT_GROUP:
+ o_type = Enums::OT_SHARED_READ;
+ break;
+
+ case BRIG_SEGMENT_PRIVATE:
+ o_type = Enums::OT_PRIVATE_READ;
+ break;
+
+ case BRIG_SEGMENT_READONLY:
+ o_type = Enums::OT_READONLY_READ;
+ break;
+
+ case BRIG_SEGMENT_SPILL:
+ o_type = Enums::OT_SPILL_READ;
+ break;
+
+ case BRIG_SEGMENT_FLAT:
+ o_type = Enums::OT_FLAT_READ;
+ break;
+
+ case BRIG_SEGMENT_KERNARG:
+ o_type = Enums::OT_KERN_READ;
+ break;
+
+ case BRIG_SEGMENT_ARG:
+ o_type = Enums::OT_ARG;
+ break;
+
+ default:
+ panic("Ld: segment %d not supported\n", segment);
+ }
+
+ width = ldst->width;
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+ if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
+ dest.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ addr.init(op_offs, obj);
+ }
+
+ void
+ initAtomicLd(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ {
+ using namespace Brig;
+
+ const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+ segment = (BrigSegment)at->segment;
+ memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+ memoryScope = (BrigMemoryScope)at->memoryScope;
+ equivClass = 0;
+
+ switch (segment) {
+ case BRIG_SEGMENT_GLOBAL:
+ o_type = Enums::OT_GLOBAL_READ;
+ break;
+
+ case BRIG_SEGMENT_GROUP:
+ o_type = Enums::OT_SHARED_READ;
+ break;
+
+ case BRIG_SEGMENT_PRIVATE:
+ o_type = Enums::OT_PRIVATE_READ;
+ break;
+
+ case BRIG_SEGMENT_READONLY:
+ o_type = Enums::OT_READONLY_READ;
+ break;
+
+ case BRIG_SEGMENT_SPILL:
+ o_type = Enums::OT_SPILL_READ;
+ break;
+
+ case BRIG_SEGMENT_FLAT:
+ o_type = Enums::OT_FLAT_READ;
+ break;
+
+ case BRIG_SEGMENT_KERNARG:
+ o_type = Enums::OT_KERN_READ;
+ break;
+
+ case BRIG_SEGMENT_ARG:
+ o_type = Enums::OT_ARG;
+ break;
+
+ default:
+ panic("Ld: segment %d not supported\n", segment);
+ }
+
+ width = BRIG_WIDTH_1;
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+
+ if (brigOp->kind == BRIG_KIND_OPERAND_REGISTER)
+ dest.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands,1);
+ addr.init(op_offs, obj);
+ }
+
+ LdInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : HsailGPUStaticInst(obj, _opcode)
+ {
+ using namespace Brig;
+
+ if (ib->opcode == BRIG_OPCODE_LD) {
+ initLd(ib, obj, _opcode);
+ } else {
+ initAtomicLd(ib, obj, _opcode);
+ }
+ }
+
+ int numSrcRegOperands() { return(this->addr.isVectorRegister()); }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ int getNumOperands()
+ {
+ if (this->addr.isVectorRegister())
+ return 2;
+ else
+ return 1;
+ }
+ bool isVectorRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.isVectorRegister() :
+ this->addr.isVectorRegister());
+ }
+ bool isCondRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.isCondRegister() :
+ this->addr.isCondRegister());
+ }
+ bool isScalarRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.isScalarRegister() :
+ this->addr.isScalarRegister());
+ }
+ bool isSrcOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex > 0)
+ return(this->addr.isVectorRegister());
+ return false;
+ }
+ bool isDstOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return(operandIndex == 0);
+ }
+ int getOperandSize(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.opSize() :
+ this->addr.opSize());
+ }
+ int getRegisterIndex(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return((operandIndex == 0) ? dest.regIndex() :
+ this->addr.regIndex());
+ }
+ };
+
+ template<typename MemDataType, typename DestDataType,
+ typename AddrOperandType>
+ class LdInst :
+ public LdInstBase<typename MemDataType::CType,
+ typename DestDataType::OperandType, AddrOperandType>,
+ public MemInst
+ {
+ typename DestDataType::OperandType::DestOperand dest_vect[4];
+ uint16_t num_dest_operands;
+ void generateDisassembly();
+
+ public:
+ LdInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : LdInstBase<typename MemDataType::CType,
+ typename DestDataType::OperandType,
+ AddrOperandType>(ib, obj, _opcode),
+ MemInst(MemDataType::memType)
+ {
+ init_addr(&this->addr);
+
+ unsigned op_offs = obj->getOperandPtr(ib->operands,0);
+ const Brig::BrigOperand *brigOp = obj->getOperand(op_offs);
+
+ if (brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+ const Brig::BrigOperandOperandList *brigRegVecOp =
+ (const Brig::BrigOperandOperandList*)brigOp;
+
+ num_dest_operands =
+ *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
+
+ assert(num_dest_operands <= 4);
+ } else {
+ num_dest_operands = 1;
+ }
+
+ if (num_dest_operands > 1) {
+ assert(brigOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+
+ for (int i = 0; i < num_dest_operands; ++i) {
+ dest_vect[i].init_from_vect(op_offs, obj, i);
+ }
+ }
+ }
+
+ void
+ initiateAcc(GPUDynInstPtr gpuDynInst) override
+ {
+ typedef typename MemDataType::CType c0;
+
+ gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+ if (num_dest_operands > 1) {
+ for (int i = 0; i < VSZ; ++i)
+ if (gpuDynInst->exec_mask[i])
+ gpuDynInst->statusVector.push_back(num_dest_operands);
+ else
+ gpuDynInst->statusVector.push_back(0);
+ }
+
+ for (int k = 0; k < num_dest_operands; ++k) {
+
+ c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+
+ for (int i = 0; i < VSZ; ++i) {
+ if (gpuDynInst->exec_mask[i]) {
+ Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
+
+ if (isLocalMem()) {
+ // load from shared memory
+ *d = gpuDynInst->wavefront()->ldsChunk->
+ read<c0>(vaddr);
+ } else {
+ Request *req = new Request(0, vaddr, sizeof(c0), 0,
+ gpuDynInst->computeUnit()->masterId(),
+ 0, gpuDynInst->wfDynId, i);
+
+ gpuDynInst->setRequestFlags(req);
+ PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+ pkt->dataStatic(d);
+
+ if (gpuDynInst->computeUnit()->shader->
+ separate_acquire_release &&
+ gpuDynInst->memoryOrder ==
+ Enums::MEMORY_ORDER_SC_ACQUIRE) {
+ // if this load has acquire semantics,
+ // set the response continuation function
+ // to perform an Acquire request
+ gpuDynInst->execContinuation =
+ &GPUStaticInst::execLdAcq;
+
+ gpuDynInst->useContinuation = true;
+ } else {
+ // the request will be finished when
+ // the load completes
+ gpuDynInst->useContinuation = false;
+ }
+ // translation is performed in sendRequest()
+ gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
+ i, pkt);
+ }
+ }
+ ++d;
+ }
+ }
+
+ gpuDynInst->updateStats();
+ }
+
+ private:
+ void
+ execLdAcq(GPUDynInstPtr gpuDynInst) override
+ {
+ // after the load has complete and if the load has acquire
+ // semantics, issue an acquire request.
+ if (!isLocalMem()) {
+ if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+ && gpuDynInst->memoryOrder ==
+ Enums::MEMORY_ORDER_SC_ACQUIRE) {
+ gpuDynInst->statusBitVector = VectorMask(1);
+ gpuDynInst->useContinuation = false;
+ // create request
+ Request *req = new Request(0, 0, 0, 0,
+ gpuDynInst->computeUnit()->masterId(),
+ 0, gpuDynInst->wfDynId, -1);
+ req->setFlags(Request::ACQUIRE);
+ gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+ }
+ }
+ }
+
+ public:
+ bool
+ isLocalMem() const override
+ {
+ return this->segment == Brig::BRIG_SEGMENT_GROUP;
+ }
+
+ bool isVectorRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if ((num_dest_operands != getNumOperands()) &&
+ (operandIndex == (getNumOperands()-1)))
+ return(this->addr.isVectorRegister());
+ if (num_dest_operands > 1) {
+ return dest_vect[operandIndex].isVectorRegister();
+ }
+ else if (num_dest_operands == 1) {
+ return LdInstBase<typename MemDataType::CType,
+ typename DestDataType::OperandType,
+ AddrOperandType>::dest.isVectorRegister();
+ }
+ return false;
+ }
+ bool isCondRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if ((num_dest_operands != getNumOperands()) &&
+ (operandIndex == (getNumOperands()-1)))
+ return(this->addr.isCondRegister());
+ if (num_dest_operands > 1)
+ return dest_vect[operandIndex].isCondRegister();
+ else if (num_dest_operands == 1)
+ return LdInstBase<typename MemDataType::CType,
+ typename DestDataType::OperandType,
+ AddrOperandType>::dest.isCondRegister();
+ return false;
+ }
+ bool isScalarRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if ((num_dest_operands != getNumOperands()) &&
+ (operandIndex == (getNumOperands()-1)))
+ return(this->addr.isScalarRegister());
+ if (num_dest_operands > 1)
+ return dest_vect[operandIndex].isScalarRegister();
+ else if (num_dest_operands == 1)
+ return LdInstBase<typename MemDataType::CType,
+ typename DestDataType::OperandType,
+ AddrOperandType>::dest.isScalarRegister();
+ return false;
+ }
+ bool isSrcOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if ((num_dest_operands != getNumOperands()) &&
+ (operandIndex == (getNumOperands()-1)))
+ return(this->addr.isVectorRegister());
+ return false;
+ }
+ bool isDstOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if ((num_dest_operands != getNumOperands()) &&
+ (operandIndex == (getNumOperands()-1)))
+ return false;
+ return true;
+ }
+ int getOperandSize(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if ((num_dest_operands != getNumOperands()) &&
+ (operandIndex == (getNumOperands()-1)))
+ return(this->addr.opSize());
+ if (num_dest_operands > 1)
+ return(dest_vect[operandIndex].opSize());
+ else if (num_dest_operands == 1)
+ return(LdInstBase<typename MemDataType::CType,
+ typename DestDataType::OperandType,
+ AddrOperandType>::dest.opSize());
+ return 0;
+ }
+ int getRegisterIndex(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if ((num_dest_operands != getNumOperands()) &&
+ (operandIndex == (getNumOperands()-1)))
+ return(this->addr.regIndex());
+ if (num_dest_operands > 1)
+ return(dest_vect[operandIndex].regIndex());
+ else if (num_dest_operands == 1)
+ return(LdInstBase<typename MemDataType::CType,
+ typename DestDataType::OperandType,
+ AddrOperandType>::dest.regIndex());
+ return -1;
+ }
+ int getNumOperands()
+ {
+ if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+ return(num_dest_operands+1);
+ else
+ return(num_dest_operands);
+ }
+ void execute(GPUDynInstPtr gpuDynInst);
+ };
+
+ template<typename MemDT, typename DestDT>
+ GPUStaticInst*
+ decodeLd2(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands,1);
+ BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+ if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+ return new LdInst<MemDT, DestDT, NoRegAddrOperand>(ib, obj, "ld");
+ } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
+ tmp.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+ switch (tmp.regKind) {
+ case Brig::BRIG_REGISTER_KIND_SINGLE:
+ return new LdInst<MemDT, DestDT,
+ SRegAddrOperand>(ib, obj, "ld");
+ case Brig::BRIG_REGISTER_KIND_DOUBLE:
+ return new LdInst<MemDT, DestDT,
+ DRegAddrOperand>(ib, obj, "ld");
+ default:
+ fatal("Bad ld register operand type %d\n", tmp.regKind);
+ }
+ } else {
+ fatal("Bad ld register operand kind %d\n", tmp.kind);
+ }
+ }
+
+ template<typename MemDT>
+ GPUStaticInst*
+ decodeLd(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ unsigned op_offs = obj->getOperandPtr(ib->operands,0);
+ BrigRegOperandInfo dest = findRegDataType(op_offs, obj);
+
+ assert(dest.kind == Brig::BRIG_KIND_OPERAND_REGISTER ||
+ dest.kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+ switch(dest.regKind) {
+ case Brig::BRIG_REGISTER_KIND_SINGLE:
+ switch (ib->type) {
+ case Brig::BRIG_TYPE_B8:
+ case Brig::BRIG_TYPE_B16:
+ case Brig::BRIG_TYPE_B32:
+ return decodeLd2<MemDT, B32>(ib, obj);
+ case Brig::BRIG_TYPE_U8:
+ case Brig::BRIG_TYPE_U16:
+ case Brig::BRIG_TYPE_U32:
+ return decodeLd2<MemDT, U32>(ib, obj);
+ case Brig::BRIG_TYPE_S8:
+ case Brig::BRIG_TYPE_S16:
+ case Brig::BRIG_TYPE_S32:
+ return decodeLd2<MemDT, S32>(ib, obj);
+ case Brig::BRIG_TYPE_F16:
+ case Brig::BRIG_TYPE_F32:
+ return decodeLd2<MemDT, U32>(ib, obj);
+ default:
+ fatal("Bad ld register operand type %d, %d\n",
+ dest.regKind, ib->type);
+ };
+ case Brig::BRIG_REGISTER_KIND_DOUBLE:
+ switch (ib->type) {
+ case Brig::BRIG_TYPE_B64:
+ return decodeLd2<MemDT, B64>(ib, obj);
+ case Brig::BRIG_TYPE_U64:
+ return decodeLd2<MemDT, U64>(ib, obj);
+ case Brig::BRIG_TYPE_S64:
+ return decodeLd2<MemDT, S64>(ib, obj);
+ case Brig::BRIG_TYPE_F64:
+ return decodeLd2<MemDT, U64>(ib, obj);
+ default:
+ fatal("Bad ld register operand type %d, %d\n",
+ dest.regKind, ib->type);
+ };
+ default:
+ fatal("Bad ld register operand type %d, %d\n", dest.regKind,
+ ib->type);
+ }
+ }
+
+ template<typename MemDataType, typename SrcOperandType,
+ typename AddrOperandType>
+ class StInstBase : public HsailGPUStaticInst
+ {
+ public:
+ typename SrcOperandType::SrcOperand src;
+ AddrOperandType addr;
+
+ Brig::BrigSegment segment;
+ Brig::BrigMemoryScope memoryScope;
+ Brig::BrigMemoryOrder memoryOrder;
+ unsigned int equivClass;
+
+ void
+ initSt(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ {
+ using namespace Brig;
+
+ const BrigInstMem *ldst = (const BrigInstMem*)ib;
+
+ segment = (BrigSegment)ldst->segment;
+ memoryOrder = BRIG_MEMORY_ORDER_NONE;
+ memoryScope = BRIG_MEMORY_SCOPE_NONE;
+ equivClass = ldst->equivClass;
+
+ switch (segment) {
+ case BRIG_SEGMENT_GLOBAL:
+ o_type = Enums::OT_GLOBAL_WRITE;
+ break;
+
+ case BRIG_SEGMENT_GROUP:
+ o_type = Enums::OT_SHARED_WRITE;
+ break;
+
+ case BRIG_SEGMENT_PRIVATE:
+ o_type = Enums::OT_PRIVATE_WRITE;
+ break;
+
+ case BRIG_SEGMENT_READONLY:
+ o_type = Enums::OT_READONLY_WRITE;
+ break;
+
+ case BRIG_SEGMENT_SPILL:
+ o_type = Enums::OT_SPILL_WRITE;
+ break;
+
+ case BRIG_SEGMENT_FLAT:
+ o_type = Enums::OT_FLAT_WRITE;
+ break;
+
+ case BRIG_SEGMENT_ARG:
+ o_type = Enums::OT_ARG;
+ break;
+
+ default:
+ panic("St: segment %d not supported\n", segment);
+ }
+
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ const BrigOperand *baseOp = obj->getOperand(op_offs);
+
+ if ((baseOp->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) ||
+ (baseOp->kind == BRIG_KIND_OPERAND_REGISTER)) {
+ src.init(op_offs, obj);
+ }
+
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ addr.init(op_offs, obj);
+ }
+
+ void
+ initAtomicSt(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ {
+ using namespace Brig;
+
+ const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+ segment = (BrigSegment)at->segment;
+ memoryScope = (BrigMemoryScope)at->memoryScope;
+ memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+ equivClass = 0;
+
+ switch (segment) {
+ case BRIG_SEGMENT_GLOBAL:
+ o_type = Enums::OT_GLOBAL_WRITE;
+ break;
+
+ case BRIG_SEGMENT_GROUP:
+ o_type = Enums::OT_SHARED_WRITE;
+ break;
+
+ case BRIG_SEGMENT_PRIVATE:
+ o_type = Enums::OT_PRIVATE_WRITE;
+ break;
+
+ case BRIG_SEGMENT_READONLY:
+ o_type = Enums::OT_READONLY_WRITE;
+ break;
+
+ case BRIG_SEGMENT_SPILL:
+ o_type = Enums::OT_SPILL_WRITE;
+ break;
+
+ case BRIG_SEGMENT_FLAT:
+ o_type = Enums::OT_FLAT_WRITE;
+ break;
+
+ case BRIG_SEGMENT_ARG:
+ o_type = Enums::OT_ARG;
+ break;
+
+ default:
+ panic("St: segment %d not supported\n", segment);
+ }
+
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ addr.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ src.init(op_offs, obj);
+ }
+
+ StInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : HsailGPUStaticInst(obj, _opcode)
+ {
+ using namespace Brig;
+
+ if (ib->opcode == BRIG_OPCODE_ST) {
+ initSt(ib, obj, _opcode);
+ } else {
+ initAtomicSt(ib, obj, _opcode);
+ }
+ }
+
+ int numDstRegOperands() { return 0; }
+ int numSrcRegOperands()
+ {
+ return src.isVectorRegister() + this->addr.isVectorRegister();
+ }
+ int getNumOperands()
+ {
+ if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+ return 2;
+ else
+ return 1;
+ }
+ bool isVectorRegister(int operandIndex)
+ {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return !operandIndex ? src.isVectorRegister() :
+ this->addr.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex)
+ {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return !operandIndex ? src.isCondRegister() :
+ this->addr.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex)
+ {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return !operandIndex ? src.isScalarRegister() :
+ this->addr.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return true;
+ }
+ bool isDstOperand(int operandIndex) { return false; }
+ int getOperandSize(int operandIndex)
+ {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return !operandIndex ? src.opSize() : this->addr.opSize();
+ }
+ int getRegisterIndex(int operandIndex)
+ {
+ assert(operandIndex >= 0 && operandIndex < getNumOperands());
+ return !operandIndex ? src.regIndex() : this->addr.regIndex();
+ }
+ };
+
+
+ template<typename MemDataType, typename SrcDataType,
+ typename AddrOperandType>
+ class StInst :
+ public StInstBase<MemDataType, typename SrcDataType::OperandType,
+ AddrOperandType>,
+ public MemInst
+ {
+ public:
+ typename SrcDataType::OperandType::SrcOperand src_vect[4];
+ uint16_t num_src_operands;
+ void generateDisassembly();
+
+ StInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode, int srcIdx)
+ : StInstBase<MemDataType, typename SrcDataType::OperandType,
+ AddrOperandType>(ib, obj, _opcode),
+ MemInst(SrcDataType::memType)
+ {
+ init_addr(&this->addr);
+
+ BrigRegOperandInfo rinfo;
+ unsigned op_offs = obj->getOperandPtr(ib->operands,srcIdx);
+ const Brig::BrigOperand *baseOp = obj->getOperand(op_offs);
+
+ if (baseOp->kind == Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+ const Brig::BrigOperandConstantBytes *op =
+ (Brig::BrigOperandConstantBytes*)baseOp;
+
+ rinfo = BrigRegOperandInfo((Brig::BrigKind16_t)op->base.kind,
+ Brig::BRIG_TYPE_NONE);
+ } else {
+ rinfo = findRegDataType(op_offs, obj);
+ }
+
+ if (baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+ const Brig::BrigOperandOperandList *brigRegVecOp =
+ (const Brig::BrigOperandOperandList*)baseOp;
+
+ num_src_operands =
+ *((unsigned*)obj->getData(brigRegVecOp->elements)) / 4;
+
+ assert(num_src_operands <= 4);
+ } else {
+ num_src_operands = 1;
+ }
+
+ if (num_src_operands > 1) {
+ assert(baseOp->kind == Brig::BRIG_KIND_OPERAND_OPERAND_LIST);
+
+ for (int i = 0; i < num_src_operands; ++i) {
+ src_vect[i].init_from_vect(op_offs, obj, i);
+ }
+ }
+ }
+
+ void
+ initiateAcc(GPUDynInstPtr gpuDynInst) override
+ {
+ // before performing a store, check if this store has
+ // release semantics, and if so issue a release first
+ if (!isLocalMem()) {
+ if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+ && gpuDynInst->memoryOrder ==
+ Enums::MEMORY_ORDER_SC_RELEASE) {
+
+ gpuDynInst->statusBitVector = VectorMask(1);
+ gpuDynInst->execContinuation = &GPUStaticInst::execSt;
+ gpuDynInst->useContinuation = true;
+ // create request
+ Request *req = new Request(0, 0, 0, 0,
+ gpuDynInst->computeUnit()->masterId(),
+ 0, gpuDynInst->wfDynId, -1);
+ req->setFlags(Request::RELEASE);
+ gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+
+ return;
+ }
+ }
+
+ // if there is no release semantic, perform stores immediately
+ execSt(gpuDynInst);
+ }
+
+ bool
+ isLocalMem() const override
+ {
+ return this->segment == Brig::BRIG_SEGMENT_GROUP;
+ }
+
+ private:
+ // execSt may be called through a continuation
+ // if the store had release semantics. see comment for
+ // execSt in gpu_static_inst.hh
+ void
+ execSt(GPUDynInstPtr gpuDynInst) override
+ {
+ typedef typename MemDataType::CType c0;
+
+ gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+ if (num_src_operands > 1) {
+ for (int i = 0; i < VSZ; ++i)
+ if (gpuDynInst->exec_mask[i])
+ gpuDynInst->statusVector.push_back(num_src_operands);
+ else
+ gpuDynInst->statusVector.push_back(0);
+ }
+
+ for (int k = 0; k < num_src_operands; ++k) {
+ c0 *d = &((c0*)gpuDynInst->d_data)[k * VSZ];
+
+ for (int i = 0; i < VSZ; ++i) {
+ if (gpuDynInst->exec_mask[i]) {
+ Addr vaddr = gpuDynInst->addr[i] + k * sizeof(c0);
+
+ if (isLocalMem()) {
+ //store to shared memory
+ gpuDynInst->wavefront()->ldsChunk->write<c0>(vaddr,
+ *d);
+ } else {
+ Request *req =
+ new Request(0, vaddr, sizeof(c0), 0,
+ gpuDynInst->computeUnit()->masterId(),
+ 0, gpuDynInst->wfDynId, i);
+
+ gpuDynInst->setRequestFlags(req);
+ PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+ pkt->dataStatic<c0>(d);
+
+ // translation is performed in sendRequest()
+ // the request will be finished when the store completes
+ gpuDynInst->useContinuation = false;
+ gpuDynInst->computeUnit()->sendRequest(gpuDynInst,
+ i, pkt);
+
+ }
+ }
+ ++d;
+ }
+ }
+
+ gpuDynInst->updateStats();
+ }
+
+ public:
+ bool isVectorRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex == num_src_operands)
+ return this->addr.isVectorRegister();
+ if (num_src_operands > 1)
+ return src_vect[operandIndex].isVectorRegister();
+ else if (num_src_operands == 1)
+ return StInstBase<MemDataType,
+ typename SrcDataType::OperandType,
+ AddrOperandType>::src.isVectorRegister();
+ return false;
+ }
+ bool isCondRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex == num_src_operands)
+ return this->addr.isCondRegister();
+ if (num_src_operands > 1)
+ return src_vect[operandIndex].isCondRegister();
+ else if (num_src_operands == 1)
+ return StInstBase<MemDataType,
+ typename SrcDataType::OperandType,
+ AddrOperandType>::src.isCondRegister();
+ return false;
+ }
+ bool isScalarRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex == num_src_operands)
+ return this->addr.isScalarRegister();
+ if (num_src_operands > 1)
+ return src_vect[operandIndex].isScalarRegister();
+ else if (num_src_operands == 1)
+ return StInstBase<MemDataType,
+ typename SrcDataType::OperandType,
+ AddrOperandType>::src.isScalarRegister();
+ return false;
+ }
+ bool isSrcOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ return true;
+ }
+ bool isDstOperand(int operandIndex) { return false; }
+ int getOperandSize(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex == num_src_operands)
+ return this->addr.opSize();
+ if (num_src_operands > 1)
+ return src_vect[operandIndex].opSize();
+ else if (num_src_operands == 1)
+ return StInstBase<MemDataType,
+ typename SrcDataType::OperandType,
+ AddrOperandType>::src.opSize();
+ return 0;
+ }
+ int getRegisterIndex(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex == num_src_operands)
+ return this->addr.regIndex();
+ if (num_src_operands > 1)
+ return src_vect[operandIndex].regIndex();
+ else if (num_src_operands == 1)
+ return StInstBase<MemDataType,
+ typename SrcDataType::OperandType,
+ AddrOperandType>::src.regIndex();
+ return -1;
+ }
+ int getNumOperands()
+ {
+ if (this->addr.isVectorRegister() || this->addr.isScalarRegister())
+ return num_src_operands + 1;
+ else
+ return num_src_operands;
+ }
+ void execute(GPUDynInstPtr gpuDynInst);
+ };
+
+ template<typename DataType, typename SrcDataType>
+ GPUStaticInst*
+ decodeSt(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ int srcIdx = 0;
+ int destIdx = 1;
+ if (ib->opcode == Brig::BRIG_OPCODE_ATOMIC ||
+ ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET) {
+ srcIdx = 1;
+ destIdx = 0;
+ }
+ unsigned op_offs = obj->getOperandPtr(ib->operands,destIdx);
+
+ BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+ if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+ return new StInst<DataType, SrcDataType,
+ NoRegAddrOperand>(ib, obj, "st", srcIdx);
+ } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+ // V2/V4 not allowed
+ switch (tmp.regKind) {
+ case Brig::BRIG_REGISTER_KIND_SINGLE:
+ return new StInst<DataType, SrcDataType,
+ SRegAddrOperand>(ib, obj, "st", srcIdx);
+ case Brig::BRIG_REGISTER_KIND_DOUBLE:
+ return new StInst<DataType, SrcDataType,
+ DRegAddrOperand>(ib, obj, "st", srcIdx);
+ default:
+ fatal("Bad st register operand type %d\n", tmp.type);
+ }
+ } else {
+ fatal("Bad st register operand kind %d\n", tmp.kind);
+ }
+ }
+
+ Enums::MemOpType brigAtomicToMemOpType(Brig::BrigOpcode brigOpCode,
+ Brig::BrigAtomicOperation brigOp);
+
+ template<typename OperandType, typename AddrOperandType, int NumSrcOperands,
+ bool HasDst>
+ class AtomicInstBase : public HsailGPUStaticInst
+ {
+ public:
+ typename OperandType::DestOperand dest;
+ typename OperandType::SrcOperand src[NumSrcOperands];
+ AddrOperandType addr;
+
+ Brig::BrigSegment segment;
+ Brig::BrigMemoryOrder memoryOrder;
+ Brig::BrigAtomicOperation atomicOperation;
+ Brig::BrigMemoryScope memoryScope;
+ Brig::BrigOpcode opcode;
+ Enums::MemOpType opType;
+
+ AtomicInstBase(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : HsailGPUStaticInst(obj, _opcode)
+ {
+ using namespace Brig;
+
+ const BrigInstAtomic *at = (const BrigInstAtomic*)ib;
+
+ segment = (BrigSegment)at->segment;
+ memoryScope = (BrigMemoryScope)at->memoryScope;
+ memoryOrder = (BrigMemoryOrder)at->memoryOrder;
+ atomicOperation = (BrigAtomicOperation)at->atomicOperation;
+ opcode = (BrigOpcode)ib->opcode;
+ opType = brigAtomicToMemOpType(opcode, atomicOperation);
+
+ switch (segment) {
+ case BRIG_SEGMENT_GLOBAL:
+ o_type = Enums::OT_GLOBAL_ATOMIC;
+ break;
+
+ case BRIG_SEGMENT_GROUP:
+ o_type = Enums::OT_SHARED_ATOMIC;
+ break;
+
+ case BRIG_SEGMENT_FLAT:
+ o_type = Enums::OT_FLAT_ATOMIC;
+ break;
+
+ default:
+ panic("Atomic: segment %d not supported\n", segment);
+ }
+
+ if (HasDst) {
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ dest.init(op_offs, obj);
+
+ op_offs = obj->getOperandPtr(ib->operands, 1);
+ addr.init(op_offs, obj);
+
+ for (int i = 0; i < NumSrcOperands; ++i) {
+ op_offs = obj->getOperandPtr(ib->operands, i + 2);
+ src[i].init(op_offs, obj);
+ }
+ } else {
+
+ unsigned op_offs = obj->getOperandPtr(ib->operands, 0);
+ addr.init(op_offs, obj);
+
+ for (int i = 0; i < NumSrcOperands; ++i) {
+ op_offs = obj->getOperandPtr(ib->operands, i + 1);
+ src[i].init(op_offs, obj);
+ }
+ }
+ }
+
+ int numSrcRegOperands()
+ {
+ int operands = 0;
+ for (int i = 0; i < NumSrcOperands; i++) {
+ if (src[i].isVectorRegister() == true) {
+ operands++;
+ }
+ }
+ if (addr.isVectorRegister())
+ operands++;
+ return operands;
+ }
+ int numDstRegOperands() { return dest.isVectorRegister(); }
+ int getNumOperands()
+ {
+ if (addr.isVectorRegister())
+ return(NumSrcOperands + 2);
+ return(NumSrcOperands + 1);
+ }
+ bool isVectorRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].isVectorRegister();
+ else if (operandIndex == NumSrcOperands)
+ return(addr.isVectorRegister());
+ else
+ return dest.isVectorRegister();
+ }
+ bool isCondRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].isCondRegister();
+ else if (operandIndex == NumSrcOperands)
+ return(addr.isCondRegister());
+ else
+ return dest.isCondRegister();
+ }
+ bool isScalarRegister(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return src[operandIndex].isScalarRegister();
+ else if (operandIndex == NumSrcOperands)
+ return(addr.isScalarRegister());
+ else
+ return dest.isScalarRegister();
+ }
+ bool isSrcOperand(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return true;
+ else if (operandIndex == NumSrcOperands)
+ return(addr.isVectorRegister());
+ else
+ return false;
+ }
+ bool isDstOperand(int operandIndex)
+ {
+ if (operandIndex <= NumSrcOperands)
+ return false;
+ else
+ return true;
+ }
+ int getOperandSize(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return(src[operandIndex].opSize());
+ else if (operandIndex == NumSrcOperands)
+ return(addr.opSize());
+ else
+ return(dest.opSize());
+ }
+ int getRegisterIndex(int operandIndex)
+ {
+ assert((operandIndex >= 0) && (operandIndex < getNumOperands()));
+ if (operandIndex < NumSrcOperands)
+ return(src[operandIndex].regIndex());
+ else if (operandIndex == NumSrcOperands)
+ return(addr.regIndex());
+ else
+ return(dest.regIndex());
+ return -1;
+ }
+ };
+
+ template<typename MemDataType, typename AddrOperandType, int NumSrcOperands,
+ bool HasDst>
+ class AtomicInst :
+ public AtomicInstBase<typename MemDataType::OperandType,
+ AddrOperandType, NumSrcOperands, HasDst>,
+ public MemInst
+ {
+ public:
+ void generateDisassembly();
+
+ AtomicInst(const Brig::BrigInstBase *ib, const BrigObject *obj,
+ const char *_opcode)
+ : AtomicInstBase<typename MemDataType::OperandType, AddrOperandType,
+ NumSrcOperands, HasDst>
+ (ib, obj, _opcode),
+ MemInst(MemDataType::memType)
+ {
+ init_addr(&this->addr);
+ }
+
+ void
+ initiateAcc(GPUDynInstPtr gpuDynInst) override
+ {
+ // before doing the RMW, check if this atomic has
+ // release semantics, and if so issue a release first
+ if (!isLocalMem()) {
+ if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+ && (gpuDynInst->memoryOrder ==
+ Enums::MEMORY_ORDER_SC_RELEASE || gpuDynInst->memoryOrder ==
+ Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE)) {
+
+ gpuDynInst->statusBitVector = VectorMask(1);
+
+ gpuDynInst->execContinuation = &GPUStaticInst::execAtomic;
+ gpuDynInst->useContinuation = true;
+
+ // create request
+ Request *req = new Request(0, 0, 0, 0,
+ gpuDynInst->computeUnit()->masterId(),
+ 0, gpuDynInst->wfDynId, -1);
+ req->setFlags(Request::RELEASE);
+ gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+
+ return;
+ }
+ }
+
+ // if there is no release semantic, execute the RMW immediately
+ execAtomic(gpuDynInst);
+
+ }
+
+ void execute(GPUDynInstPtr gpuDynInst);
+
+ bool
+ isLocalMem() const override
+ {
+ return this->segment == Brig::BRIG_SEGMENT_GROUP;
+ }
+
+ private:
+ // execAtomic may be called through a continuation
+ // if the RMW had release semantics. see comment for
+ // execContinuation in gpu_dyn_inst.hh
+ void
+ execAtomic(GPUDynInstPtr gpuDynInst) override
+ {
+ gpuDynInst->statusBitVector = gpuDynInst->exec_mask;
+
+ typedef typename MemDataType::CType c0;
+
+ c0 *d = &((c0*) gpuDynInst->d_data)[0];
+ c0 *e = &((c0*) gpuDynInst->a_data)[0];
+ c0 *f = &((c0*) gpuDynInst->x_data)[0];
+
+ for (int i = 0; i < VSZ; ++i) {
+ if (gpuDynInst->exec_mask[i]) {
+ Addr vaddr = gpuDynInst->addr[i];
+
+ if (isLocalMem()) {
+ Wavefront *wavefront = gpuDynInst->wavefront();
+ *d = wavefront->ldsChunk->read<c0>(vaddr);
+
+ switch (this->opType) {
+ case Enums::MO_AADD:
+ case Enums::MO_ANRADD:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ wavefront->ldsChunk->read<c0>(vaddr) + (*e));
+ break;
+ case Enums::MO_ASUB:
+ case Enums::MO_ANRSUB:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ wavefront->ldsChunk->read<c0>(vaddr) - (*e));
+ break;
+ case Enums::MO_AMAX:
+ case Enums::MO_ANRMAX:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ std::max(wavefront->ldsChunk->read<c0>(vaddr),
+ (*e)));
+ break;
+ case Enums::MO_AMIN:
+ case Enums::MO_ANRMIN:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ std::min(wavefront->ldsChunk->read<c0>(vaddr),
+ (*e)));
+ break;
+ case Enums::MO_AAND:
+ case Enums::MO_ANRAND:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ wavefront->ldsChunk->read<c0>(vaddr) & (*e));
+ break;
+ case Enums::MO_AOR:
+ case Enums::MO_ANROR:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ wavefront->ldsChunk->read<c0>(vaddr) | (*e));
+ break;
+ case Enums::MO_AXOR:
+ case Enums::MO_ANRXOR:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ wavefront->ldsChunk->read<c0>(vaddr) ^ (*e));
+ break;
+ case Enums::MO_AINC:
+ case Enums::MO_ANRINC:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ wavefront->ldsChunk->read<c0>(vaddr) + 1);
+ break;
+ case Enums::MO_ADEC:
+ case Enums::MO_ANRDEC:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ wavefront->ldsChunk->read<c0>(vaddr) - 1);
+ break;
+ case Enums::MO_AEXCH:
+ case Enums::MO_ANREXCH:
+ wavefront->ldsChunk->write<c0>(vaddr, (*e));
+ break;
+ case Enums::MO_ACAS:
+ case Enums::MO_ANRCAS:
+ wavefront->ldsChunk->write<c0>(vaddr,
+ (wavefront->ldsChunk->read<c0>(vaddr) == (*e)) ?
+ (*f) : wavefront->ldsChunk->read<c0>(vaddr));
+ break;
+ default:
+ fatal("Unrecognized or invalid HSAIL atomic op "
+ "type.\n");
+ break;
+ }
+ } else {
+ Request *req =
+ new Request(0, vaddr, sizeof(c0), 0,
+ gpuDynInst->computeUnit()->masterId(),
+ 0, gpuDynInst->wfDynId, i,
+ gpuDynInst->makeAtomicOpFunctor<c0>(e,
+ f, this->opType));
+
+ gpuDynInst->setRequestFlags(req);
+ PacketPtr pkt = new Packet(req, MemCmd::SwapReq);
+ pkt->dataStatic(d);
+
+ if (gpuDynInst->computeUnit()->shader->
+ separate_acquire_release &&
+ (gpuDynInst->memoryOrder ==
+ Enums::MEMORY_ORDER_SC_ACQUIRE)) {
+ // if this atomic has acquire semantics,
+ // schedule the continuation to perform an
+ // acquire after the RMW completes
+ gpuDynInst->execContinuation =
+ &GPUStaticInst::execAtomicAcq;
+
+ gpuDynInst->useContinuation = true;
+ } else {
+ // the request will be finished when the RMW completes
+ gpuDynInst->useContinuation = false;
+ }
+ // translation is performed in sendRequest()
+ gpuDynInst->computeUnit()->sendRequest(gpuDynInst, i,
+ pkt);
+ }
+ }
+
+ ++d;
+ ++e;
+ ++f;
+ }
+
+ gpuDynInst->updateStats();
+ }
+
+ // execAtomicACq will always be called through a continuation.
+ // see comment for execContinuation in gpu_dyn_inst.hh
+ void
+ execAtomicAcq(GPUDynInstPtr gpuDynInst) override
+ {
+ // after performing the RMW, check to see if this instruction
+ // has acquire semantics, and if so, issue an acquire
+ if (!isLocalMem()) {
+ if (gpuDynInst->computeUnit()->shader->separate_acquire_release
+ && gpuDynInst->memoryOrder ==
+ Enums::MEMORY_ORDER_SC_ACQUIRE) {
+ gpuDynInst->statusBitVector = VectorMask(1);
+
+ // the request will be finished when
+ // the acquire completes
+ gpuDynInst->useContinuation = false;
+ // create request
+ Request *req = new Request(0, 0, 0, 0,
+ gpuDynInst->computeUnit()->masterId(),
+ 0, gpuDynInst->wfDynId, -1);
+ req->setFlags(Request::ACQUIRE);
+ gpuDynInst->computeUnit()->injectGlobalMemFence(gpuDynInst, false, req);
+ }
+ }
+ }
+ };
+
+ template<typename DataType, typename AddrOperandType, int NumSrcOperands>
+ GPUStaticInst*
+ constructAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+
+ if (at->atomicOperation == Brig::BRIG_ATOMIC_LD) {
+ return decodeLd<DataType>(ib, obj);
+ } else if (at->atomicOperation == Brig::BRIG_ATOMIC_ST) {
+ switch (ib->type) {
+ case Brig::BRIG_TYPE_B8:
+ return decodeSt<S8,S8>(ib, obj);
+ case Brig::BRIG_TYPE_B16:
+ return decodeSt<S8,S16>(ib, obj);
+ case Brig::BRIG_TYPE_B32:
+ return decodeSt<S8,S32>(ib, obj);
+ case Brig::BRIG_TYPE_B64:
+ return decodeSt<S8,S64>(ib, obj);
+ default: fatal("AtomicSt: Operand type mismatch %d\n", ib->type);
+ }
+ } else {
+ if ((Brig::BrigOpcode)ib->opcode == Brig::BRIG_OPCODE_ATOMICNORET)
+ return new AtomicInst<DataType, AddrOperandType,
+ NumSrcOperands, false>(ib, obj, "atomicnoret");
+ else
+ return new AtomicInst<DataType, AddrOperandType,
+ NumSrcOperands, true>(ib, obj, "atomic");
+ }
+ }
+
+ template<typename DataType, int NumSrcOperands>
+ GPUStaticInst*
+ decodeAtomicHelper(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ unsigned addrIndex = (Brig::BrigOpcode)ib->opcode ==
+ Brig::BRIG_OPCODE_ATOMICNORET ? 0 : 1;
+
+ unsigned op_offs = obj->getOperandPtr(ib->operands,addrIndex);
+
+ BrigRegOperandInfo tmp = findRegDataType(op_offs, obj);
+
+ if (tmp.kind == Brig::BRIG_KIND_OPERAND_ADDRESS) {
+ return constructAtomic<DataType, NoRegAddrOperand,
+ NumSrcOperands>(ib, obj);
+ } else if (tmp.kind == Brig::BRIG_KIND_OPERAND_REGISTER) {
+ // V2/V4 not allowed
+ switch (tmp.regKind) {
+ case Brig::BRIG_REGISTER_KIND_SINGLE:
+ return constructAtomic<DataType, SRegAddrOperand,
+ NumSrcOperands>(ib, obj);
+ case Brig::BRIG_REGISTER_KIND_DOUBLE:
+ return constructAtomic<DataType, DRegAddrOperand,
+ NumSrcOperands>(ib, obj);
+ default:
+ fatal("Bad atomic register operand type %d\n", tmp.type);
+ }
+ } else {
+ fatal("Bad atomic register operand kind %d\n", tmp.kind);
+ }
+ }
+
+
+ template<typename DataType>
+ GPUStaticInst*
+ decodeAtomic(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+
+ if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
+ return decodeAtomicHelper<DataType, 2>(ib, obj);
+ } else {
+ return decodeAtomicHelper<DataType, 1>(ib, obj);
+ }
+ }
+
+ template<typename DataType>
+ GPUStaticInst*
+ decodeAtomicNoRet(const Brig::BrigInstBase *ib, const BrigObject *obj)
+ {
+ const Brig::BrigInstAtomic *at = (const Brig::BrigInstAtomic*)ib;
+ if (at->atomicOperation == Brig::BRIG_ATOMIC_CAS) {
+ return decodeAtomicHelper<DataType, 2>(ib, obj);
+ } else {
+ return decodeAtomicHelper<DataType, 1>(ib, obj);
+ }
+ }
+} // namespace HsailISA
+
+#endif // __ARCH_HSAIL_INSTS_MEM_HH__
diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh
new file mode 100644
index 000000000..94f0cd6aa
--- /dev/null
+++ b/src/arch/hsail/insts/mem_impl.hh
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/generic_types.hh"
+#include "gpu-compute/hsail_code.hh"
+
+// defined in code.cc, but not worth sucking in all of code.h for this
+// at this point
+extern const char *segmentNames[];
+
+namespace HsailISA
+{
+ template<typename DestDataType, typename AddrRegOperandType>
+ void
+ LdaInst<DestDataType, AddrRegOperandType>::generateDisassembly()
+ {
+ this->disassembly = csprintf("%s_%s %s,%s", this->opcode,
+ DestDataType::label,
+ this->dest.disassemble(),
+ this->addr.disassemble());
+ }
+
+ template<typename DestDataType, typename AddrRegOperandType>
+ void
+ LdaInst<DestDataType, AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ typedef typename DestDataType::CType CType M5_VAR_USED;
+ const VectorMask &mask = w->get_pred();
+ uint64_t addr_vec[VSZ];
+ this->addr.calcVector(w, addr_vec);
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ this->dest.set(w, lane, addr_vec[lane]);
+ }
+ }
+ }
+
+ template<typename MemDataType, typename DestDataType,
+ typename AddrRegOperandType>
+ void
+ LdInst<MemDataType, DestDataType, AddrRegOperandType>::generateDisassembly()
+ {
+ switch (num_dest_operands) {
+ case 1:
+ this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
+ segmentNames[this->segment],
+ MemDataType::label,
+ this->dest.disassemble(),
+ this->addr.disassemble());
+ break;
+ case 2:
+ this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
+ segmentNames[this->segment],
+ MemDataType::label,
+ this->dest_vect[0].disassemble(),
+ this->dest_vect[1].disassemble(),
+ this->addr.disassemble());
+ break;
+ case 4:
+ this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
+ this->opcode,
+ segmentNames[this->segment],
+ MemDataType::label,
+ this->dest_vect[0].disassemble(),
+ this->dest_vect[1].disassemble(),
+ this->dest_vect[2].disassemble(),
+ this->dest_vect[3].disassemble(),
+ this->addr.disassemble());
+ break;
+ default:
+ fatal("Bad ld register dest operand, num vector operands: %d \n",
+ num_dest_operands);
+ break;
+ }
+ }
+
+ static Addr
+ calcPrivAddr(Addr addr, Wavefront *w, int lane, GPUStaticInst *i)
+ {
+ // what is the size of the object we are accessing??
+ // NOTE: the compiler doesn't generate enough information
+ // to do this yet..have to just line up all the private
+ // work-item spaces back to back for now
+ /*
+ StorageElement* se =
+ i->parent->findSymbol(Brig::BrigPrivateSpace, addr);
+ assert(se);
+
+ return w->wfSlotId * w->privSizePerItem * VSZ +
+ se->offset * VSZ +
+ lane * se->size;
+ */
+
+ // addressing strategy: interleave the private spaces of
+ // work-items in a wave-front on 8 byte granularity.
+ // this won't be perfect coalescing like the spill space
+ // strategy, but it's better than nothing. The spill space
+ // strategy won't work with private because the same address
+ // may be accessed by different sized loads/stores.
+
+ // Note: I'm assuming that the largest load/store to private
+ // is 8 bytes. If it is larger, the stride will have to increase
+
+ Addr addr_div8 = addr / 8;
+ Addr addr_mod8 = addr % 8;
+
+ Addr ret = addr_div8 * 8 * VSZ + lane * 8 + addr_mod8 + w->privBase;
+
+ assert(ret < w->privBase + (w->privSizePerItem * VSZ));
+
+ return ret;
+ }
+
+ template<typename MemDataType, typename DestDataType,
+ typename AddrRegOperandType>
+ void
+ LdInst<MemDataType, DestDataType,
+ AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ typedef typename MemDataType::CType MemCType;
+ const VectorMask &mask = w->get_pred();
+
+ // Kernarg references are handled uniquely for now (no Memory Request
+ // is used), so special-case them up front. Someday we should
+ // make this more realistic, at which we should get rid of this
+ // block and fold this case into the switch below.
+ if (this->segment == Brig::BRIG_SEGMENT_KERNARG) {
+ MemCType val;
+
+ // I assume no vector ld for kernargs
+ assert(num_dest_operands == 1);
+
+ // assuming for the moment that we'll never do register
+ // offsets into kernarg space... just to make life simpler
+ uint64_t address = this->addr.calcUniform();
+
+ val = *(MemCType*)&w->kernelArgs[address];
+
+ DPRINTF(HSAIL, "ld_kernarg [%d] -> %d\n", address, val);
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ this->dest.set(w, lane, val);
+ }
+ }
+
+ return;
+ } else if (this->segment == Brig::BRIG_SEGMENT_ARG) {
+ uint64_t address = this->addr.calcUniform();
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ MemCType val = w->readCallArgMem<MemCType>(lane, address);
+
+ DPRINTF(HSAIL, "ld_arg [%d] -> %llu\n", address,
+ (unsigned long long)val);
+
+ this->dest.set(w, lane, val);
+ }
+ }
+
+ return;
+ }
+
+ GPUDynInstPtr m = gpuDynInst;
+
+ this->addr.calcVector(w, m->addr);
+
+ m->m_op = Enums::MO_LD;
+ m->m_type = MemDataType::memType;
+ m->v_type = DestDataType::vgprType;
+
+ m->exec_mask = w->execMask();
+ m->statusBitVector = 0;
+ m->equiv = this->equivClass;
+ m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+ m->scope = getGenericMemoryScope(this->memoryScope);
+
+ if (num_dest_operands == 1) {
+ m->dst_reg = this->dest.regIndex();
+ m->n_reg = 1;
+ } else {
+ m->n_reg = num_dest_operands;
+ for (int i = 0; i < num_dest_operands; ++i) {
+ m->dst_reg_vec[i] = this->dest_vect[i].regIndex();
+ }
+ }
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->kern_id = w->kern_id;
+ m->cu_id = w->computeUnit->cu_id;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ switch (this->segment) {
+ case Brig::BRIG_SEGMENT_GLOBAL:
+ m->s_type = SEG_GLOBAL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+
+ // this is a complete hack to get around a compiler bug
+ // (the compiler currently generates global access for private
+ // addresses (starting from 0). We need to add the private offset)
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (m->addr[lane] < w->privSizePerItem) {
+ if (mask[lane]) {
+ // what is the size of the object we are accessing?
+ // find base for for this wavefront
+
+ // calcPrivAddr will fail if accesses are unaligned
+ assert(!((sizeof(MemCType) - 1) & m->addr[lane]));
+
+ Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
+ this);
+
+ m->addr[lane] = privAddr;
+ }
+ }
+ }
+
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_SPILL:
+ assert(num_dest_operands == 1);
+ m->s_type = SEG_SPILL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+ {
+ for (int lane = 0; lane < VSZ; ++lane) {
+ // note: this calculation will NOT WORK if the compiler
+ // ever generates loads/stores to the same address with
+ // different widths (e.g., a ld_u32 addr and a ld_u16 addr)
+ if (mask[lane]) {
+ assert(m->addr[lane] < w->spillSizePerItem);
+
+ m->addr[lane] = m->addr[lane] * w->spillWidth +
+ lane * sizeof(MemCType) + w->spillBase;
+
+ w->last_addr[lane] = m->addr[lane];
+ }
+ }
+ }
+
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_GROUP:
+ m->s_type = SEG_SHARED;
+ m->pipeId = LDSMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(24));
+ w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+ w->outstanding_reqs_rd_lm++;
+ w->rd_lm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_READONLY:
+ m->s_type = SEG_READONLY;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ assert(m->addr[lane] + sizeof(MemCType) <= w->roSize);
+ m->addr[lane] += w->roBase;
+ }
+ }
+
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_PRIVATE:
+ m->s_type = SEG_PRIVATE;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+ {
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ assert(m->addr[lane] < w->privSizePerItem);
+
+ m->addr[lane] = m->addr[lane] +
+ lane * sizeof(MemCType) + w->privBase;
+ }
+ }
+ }
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ break;
+
+ default:
+ fatal("Load to unsupported segment %d %llxe\n", this->segment,
+ m->addr[0]);
+ }
+
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ template<typename OperationType, typename SrcDataType,
+ typename AddrRegOperandType>
+ void
+ StInst<OperationType, SrcDataType,
+ AddrRegOperandType>::execute(GPUDynInstPtr gpuDynInst)
+ {
+ Wavefront *w = gpuDynInst->wavefront();
+
+ typedef typename OperationType::CType CType;
+
+ const VectorMask &mask = w->get_pred();
+
+ // arg references are handled uniquely for now (no Memory Request
+ // is used), so special-case them up front. Someday we should
+ // make this more realistic, at which we should get rid of this
+ // block and fold this case into the switch below.
+ if (this->segment == Brig::BRIG_SEGMENT_ARG) {
+ uint64_t address = this->addr.calcUniform();
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ CType data = this->src.template get<CType>(w, lane);
+ DPRINTF(HSAIL, "st_arg [%d] <- %d\n", address, data);
+ w->writeCallArgMem<CType>(lane, address, data);
+ }
+ }
+
+ return;
+ }
+
+ GPUDynInstPtr m = gpuDynInst;
+
+ m->exec_mask = w->execMask();
+
+ this->addr.calcVector(w, m->addr);
+
+ if (num_src_operands == 1) {
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ ((CType*)m->d_data)[lane] =
+ this->src.template get<CType>(w, lane);
+ }
+ }
+ } else {
+ for (int k= 0; k < num_src_operands; ++k) {
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ ((CType*)m->d_data)[k * VSZ + lane] =
+ this->src_vect[k].template get<CType>(w, lane);
+ }
+ }
+ }
+ }
+
+ m->m_op = Enums::MO_ST;
+ m->m_type = OperationType::memType;
+ m->v_type = OperationType::vgprType;
+
+ m->statusBitVector = 0;
+ m->equiv = this->equivClass;
+
+ if (num_src_operands == 1) {
+ m->n_reg = 1;
+ } else {
+ m->n_reg = num_src_operands;
+ }
+
+ m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+ m->scope = getGenericMemoryScope(this->memoryScope);
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->kern_id = w->kern_id;
+ m->cu_id = w->computeUnit->cu_id;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ switch (this->segment) {
+ case Brig::BRIG_SEGMENT_GLOBAL:
+ m->s_type = SEG_GLOBAL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+
+ // this is a complete hack to get around a compiler bug
+ // (the compiler currently generates global access for private
+ // addresses (starting from 0). We need to add the private offset)
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ if (m->addr[lane] < w->privSizePerItem) {
+
+ // calcPrivAddr will fail if accesses are unaligned
+ assert(!((sizeof(CType)-1) & m->addr[lane]));
+
+ Addr privAddr = calcPrivAddr(m->addr[lane], w, lane,
+ this);
+
+ m->addr[lane] = privAddr;
+ }
+ }
+ }
+
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_wr_gm++;
+ w->wr_gm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_SPILL:
+ assert(num_src_operands == 1);
+ m->s_type = SEG_SPILL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+ {
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ assert(m->addr[lane] < w->spillSizePerItem);
+
+ m->addr[lane] = m->addr[lane] * w->spillWidth +
+ lane * sizeof(CType) + w->spillBase;
+ }
+ }
+ }
+
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_wr_gm++;
+ w->wr_gm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_GROUP:
+ m->s_type = SEG_SHARED;
+ m->pipeId = LDSMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(24));
+ w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+ w->outstanding_reqs_wr_lm++;
+ w->wr_lm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_PRIVATE:
+ m->s_type = SEG_PRIVATE;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+ {
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ assert(m->addr[lane] < w->privSizePerItem);
+ m->addr[lane] = m->addr[lane] + lane *
+ sizeof(CType)+w->privBase;
+ }
+ }
+ }
+
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_wr_gm++;
+ w->wr_gm_reqs_in_pipe--;
+ break;
+
+ default:
+ fatal("Store to unsupported segment %d\n", this->segment);
+ }
+
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ template<typename OperationType, typename SrcDataType,
+ typename AddrRegOperandType>
+ void
+ StInst<OperationType, SrcDataType,
+ AddrRegOperandType>::generateDisassembly()
+ {
+ switch (num_src_operands) {
+ case 1:
+ this->disassembly = csprintf("%s_%s_%s %s,%s", this->opcode,
+ segmentNames[this->segment],
+ OperationType::label,
+ this->src.disassemble(),
+ this->addr.disassemble());
+ break;
+ case 2:
+ this->disassembly = csprintf("%s_%s_%s (%s,%s), %s", this->opcode,
+ segmentNames[this->segment],
+ OperationType::label,
+ this->src_vect[0].disassemble(),
+ this->src_vect[1].disassemble(),
+ this->addr.disassemble());
+ break;
+ case 4:
+ this->disassembly = csprintf("%s_%s_%s (%s,%s,%s,%s), %s",
+ this->opcode,
+ segmentNames[this->segment],
+ OperationType::label,
+ this->src_vect[0].disassemble(),
+ this->src_vect[1].disassemble(),
+ this->src_vect[2].disassemble(),
+ this->src_vect[3].disassemble(),
+ this->addr.disassemble());
+ break;
+ default: fatal("Bad ld register src operand, num vector operands: "
+ "%d \n", num_src_operands);
+ break;
+ }
+ }
+
+ template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
+ bool HasDst>
+ void
+ AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
+ HasDst>::execute(GPUDynInstPtr gpuDynInst)
+ {
+ typedef typename DataType::CType CType;
+
+ Wavefront *w = gpuDynInst->wavefront();
+
+ GPUDynInstPtr m = gpuDynInst;
+
+ this->addr.calcVector(w, m->addr);
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ ((CType *)m->a_data)[lane] =
+ this->src[0].template get<CType>(w, lane);
+ }
+
+ // load second source operand for CAS
+ if (NumSrcOperands > 1) {
+ for (int lane = 0; lane < VSZ; ++lane) {
+ ((CType*)m->x_data)[lane] =
+ this->src[1].template get<CType>(w, lane);
+ }
+ }
+
+ assert(NumSrcOperands <= 2);
+
+ m->m_op = this->opType;
+ m->m_type = DataType::memType;
+ m->v_type = DataType::vgprType;
+
+ m->exec_mask = w->execMask();
+ m->statusBitVector = 0;
+ m->equiv = 0; // atomics don't have an equivalence class operand
+ m->n_reg = 1;
+ m->memoryOrder = getGenericMemoryOrder(this->memoryOrder);
+
+ m->scope = getGenericMemoryScope(this->memoryScope);
+
+ if (HasDst) {
+ m->dst_reg = this->dest.regIndex();
+ }
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->kern_id = w->kern_id;
+ m->cu_id = w->computeUnit->cu_id;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ switch (this->segment) {
+ case Brig::BRIG_SEGMENT_GLOBAL:
+ m->s_type = SEG_GLOBAL;
+ m->latency.set(w->computeUnit->shader->ticks(64));
+ m->pipeId = GLBMEM_PIPE;
+
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_wr_gm++;
+ w->wr_gm_reqs_in_pipe--;
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ break;
+
+ case Brig::BRIG_SEGMENT_GROUP:
+ m->s_type = SEG_SHARED;
+ m->pipeId = LDSMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(24));
+ w->computeUnit->localMemoryPipe.getLMReqFIFO().push(m);
+ w->outstanding_reqs_wr_lm++;
+ w->wr_lm_reqs_in_pipe--;
+ w->outstanding_reqs_rd_lm++;
+ w->rd_lm_reqs_in_pipe--;
+ break;
+
+ default:
+ fatal("Atomic op to unsupported segment %d\n",
+ this->segment);
+ }
+
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ const char* atomicOpToString(Brig::BrigAtomicOperation atomicOp);
+
+ template<typename DataType, typename AddrRegOperandType, int NumSrcOperands,
+ bool HasDst>
+ void
+ AtomicInst<DataType, AddrRegOperandType, NumSrcOperands,
+ HasDst>::generateDisassembly()
+ {
+ if (HasDst) {
+ this->disassembly =
+ csprintf("%s_%s_%s_%s %s,%s", this->opcode,
+ atomicOpToString(this->atomicOperation),
+ segmentNames[this->segment],
+ DataType::label, this->dest.disassemble(),
+ this->addr.disassemble());
+ } else {
+ this->disassembly =
+ csprintf("%s_%s_%s_%s %s", this->opcode,
+ atomicOpToString(this->atomicOperation),
+ segmentNames[this->segment],
+ DataType::label, this->addr.disassemble());
+ }
+
+ for (int i = 0; i < NumSrcOperands; ++i) {
+ this->disassembly += ",";
+ this->disassembly += this->src[i].disassemble();
+ }
+ }
+} // namespace HsailISA
diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc
new file mode 100644
index 000000000..9506a80ab
--- /dev/null
+++ b/src/arch/hsail/insts/pseudo_inst.cc
@@ -0,0 +1,787 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Marc Orr
+ */
+
+#include <csignal>
+
+#include "arch/hsail/insts/decl.hh"
+#include "arch/hsail/insts/mem.hh"
+
+namespace HsailISA
+{
+ // Pseudo (or magic) instructions are overloaded on the hsail call
+ // instruction, because of its flexible parameter signature.
+
+ // To add a new magic instruction:
+ // 1. Add an entry to the enum.
+ // 2. Implement it in the switch statement below (Call::exec).
+ // 3. Add a utility function to hsa/hsail-gpu-compute/util/magicinst.h,
+ // so its easy to call from an OpenCL kernel.
+
+ // This enum should be identical to the enum in
+ // hsa/hsail-gpu-compute/util/magicinst.h
+ enum
+ {
+ MAGIC_PRINT_WF_32 = 0,
+ MAGIC_PRINT_WF_64,
+ MAGIC_PRINT_LANE,
+ MAGIC_PRINT_LANE_64,
+ MAGIC_PRINT_WF_FLOAT,
+ MAGIC_SIM_BREAK,
+ MAGIC_PREF_SUM,
+ MAGIC_REDUCTION,
+ MAGIC_MASKLANE_LOWER,
+ MAGIC_MASKLANE_UPPER,
+ MAGIC_JOIN_WF_BAR,
+ MAGIC_WAIT_WF_BAR,
+ MAGIC_PANIC,
+ MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG,
+ MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG,
+ MAGIC_LOAD_GLOBAL_U32_REG,
+ MAGIC_XACT_CAS_LD,
+ MAGIC_MOST_SIG_THD,
+ MAGIC_MOST_SIG_BROADCAST,
+ MAGIC_PRINT_WFID_32,
+ MAGIC_PRINT_WFID_64
+ };
+
+ void
+ Call::execPseudoInst(Wavefront *w, GPUDynInstPtr gpuDynInst)
+ {
+ const VectorMask &mask = w->get_pred();
+
+ int op = 0;
+ bool got_op = false;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val0 = src1.get<int>(w, lane, 0);
+ if (got_op) {
+ if (src_val0 != op) {
+ fatal("Multiple magic instructions per PC not "
+ "supported\n");
+ }
+ } else {
+ op = src_val0;
+ got_op = true;
+ }
+ }
+ }
+
+ switch(op) {
+ case MAGIC_PRINT_WF_32:
+ MagicPrintWF32(w);
+ break;
+ case MAGIC_PRINT_WF_64:
+ MagicPrintWF64(w);
+ break;
+ case MAGIC_PRINT_LANE:
+ MagicPrintLane(w);
+ break;
+ case MAGIC_PRINT_LANE_64:
+ MagicPrintLane64(w);
+ break;
+ case MAGIC_PRINT_WF_FLOAT:
+ MagicPrintWFFloat(w);
+ break;
+ case MAGIC_SIM_BREAK:
+ MagicSimBreak(w);
+ break;
+ case MAGIC_PREF_SUM:
+ MagicPrefixSum(w);
+ break;
+ case MAGIC_REDUCTION:
+ MagicReduction(w);
+ break;
+ case MAGIC_MASKLANE_LOWER:
+ MagicMaskLower(w);
+ break;
+ case MAGIC_MASKLANE_UPPER:
+ MagicMaskUpper(w);
+ break;
+ case MAGIC_JOIN_WF_BAR:
+ MagicJoinWFBar(w);
+ break;
+ case MAGIC_WAIT_WF_BAR:
+ MagicWaitWFBar(w);
+ break;
+ case MAGIC_PANIC:
+ MagicPanic(w);
+ break;
+
+ // atomic instructions
+ case MAGIC_ATOMIC_NR_ADD_GLOBAL_U32_REG:
+ MagicAtomicNRAddGlobalU32Reg(w, gpuDynInst);
+ break;
+
+ case MAGIC_ATOMIC_NR_ADD_GROUP_U32_REG:
+ MagicAtomicNRAddGroupU32Reg(w, gpuDynInst);
+ break;
+
+ case MAGIC_LOAD_GLOBAL_U32_REG:
+ MagicLoadGlobalU32Reg(w, gpuDynInst);
+ break;
+
+ case MAGIC_XACT_CAS_LD:
+ MagicXactCasLd(w);
+ break;
+
+ case MAGIC_MOST_SIG_THD:
+ MagicMostSigThread(w);
+ break;
+
+ case MAGIC_MOST_SIG_BROADCAST:
+ MagicMostSigBroadcast(w);
+ break;
+
+ case MAGIC_PRINT_WFID_32:
+ MagicPrintWF32ID(w);
+ break;
+
+ case MAGIC_PRINT_WFID_64:
+ MagicPrintWFID64(w);
+ break;
+
+ default: fatal("unrecognized magic instruction: %d\n", op);
+ }
+ }
+
+ void
+ Call::MagicPrintLane(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ if (src_val2) {
+ DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
+ disassemble(), w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, lane, src_val1);
+ } else {
+ DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
+ disassemble(), w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, lane, src_val1);
+ }
+ }
+ }
+ #endif
+ }
+
+ void
+ Call::MagicPrintLane64(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ if (src_val2) {
+ DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: 0x%x\n",
+ disassemble(), w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, lane, src_val1);
+ } else {
+ DPRINTFN("krl_prt (%s): CU%d, WF[%d][%d], lane %d: %d\n",
+ disassemble(), w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, lane, src_val1);
+ }
+ }
+ }
+ #endif
+ }
+
+ void
+ Call::MagicPrintWF32(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 7)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+
+ if (src_val2) {
+ res_str += csprintf("%08x", src_val1);
+ } else {
+ res_str += csprintf("%08d", src_val1);
+ }
+ } else {
+ res_str += csprintf("xxxxxxxx");
+ }
+
+ if ((lane & 7) == 7) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ DPRINTFN(res_str.c_str());
+ #endif
+ }
+
+ void
+ Call::MagicPrintWF32ID(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ int src_val3 = -1;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 7)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ src_val3 = src1.get<int>(w, lane, 3);
+
+ if (src_val2) {
+ res_str += csprintf("%08x", src_val1);
+ } else {
+ res_str += csprintf("%08d", src_val1);
+ }
+ } else {
+ res_str += csprintf("xxxxxxxx");
+ }
+
+ if ((lane & 7) == 7) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ if (w->wfDynId == src_val3) {
+ DPRINTFN(res_str.c_str());
+ }
+ #endif
+ }
+
+ void
+ Call::MagicPrintWF64(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 3)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+
+ if (src_val2) {
+ res_str += csprintf("%016x", src_val1);
+ } else {
+ res_str += csprintf("%016d", src_val1);
+ }
+ } else {
+ res_str += csprintf("xxxxxxxxxxxxxxxx");
+ }
+
+ if ((lane & 3) == 3) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ DPRINTFN(res_str.c_str());
+ #endif
+ }
+
+ void
+ Call::MagicPrintWFID64(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ int src_val3 = -1;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 3)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ int64_t src_val1 = src1.get<int64_t>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ src_val3 = src1.get<int>(w, lane, 3);
+
+ if (src_val2) {
+ res_str += csprintf("%016x", src_val1);
+ } else {
+ res_str += csprintf("%016d", src_val1);
+ }
+ } else {
+ res_str += csprintf("xxxxxxxxxxxxxxxx");
+ }
+
+ if ((lane & 3) == 3) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ if (w->wfDynId == src_val3) {
+ DPRINTFN(res_str.c_str());
+ }
+ #endif
+ }
+
+ void
+ Call::MagicPrintWFFloat(Wavefront *w)
+ {
+ #if TRACING_ON
+ const VectorMask &mask = w->get_pred();
+ std::string res_str;
+ res_str = csprintf("krl_prt (%s)\n", disassemble());
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (!(lane & 7)) {
+ res_str += csprintf("DB%03d: ", (int)w->wfDynId);
+ }
+
+ if (mask[lane]) {
+ float src_val1 = src1.get<float>(w, lane, 1);
+ res_str += csprintf("%08f", src_val1);
+ } else {
+ res_str += csprintf("xxxxxxxx");
+ }
+
+ if ((lane & 7) == 7) {
+ res_str += csprintf("\n");
+ } else {
+ res_str += csprintf(" ");
+ }
+ }
+
+ res_str += "\n\n";
+ DPRINTFN(res_str.c_str());
+ #endif
+ }
+
+ // raises a signal that GDB will catch
+ // when done with the break, type "signal 0" in gdb to continue
+ void
+ Call::MagicSimBreak(Wavefront *w)
+ {
+ std::string res_str;
+ // print out state for this wavefront and then break
+ res_str = csprintf("Breakpoint encountered for wavefront %i\n",
+ w->wfSlotId);
+
+ res_str += csprintf(" Kern ID: %i\n", w->kern_id);
+ res_str += csprintf(" Phase ID: %i\n", w->simdId);
+ res_str += csprintf(" Executing on CU #%i\n", w->computeUnit->cu_id);
+ res_str += csprintf(" Exec mask: ");
+
+ for (int i = VSZ - 1; i >= 0; --i) {
+ if (w->execMask(i))
+ res_str += "1";
+ else
+ res_str += "0";
+
+ if ((i & 7) == 7)
+ res_str += " ";
+ }
+
+ res_str += csprintf("(0x%016llx)\n", w->execMask().to_ullong());
+
+ res_str += "\nHelpful debugging hints:\n";
+ res_str += " Check out w->s_reg / w->d_reg for register state\n";
+
+ res_str += "\n\n";
+ DPRINTFN(res_str.c_str());
+ fflush(stdout);
+
+ raise(SIGTRAP);
+ }
+
+ void
+ Call::MagicPrefixSum(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ dest.set<int>(w, lane, res);
+ res += src_val1;
+ }
+ }
+ }
+
+ void
+ Call::MagicReduction(Wavefront *w)
+ {
+ // reduction magic instruction
+ // The reduction instruction takes up to 64 inputs (one from
+ // each thread in a WF) and sums them. It returns the sum to
+ // each thread in the WF.
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ res += src_val1;
+ }
+ }
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ dest.set<int>(w, lane, res);
+ }
+ }
+ }
+
+ void
+ Call::MagicMaskLower(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+
+ if (src_val1) {
+ if (lane < (VSZ/2)) {
+ res = res | ((uint32_t)(1) << lane);
+ }
+ }
+ }
+ }
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ dest.set<int>(w, lane, res);
+ }
+ }
+ }
+
+ void
+ Call::MagicMaskUpper(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+
+ if (src_val1) {
+ if (lane >= (VSZ/2)) {
+ res = res | ((uint32_t)(1) << (lane - (VSZ/2)));
+ }
+ }
+ }
+ }
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ dest.set<int>(w, lane, res);
+ }
+ }
+ }
+
+ void
+ Call::MagicJoinWFBar(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int max_cnt = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ w->bar_cnt[lane]++;
+
+ if (w->bar_cnt[lane] > max_cnt) {
+ max_cnt = w->bar_cnt[lane];
+ }
+ }
+ }
+
+ if (max_cnt > w->max_bar_cnt) {
+ w->max_bar_cnt = max_cnt;
+ }
+ }
+
+ void
+ Call::MagicWaitWFBar(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int max_cnt = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ w->bar_cnt[lane]--;
+ }
+
+ if (w->bar_cnt[lane] > max_cnt) {
+ max_cnt = w->bar_cnt[lane];
+ }
+ }
+
+ if (max_cnt < w->max_bar_cnt) {
+ w->max_bar_cnt = max_cnt;
+ }
+
+ w->instructionBuffer.erase(w->instructionBuffer.begin() + 1,
+ w->instructionBuffer.end());
+ if (w->pendingFetch)
+ w->dropFetch = true;
+ }
+
+ void
+ Call::MagicPanic(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ panic("OpenCL Code failed assertion #%d. Triggered by lane %s",
+ src_val1, lane);
+ }
+ }
+ }
+
+ void
+ Call::calcAddr(Wavefront *w, GPUDynInstPtr m)
+ {
+ // the address is in src1 | src2
+ for (int lane = 0; lane < VSZ; ++lane) {
+ int src_val1 = src1.get<int>(w, lane, 1);
+ int src_val2 = src1.get<int>(w, lane, 2);
+ Addr addr = (((Addr) src_val1) << 32) | ((Addr) src_val2);
+
+ m->addr[lane] = addr;
+ }
+
+ }
+
+ void
+ Call::MagicAtomicNRAddGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+ {
+ GPUDynInstPtr m = gpuDynInst;
+
+ calcAddr(w, m);
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 3);
+ }
+
+ m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
+ Brig::BRIG_ATOMIC_ADD);
+ m->m_type = U32::memType;
+ m->v_type = U32::vgprType;
+
+ m->exec_mask = w->execMask();
+ m->statusBitVector = 0;
+ m->equiv = 0; // atomics don't have an equivalence class operand
+ m->n_reg = 1;
+ m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+ m->scope = Enums::MEMORY_SCOPE_NONE;
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ m->s_type = SEG_GLOBAL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(64));
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_wr_gm++;
+ w->wr_gm_reqs_in_pipe--;
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ void
+ Call::MagicAtomicNRAddGroupU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+ {
+ GPUDynInstPtr m = gpuDynInst;
+ calcAddr(w, m);
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ ((int*)m->a_data)[lane] = src1.get<int>(w, lane, 1);
+ }
+
+ m->m_op = brigAtomicToMemOpType(Brig::BRIG_OPCODE_ATOMICNORET,
+ Brig::BRIG_ATOMIC_ADD);
+ m->m_type = U32::memType;
+ m->v_type = U32::vgprType;
+
+ m->exec_mask = w->execMask();
+ m->statusBitVector = 0;
+ m->equiv = 0; // atomics don't have an equivalence class operand
+ m->n_reg = 1;
+ m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+ m->scope = Enums::MEMORY_SCOPE_NONE;
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ m->s_type = SEG_GLOBAL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(64));
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_wr_gm++;
+ w->wr_gm_reqs_in_pipe--;
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ void
+ Call::MagicLoadGlobalU32Reg(Wavefront *w, GPUDynInstPtr gpuDynInst)
+ {
+ GPUDynInstPtr m = gpuDynInst;
+ // calculate the address
+ calcAddr(w, m);
+
+ m->m_op = Enums::MO_LD;
+ m->m_type = U32::memType; //MemDataType::memType;
+ m->v_type = U32::vgprType; //DestDataType::vgprType;
+
+ m->exec_mask = w->execMask();
+ m->statusBitVector = 0;
+ m->equiv = 0;
+ m->n_reg = 1;
+ m->memoryOrder = Enums::MEMORY_ORDER_NONE;
+ m->scope = Enums::MEMORY_SCOPE_NONE;
+
+ // FIXME
+ //m->dst_reg = this->dest.regIndex();
+
+ m->simdId = w->simdId;
+ m->wfSlotId = w->wfSlotId;
+ m->wfDynId = w->wfDynId;
+ m->latency.init(&w->computeUnit->shader->tick_cnt);
+
+ m->s_type = SEG_GLOBAL;
+ m->pipeId = GLBMEM_PIPE;
+ m->latency.set(w->computeUnit->shader->ticks(1));
+ w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m);
+ w->outstanding_reqs_rd_gm++;
+ w->rd_gm_reqs_in_pipe--;
+ w->outstanding_reqs++;
+ w->mem_reqs_in_pipe--;
+ }
+
+ void
+ Call::MagicXactCasLd(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int src_val1 = 0;
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (mask[lane]) {
+ src_val1 = src1.get<int>(w, lane, 1);
+ break;
+ }
+ }
+
+ if (!w->computeUnit->xactCasLoadMap.count(src_val1)) {
+ w->computeUnit->xactCasLoadMap[src_val1] = ComputeUnit::waveQueue();
+ w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue.clear();
+ }
+
+ w->computeUnit->xactCasLoadMap[src_val1].waveIDQueue
+ .push_back(ComputeUnit::waveIdentifier(w->simdId, w->wfSlotId));
+ }
+
+ void
+ Call::MagicMostSigThread(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ unsigned mst = true;
+
+ for (int lane = VSZ - 1; lane >= 0; --lane) {
+ if (mask[lane]) {
+ dest.set<int>(w, lane, mst);
+ mst = false;
+ }
+ }
+ }
+
+ void
+ Call::MagicMostSigBroadcast(Wavefront *w)
+ {
+ const VectorMask &mask = w->get_pred();
+ int res = 0;
+ bool got_res = false;
+
+ for (int lane = VSZ - 1; lane >= 0; --lane) {
+ if (mask[lane]) {
+ if (!got_res) {
+ res = src1.get<int>(w, lane, 1);
+ got_res = true;
+ }
+ dest.set<int>(w, lane, res);
+ }
+ }
+ }
+
+} // namespace HsailISA
diff --git a/src/arch/hsail/operand.cc b/src/arch/hsail/operand.cc
new file mode 100644
index 000000000..d0e6c5541
--- /dev/null
+++ b/src/arch/hsail/operand.cc
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "arch/hsail/operand.hh"
+
+using namespace Brig;
+
+bool
+BaseRegOperand::init(unsigned opOffset, const BrigObject *obj,
+ unsigned &maxRegIdx, char _regFileChar)
+{
+ regFileChar = _regFileChar;
+ const BrigOperand *brigOp = obj->getOperand(opOffset);
+
+ if (brigOp->kind != BRIG_KIND_OPERAND_REGISTER)
+ return false;
+
+ const BrigOperandRegister *brigRegOp = (const BrigOperandRegister*)brigOp;
+
+ regIdx = brigRegOp->regNum;
+
+ DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d\n", regIdx,
+ brigRegOp->regKind);
+
+ maxRegIdx = std::max(maxRegIdx, regIdx);
+
+ return true;
+}
+
+void
+ListOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+ const BrigOperand *brigOp = (const BrigOperand*)obj->getOperand(opOffset);
+
+ switch (brigOp->kind) {
+ case BRIG_KIND_OPERAND_CODE_LIST:
+ {
+ const BrigOperandCodeList *opList =
+ (const BrigOperandCodeList*)brigOp;
+
+ const Brig::BrigData *oprnd_data =
+ obj->getBrigBaseData(opList->elements);
+
+ // Note: for calls Dest list of operands could be size of 0.
+ elementCount = oprnd_data->byteCount / 4;
+
+ DPRINTF(GPUReg, "Operand Code List: # elements: %d\n",
+ elementCount);
+
+ for (int i = 0; i < elementCount; ++i) {
+ unsigned *data_offset =
+ (unsigned*)obj->getData(opList->elements + 4 * (i + 1));
+
+ const BrigDirectiveVariable *p =
+ (const BrigDirectiveVariable*)obj->
+ getCodeSectionEntry(*data_offset);
+
+ StorageElement *se = obj->currentCode->storageMap->
+ findSymbol(BRIG_SEGMENT_ARG, p);
+
+ assert(se);
+ callArgs.push_back(se);
+ }
+ }
+ break;
+ default:
+ fatal("ListOperand: bad operand kind %d\n", brigOp->kind);
+ }
+}
+
+std::string
+ListOperand::disassemble()
+{
+ std::string res_str("");
+
+ for (auto it : callArgs) {
+ res_str += csprintf("%s ", it->name.c_str());
+ }
+
+ return res_str;
+}
+
+void
+FunctionRefOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+ const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+ if (baseOp->kind != BRIG_KIND_OPERAND_CODE_REF) {
+ fatal("FunctionRefOperand: bad operand kind %d\n", baseOp->kind);
+ }
+
+ const BrigOperandCodeRef *brigOp = (const BrigOperandCodeRef*)baseOp;
+
+ const BrigDirectiveExecutable *p =
+ (const BrigDirectiveExecutable*)obj->getCodeSectionEntry(brigOp->ref);
+
+ func_name = obj->getString(p->name);
+}
+
+std::string
+FunctionRefOperand::disassemble()
+{
+ DPRINTF(GPUReg, "Operand Func-ref name: %s\n", func_name);
+
+ return csprintf("%s", func_name);
+}
+
+bool
+BaseRegOperand::init_from_vect(unsigned opOffset, const BrigObject *obj,
+ int at, unsigned &maxRegIdx, char _regFileChar)
+{
+ regFileChar = _regFileChar;
+ const BrigOperand *brigOp = obj->getOperand(opOffset);
+
+ if (brigOp->kind != BRIG_KIND_OPERAND_OPERAND_LIST)
+ return false;
+
+
+ const Brig::BrigOperandOperandList *brigRegVecOp =
+ (const Brig::BrigOperandOperandList*)brigOp;
+
+ unsigned *data_offset =
+ (unsigned*)obj->getData(brigRegVecOp->elements + 4 * (at + 1));
+
+ const BrigOperand *p =
+ (const BrigOperand*)obj->getOperand(*data_offset);
+ if (p->kind != BRIG_KIND_OPERAND_REGISTER) {
+ return false;
+ }
+
+ const BrigOperandRegister *brigRegOp =(const BrigOperandRegister*)p;
+
+ regIdx = brigRegOp->regNum;
+
+ DPRINTF(GPUReg, "Operand: regNum: %d, kind: %d \n", regIdx,
+ brigRegOp->regKind);
+
+ maxRegIdx = std::max(maxRegIdx, regIdx);
+
+ return true;
+}
+
+void
+BaseRegOperand::initWithStrOffset(unsigned strOffset, const BrigObject *obj,
+ unsigned &maxRegIdx, char _regFileChar)
+{
+ const char *name = obj->getString(strOffset);
+ char *endptr;
+ regIdx = strtoul(name + 2, &endptr, 10);
+
+ if (name[0] != '$' || name[1] != _regFileChar) {
+ fatal("register operand parse error on \"%s\"\n", name);
+ }
+
+ maxRegIdx = std::max(maxRegIdx, regIdx);
+}
+
+unsigned SRegOperand::maxRegIdx;
+unsigned DRegOperand::maxRegIdx;
+unsigned CRegOperand::maxRegIdx;
+
+std::string
+SRegOperand::disassemble()
+{
+ return csprintf("$s%d", regIdx);
+}
+
+std::string
+DRegOperand::disassemble()
+{
+ return csprintf("$d%d", regIdx);
+}
+
+std::string
+CRegOperand::disassemble()
+{
+ return csprintf("$c%d", regIdx);
+}
+
+BrigRegOperandInfo
+findRegDataType(unsigned opOffset, const BrigObject *obj)
+{
+ const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+ switch (baseOp->kind) {
+ case BRIG_KIND_OPERAND_REGISTER:
+ {
+ const BrigOperandRegister *op = (BrigOperandRegister*)baseOp;
+
+ return BrigRegOperandInfo((BrigKind16_t)baseOp->kind,
+ (BrigRegisterKind)op->regKind);
+ }
+ break;
+
+ case BRIG_KIND_OPERAND_OPERAND_LIST:
+ {
+ const BrigOperandOperandList *op =
+ (BrigOperandOperandList*)baseOp;
+ const BrigData *data_p = (BrigData*)obj->getData(op->elements);
+
+
+ int num_operands = 0;
+ BrigRegisterKind reg_kind = (BrigRegisterKind)0;
+ for (int offset = 0; offset < data_p->byteCount; offset += 4) {
+ const BrigOperand *op_p = (const BrigOperand *)
+ obj->getOperand(((int *)data_p->bytes)[offset/4]);
+
+ if (op_p->kind == BRIG_KIND_OPERAND_REGISTER) {
+ const BrigOperandRegister *brigRegOp =
+ (const BrigOperandRegister*)op_p;
+ reg_kind = (BrigRegisterKind)brigRegOp->regKind;
+ } else if (op_p->kind == BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+ uint16_t num_bytes =
+ ((Brig::BrigOperandConstantBytes*)op_p)->base.byteCount
+ - sizeof(BrigBase);
+ if (num_bytes == sizeof(uint32_t)) {
+ reg_kind = BRIG_REGISTER_KIND_SINGLE;
+ } else if (num_bytes == sizeof(uint64_t)) {
+ reg_kind = BRIG_REGISTER_KIND_DOUBLE;
+ } else {
+ fatal("OperandList: bad operand size %d\n", num_bytes);
+ }
+ } else {
+ fatal("OperandList: bad operand kind %d\n", op_p->kind);
+ }
+
+ num_operands++;
+ }
+ assert(baseOp->kind == BRIG_KIND_OPERAND_OPERAND_LIST);
+
+ return BrigRegOperandInfo((BrigKind16_t)baseOp->kind, reg_kind);
+ }
+ break;
+
+ case BRIG_KIND_OPERAND_ADDRESS:
+ {
+ const BrigOperandAddress *op = (BrigOperandAddress*)baseOp;
+
+ if (!op->reg) {
+ BrigType type = BRIG_TYPE_NONE;
+
+ if (op->symbol) {
+ const BrigDirective *dir = (BrigDirective*)
+ obj->getCodeSectionEntry(op->symbol);
+
+ assert(dir->kind == BRIG_KIND_DIRECTIVE_VARIABLE);
+
+ const BrigDirectiveVariable *sym =
+ (const BrigDirectiveVariable*)dir;
+
+ type = (BrigType)sym->type;
+ }
+ return BrigRegOperandInfo(BRIG_KIND_OPERAND_ADDRESS,
+ (BrigType)type);
+ } else {
+ const BrigOperandAddress *b = (const BrigOperandAddress*)baseOp;
+ const BrigOperand *reg = obj->getOperand(b->reg);
+ const BrigOperandRegister *rop = (BrigOperandRegister*)reg;
+
+ return BrigRegOperandInfo(BRIG_KIND_OPERAND_REGISTER,
+ (BrigRegisterKind)rop->regKind);
+ }
+ }
+ break;
+
+ default:
+ fatal("AddrOperand: bad operand kind %d\n", baseOp->kind);
+ break;
+ }
+}
+
+void
+AddrOperandBase::parseAddr(const BrigOperandAddress *op, const BrigObject *obj)
+{
+ assert(op->base.kind == BRIG_KIND_OPERAND_ADDRESS);
+
+ const BrigDirective *d =
+ (BrigDirective*)obj->getCodeSectionEntry(op->symbol);
+
+ assert(d->kind == BRIG_KIND_DIRECTIVE_VARIABLE);
+ const BrigDirectiveVariable *sym = (BrigDirectiveVariable*)d;
+ name = obj->getString(sym->name);
+
+ if (sym->segment != BRIG_SEGMENT_ARG) {
+ storageElement =
+ obj->currentCode->storageMap->findSymbol(sym->segment, name);
+ assert(storageElement);
+ offset = 0;
+ } else {
+ // sym->name does not work for BRIG_SEGMENT_ARG for the following case:
+ //
+ // void foo(int a);
+ // void bar(double a);
+ //
+ // foo(...) --> arg_u32 %param_p0;
+ // st_arg_u32 $s0, [%param_p0];
+ // call &foo (%param_p0);
+ // bar(...) --> arg_f64 %param_p0;
+ // st_arg_u64 $d0, [%param_p0];
+ // call &foo (%param_p0);
+ //
+ // Both functions use the same variable name (param_p0)!!!
+ //
+ // Maybe this is a bug in the compiler (I don't know).
+ //
+ // Solution:
+ // Use directive pointer (BrigDirectiveVariable) to differentiate 2
+ // versions of param_p0.
+ //
+ // Note this solution is kind of stupid, because we are pulling stuff
+ // out of the brig binary via the directive pointer and putting it into
+ // the symbol table, but now we are indexing the symbol table by the
+ // brig directive pointer! It makes the symbol table sort of pointless.
+ // But I don't want to mess with the rest of the infrastructure, so
+ // let's go with this for now.
+ //
+ // When we update the compiler again, we should see if this problem goes
+ // away. If so, we can fold some of this functionality into the code for
+ // kernel arguments. If not, maybe we can index the symbol name on a
+ // hash of the variable AND function name
+ storageElement = obj->currentCode->
+ storageMap->findSymbol((Brig::BrigSegment)sym->segment, sym);
+
+ assert(storageElement);
+ }
+}
+
+uint64_t
+AddrOperandBase::calcUniformBase()
+{
+ // start with offset, will be 0 if not specified
+ uint64_t address = offset;
+
+ // add in symbol value if specified
+ if (storageElement) {
+ address += storageElement->offset;
+ }
+
+ return address;
+}
+
+std::string
+AddrOperandBase::disassemble(std::string reg_disassembly)
+{
+ std::string disasm;
+
+ if (offset || reg_disassembly != "") {
+ disasm += "[";
+
+ if (reg_disassembly != "") {
+ disasm += reg_disassembly;
+
+ if (offset > 0) {
+ disasm += "+";
+ }
+ }
+
+ if (offset) {
+ disasm += csprintf("%d", offset);
+ }
+
+ disasm += "]";
+ } else if (name) {
+ disasm += csprintf("[%s]", name);
+ }
+
+ return disasm;
+}
+
+void
+NoRegAddrOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+ const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+ if (baseOp->kind == BRIG_KIND_OPERAND_ADDRESS) {
+ BrigOperandAddress *addrOp = (BrigOperandAddress*)baseOp;
+ parseAddr(addrOp, obj);
+ offset = (uint64_t(addrOp->offset.hi) << 32) |
+ uint64_t(addrOp->offset.lo);
+ } else {
+ fatal("NoRegAddrOperand: bad operand kind %d\n", baseOp->kind);
+ }
+
+}
+
+std::string
+NoRegAddrOperand::disassemble()
+{
+ return AddrOperandBase::disassemble(std::string(""));
+}
+
+void
+LabelOperand::init(unsigned opOffset, const BrigObject *obj)
+{
+ const BrigOperandCodeRef *op =
+ (const BrigOperandCodeRef*)obj->getOperand(opOffset);
+
+ assert(op->base.kind == BRIG_KIND_OPERAND_CODE_REF);
+
+ const BrigDirective *dir =
+ (const BrigDirective*)obj->getCodeSectionEntry(op->ref);
+
+ assert(dir->kind == BRIG_KIND_DIRECTIVE_LABEL);
+ label = obj->currentCode->refLabel((BrigDirectiveLabel*)dir, obj);
+}
+
+uint32_t
+LabelOperand::getTarget(Wavefront *w, int lane)
+{
+ return label->get();
+}
+
+std::string
+LabelOperand::disassemble()
+{
+ return label->name;
+}
diff --git a/src/arch/hsail/operand.hh b/src/arch/hsail/operand.hh
new file mode 100644
index 000000000..e3d275b10
--- /dev/null
+++ b/src/arch/hsail/operand.hh
@@ -0,0 +1,768 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __ARCH_HSAIL_OPERAND_HH__
+#define __ARCH_HSAIL_OPERAND_HH__
+
+/**
+ * @file operand.hh
+ *
+ * Defines classes encapsulating HSAIL instruction operands.
+ */
+
+#include <string>
+
+#include "arch/hsail/Brig.h"
+#include "base/trace.hh"
+#include "base/types.hh"
+#include "debug/GPUReg.hh"
+#include "enums/RegisterType.hh"
+#include "gpu-compute/brig_object.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/hsail_code.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+class Label;
+class StorageElement;
+
+class BaseOperand
+{
+ public:
+ Enums::RegisterType registerType;
+ uint32_t regOperandSize;
+ BaseOperand() { registerType = Enums::RT_NONE; regOperandSize = 0; }
+ bool isVectorRegister() { return registerType == Enums::RT_VECTOR; }
+ bool isScalarRegister() { return registerType == Enums::RT_SCALAR; }
+ bool isCondRegister() { return registerType == Enums::RT_CONDITION; }
+ unsigned int regIndex() { return 0; }
+ uint32_t opSize() { return regOperandSize; }
+ virtual ~BaseOperand() { }
+};
+
+class BrigRegOperandInfo
+{
+ public:
+ Brig::BrigKind16_t kind;
+ Brig::BrigType type;
+ Brig::BrigRegisterKind regKind;
+
+ BrigRegOperandInfo(Brig::BrigKind16_t _kind,
+ Brig::BrigRegisterKind _regKind)
+ : kind(_kind), regKind(_regKind)
+ {
+ }
+
+ BrigRegOperandInfo(Brig::BrigKind16_t _kind, Brig::BrigType _type)
+ : kind(_kind), type(_type)
+ {
+ }
+
+ BrigRegOperandInfo() : kind(Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES),
+ type(Brig::BRIG_TYPE_NONE)
+ {
+ }
+};
+
+BrigRegOperandInfo findRegDataType(unsigned opOffset, const BrigObject *obj);
+
+class BaseRegOperand : public BaseOperand
+{
+ public:
+ unsigned regIdx;
+ char regFileChar;
+
+ bool init(unsigned opOffset, const BrigObject *obj,
+ unsigned &maxRegIdx, char _regFileChar);
+
+ bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at,
+ unsigned &maxRegIdx, char _regFileChar);
+
+ void initWithStrOffset(unsigned strOffset, const BrigObject *obj,
+ unsigned &maxRegIdx, char _regFileChar);
+ unsigned int regIndex() { return regIdx; }
+};
+
+class SRegOperand : public BaseRegOperand
+{
+ public:
+ static unsigned maxRegIdx;
+
+ bool
+ init(unsigned opOffset, const BrigObject *obj)
+ {
+ regOperandSize = sizeof(uint32_t);
+ registerType = Enums::RT_VECTOR;
+
+ return BaseRegOperand::init(opOffset, obj, maxRegIdx, 's');
+ }
+
+ bool
+ init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+ {
+ regOperandSize = sizeof(uint32_t);
+ registerType = Enums::RT_VECTOR;
+
+ return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
+ 's');
+ }
+
+ void
+ initWithStrOffset(unsigned strOffset, const BrigObject *obj)
+ {
+ regOperandSize = sizeof(uint32_t);
+ registerType = Enums::RT_VECTOR;
+
+ return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
+ 's');
+ }
+
+ template<typename OperandType>
+ OperandType
+ get(Wavefront *w, int lane)
+ {
+ assert(sizeof(OperandType) <= sizeof(uint32_t));
+ assert(regIdx < w->maxSpVgprs);
+ // if OperandType is smaller than 32-bit, we truncate the value
+ OperandType ret;
+ uint32_t vgprIdx;
+
+ switch (sizeof(OperandType)) {
+ case 1: // 1 byte operand
+ vgprIdx = w->remap(regIdx, 1, 1);
+ ret = (w->computeUnit->vrf[w->simdId]->
+ read<uint32_t>(vgprIdx, lane)) & 0xff;
+ break;
+ case 2: // 2 byte operand
+ vgprIdx = w->remap(regIdx, 2, 1);
+ ret = (w->computeUnit->vrf[w->simdId]->
+ read<uint32_t>(vgprIdx, lane)) & 0xffff;
+ break;
+ case 4: // 4 byte operand
+ vgprIdx = w->remap(regIdx,sizeof(OperandType), 1);
+ ret = w->computeUnit->vrf[w->simdId]->
+ read<OperandType>(vgprIdx, lane);
+ break;
+ default:
+ panic("Bad OperandType\n");
+ break;
+ }
+
+ return (OperandType)ret;
+ }
+
+ // special get method for compatibility with LabelOperand
+ uint32_t
+ getTarget(Wavefront *w, int lane)
+ {
+ return get<uint32_t>(w, lane);
+ }
+
+ template<typename OperandType>
+ void set(Wavefront *w, int lane, OperandType &val);
+ std::string disassemble();
+};
+
+template<typename OperandType>
+void
+SRegOperand::set(Wavefront *w, int lane, OperandType &val)
+{
+ DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n",
+ w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val);
+
+ assert(sizeof(OperandType) == sizeof(uint32_t));
+ assert(regIdx < w->maxSpVgprs);
+ uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
+ w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane);
+}
+
+template<>
+inline void
+SRegOperand::set(Wavefront *w, int lane, uint64_t &val)
+{
+ DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $s%d <- %d\n",
+ w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx, val);
+
+ assert(regIdx < w->maxSpVgprs);
+ uint32_t vgprIdx = w->remap(regIdx, sizeof(uint32_t), 1);
+ w->computeUnit->vrf[w->simdId]->write<uint32_t>(vgprIdx, val, lane);
+}
+
+class DRegOperand : public BaseRegOperand
+{
+ public:
+ static unsigned maxRegIdx;
+
+ bool
+ init(unsigned opOffset, const BrigObject *obj)
+ {
+ regOperandSize = sizeof(uint64_t);
+ registerType = Enums::RT_VECTOR;
+
+ return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'd');
+ }
+
+ bool
+ init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+ {
+ regOperandSize = sizeof(uint64_t);
+ registerType = Enums::RT_VECTOR;
+
+ return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
+ 'd');
+ }
+
+ void
+ initWithStrOffset(unsigned strOffset, const BrigObject *obj)
+ {
+ regOperandSize = sizeof(uint64_t);
+ registerType = Enums::RT_VECTOR;
+
+ return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
+ 'd');
+ }
+
+ template<typename OperandType>
+ OperandType
+ get(Wavefront *w, int lane)
+ {
+ assert(sizeof(OperandType) <= sizeof(uint64_t));
+ // TODO: this check is valid only for HSAIL
+ assert(regIdx < w->maxDpVgprs);
+ uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
+
+ return w->computeUnit->vrf[w->simdId]->read<OperandType>(vgprIdx,lane);
+ }
+
+ template<typename OperandType>
+ void
+ set(Wavefront *w, int lane, OperandType &val)
+ {
+ DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $d%d <- %d\n",
+ w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx,
+ val);
+
+ assert(sizeof(OperandType) <= sizeof(uint64_t));
+ // TODO: this check is valid only for HSAIL
+ assert(regIdx < w->maxDpVgprs);
+ uint32_t vgprIdx = w->remap(regIdx, sizeof(OperandType), 1);
+ w->computeUnit->vrf[w->simdId]->write<OperandType>(vgprIdx,val,lane);
+ }
+
+ std::string disassemble();
+};
+
+class CRegOperand : public BaseRegOperand
+{
+ public:
+ static unsigned maxRegIdx;
+
+ bool
+ init(unsigned opOffset, const BrigObject *obj)
+ {
+ regOperandSize = sizeof(uint8_t);
+ registerType = Enums::RT_CONDITION;
+
+ return BaseRegOperand::init(opOffset, obj, maxRegIdx, 'c');
+ }
+
+ bool
+ init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+ {
+ regOperandSize = sizeof(uint8_t);
+ registerType = Enums::RT_CONDITION;
+
+ return BaseRegOperand::init_from_vect(opOffset, obj, at, maxRegIdx,
+ 'c');
+ }
+
+ void
+ initWithStrOffset(unsigned strOffset, const BrigObject *obj)
+ {
+ regOperandSize = sizeof(uint8_t);
+ registerType = Enums::RT_CONDITION;
+
+ return BaseRegOperand::initWithStrOffset(strOffset, obj, maxRegIdx,
+ 'c');
+ }
+
+ template<typename OperandType>
+ OperandType
+ get(Wavefront *w, int lane)
+ {
+ assert(regIdx < w->condRegState->numRegs());
+
+ return w->condRegState->read<OperandType>((int)regIdx, lane);
+ }
+
+ template<typename OperandType>
+ void
+ set(Wavefront *w, int lane, OperandType &val)
+ {
+ DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: $c%d <- %d\n",
+ w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane, regIdx,
+ val);
+
+ assert(regIdx < w->condRegState->numRegs());
+ w->condRegState->write<OperandType>(regIdx,lane,val);
+ }
+
+ std::string disassemble();
+};
+
+template<typename T>
+class ImmOperand : public BaseOperand
+{
+ public:
+ T bits;
+
+ bool init(unsigned opOffset, const BrigObject *obj);
+ bool init_from_vect(unsigned opOffset, const BrigObject *obj, int at);
+ std::string disassemble();
+
+ template<typename OperandType>
+ OperandType
+ get()
+ {
+ assert(sizeof(OperandType) <= sizeof(T));
+
+ return *(OperandType*)&bits;
+ }
+
+ // This version of get() takes a WF* and a lane id for
+ // compatibility with the register-based get() methods.
+ template<typename OperandType>
+ OperandType
+ get(Wavefront *w, int lane)
+ {
+ return get<OperandType>();
+ }
+};
+
+template<typename T>
+bool
+ImmOperand<T>::init(unsigned opOffset, const BrigObject *obj)
+{
+ const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
+
+ switch (brigOp->kind) {
+ // this is immediate operand
+ case Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES:
+ {
+ DPRINTF(GPUReg, "sizeof(T): %lu, byteCount: %d\n", sizeof(T),
+ brigOp->byteCount);
+
+ auto cbptr = (Brig::BrigOperandConstantBytes*)brigOp;
+
+ bits = *((T*)(obj->getData(cbptr->bytes + 4)));
+
+ return true;
+ }
+ break;
+
+ case Brig::BRIG_KIND_OPERAND_WAVESIZE:
+ bits = VSZ;
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+template <typename T>
+bool
+ImmOperand<T>::init_from_vect(unsigned opOffset, const BrigObject *obj, int at)
+{
+ const Brig::BrigOperand *brigOp = obj->getOperand(opOffset);
+
+ if (brigOp->kind != Brig::BRIG_KIND_OPERAND_OPERAND_LIST) {
+ return false;
+ }
+
+
+ const Brig::BrigOperandOperandList *brigVecOp =
+ (const Brig::BrigOperandOperandList *)brigOp;
+
+ unsigned *data_offset =
+ (unsigned *)obj->getData(brigVecOp->elements + 4 * (at + 1));
+
+ const Brig::BrigOperand *p =
+ (const Brig::BrigOperand *)obj->getOperand(*data_offset);
+
+ if (p->kind != Brig::BRIG_KIND_OPERAND_CONSTANT_BYTES) {
+ return false;
+ }
+
+ return init(*data_offset, obj);
+}
+template<typename T>
+std::string
+ImmOperand<T>::disassemble()
+{
+ return csprintf("0x%08x", bits);
+}
+
+template<typename RegOperand, typename T>
+class RegOrImmOperand : public BaseOperand
+{
+ private:
+ bool is_imm;
+
+ public:
+ void setImm(const bool value) { is_imm = value; }
+
+ ImmOperand<T> imm_op;
+ RegOperand reg_op;
+
+ RegOrImmOperand() { is_imm = false; }
+ void init(unsigned opOffset, const BrigObject *obj);
+ void init_from_vect(unsigned opOffset, const BrigObject *obj, int at);
+ std::string disassemble();
+
+ template<typename OperandType>
+ OperandType
+ get(Wavefront *w, int lane)
+ {
+ return is_imm ? imm_op.template get<OperandType>() :
+ reg_op.template get<OperandType>(w, lane);
+ }
+
+ uint32_t
+ opSize()
+ {
+ if (!is_imm) {
+ return reg_op.opSize();
+ }
+
+ return 0;
+ }
+
+ bool
+ isVectorRegister()
+ {
+ if (!is_imm) {
+ return reg_op.registerType == Enums::RT_VECTOR;
+ }
+ return false;
+ }
+
+ bool
+ isCondRegister()
+ {
+ if (!is_imm) {
+ return reg_op.registerType == Enums::RT_CONDITION;
+ }
+
+ return false;
+ }
+
+ bool
+ isScalarRegister()
+ {
+ if (!is_imm) {
+ return reg_op.registerType == Enums::RT_SCALAR;
+ }
+
+ return false;
+ }
+
+ unsigned int
+ regIndex()
+ {
+ if (!is_imm) {
+ return reg_op.regIndex();
+ }
+ return 0;
+ }
+};
+
+template<typename RegOperand, typename T>
+void
+RegOrImmOperand<RegOperand, T>::init(unsigned opOffset, const BrigObject *obj)
+{
+ is_imm = false;
+
+ if (reg_op.init(opOffset, obj)) {
+ return;
+ }
+
+ if (imm_op.init(opOffset, obj)) {
+ is_imm = true;
+ return;
+ }
+
+ fatal("RegOrImmOperand::init(): bad operand kind %d\n",
+ obj->getOperand(opOffset)->kind);
+}
+
+template<typename RegOperand, typename T>
+void
+RegOrImmOperand<RegOperand, T>::init_from_vect(unsigned opOffset,
+ const BrigObject *obj, int at)
+{
+ if (reg_op.init_from_vect(opOffset, obj, at)) {
+ is_imm = false;
+
+ return;
+ }
+
+ if (imm_op.init_from_vect(opOffset, obj, at)) {
+ is_imm = true;
+
+ return;
+ }
+
+ fatal("RegOrImmOperand::init(): bad operand kind %d\n",
+ obj->getOperand(opOffset)->kind);
+}
+
+template<typename RegOperand, typename T>
+std::string
+RegOrImmOperand<RegOperand, T>::disassemble()
+{
+ return is_imm ? imm_op.disassemble() : reg_op.disassemble();
+}
+
+typedef RegOrImmOperand<SRegOperand, uint32_t> SRegOrImmOperand;
+typedef RegOrImmOperand<DRegOperand, uint64_t> DRegOrImmOperand;
+typedef RegOrImmOperand<CRegOperand, bool> CRegOrImmOperand;
+
+class AddrOperandBase : public BaseOperand
+{
+ protected:
+ // helper function for init()
+ void parseAddr(const Brig::BrigOperandAddress *op, const BrigObject *obj);
+
+ // helper function for disassemble()
+ std::string disassemble(std::string reg_disassembly);
+ uint64_t calcUniformBase();
+
+ public:
+ virtual void calcVector(Wavefront *w, uint64_t *addrVec) = 0;
+ virtual uint64_t calcLane(Wavefront *w, int lane=0) = 0;
+
+ uint64_t offset;
+ const char *name = nullptr;
+ StorageElement *storageElement;
+};
+
+template<typename RegOperandType>
+class RegAddrOperand : public AddrOperandBase
+{
+ public:
+ RegOperandType reg;
+ void init(unsigned opOffset, const BrigObject *obj);
+ uint64_t calcUniform();
+ void calcVector(Wavefront *w, uint64_t *addrVec);
+ uint64_t calcLane(Wavefront *w, int lane=0);
+ uint32_t opSize() { return reg.opSize(); }
+ bool isVectorRegister() { return reg.registerType == Enums::RT_VECTOR; }
+ bool isCondRegister() { return reg.registerType == Enums::RT_CONDITION; }
+ bool isScalarRegister() { return reg.registerType == Enums::RT_SCALAR; }
+ unsigned int regIndex() { return reg.regIndex(); }
+ std::string disassemble();
+};
+
+template<typename RegOperandType>
+void
+RegAddrOperand<RegOperandType>::init(unsigned opOffset, const BrigObject *obj)
+{
+ using namespace Brig;
+
+ const BrigOperand *baseOp = obj->getOperand(opOffset);
+
+ switch (baseOp->kind) {
+ case BRIG_KIND_OPERAND_ADDRESS:
+ {
+ const BrigOperandAddress *op = (BrigOperandAddress*)baseOp;
+ storageElement = nullptr;
+
+ offset = (uint64_t(op->offset.hi) << 32) | uint64_t(op->offset.lo);
+ reg.init(op->reg, obj);
+
+ if (reg.regFileChar == 's') {
+ reg.regOperandSize = sizeof(uint32_t);
+ registerType = Enums::RT_VECTOR;
+ }
+ else if (reg.regFileChar == 'd') {
+ reg.regOperandSize = sizeof(uint64_t);
+ registerType = Enums::RT_VECTOR;
+ }
+ }
+ break;
+
+ default:
+ fatal("RegAddrOperand: bad operand kind %d\n", baseOp->kind);
+ break;
+ }
+}
+
+template<typename RegOperandType>
+uint64_t
+RegAddrOperand<RegOperandType>::calcUniform()
+{
+ fatal("can't do calcUniform() on register-based address\n");
+
+ return 0;
+}
+
+template<typename RegOperandType>
+void
+RegAddrOperand<RegOperandType>::calcVector(Wavefront *w, uint64_t *addrVec)
+{
+ Addr address = calcUniformBase();
+
+ for (int lane = 0; lane < VSZ; ++lane) {
+ if (w->execMask(lane)) {
+ if (reg.regFileChar == 's') {
+ addrVec[lane] = address + reg.template get<uint32_t>(w, lane);
+ } else {
+ addrVec[lane] = address + reg.template get<Addr>(w, lane);
+ }
+ }
+ }
+}
+
+template<typename RegOperandType>
+uint64_t
+RegAddrOperand<RegOperandType>::calcLane(Wavefront *w, int lane)
+{
+ Addr address = calcUniformBase();
+
+ return address + reg.template get<Addr>(w, lane);
+}
+
+template<typename RegOperandType>
+std::string
+RegAddrOperand<RegOperandType>::disassemble()
+{
+ return AddrOperandBase::disassemble(reg.disassemble());
+}
+
+typedef RegAddrOperand<SRegOperand> SRegAddrOperand;
+typedef RegAddrOperand<DRegOperand> DRegAddrOperand;
+
+class NoRegAddrOperand : public AddrOperandBase
+{
+ public:
+ void init(unsigned opOffset, const BrigObject *obj);
+ uint64_t calcUniform();
+ void calcVector(Wavefront *w, uint64_t *addrVec);
+ uint64_t calcLane(Wavefront *w, int lane=0);
+ std::string disassemble();
+};
+
+inline uint64_t
+NoRegAddrOperand::calcUniform()
+{
+ return AddrOperandBase::calcUniformBase();
+}
+
+inline uint64_t
+NoRegAddrOperand::calcLane(Wavefront *w, int lane)
+{
+ return calcUniform();
+}
+
+inline void
+NoRegAddrOperand::calcVector(Wavefront *w, uint64_t *addrVec)
+{
+ uint64_t address = calcUniformBase();
+
+ for (int lane = 0; lane < VSZ; ++lane)
+ addrVec[lane] = address;
+}
+
+class LabelOperand : public BaseOperand
+{
+ public:
+ Label *label;
+
+ void init(unsigned opOffset, const BrigObject *obj);
+ std::string disassemble();
+
+ // special get method for compatibility with SRegOperand
+ uint32_t getTarget(Wavefront *w, int lane);
+
+};
+
+class ListOperand : public BaseOperand
+{
+ public:
+ int elementCount;
+ std::vector<StorageElement*> callArgs;
+
+ int
+ getSrcOperand(int idx)
+ {
+ DPRINTF(GPUReg, "getSrcOperand, idx: %d, sz_args: %d\n", idx,
+ callArgs.size());
+
+ return callArgs.at(idx)->offset;
+ }
+
+ void init(unsigned opOffset, const BrigObject *obj);
+
+ std::string disassemble();
+
+ template<typename OperandType>
+ OperandType
+ get(Wavefront *w, int lane, int arg_idx)
+ {
+ return w->readCallArgMem<OperandType>(lane, getSrcOperand(arg_idx));
+ }
+
+ template<typename OperandType>
+ void
+ set(Wavefront *w, int lane, OperandType val)
+ {
+ w->writeCallArgMem<OperandType>(lane, getSrcOperand(0), val);
+ DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: arg[%d] <- %d\n",
+ w->computeUnit->cu_id, w->simdId, w->wfSlotId, lane,
+ getSrcOperand(0), val);
+ }
+};
+
+class FunctionRefOperand : public BaseOperand
+{
+ public:
+ const char *func_name;
+
+ void init(unsigned opOffset, const BrigObject *obj);
+ std::string disassemble();
+};
+
+#endif // __ARCH_HSAIL_OPERAND_HH__
diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py
new file mode 100644
index 000000000..bd95f6335
--- /dev/null
+++ b/src/gpu-compute/GPU.py
@@ -0,0 +1,310 @@
+#
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Steve Reinhardt
+#
+
+from ClockedObject import ClockedObject
+from Device import DmaDevice
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+from m5.SimObject import SimObject
+from MemObject import MemObject
+from Process import EmulatedDriver
+from Bridge import Bridge
+from LdsState import LdsState
+
+class PrefetchType(Enum): vals = [
+ 'PF_CU',
+ 'PF_PHASE',
+ 'PF_WF',
+ 'PF_STRIDE',
+ 'PF_END',
+ ]
+
+class VectorRegisterFile(SimObject):
+ type = 'VectorRegisterFile'
+ cxx_class = 'VectorRegisterFile'
+ cxx_header = 'gpu-compute/vector_register_file.hh'
+
+ simd_id = Param.Int(0, 'SIMD ID associated with this VRF')
+ num_regs_per_simd = Param.Int(2048, 'number of vector registers per SIMD')
+ min_alloc = Param.Int(4, 'min number of VGPRs allocated per WF')
+
+class Wavefront(SimObject):
+ type = 'Wavefront'
+ cxx_class = 'Wavefront'
+ cxx_header = 'gpu-compute/wavefront.hh'
+
+ simdId = Param.Int('SIMD id (0-ComputeUnit.num_SIMDs)')
+ wf_slot_id = Param.Int('wavefront id (0-ComputeUnit.max_wfs)')
+
+class ComputeUnit(MemObject):
+ type = 'ComputeUnit'
+ cxx_class = 'ComputeUnit'
+ cxx_header = 'gpu-compute/compute_unit.hh'
+
+ wavefronts = VectorParam.Wavefront('Number of wavefronts')
+ wfSize = Param.Int(64, 'Wavefront size (in work items)')
+ num_SIMDs = Param.Int(4, 'number of SIMD units per CU')
+
+ spbypass_pipe_length = Param.Int(4, 'vector ALU Single Precision bypass '\
+ 'latency')
+
+ dpbypass_pipe_length = Param.Int(8, 'vector ALU Double Precision bypass '\
+ 'latency')
+
+ issue_period = Param.Int(4, 'number of cycles per issue period')
+ num_global_mem_pipes = Param.Int(1,'number of global memory pipes per CU')
+ num_shared_mem_pipes = Param.Int(1,'number of shared memory pipes per CU')
+ n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+ mem_req_latency = Param.Int(9, "Latency for request from the cu to ruby. "\
+ "Represents the pipeline to reach the TCP and "\
+ "specified in GPU clock cycles")
+ mem_resp_latency = Param.Int(9, "Latency for responses from ruby to the "\
+ "cu. Represents the pipeline between the TCP "\
+ "and cu as well as TCP data array access. "\
+ "Specified in GPU clock cycles")
+ system = Param.System(Parent.any, "system object")
+ cu_id = Param.Int('CU id')
+ vrf_to_coalescer_bus_width = Param.Int(32, "VRF->Coalescer data bus width "\
+ "in bytes")
+ coalescer_to_vrf_bus_width = Param.Int(32, "Coalescer->VRF data bus width "\
+ "in bytes")
+
+ memory_port = VectorMasterPort("Port to the memory system")
+ translation_port = VectorMasterPort('Port to the TLB hierarchy')
+ sqc_port = MasterPort("Port to the SQC (I-cache")
+ sqc_tlb_port = MasterPort("Port to the TLB for the SQC (I-cache)")
+ perLaneTLB = Param.Bool(False, "enable per-lane TLB")
+ prefetch_depth = Param.Int(0, "Number of prefetches triggered at a time"\
+ "(0 turns off prefetching)")
+ prefetch_stride = Param.Int(1, "Fixed Prefetch Stride (1 means next-page)")
+ prefetch_prev_type = Param.PrefetchType('PF_PHASE', "Prefetch the stride "\
+ "from last mem req in lane of "\
+ "CU|Phase|Wavefront")
+ execPolicy = Param.String("OLDEST-FIRST", "WF execution selection policy");
+ xactCasMode = Param.Bool(False, "Behavior of xact_cas_load magic instr.");
+ debugSegFault = Param.Bool(False, "enable debugging GPU seg faults")
+ functionalTLB = Param.Bool(False, "Assume TLB causes no delay")
+
+ localMemBarrier = Param.Bool(False, "Assume Barriers do not wait on "\
+ "kernel end")
+
+ countPages = Param.Bool(False, "Generate per-CU file of all pages touched "\
+ "and how many times")
+ global_mem_queue_size = Param.Int(256, "Number of entries in the global "
+ "memory pipeline's queues")
+ local_mem_queue_size = Param.Int(256, "Number of entries in the local "
+ "memory pipeline's queues")
+ ldsBus = Bridge() # the bridge between the CU and its LDS
+ ldsPort = MasterPort("The port that goes to the LDS")
+ localDataStore = Param.LdsState("the LDS for this CU")
+
+ vector_register_file = VectorParam.VectorRegisterFile("Vector register "\
+ "file")
+
+class Shader(ClockedObject):
+ type = 'Shader'
+ cxx_class = 'Shader'
+ cxx_header = 'gpu-compute/shader.hh'
+
+ CUs = VectorParam.ComputeUnit('Number of compute units')
+ n_wf = Param.Int(1, 'Number of wavefront slots per SIMD')
+ impl_kern_boundary_sync = Param.Bool(True, """Insert acq/rel packets into
+ ruby at kernel boundaries""")
+ separate_acquire_release = Param.Bool(False,
+ """Do ld_acquire/st_release generate separate requests for the
+ acquire and release?""")
+ globalmem = Param.MemorySize('64kB', 'Memory size')
+ timing = Param.Bool(False, 'timing memory accesses')
+
+ cpu_pointer = Param.BaseCPU(NULL, "pointer to base CPU")
+ translation = Param.Bool(False, "address translation");
+
+class ClDriver(EmulatedDriver):
+ type = 'ClDriver'
+ cxx_header = 'gpu-compute/cl_driver.hh'
+ codefile = VectorParam.String('code file name(s)')
+
+class GpuDispatcher(DmaDevice):
+ type = 'GpuDispatcher'
+ cxx_header = 'gpu-compute/dispatcher.hh'
+ # put at 8GB line for now
+ pio_addr = Param.Addr(0x200000000, "Device Address")
+ pio_latency = Param.Latency('1ns', "Programmed IO latency")
+ shader_pointer = Param.Shader('pointer to shader')
+ translation_port = MasterPort('Port to the dispatcher TLB')
+ cpu = Param.BaseCPU("CPU to wake up on kernel completion")
+
+ cl_driver = Param.ClDriver('pointer to driver')
+
+class OpType(Enum): vals = [
+ 'OT_NULL',
+ 'OT_ALU',
+ 'OT_SPECIAL',
+ 'OT_GLOBAL_READ',
+ 'OT_GLOBAL_WRITE',
+ 'OT_GLOBAL_ATOMIC',
+ 'OT_GLOBAL_HIST',
+ 'OT_GLOBAL_LDAS',
+ 'OT_SHARED_READ',
+ 'OT_SHARED_WRITE',
+ 'OT_SHARED_ATOMIC',
+ 'OT_SHARED_HIST',
+ 'OT_SHARED_LDAS',
+ 'OT_PRIVATE_READ',
+ 'OT_PRIVATE_WRITE',
+ 'OT_PRIVATE_ATOMIC',
+ 'OT_PRIVATE_HIST',
+ 'OT_PRIVATE_LDAS',
+ 'OT_SPILL_READ',
+ 'OT_SPILL_WRITE',
+ 'OT_SPILL_ATOMIC',
+ 'OT_SPILL_HIST',
+ 'OT_SPILL_LDAS',
+ 'OT_READONLY_READ',
+ 'OT_READONLY_WRITE',
+ 'OT_READONLY_ATOMIC',
+ 'OT_READONLY_HIST',
+ 'OT_READONLY_LDAS',
+ 'OT_FLAT_READ',
+ 'OT_FLAT_WRITE',
+ 'OT_FLAT_ATOMIC',
+ 'OT_FLAT_HIST',
+ 'OT_FLAT_LDAS',
+ 'OT_KERN_READ',
+ 'OT_BRANCH',
+
+ # note: Only the OT_BOTH_MEMFENCE seems to be supported in the 1.0F version
+ # of the compiler.
+ 'OT_SHARED_MEMFENCE',
+ 'OT_GLOBAL_MEMFENCE',
+ 'OT_BOTH_MEMFENCE',
+
+ 'OT_BARRIER',
+ 'OT_PRINT',
+ 'OT_RET',
+ 'OT_NOP',
+ 'OT_ARG'
+ ]
+
+class MemType(Enum): vals = [
+ 'M_U8',
+ 'M_U16',
+ 'M_U32',
+ 'M_U64',
+ 'M_S8',
+ 'M_S16',
+ 'M_S32',
+ 'M_S64',
+ 'M_F16',
+ 'M_F32',
+ 'M_F64',
+ ]
+
+class MemOpType(Enum): vals = [
+ 'MO_LD',
+ 'MO_ST',
+ 'MO_LDAS',
+ 'MO_LDA',
+ 'MO_AAND',
+ 'MO_AOR',
+ 'MO_AXOR',
+ 'MO_ACAS',
+ 'MO_AEXCH',
+ 'MO_AADD',
+ 'MO_ASUB',
+ 'MO_AINC',
+ 'MO_ADEC',
+ 'MO_AMAX',
+ 'MO_AMIN',
+ 'MO_ANRAND',
+ 'MO_ANROR',
+ 'MO_ANRXOR',
+ 'MO_ANRCAS',
+ 'MO_ANREXCH',
+ 'MO_ANRADD',
+ 'MO_ANRSUB',
+ 'MO_ANRINC',
+ 'MO_ANRDEC',
+ 'MO_ANRMAX',
+ 'MO_ANRMIN',
+ 'MO_HAND',
+ 'MO_HOR',
+ 'MO_HXOR',
+ 'MO_HCAS',
+ 'MO_HEXCH',
+ 'MO_HADD',
+ 'MO_HSUB',
+ 'MO_HINC',
+ 'MO_HDEC',
+ 'MO_HMAX',
+ 'MO_HMIN',
+ 'MO_UNDEF'
+ ]
+
+class StorageClassType(Enum): vals = [
+ 'SC_SPILL',
+ 'SC_GLOBAL',
+ 'SC_SHARED',
+ 'SC_PRIVATE',
+ 'SC_READONLY',
+ 'SC_KERNARG',
+ 'SC_NONE',
+ ]
+
+class RegisterType(Enum): vals = [
+ 'RT_VECTOR',
+ 'RT_SCALAR',
+ 'RT_CONDITION',
+ 'RT_HARDWARE',
+ 'RT_NONE',
+ ]
+
+class GenericMemoryOrder(Enum): vals = [
+ 'MEMORY_ORDER_NONE',
+ 'MEMORY_ORDER_RELAXED',
+ 'MEMORY_ORDER_SC_ACQUIRE',
+ 'MEMORY_ORDER_SC_RELEASE',
+ 'MEMORY_ORDER_SC_ACQUIRE_RELEASE',
+ ]
+
+class GenericMemoryScope(Enum): vals = [
+ 'MEMORY_SCOPE_NONE',
+ 'MEMORY_SCOPE_WORKITEM',
+ 'MEMORY_SCOPE_WAVEFRONT',
+ 'MEMORY_SCOPE_WORKGROUP',
+ 'MEMORY_SCOPE_DEVICE',
+ 'MEMORY_SCOPE_SYSTEM',
+ ]
diff --git a/src/gpu-compute/LdsState.py b/src/gpu-compute/LdsState.py
new file mode 100644
index 000000000..6ea9f6427
--- /dev/null
+++ b/src/gpu-compute/LdsState.py
@@ -0,0 +1,51 @@
+#
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Joe Gross
+#
+
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+
+from MemObject import MemObject
+
+class LdsState(MemObject):
+ type = 'LdsState'
+ cxx_class = 'LdsState'
+ cxx_header = 'gpu-compute/lds_state.hh'
+ size = Param.Int(65536, 'the size of the LDS')
+ range = Param.AddrRange('64kB', "address space of the LDS")
+ bankConflictPenalty = Param.Int(1, 'penalty per LDS bank conflict when '\
+ 'accessing data')
+ banks = Param.Int(32, 'Number of LDS banks')
+ cuPort = SlavePort("port that goes to the compute unit")
diff --git a/src/gpu-compute/SConscript b/src/gpu-compute/SConscript
new file mode 100644
index 000000000..2de96df24
--- /dev/null
+++ b/src/gpu-compute/SConscript
@@ -0,0 +1,99 @@
+# -*- mode:python -*-
+
+#
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Anthony Gutierrez
+#
+
+Import('*')
+
+if not env['BUILD_GPU']:
+ Return()
+
+SimObject('GPU.py')
+SimObject('LdsState.py')
+SimObject('X86GPUTLB.py')
+
+if env['TARGET_GPU_ISA'] == 'hsail':
+ Source('brig_object.cc')
+ Source('hsail_code.cc')
+
+Source('cl_driver.cc')
+Source('compute_unit.cc')
+Source('condition_register_state.cc')
+Source('dispatcher.cc')
+Source('exec_stage.cc')
+Source('fetch_stage.cc')
+Source('fetch_unit.cc')
+Source('global_memory_pipeline.cc')
+Source('gpu_dyn_inst.cc')
+Source('gpu_exec_context.cc')
+Source('gpu_static_inst.cc')
+Source('gpu_tlb.cc')
+Source('hsa_object.cc')
+Source('kernel_cfg.cc')
+Source('lds_state.cc')
+Source('local_memory_pipeline.cc')
+Source('of_scheduling_policy.cc')
+Source('pool_manager.cc')
+Source('rr_scheduling_policy.cc')
+Source('schedule_stage.cc')
+Source('scheduler.cc')
+Source('scoreboard_check_stage.cc')
+Source('shader.cc')
+Source('simple_pool_manager.cc')
+Source('tlb_coalescer.cc')
+Source('vector_register_file.cc')
+Source('vector_register_state.cc')
+Source('wavefront.cc')
+
+DebugFlag('BRIG')
+DebugFlag('GPUCoalescer')
+DebugFlag('GPUDisp')
+DebugFlag('GPUExec')
+DebugFlag('GPUFetch')
+DebugFlag('GPUHsailCFInfo')
+DebugFlag('GPUMem')
+DebugFlag('GPUPort')
+DebugFlag('GPUPrefetch')
+DebugFlag('GPUReg')
+DebugFlag('GPUSync')
+DebugFlag('GPUTLB')
+DebugFlag('HSALoader')
+DebugFlag('HSAIL')
+DebugFlag('HSAILObject')
+DebugFlag('Predictor')
+DebugFlag('WavefrontStack')
+
+CompoundFlag('GPUALL', ['GPUCoalescer', 'GPUDisp', 'GPUExec', 'GPUFetch',
+ 'GPUMem', 'GPUPort', 'GPUSync', 'GPUTLB', 'HSAIL'])
diff --git a/src/gpu-compute/X86GPUTLB.py b/src/gpu-compute/X86GPUTLB.py
new file mode 100644
index 000000000..51f8e514e
--- /dev/null
+++ b/src/gpu-compute/X86GPUTLB.py
@@ -0,0 +1,77 @@
+#
+# Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Lisa Hsu
+#
+
+from m5.defines import buildEnv
+from m5.params import *
+from m5.proxy import *
+
+from m5.objects.MemObject import MemObject
+
+if buildEnv['FULL_SYSTEM']:
+ class X86PagetableWalker(MemObject):
+ type = 'X86PagetableWalker'
+ cxx_class = 'X86ISA::Walker'
+ port = SlavePort("Port for the hardware table walker")
+ system = Param.System(Parent.any, "system object")
+
+class X86GPUTLB(MemObject):
+ type = 'X86GPUTLB'
+ cxx_class = 'X86ISA::GpuTLB'
+ cxx_header = 'gpu-compute/gpu_tlb.hh'
+ size = Param.Int(64, "TLB size (number of entries)")
+ assoc = Param.Int(64, "TLB associativity")
+
+ if buildEnv['FULL_SYSTEM']:
+ walker = Param.X86PagetableWalker(X86PagetableWalker(),
+ "page table walker")
+
+ hitLatency = Param.Int(2, "Latency of a TLB hit")
+ missLatency1 = Param.Int(5, "Latency #1 of a TLB miss")
+ missLatency2 = Param.Int(100, "Latency #2 of a TLB miss")
+ maxOutstandingReqs = Param.Int(64, "# of maximum outstanding requests")
+ slave = VectorSlavePort("Port on side closer to CPU/CU")
+ master = VectorMasterPort("Port on side closer to memory")
+ allocationPolicy = Param.Bool(True, "Allocate on an access")
+ accessDistance = Param.Bool(False, "print accessDistance stats")
+
+class TLBCoalescer(MemObject):
+ type = 'TLBCoalescer'
+ cxx_class = 'TLBCoalescer'
+ cxx_header = 'gpu-compute/tlb_coalescer.hh'
+ probesPerCycle = Param.Int(2, "Number of TLB probes per cycle")
+ coalescingWindow = Param.Int(1, "Permit coalescing across that many ticks")
+ slave = VectorSlavePort("Port on side closer to CPU/CU")
+ master = VectorMasterPort("Port on side closer to memory")
+ disableCoalescing = Param.Bool(False,"Dispable Coalescing")
diff --git a/src/gpu-compute/brig_object.cc b/src/gpu-compute/brig_object.cc
new file mode 100644
index 000000000..7cc9b7cc4
--- /dev/null
+++ b/src/gpu-compute/brig_object.cc
@@ -0,0 +1,474 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt, Anthony Gutierrez
+ */
+
+#include "gpu-compute/brig_object.hh"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+
+#include "arch/hsail/Brig.h"
+#include "base/misc.hh"
+#include "base/trace.hh"
+#include "debug/BRIG.hh"
+#include "debug/HSAILObject.hh"
+#include "debug/HSALoader.hh"
+
+using namespace Brig;
+
+std::vector<std::function<HsaObject*(const std::string&, int, uint8_t*)>>
+ HsaObject::tryFileFuncs = { BrigObject::tryFile };
+
+extern int getBrigDataTypeBytes(BrigType16_t t);
+
+const char *BrigObject::sectionNames[] =
+{
+ "hsa_data",
+ "hsa_code",
+ "hsa_operand",
+ ".shstrtab"
+};
+
+const char *segmentNames[] =
+{
+ "none",
+ "flat",
+ "global",
+ "readonly",
+ "kernarg",
+ "group",
+ "private",
+ "spill",
+ "args"
+};
+
+const uint8_t*
+BrigObject::getSectionOffset(enum SectionIndex sec, int offs) const
+{
+ // allow offs == size for dummy end pointers
+ assert(offs <= sectionInfo[sec].size);
+
+ return sectionInfo[sec].ptr + offs;
+}
+
+const char*
+BrigObject::getString(int offs) const
+{
+ return (const char*)(getSectionOffset(DataSectionIndex, offs) + 4);
+}
+
+const BrigBase*
+BrigObject::getCodeSectionEntry(int offs) const
+{
+ return (const BrigBase*)getSectionOffset(CodeSectionIndex, offs);
+}
+
+const BrigData*
+BrigObject::getBrigBaseData(int offs) const
+{
+ return (Brig::BrigData*)(getSectionOffset(DataSectionIndex, offs));
+}
+
+const uint8_t*
+BrigObject::getData(int offs) const
+{
+ return getSectionOffset(DataSectionIndex, offs);
+}
+
+const BrigOperand*
+BrigObject::getOperand(int offs) const
+{
+ return (const BrigOperand*)getSectionOffset(OperandsSectionIndex, offs);
+}
+
+unsigned
+BrigObject::getOperandPtr(int offs, int index) const
+{
+ unsigned *op_offs = (unsigned*)(getData(offs + 4 * (index + 1)));
+
+ return *op_offs;
+}
+
+const BrigInstBase*
+BrigObject::getInst(int offs) const
+{
+ return (const BrigInstBase*)getSectionOffset(CodeSectionIndex, offs);
+}
+
+HsaCode*
+BrigObject::getKernel(const std::string &name) const
+{
+ return nullptr;
+}
+
+HsaCode*
+BrigObject::getFunction(const std::string &name) const
+{
+ for (int i = 0; i < functions.size(); ++i) {
+ if (functions[i]->name() == name) {
+ return functions[i];
+ }
+ }
+
+ return nullptr;
+}
+
+void
+BrigObject::processDirectives(const BrigBase *dirPtr, const BrigBase *endPtr,
+ StorageMap *storageMap)
+{
+ while (dirPtr < endPtr) {
+ if (!dirPtr->byteCount) {
+ fatal("Bad directive size 0\n");
+ }
+
+ // calculate next pointer now so we can override it if needed
+ const BrigBase *nextDirPtr = brigNext(dirPtr);
+
+ DPRINTF(HSAILObject, "Code section entry kind: #%x, byte count: %d\n",
+ dirPtr->kind, dirPtr->byteCount);
+
+ switch (dirPtr->kind) {
+ case BRIG_KIND_DIRECTIVE_FUNCTION:
+ {
+ const BrigDirectiveExecutable *p M5_VAR_USED =
+ reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
+
+ DPRINTF(HSAILObject,"DIRECTIVE_FUNCTION: %s offset: "
+ "%d next: %d\n", getString(p->name),
+ p->firstCodeBlockEntry, p->nextModuleEntry);
+
+ if (p->firstCodeBlockEntry != p->nextModuleEntry) {
+ panic("Function calls are not fully supported yet!!: %s\n",
+ getString(p->name));
+
+ const char *name = getString(p->name);
+
+ HsailCode *code_obj = nullptr;
+
+ for (int i = 0; i < functions.size(); ++i) {
+ if (functions[i]->name() == name) {
+ code_obj = functions[i];
+ break;
+ }
+ }
+
+ if (!code_obj) {
+ // create new local storage map for kernel-local symbols
+ code_obj = new HsailCode(name, p, this,
+ new StorageMap(storageMap));
+ functions.push_back(code_obj);
+ } else {
+ panic("Multiple definition of Function!!: %s\n",
+ getString(p->name));
+ }
+
+ }
+ nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_KERNEL:
+ {
+ const BrigDirectiveExecutable *p =
+ reinterpret_cast<const BrigDirectiveExecutable*>(dirPtr);
+
+ DPRINTF(HSAILObject,"DIRECTIVE_KERNEL: %s offset: %d count: "
+ "next: %d\n", getString(p->name),
+ p->firstCodeBlockEntry, p->nextModuleEntry);
+
+ const char *name = getString(p->name);
+
+ if (name[0] == '&')
+ name++;
+
+ std::string str = name;
+ char *temp;
+ int len = str.length();
+
+ if (str[len - 1] >= 'a' && str[len - 1] <= 'z') {
+ temp = new char[str.size() + 1];
+ std::copy(str.begin(), str.end() , temp);
+ temp[str.size()] = '\0';
+ } else {
+ temp = new char[str.size()];
+ std::copy(str.begin(), str.end() - 1 , temp);
+ temp[str.size() - 1 ] = '\0';
+ }
+
+ std::string kernel_name = temp;
+ delete[] temp;
+
+ HsailCode *code_obj = nullptr;
+
+ for (const auto &kernel : kernels) {
+ if (kernel->name() == kernel_name) {
+ code_obj = kernel;
+ break;
+ }
+ }
+
+ if (!code_obj) {
+ // create new local storage map for kernel-local symbols
+ code_obj = new HsailCode(kernel_name, p, this,
+ new StorageMap(storageMap));
+
+ kernels.push_back(code_obj);
+ }
+
+ nextDirPtr = getCodeSectionEntry(p->nextModuleEntry);
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_VARIABLE:
+ {
+ const BrigDirectiveVariable *p =
+ reinterpret_cast<const BrigDirectiveVariable*>(dirPtr);
+
+ uint64_t readonlySize_old =
+ storageMap->getSize(BRIG_SEGMENT_READONLY);
+
+ StorageElement* se = storageMap->addSymbol(p, this);
+
+ DPRINTF(HSAILObject, "DIRECTIVE_VARIABLE, symbol %s\n",
+ getString(p->name));
+
+ if (p->segment == BRIG_SEGMENT_READONLY) {
+ // readonly memory has initialization data
+ uint8_t* readonlyData_old = readonlyData;
+
+ readonlyData =
+ new uint8_t[storageMap->getSize(BRIG_SEGMENT_READONLY)];
+
+ if (p->init) {
+ if ((p->type == BRIG_TYPE_ROIMG) ||
+ (p->type == BRIG_TYPE_WOIMG) ||
+ (p->type == BRIG_TYPE_SAMP) ||
+ (p->type == BRIG_TYPE_SIG32) ||
+ (p->type == BRIG_TYPE_SIG64)) {
+ panic("Read only data type not supported: %s\n",
+ getString(p->name));
+ }
+
+ const BrigOperand *brigOp = getOperand(p->init);
+ assert(brigOp->kind ==
+ BRIG_KIND_OPERAND_CONSTANT_BYTES);
+
+ const Brig::BrigData *operand_data M5_VAR_USED =
+ getBrigBaseData(((BrigOperandConstantBytes*)
+ brigOp)->bytes);
+
+ assert((operand_data->byteCount / 4) > 0);
+
+ uint8_t *symbol_data =
+ (uint8_t*)getData(((BrigOperandConstantBytes*)
+ brigOp)->bytes + 4);
+
+ // copy the old data and add the new data
+ if (readonlySize_old > 0) {
+ memcpy(readonlyData, readonlyData_old,
+ readonlySize_old);
+ }
+
+ memcpy(readonlyData + se->offset, symbol_data,
+ se->size);
+
+ delete[] readonlyData_old;
+ }
+ }
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_LABEL:
+ {
+ const BrigDirectiveLabel M5_VAR_USED *p =
+ reinterpret_cast<const BrigDirectiveLabel*>(dirPtr);
+
+ panic("Label directives cannot be at the module level: %s\n",
+ getString(p->name));
+
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_COMMENT:
+ {
+ const BrigDirectiveComment M5_VAR_USED *p =
+ reinterpret_cast<const BrigDirectiveComment*>(dirPtr);
+
+ DPRINTF(HSAILObject, "DIRECTIVE_COMMENT: %s\n",
+ getString(p->name));
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_LOC:
+ {
+ DPRINTF(HSAILObject, "BRIG_DIRECTIVE_LOC\n");
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_MODULE:
+ {
+ const BrigDirectiveModule M5_VAR_USED *p =
+ reinterpret_cast<const BrigDirectiveModule*>(dirPtr);
+
+ DPRINTF(HSAILObject, "BRIG_DIRECTIVE_MODULE: %s\n",
+ getString(p->name));
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_CONTROL:
+ {
+ DPRINTF(HSAILObject, "DIRECTIVE_CONTROL\n");
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_PRAGMA:
+ {
+ DPRINTF(HSAILObject, "DIRECTIVE_PRAGMA\n");
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_EXTENSION:
+ {
+ DPRINTF(HSAILObject, "DIRECTIVE_EXTENSION\n");
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
+ {
+ DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_START\n");
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
+ {
+ DPRINTF(HSAILObject, "DIRECTIVE_ARG_BLOCK_END\n");
+ }
+ break;
+ default:
+ if (dirPtr->kind >= BRIG_KIND_INST_BEGIN &&
+ dirPtr->kind <= BRIG_KIND_INST_END)
+ break;
+
+ if (dirPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
+ dirPtr->kind <= BRIG_KIND_OPERAND_END)
+ break;
+
+ warn("Unknown Brig directive kind: %d\n", dirPtr->kind);
+ break;
+ }
+
+ dirPtr = nextDirPtr;
+ }
+}
+
+HsaObject*
+BrigObject::tryFile(const std::string &fname, int len, uint8_t *fileData)
+{
+ const char *brig_ident = "HSA BRIG";
+
+ if (memcmp(brig_ident, fileData, MODULE_IDENTIFICATION_LENGTH))
+ return nullptr;
+
+ return new BrigObject(fname, len, fileData);
+}
+
+BrigObject::BrigObject(const std::string &fname, int len, uint8_t *fileData)
+ : HsaObject(fname), storageMap(new StorageMap())
+{
+ const char *brig_ident = "HSA BRIG";
+ BrigModuleHeader *mod_hdr = (BrigModuleHeader*)fileData;
+
+ fatal_if(memcmp(brig_ident, mod_hdr, MODULE_IDENTIFICATION_LENGTH),
+ "%s is not a BRIG file\n", fname);
+
+ if (mod_hdr->brigMajor != BRIG_VERSION_BRIG_MAJOR ||
+ mod_hdr->brigMinor != BRIG_VERSION_BRIG_MINOR) {
+ fatal("%s: BRIG version mismatch, %d.%d != %d.%d\n",
+ fname, mod_hdr->brigMajor, mod_hdr->brigMinor,
+ BRIG_VERSION_BRIG_MAJOR, BRIG_VERSION_BRIG_MINOR);
+ }
+
+ fatal_if(mod_hdr->sectionCount != NumSectionIndices, "%s: BRIG section "
+ "count (%d) != expected value (%d)\n", fname,
+ mod_hdr->sectionCount, NumSectionIndices);
+
+ for (int i = 0; i < NumSectionIndices; ++i) {
+ sectionInfo[i].ptr = nullptr;
+ }
+
+ uint64_t *sec_idx_table = (uint64_t*)(fileData + mod_hdr->sectionIndex);
+ for (int sec_idx = 0; sec_idx < mod_hdr->sectionCount; ++sec_idx) {
+ uint8_t *sec_hdr_byte_ptr = fileData + sec_idx_table[sec_idx];
+ BrigSectionHeader *sec_hdr = (BrigSectionHeader*)sec_hdr_byte_ptr;
+
+ // It doesn't look like cprintf supports string precision values,
+ // but if this breaks, the right answer is to fix that
+ DPRINTF(HSAILObject, "found section %.*s\n", sec_hdr->nameLength,
+ sec_hdr->name);
+
+ sectionInfo[sec_idx].ptr = new uint8_t[sec_hdr->byteCount];
+ memcpy(sectionInfo[sec_idx].ptr, sec_hdr_byte_ptr, sec_hdr->byteCount);
+ sectionInfo[sec_idx].size = sec_hdr->byteCount;
+ }
+
+ BrigSectionHeader *code_hdr =
+ (BrigSectionHeader*)sectionInfo[CodeSectionIndex].ptr;
+
+ DPRINTF(HSAILObject, "Code section hdr, count: %d, hdr count: %d, "
+ "name len: %d\n", code_hdr->byteCount, code_hdr->headerByteCount,
+ code_hdr->nameLength);
+
+ // start at offset 4 to skip initial null entry (see Brig spec)
+ processDirectives(getCodeSectionEntry(code_hdr->headerByteCount),
+ getCodeSectionEntry(sectionInfo[CodeSectionIndex].size),
+ storageMap);
+
+ delete[] fileData;
+
+ DPRINTF(HSALoader, "BRIG object %s loaded.\n", fname);
+}
+
+BrigObject::~BrigObject()
+{
+ for (int i = 0; i < NumSectionIndices; ++i)
+ if (sectionInfo[i].ptr)
+ delete[] sectionInfo[i].ptr;
+}
diff --git a/src/gpu-compute/brig_object.hh b/src/gpu-compute/brig_object.hh
new file mode 100644
index 000000000..59a585914
--- /dev/null
+++ b/src/gpu-compute/brig_object.hh
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt, Anthony Gutierrez
+ */
+
+#ifndef __BRIG_OBJECT_HH__
+#define __BRIG_OBJECT_HH__
+
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "arch/hsail/Brig.h"
+#include "gpu-compute/hsa_object.hh"
+#include "gpu-compute/hsail_code.hh"
+
+class LabelMap;
+class StorageMap;
+
+/* @class BrigObject
+ * this class implements the BRIG loader object, and
+ * is used when the simulator directly executes HSAIL.
+ * this class is responsible for extracting all
+ * information about kernels contained in BRIG format
+ * and converts them to HsailCode objects that are
+ * usable by the simulator and emulated runtime.
+ */
+
+class BrigObject final : public HsaObject
+{
+ public:
+ enum SectionIndex
+ {
+ DataSectionIndex,
+ CodeSectionIndex,
+ OperandsSectionIndex,
+ NumSectionIndices
+ };
+
+ static const char *sectionNames[];
+
+ struct SectionInfo
+ {
+ uint8_t *ptr;
+ int size;
+ };
+
+ static HsaObject* tryFile(const std::string &fname, int len,
+ uint8_t *fileData);
+
+ SectionInfo sectionInfo[NumSectionIndices];
+ const uint8_t *getSectionOffset(enum SectionIndex sec, int offs) const;
+
+ std::vector<HsailCode*> kernels;
+ std::vector<HsailCode*> functions;
+ std::string kern_block_name;
+
+ void processDirectives(const Brig::BrigBase *dirPtr,
+ const Brig::BrigBase *endPtr,
+ StorageMap *storageMap);
+
+ BrigObject(const std::string &fname, int len, uint8_t *fileData);
+ ~BrigObject();
+
+ // eventually these will need to be per-kernel not per-object-file
+ StorageMap *storageMap;
+ LabelMap *labelMap;
+
+ const char* getString(int offs) const;
+ const Brig::BrigData* getBrigBaseData(int offs) const;
+ const uint8_t* getData(int offs) const;
+ const Brig::BrigBase* getCodeSectionEntry(int offs) const;
+ const Brig::BrigOperand* getOperand(int offs) const;
+ unsigned getOperandPtr(int offs, int index) const;
+ const Brig::BrigInstBase* getInst(int offs) const;
+
+ HsaCode* getKernel(const std::string &name) const override;
+ HsaCode* getFunction(const std::string &name) const override;
+
+ int numKernels() const override { return kernels.size(); }
+
+ HsaCode* getKernel(int i) const override { return kernels[i]; }
+
+ // pointer to the current kernel/function we're processing, so elements
+ // under construction can reference it. kinda ugly, but easier
+ // than passing it all over for the few places it's needed.
+ mutable HsailCode *currentCode;
+};
+
+// Utility function to bump Brig item pointer to next element given
+// item size in bytes. Really just an add but with lots of casting.
+template<typename T>
+T*
+brigNext(T *ptr)
+{
+ Brig::BrigBase *base_ptr = (Brig::BrigBase*)ptr;
+ int size = base_ptr->byteCount;
+ assert(size);
+
+ return (T*)((uint8_t*)ptr + size);
+}
+
+#endif // __BRIG_OBJECT_HH__
diff --git a/src/gpu-compute/cl_driver.cc b/src/gpu-compute/cl_driver.cc
new file mode 100644
index 000000000..3b3291c03
--- /dev/null
+++ b/src/gpu-compute/cl_driver.cc
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/cl_driver.hh"
+
+#include "base/intmath.hh"
+#include "cpu/thread_context.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/hsa_code.hh"
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "gpu-compute/hsa_object.hh"
+#include "params/ClDriver.hh"
+#include "sim/process.hh"
+#include "sim/syscall_emul_buf.hh"
+
+ClDriver::ClDriver(ClDriverParams *p)
+ : EmulatedDriver(p), hsaCode(0)
+{
+ for (const auto &codeFile : p->codefile)
+ codeFiles.push_back(&codeFile);
+
+ maxFuncArgsSize = 0;
+
+ for (int i = 0; i < codeFiles.size(); ++i) {
+ HsaObject *obj = HsaObject::createHsaObject(*codeFiles[i]);
+
+ for (int k = 0; k < obj->numKernels(); ++k) {
+ assert(obj->getKernel(k));
+ kernels.push_back(obj->getKernel(k));
+ kernels.back()->setReadonlyData((uint8_t*)obj->readonlyData);
+ int kern_funcargs_size = kernels.back()->funcarg_size;
+ maxFuncArgsSize = maxFuncArgsSize < kern_funcargs_size ?
+ kern_funcargs_size : maxFuncArgsSize;
+ }
+ }
+
+ int name_offs = 0;
+ int code_offs = 0;
+
+ for (int i = 0; i < kernels.size(); ++i) {
+ kernelInfo.push_back(HsaKernelInfo());
+ HsaCode *k = kernels[i];
+
+ k->generateHsaKernelInfo(&kernelInfo[i]);
+
+ kernelInfo[i].name_offs = name_offs;
+ kernelInfo[i].code_offs = code_offs;
+
+ name_offs += k->name().size() + 1;
+ code_offs += k->numInsts() * sizeof(GPUStaticInst*);
+ }
+}
+
+void
+ClDriver::handshake(GpuDispatcher *_dispatcher)
+{
+ dispatcher = _dispatcher;
+ dispatcher->setFuncargsSize(maxFuncArgsSize);
+}
+
+int
+ClDriver::open(LiveProcess *p, ThreadContext *tc, int mode, int flags)
+{
+ int fd = p->allocFD(-1, filename, 0, 0, false);
+ FDEntry *fde = p->getFDEntry(fd);
+ fde->driver = this;
+
+ return fd;
+}
+
+int
+ClDriver::ioctl(LiveProcess *process, ThreadContext *tc, unsigned req)
+{
+ int index = 2;
+ Addr buf_addr = process->getSyscallArg(tc, index);
+
+ switch (req) {
+ case HSA_GET_SIZES:
+ {
+ TypedBufferArg<HsaDriverSizes> sizes(buf_addr);
+ sizes->num_kernels = kernels.size();
+ sizes->string_table_size = 0;
+ sizes->code_size = 0;
+ sizes->readonly_size = 0;
+
+ if (kernels.size() > 0) {
+ // all kernels will share the same read-only memory
+ sizes->readonly_size =
+ kernels[0]->getSize(HsaCode::MemorySegment::READONLY);
+ // check our assumption
+ for (int i = 1; i<kernels.size(); ++i) {
+ assert(sizes->readonly_size ==
+ kernels[i]->getSize(HsaCode::MemorySegment::READONLY));
+ }
+ }
+
+ for (int i = 0; i < kernels.size(); ++i) {
+ HsaCode *k = kernels[i];
+ // add one for terminating '\0'
+ sizes->string_table_size += k->name().size() + 1;
+ sizes->code_size += k->numInsts() * sizeof(GPUStaticInst*);
+ }
+
+ sizes.copyOut(tc->getMemProxy());
+ }
+ break;
+
+ case HSA_GET_KINFO:
+ {
+ TypedBufferArg<HsaKernelInfo>
+ kinfo(buf_addr, sizeof(HsaKernelInfo) * kernels.size());
+
+ for (int i = 0; i < kernels.size(); ++i) {
+ HsaKernelInfo *ki = &kinfo[i];
+ ki->name_offs = kernelInfo[i].name_offs;
+ ki->code_offs = kernelInfo[i].code_offs;
+ ki->sRegCount = kernelInfo[i].sRegCount;
+ ki->dRegCount = kernelInfo[i].dRegCount;
+ ki->cRegCount = kernelInfo[i].cRegCount;
+ ki->static_lds_size = kernelInfo[i].static_lds_size;
+ ki->private_mem_size = kernelInfo[i].private_mem_size;
+ ki->spill_mem_size = kernelInfo[i].spill_mem_size;
+ }
+
+ kinfo.copyOut(tc->getMemProxy());
+ }
+ break;
+
+ case HSA_GET_STRINGS:
+ {
+ int string_table_size = 0;
+ for (int i = 0; i < kernels.size(); ++i) {
+ HsaCode *k = kernels[i];
+ string_table_size += k->name().size() + 1;
+ }
+
+ BufferArg buf(buf_addr, string_table_size);
+ char *bufp = (char*)buf.bufferPtr();
+
+ for (int i = 0; i < kernels.size(); ++i) {
+ HsaCode *k = kernels[i];
+ const char *n = k->name().c_str();
+
+ // idiomatic string copy
+ while ((*bufp++ = *n++));
+ }
+
+ assert(bufp - (char *)buf.bufferPtr() == string_table_size);
+
+ buf.copyOut(tc->getMemProxy());
+ }
+ break;
+
+ case HSA_GET_READONLY_DATA:
+ {
+ // we can pick any kernel --- they share the same
+ // readonly segment (this assumption is checked in GET_SIZES)
+ uint64_t size =
+ kernels.back()->getSize(HsaCode::MemorySegment::READONLY);
+ BufferArg data(buf_addr, size);
+ char *datap = (char *)data.bufferPtr();
+ memcpy(datap,
+ kernels.back()->readonly_data,
+ size);
+ data.copyOut(tc->getMemProxy());
+ }
+ break;
+
+ case HSA_GET_CODE:
+ {
+ // set hsaCode pointer
+ hsaCode = buf_addr;
+ int code_size = 0;
+
+ for (int i = 0; i < kernels.size(); ++i) {
+ HsaCode *k = kernels[i];
+ code_size += k->numInsts() * sizeof(TheGpuISA::RawMachInst);
+ }
+
+ TypedBufferArg<TheGpuISA::RawMachInst> buf(buf_addr, code_size);
+ TheGpuISA::RawMachInst *bufp = buf;
+
+ int buf_idx = 0;
+
+ for (int i = 0; i < kernels.size(); ++i) {
+ HsaCode *k = kernels[i];
+
+ for (int j = 0; j < k->numInsts(); ++j) {
+ bufp[buf_idx] = k->insts()->at(j);
+ ++buf_idx;
+ }
+ }
+
+ buf.copyOut(tc->getMemProxy());
+ }
+ break;
+
+ case HSA_GET_CU_CNT:
+ {
+ BufferArg buf(buf_addr, sizeof(uint32_t));
+ *((uint32_t*)buf.bufferPtr()) = dispatcher->getNumCUs();
+ buf.copyOut(tc->getMemProxy());
+ }
+ break;
+
+ case HSA_GET_VSZ:
+ {
+ BufferArg buf(buf_addr, sizeof(uint32_t));
+ *((uint32_t*)buf.bufferPtr()) = VSZ;
+ buf.copyOut(tc->getMemProxy());
+ }
+ break;
+
+ default:
+ fatal("ClDriver: bad ioctl %d\n", req);
+ }
+
+ return 0;
+}
+
+const char*
+ClDriver::codeOffToKernelName(uint64_t code_ptr)
+{
+ assert(hsaCode);
+ uint32_t code_offs = code_ptr - hsaCode;
+
+ for (int i = 0; i < kernels.size(); ++i) {
+ if (code_offs == kernelInfo[i].code_offs) {
+ return kernels[i]->name().c_str();
+ }
+ }
+
+ return nullptr;
+}
+
+ClDriver*
+ClDriverParams::create()
+{
+ return new ClDriver(this);
+}
diff --git a/src/gpu-compute/cl_driver.hh b/src/gpu-compute/cl_driver.hh
new file mode 100644
index 000000000..03567bab5
--- /dev/null
+++ b/src/gpu-compute/cl_driver.hh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __CL_DRIVER_HH__
+#define __CL_DRIVER_HH__
+
+#include <vector>
+
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "sim/emul_driver.hh"
+
+class GpuDispatcher;
+class HsaCode;
+class LiveProcess;
+class ThreadContext;
+
+struct ClDriverParams;
+
+class ClDriver final : public EmulatedDriver
+{
+ public:
+ ClDriver(ClDriverParams *p);
+ void handshake(GpuDispatcher *_dispatcher);
+ int open(LiveProcess *p, ThreadContext *tc, int mode, int flags);
+ int ioctl(LiveProcess *p, ThreadContext *tc, unsigned req);
+ const char* codeOffToKernelName(uint64_t code_ptr);
+
+ private:
+ GpuDispatcher *dispatcher;
+
+ std::vector<const std::string*> codeFiles;
+
+ // All the kernels we know about
+ std::vector<HsaCode*> kernels;
+ std::vector<HsaCode*> functions;
+
+ std::vector<HsaKernelInfo> kernelInfo;
+
+ // maximum size necessary for function arguments
+ int maxFuncArgsSize;
+ // The host virtual address for the kernel code
+ uint64_t hsaCode;
+};
+
+#endif // __CL_DRIVER_HH__
diff --git a/src/gpu-compute/cl_event.hh b/src/gpu-compute/cl_event.hh
new file mode 100644
index 000000000..75297a2d2
--- /dev/null
+++ b/src/gpu-compute/cl_event.hh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Marc Orr
+ */
+
+#ifndef __GPU_CL_EVENT_HH__
+#define __GPU_CL_EVENT_HH__
+
+struct HsaQueueEntry;
+
+class _cl_event {
+ public:
+ _cl_event() : done(false), hsaTaskPtr(nullptr), start(0), end(0) { }
+
+ volatile bool done;
+ HsaQueueEntry *hsaTaskPtr;
+ uint64_t start;
+ uint64_t end;
+};
+
+#endif // __GPU_CL_EVENT_HH__
diff --git a/src/gpu-compute/code_enums.hh b/src/gpu-compute/code_enums.hh
new file mode 100644
index 000000000..126cf6c50
--- /dev/null
+++ b/src/gpu-compute/code_enums.hh
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __CODE_ENUMS_HH__
+#define __CODE_ENUMS_HH__
+
+#define IS_OT_GLOBAL(a) ((a)>=Enums::OT_GLOBAL_READ \
+ && (a)<=Enums::OT_GLOBAL_LDAS)
+#define IS_OT_SHARED(a) ((a)>=Enums::OT_SHARED_READ \
+ && (a)<=Enums::OT_SHARED_LDAS)
+#define IS_OT_PRIVATE(a) ((a)>=Enums::OT_PRIVATE_READ \
+ && (a)<=Enums::OT_PRIVATE_LDAS)
+#define IS_OT_SPILL(a) ((a)>=Enums::OT_SPILL_READ \
+ && (a)<=Enums::OT_SPILL_LDAS)
+#define IS_OT_READONLY(a) ((a)>=Enums::OT_READONLY_READ \
+ && (a)<=Enums::OT_READONLY_LDAS)
+#define IS_OT_FLAT(a) ((a)>=Enums::OT_FLAT_READ && (a)<=Enums::OT_FLAT_LDAS)
+
+#define IS_OT_LDAS(a) ((a)==Enums::OT_GLOBAL_LDAS||(a)==Enums::OT_SHARED_LDAS \
+ ||(a)==Enums::OT_PRIVATE_LDAS||(a)==Enums::OT_SPILL_LDAS \
+ ||(a)==Enums::OT_READONLY_LDAS||(a)==Enums::OT_FLAT_LDAS)
+
+#define IS_OT_READ(a) ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SHARED_READ \
+ ||(a)==Enums::OT_PRIVATE_READ||(a)==Enums::OT_SPILL_READ \
+ ||(a)==Enums::OT_READONLY_READ||(a)==Enums::OT_FLAT_READ)
+
+#define IS_OT_READ_GM(a) \
+ ((a)==Enums::OT_GLOBAL_READ||(a)==Enums::OT_SPILL_READ \
+ ||(a)==Enums::OT_READONLY_READ)
+
+#define IS_OT_READ_LM(a) ((a)==Enums::OT_SHARED_READ)
+
+#define IS_OT_READ_RM(a) ((a)==Enums::OT_READONLY_READ)
+
+#define IS_OT_READ_PM(a) ((a)==Enums::OT_PRIVATE_READ)
+
+#define IS_OT_WRITE(a) \
+ ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SHARED_WRITE \
+ ||(a)==Enums::OT_PRIVATE_WRITE||(a)==Enums::OT_SPILL_WRITE \
+ ||(a)==Enums::OT_READONLY_WRITE||(a)==Enums::OT_FLAT_WRITE)
+
+#define IS_OT_WRITE_GM(a) \
+ ((a)==Enums::OT_GLOBAL_WRITE||(a)==Enums::OT_SPILL_WRITE \
+ ||(a)==Enums::OT_READONLY_WRITE)
+
+#define IS_OT_WRITE_LM(a) ((a)==Enums::OT_SHARED_WRITE)
+
+#define IS_OT_WRITE_PM(a) ((a)==Enums::OT_PRIVATE_WRITE)
+
+#define IS_OT_ATOMIC(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
+ ||(a)==Enums::OT_SHARED_ATOMIC \
+ ||(a)==Enums::OT_PRIVATE_ATOMIC \
+ ||(a)==Enums::OT_SPILL_ATOMIC \
+ ||(a)==Enums::OT_READONLY_ATOMIC \
+ ||(a)==Enums::OT_FLAT_ATOMIC)
+
+#define IS_OT_ATOMIC_GM(a) ((a)==Enums::OT_GLOBAL_ATOMIC \
+ ||(a)==Enums::OT_SPILL_ATOMIC \
+ ||(a)==Enums::OT_READONLY_ATOMIC \
+ ||(a)==Enums::OT_GLOBAL_MEMFENCE \
+ ||(a)==Enums::OT_BOTH_MEMFENCE)
+
+#define IS_OT_ATOMIC_LM(a) ((a)==Enums::OT_SHARED_ATOMIC \
+ ||(a)==Enums::OT_SHARED_MEMFENCE \
+ ||(a)==Enums::OT_BOTH_MEMFENCE)
+
+#define IS_OT_ATOMIC_PM(a) ((a)==Enums::OT_PRIVATE_ATOMIC)
+
+#define IS_OT_HIST(a) ((a)==Enums::OT_GLOBAL_HIST \
+ ||(a)==Enums::OT_SHARED_HIST \
+ ||(a)==Enums::OT_PRIVATE_HIST \
+ ||(a)==Enums::OT_SPILL_HIST \
+ ||(a)==Enums::OT_READONLY_HIST \
+ ||(a)==Enums::OT_FLAT_HIST)
+
+#define IS_OT_HIST_GM(a) ((a)==Enums::OT_GLOBAL_HIST \
+ ||(a)==Enums::OT_SPILL_HIST \
+ ||(a)==Enums::OT_READONLY_HIST)
+
+#define IS_OT_HIST_LM(a) ((a)==Enums::OT_SHARED_HIST)
+
+#define IS_OT_HIST_PM(a) ((a)==Enums::OT_PRIVATE_HIST)
+
+#endif // __CODE_ENUMS_HH__
diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc
new file mode 100644
index 000000000..d3622007a
--- /dev/null
+++ b/src/gpu-compute/compute_unit.cc
@@ -0,0 +1,1817 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Anthony Gutierrez
+ */
+
+#include "gpu-compute/compute_unit.hh"
+
+#include "base/output.hh"
+#include "debug/GPUDisp.hh"
+#include "debug/GPUExec.hh"
+#include "debug/GPUFetch.hh"
+#include "debug/GPUMem.hh"
+#include "debug/GPUPort.hh"
+#include "debug/GPUPrefetch.hh"
+#include "debug/GPUSync.hh"
+#include "debug/GPUTLB.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/ndrange.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/page_table.hh"
+#include "sim/process.hh"
+
+ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
+ scoreboardCheckStage(p), scheduleStage(p), execStage(p),
+ globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
+ cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
+ spBypassPipeLength(p->spbypass_pipe_length),
+ dpBypassPipeLength(p->dpbypass_pipe_length),
+ issuePeriod(p->issue_period),
+ numGlbMemUnits(p->num_global_mem_pipes),
+ numLocMemUnits(p->num_shared_mem_pipes),
+ perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
+ prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
+ xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
+ functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
+ countPages(p->countPages), barrier_id(0),
+ vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
+ coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
+ req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
+ resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
+ _masterId(p->system->getMasterId(name() + ".ComputeUnit")),
+ lds(*p->localDataStore), globalSeqNum(0), wavefrontSize(p->wfSize)
+{
+ // this check will be eliminated once we have wavefront size support added
+ fatal_if(p->wfSize != VSZ, "Wavefront size parameter does not match VSZ");
+ // calculate how many cycles a vector load or store will need to transfer
+ // its data over the corresponding buses
+ numCyclesPerStoreTransfer = (uint32_t)ceil((double)(VSZ * sizeof(uint32_t))
+ / (double)vrfToCoalescerBusWidth);
+
+ numCyclesPerLoadTransfer = (VSZ * sizeof(uint32_t))
+ / coalescerToVrfBusWidth;
+
+ lastVaddrWF.resize(numSIMDs);
+ wfList.resize(numSIMDs);
+
+ for (int j = 0; j < numSIMDs; ++j) {
+ lastVaddrWF[j].resize(p->n_wf);
+
+ for (int i = 0; i < p->n_wf; ++i) {
+ lastVaddrWF[j][i].resize(VSZ);
+
+ wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
+ wfList[j][i]->setParent(this);
+
+ for (int k = 0; k < VSZ; ++k) {
+ lastVaddrWF[j][i][k] = 0;
+ }
+ }
+ }
+
+ lastVaddrPhase.resize(numSIMDs);
+
+ for (int i = 0; i < numSIMDs; ++i) {
+ lastVaddrPhase[i] = LastVaddrWave();
+ }
+
+ lastVaddrCU = LastVaddrWave();
+
+ lds.setParent(this);
+
+ if (p->execPolicy == "OLDEST-FIRST") {
+ exec_policy = EXEC_POLICY::OLDEST;
+ } else if (p->execPolicy == "ROUND-ROBIN") {
+ exec_policy = EXEC_POLICY::RR;
+ } else {
+ fatal("Invalid WF execution policy (CU)\n");
+ }
+
+ memPort.resize(VSZ);
+
+ // resize the tlbPort vectorArray
+ int tlbPort_width = perLaneTLB ? VSZ : 1;
+ tlbPort.resize(tlbPort_width);
+
+ cuExitCallback = new CUExitCallback(this);
+ registerExitCallback(cuExitCallback);
+
+ xactCasLoadMap.clear();
+ lastExecCycle.resize(numSIMDs, 0);
+
+ for (int i = 0; i < vrf.size(); ++i) {
+ vrf[i]->setParent(this);
+ }
+
+ numVecRegsPerSimd = vrf[0]->numRegs();
+}
+
+ComputeUnit::~ComputeUnit()
+{
+ // Delete wavefront slots
+
+ for (int j = 0; j < numSIMDs; ++j)
+ for (int i = 0; i < shader->n_wf; ++i) {
+ delete wfList[j][i];
+ }
+
+ readyList.clear();
+ waveStatusList.clear();
+ dispatchList.clear();
+ vectorAluInstAvail.clear();
+ delete cuExitCallback;
+ delete ldsPort;
+}
+
+void
+ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
+{
+ w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
+
+ w->workgroupsz[0] = ndr->q.wgSize[0];
+ w->workgroupsz[1] = ndr->q.wgSize[1];
+ w->workgroupsz[2] = ndr->q.wgSize[2];
+ w->wg_sz = w->workgroupsz[0] * w->workgroupsz[1] * w->workgroupsz[2];
+ w->gridsz[0] = ndr->q.gdSize[0];
+ w->gridsz[1] = ndr->q.gdSize[1];
+ w->gridsz[2] = ndr->q.gdSize[2];
+ w->kernelArgs = ndr->q.args;
+ w->privSizePerItem = ndr->q.privMemPerItem;
+ w->spillSizePerItem = ndr->q.spillMemPerItem;
+ w->roBase = ndr->q.roMemStart;
+ w->roSize = ndr->q.roMemTotal;
+}
+
+void
+ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
+ int trueWgSize[], int trueWgSizeTotal,
+ LdsChunk *ldsChunk, uint64_t origSpillMemStart)
+{
+ wfCtx->cnt = cnt;
+
+ VectorMask init_mask;
+ init_mask.reset();
+
+ for (int k = 0; k < VSZ; ++k) {
+ if (k + cnt * VSZ < trueWgSizeTotal)
+ init_mask[k] = 1;
+ }
+
+ wfCtx->init_mask = init_mask.to_ullong();
+ wfCtx->exec_mask = init_mask.to_ullong();
+
+ for (int i = 0; i < VSZ; ++i) {
+ wfCtx->bar_cnt[i] = 0;
+ }
+
+ wfCtx->max_bar_cnt = 0;
+ wfCtx->old_barrier_cnt = 0;
+ wfCtx->barrier_cnt = 0;
+
+ wfCtx->privBase = ndr->q.privMemStart;
+ ndr->q.privMemStart += ndr->q.privMemPerItem * VSZ;
+
+ wfCtx->spillBase = ndr->q.spillMemStart;
+ ndr->q.spillMemStart += ndr->q.spillMemPerItem * VSZ;
+
+ wfCtx->pc = 0;
+ wfCtx->rpc = UINT32_MAX;
+
+ // set the wavefront context to have a pointer to this section of the LDS
+ wfCtx->ldsChunk = ldsChunk;
+
+ // WG state
+ wfCtx->wg_id = ndr->globalWgId;
+ wfCtx->barrier_id = barrier_id;
+
+ // Kernel wide state
+ wfCtx->ndr = ndr;
+}
+
+void
+ComputeUnit::updateEvents() {
+
+ if (!timestampVec.empty()) {
+ uint32_t vecSize = timestampVec.size();
+ uint32_t i = 0;
+ while (i < vecSize) {
+ if (timestampVec[i] <= shader->tick_cnt) {
+ std::pair<uint32_t, uint32_t> regInfo = regIdxVec[i];
+ vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t),
+ statusVec[i]);
+ timestampVec.erase(timestampVec.begin() + i);
+ regIdxVec.erase(regIdxVec.begin() + i);
+ statusVec.erase(statusVec.begin() + i);
+ --vecSize;
+ --i;
+ }
+ ++i;
+ }
+ }
+
+ for (int i = 0; i< numSIMDs; ++i) {
+ vrf[i]->updateEvents();
+ }
+}
+
+
+void
+ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
+ int trueWgSizeTotal)
+{
+ static int _n_wave = 0;
+ int cnt = wfCtx->cnt;
+ NDRange *ndr = wfCtx->ndr;
+
+ // Fill in Kernel state
+ FillKernelState(w, ndr);
+
+ w->kern_id = ndr->dispatchId;
+ w->dynwaveid = cnt;
+ w->init_mask = wfCtx->init_mask;
+
+ for (int k = 0; k < VSZ; ++k) {
+ w->workitemid[0][k] = (k+cnt*VSZ) % trueWgSize[0];
+ w->workitemid[1][k] = ((k + cnt * VSZ) / trueWgSize[0]) % trueWgSize[1];
+ w->workitemid[2][k] = (k + cnt * VSZ) / (trueWgSize[0] * trueWgSize[1]);
+
+ w->workitemFlatId[k] = w->workitemid[2][k] * trueWgSize[0] *
+ trueWgSize[1] + w->workitemid[1][k] * trueWgSize[0] +
+ w->workitemid[0][k];
+ }
+
+ w->old_barrier_cnt = wfCtx->old_barrier_cnt;
+ w->barrier_cnt = wfCtx->barrier_cnt;
+ w->barrier_slots = divCeil(trueWgSizeTotal, VSZ);
+
+ for (int i = 0; i < VSZ; ++i) {
+ w->bar_cnt[i] = wfCtx->bar_cnt[i];
+ }
+
+ w->max_bar_cnt = wfCtx->max_bar_cnt;
+ w->privBase = wfCtx->privBase;
+ w->spillBase = wfCtx->spillBase;
+
+ w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask);
+
+ // WG state
+ w->wg_id = wfCtx->wg_id;
+ w->dispatchid = wfCtx->ndr->dispatchId;
+ w->workgroupid[0] = w->wg_id % ndr->numWg[0];
+ w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
+ w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
+
+ w->barrier_id = wfCtx->barrier_id;
+ w->stalledAtBarrier = false;
+
+ // move this from the context into the actual wavefront
+ w->ldsChunk = wfCtx->ldsChunk;
+
+ int32_t refCount M5_VAR_USED =
+ lds.increaseRefCounter(w->dispatchid, w->wg_id);
+ DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
+ cu_id, w->wg_id, refCount);
+
+ w->instructionBuffer.clear();
+
+ if (w->pendingFetch)
+ w->dropFetch = true;
+
+ // is this the last wavefront in the workgroup
+ // if set the spillWidth to be the remaining work-items
+ // so that the vector access is correct
+ if ((cnt + 1) * VSZ >= trueWgSizeTotal) {
+ w->spillWidth = trueWgSizeTotal - (cnt * VSZ);
+ } else {
+ w->spillWidth = VSZ;
+ }
+
+ DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
+ "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
+
+ w->start(++_n_wave, ndr->q.code_ptr);
+}
+
+void
+ComputeUnit::StartWorkgroup(NDRange *ndr)
+{
+ // reserve the LDS capacity allocated to the work group
+ // disambiguated by the dispatch ID and workgroup ID, which should be
+ // globally unique
+ LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId,
+ ndr->q.ldsSize);
+
+ // Send L1 cache acquire
+ // isKernel + isAcquire = Kernel Begin
+ if (shader->impl_kern_boundary_sync) {
+ GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(nullptr,
+ nullptr,
+ nullptr, 0);
+
+ gpuDynInst->useContinuation = false;
+ gpuDynInst->memoryOrder = Enums::MEMORY_ORDER_SC_ACQUIRE;
+ gpuDynInst->scope = Enums::MEMORY_SCOPE_SYSTEM;
+ injectGlobalMemFence(gpuDynInst, true);
+ }
+
+ // Get true size of workgroup (after clamping to grid size)
+ int trueWgSize[3];
+ int trueWgSizeTotal = 1;
+
+ for (int d = 0; d < 3; ++d) {
+ trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
+ ndr->wgId[d] * ndr->q.wgSize[d]);
+
+ trueWgSizeTotal *= trueWgSize[d];
+ }
+
+ uint64_t origSpillMemStart = ndr->q.spillMemStart;
+ // calculate the number of 32-bit vector registers required by wavefront
+ int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
+ int cnt = 0;
+
+ // Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
+ for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
+ Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
+ // Check if this wavefront slot is available:
+ // It must be stopped and not waiting
+ // for a release to complete S_RETURNING
+ if (w->status == Wavefront::S_STOPPED) {
+ // if we have scheduled all work items then stop
+ // scheduling wavefronts
+ if (cnt * VSZ >= trueWgSizeTotal)
+ break;
+
+ // reserve vector registers for the scheduled wavefront
+ assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd);
+ uint32_t normSize = 0;
+
+ w->startVgprIndex = vrf[m % numSIMDs]->manager->
+ allocateRegion(vregDemand, &normSize);
+
+ w->reservedVectorRegs = normSize;
+ vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
+
+ WFContext wfCtx;
+
+ InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal,
+ ldsChunk, origSpillMemStart);
+
+ StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal);
+ ++cnt;
+ }
+ }
+ ++barrier_id;
+}
+
+int
+ComputeUnit::ReadyWorkgroup(NDRange *ndr)
+{
+ // Get true size of workgroup (after clamping to grid size)
+ int trueWgSize[3];
+ int trueWgSizeTotal = 1;
+
+ for (int d = 0; d < 3; ++d) {
+ trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
+ ndr->wgId[d] * ndr->q.wgSize[d]);
+
+ trueWgSizeTotal *= trueWgSize[d];
+ DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
+ }
+
+ DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
+
+ // calculate the number of 32-bit vector registers required by each
+ // work item of the work group
+ int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
+ bool vregAvail = true;
+ int numWfs = (trueWgSizeTotal + VSZ - 1) / VSZ;
+ int freeWfSlots = 0;
+ // check if the total number of VGPRs required by all WFs of the WG
+ // fit in the VRFs of all SIMD units
+ assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd));
+ int numMappedWfs = 0;
+ std::vector<int> numWfsPerSimd;
+ numWfsPerSimd.resize(numSIMDs, 0);
+ // find how many free WF slots we have across all SIMDs
+ for (int j = 0; j < shader->n_wf; ++j) {
+ for (int i = 0; i < numSIMDs; ++i) {
+ if (wfList[i][j]->status == Wavefront::S_STOPPED) {
+ // count the number of free WF slots
+ ++freeWfSlots;
+ if (numMappedWfs < numWfs) {
+ // count the WFs to be assigned per SIMD
+ numWfsPerSimd[i]++;
+ }
+ numMappedWfs++;
+ }
+ }
+ }
+
+ // if there are enough free WF slots then find if there are enough
+ // free VGPRs per SIMD based on the WF->SIMD mapping
+ if (freeWfSlots >= numWfs) {
+ for (int j = 0; j < numSIMDs; ++j) {
+ // find if there are enough free VGPR regions in the SIMD's VRF
+ // to accommodate the WFs of the new WG that would be mapped to
+ // this SIMD unit
+ vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j],
+ vregDemandPerWI);
+
+ // stop searching if there is at least one SIMD
+ // whose VRF does not have enough free VGPR pools.
+ // This is because a WG is scheduled only if ALL
+ // of its WFs can be scheduled
+ if (!vregAvail)
+ break;
+ }
+ }
+
+ DPRINTF(GPUDisp, "Free WF slots = %d, VGPR Availability = %d\n",
+ freeWfSlots, vregAvail);
+
+ if (!vregAvail) {
+ ++numTimesWgBlockedDueVgprAlloc;
+ }
+
+ // Return true if enough WF slots to submit workgroup and if there are
+ // enough VGPRs to schedule all WFs to their SIMD units
+ if (!lds.canReserve(ndr->q.ldsSize)) {
+ wgBlockedDueLdsAllocation++;
+ }
+
+ // Return true if (a) there are enough free WF slots to submit
+ // workgrounp and (b) if there are enough VGPRs to schedule all WFs to their
+ // SIMD units and (c) if there is enough space in LDS
+ return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize);
+}
+
+int
+ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
+{
+ DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
+ int ccnt = 0;
+
+ for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
+ for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
+ Wavefront *w = wfList[i_simd][i_wf];
+
+ if (w->status == Wavefront::S_RUNNING) {
+ DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
+
+ DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
+ w->barrier_id, _barrier_id);
+
+ DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
+ w->barrier_cnt, bcnt);
+ }
+
+ if (w->status == Wavefront::S_RUNNING &&
+ w->barrier_id == _barrier_id && w->barrier_cnt == bcnt &&
+ !w->outstanding_reqs) {
+ ++ccnt;
+
+ DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
+ "%d\n", i_simd, i_wf, ccnt);
+ }
+ }
+ }
+
+ DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
+ cu_id, ccnt, bslots);
+
+ return ccnt == bslots;
+}
+
+// Check if the current wavefront is blocked on additional resources.
+bool
+ComputeUnit::cedeSIMD(int simdId, int wfSlotId)
+{
+ bool cede = false;
+
+ // If --xact-cas-mode option is enabled in run.py, then xact_cas_ld
+ // magic instructions will impact the scheduling of wavefronts
+ if (xact_cas_mode) {
+ /*
+ * When a wavefront calls xact_cas_ld, it adds itself to a per address
+ * queue. All per address queues are managed by the xactCasLoadMap.
+ *
+ * A wavefront is not blocked if: it is not in ANY per address queue or
+ * if it is at the head of a per address queue.
+ */
+ for (auto itMap : xactCasLoadMap) {
+ std::list<waveIdentifier> curWaveIDQueue = itMap.second.waveIDQueue;
+
+ if (!curWaveIDQueue.empty()) {
+ for (auto it : curWaveIDQueue) {
+ waveIdentifier cur_wave = it;
+
+ if (cur_wave.simdId == simdId &&
+ cur_wave.wfSlotId == wfSlotId) {
+ // 2 possibilities
+ // 1: this WF has a green light
+ // 2: another WF has a green light
+ waveIdentifier owner_wave = curWaveIDQueue.front();
+
+ if (owner_wave.simdId != cur_wave.simdId ||
+ owner_wave.wfSlotId != cur_wave.wfSlotId) {
+ // possibility 2
+ cede = true;
+ break;
+ } else {
+ // possibility 1
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return cede;
+}
+
+// Execute one clock worth of work on the ComputeUnit.
+void
+ComputeUnit::exec()
+{
+ updateEvents();
+ // Execute pipeline stages in reverse order to simulate
+ // the pipeline latency
+ globalMemoryPipe.exec();
+ localMemoryPipe.exec();
+ execStage.exec();
+ scheduleStage.exec();
+ scoreboardCheckStage.exec();
+ fetchStage.exec();
+
+ totalCycles++;
+}
+
+void
+ComputeUnit::init()
+{
+ // Initialize CU Bus models
+ glbMemToVrfBus.init(&shader->tick_cnt, 1);
+ locMemToVrfBus.init(&shader->tick_cnt, 1);
+ nextGlbMemBus = 0;
+ nextLocMemBus = 0;
+ fatal_if(numGlbMemUnits > 1,
+ "No support for multiple Global Memory Pipelines exists!!!");
+ vrfToGlobalMemPipeBus.resize(numGlbMemUnits);
+ for (int j = 0; j < numGlbMemUnits; ++j) {
+ vrfToGlobalMemPipeBus[j] = WaitClass();
+ vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, 1);
+ }
+
+ fatal_if(numLocMemUnits > 1,
+ "No support for multiple Local Memory Pipelines exists!!!");
+ vrfToLocalMemPipeBus.resize(numLocMemUnits);
+ for (int j = 0; j < numLocMemUnits; ++j) {
+ vrfToLocalMemPipeBus[j] = WaitClass();
+ vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, 1);
+ }
+ vectorRegsReserved.resize(numSIMDs, 0);
+ aluPipe.resize(numSIMDs);
+ wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits);
+
+ for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) {
+ wfWait[i] = WaitClass();
+ wfWait[i].init(&shader->tick_cnt, 1);
+ }
+
+ for (int i = 0; i < numSIMDs; ++i) {
+ aluPipe[i] = WaitClass();
+ aluPipe[i].init(&shader->tick_cnt, 1);
+ }
+
+ // Setup space for call args
+ for (int j = 0; j < numSIMDs; ++j) {
+ for (int i = 0; i < shader->n_wf; ++i) {
+ wfList[j][i]->initCallArgMem(shader->funcargs_size);
+ }
+ }
+
+ // Initializing pipeline resources
+ readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
+ waveStatusList.resize(numSIMDs);
+
+ for (int j = 0; j < numSIMDs; ++j) {
+ for (int i = 0; i < shader->n_wf; ++i) {
+ waveStatusList[j].push_back(
+ std::make_pair(wfList[j][i], BLOCKED));
+ }
+ }
+
+ for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) {
+ dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY));
+ }
+
+ fetchStage.init(this);
+ scoreboardCheckStage.init(this);
+ scheduleStage.init(this);
+ execStage.init(this);
+ globalMemoryPipe.init(this);
+ localMemoryPipe.init(this);
+ // initialize state for statistics calculation
+ vectorAluInstAvail.resize(numSIMDs, false);
+ shrMemInstAvail = 0;
+ glbMemInstAvail = 0;
+}
+
+bool
+ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
+{
+ // Ruby has completed the memory op. Schedule the mem_resp_event at the
+ // appropriate cycle to process the timing memory response
+ // This delay represents the pipeline delay
+ SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+ int index = sender_state->port_index;
+ GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+
+ // Is the packet returned a Kernel End or Barrier
+ if (pkt->req->isKernel() && pkt->req->isRelease()) {
+ Wavefront *w =
+ computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
+
+ // Check if we are waiting on Kernel End Release
+ if (w->status == Wavefront::S_RETURNING) {
+ DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
+ computeUnit->cu_id, w->simdId, w->wfSlotId,
+ w->wfDynId, w->kern_id);
+
+ computeUnit->shader->dispatcher->notifyWgCompl(w);
+ w->status = Wavefront::S_STOPPED;
+ } else {
+ w->outstanding_reqs--;
+ }
+
+ DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
+ computeUnit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, w->barrier_cnt);
+
+ if (gpuDynInst->useContinuation) {
+ assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+ gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+ gpuDynInst);
+ }
+
+ delete pkt->senderState;
+ delete pkt->req;
+ delete pkt;
+ return true;
+ } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
+ if (gpuDynInst->useContinuation) {
+ assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+ gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+ gpuDynInst);
+ }
+
+ delete pkt->senderState;
+ delete pkt->req;
+ delete pkt;
+ return true;
+ }
+
+ ComputeUnit::DataPort::MemRespEvent *mem_resp_event =
+ new ComputeUnit::DataPort::MemRespEvent(computeUnit->memPort[index],
+ pkt);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
+ computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+ index, pkt->req->getPaddr());
+
+ computeUnit->schedule(mem_resp_event,
+ curTick() + computeUnit->resp_tick_latency);
+ return true;
+}
+
+void
+ComputeUnit::DataPort::recvReqRetry()
+{
+ int len = retries.size();
+
+ assert(len > 0);
+
+ for (int i = 0; i < len; ++i) {
+ PacketPtr pkt = retries.front().first;
+ GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
+ computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+ pkt->req->getPaddr());
+
+ /** Currently Ruby can return false due to conflicts for the particular
+ * cache block or address. Thus other requests should be allowed to
+ * pass and the data port should expect multiple retries. */
+ if (!sendTimingReq(pkt)) {
+ DPRINTF(GPUMem, "failed again!\n");
+ break;
+ } else {
+ DPRINTF(GPUMem, "successful!\n");
+ retries.pop_front();
+ }
+ }
+}
+
+bool
+ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
+{
+ computeUnit->fetchStage.processFetchReturn(pkt);
+
+ return true;
+}
+
+void
+ComputeUnit::SQCPort::recvReqRetry()
+{
+ int len = retries.size();
+
+ assert(len > 0);
+
+ for (int i = 0; i < len; ++i) {
+ PacketPtr pkt = retries.front().first;
+ Wavefront *wavefront M5_VAR_USED = retries.front().second;
+ DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
+ computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+ pkt->req->getPaddr());
+ if (!sendTimingReq(pkt)) {
+ DPRINTF(GPUFetch, "failed again!\n");
+ break;
+ } else {
+ DPRINTF(GPUFetch, "successful!\n");
+ retries.pop_front();
+ }
+ }
+}
+
+void
+ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
+{
+ // There must be a way around this check to do the globalMemStart...
+ Addr tmp_vaddr = pkt->req->getVaddr();
+
+ updatePageDivergenceDist(tmp_vaddr);
+
+ pkt->req->setVirt(pkt->req->getAsid(), tmp_vaddr, pkt->req->getSize(),
+ pkt->req->getFlags(), pkt->req->masterId(),
+ pkt->req->getPC());
+
+ // figure out the type of the request to set read/write
+ BaseTLB::Mode TLB_mode;
+ assert(pkt->isRead() || pkt->isWrite());
+
+ // Check write before read for atomic operations
+ // since atomic operations should use BaseTLB::Write
+ if (pkt->isWrite()){
+ TLB_mode = BaseTLB::Write;
+ } else if (pkt->isRead()) {
+ TLB_mode = BaseTLB::Read;
+ } else {
+ fatal("pkt is not a read nor a write\n");
+ }
+
+ tlbCycles -= curTick();
+ ++tlbRequests;
+
+ int tlbPort_index = perLaneTLB ? index : 0;
+
+ if (shader->timingSim) {
+ if (debugSegFault) {
+ Process *p = shader->gpuTc->getProcessPtr();
+ Addr vaddr = pkt->req->getVaddr();
+ unsigned size = pkt->getSize();
+
+ if ((vaddr + size - 1) % 64 < vaddr % 64) {
+ panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
+ cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
+ }
+
+ Addr paddr;
+
+ if (!p->pTable->translate(vaddr, paddr)) {
+ if (!p->fixupStackFault(vaddr)) {
+ panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
+ cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+ vaddr);
+ }
+ }
+ }
+
+ // This is the SenderState needed upon return
+ pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
+
+ // This is the senderState needed by the TLB hierarchy to function
+ TheISA::GpuTLB::TranslationState *translation_state =
+ new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
+ pkt->senderState);
+
+ pkt->senderState = translation_state;
+
+ if (functionalTLB) {
+ tlbPort[tlbPort_index]->sendFunctional(pkt);
+
+ // update the hitLevel distribution
+ int hit_level = translation_state->hitLevel;
+ assert(hit_level != -1);
+ hitsPerTLBLevel[hit_level]++;
+
+ // New SenderState for the memory access
+ X86ISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ delete sender_state->tlbEntry;
+ delete sender_state->saved;
+ delete sender_state;
+
+ assert(pkt->req->hasPaddr());
+ assert(pkt->req->hasSize());
+
+ uint8_t *tmpData = pkt->getPtr<uint8_t>();
+
+ // this is necessary because the GPU TLB receives packets instead
+ // of requests. when the translation is complete, all relevent
+ // fields in the request will be populated, but not in the packet.
+ // here we create the new packet so we can set the size, addr,
+ // and proper flags.
+ PacketPtr oldPkt = pkt;
+ pkt = new Packet(oldPkt->req, oldPkt->cmd);
+ delete oldPkt;
+ pkt->dataStatic(tmpData);
+
+
+ // New SenderState for the memory access
+ pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst,
+ index, nullptr);
+
+ gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
+ gpuDynInst->tlbHitLevel[index] = hit_level;
+
+
+ // translation is done. Schedule the mem_req_event at the
+ // appropriate cycle to send the timing memory request to ruby
+ ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+ new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
+ "scheduled\n", cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
+
+ schedule(mem_req_event, curTick() + req_tick_latency);
+ } else if (tlbPort[tlbPort_index]->isStalled()) {
+ assert(tlbPort[tlbPort_index]->retries.size() > 0);
+
+ DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
+ "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+ tmp_vaddr);
+
+ tlbPort[tlbPort_index]->retries.push_back(pkt);
+ } else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
+ // Stall the data port;
+ // No more packet will be issued till
+ // ruby indicates resources are freed by
+ // a recvReqRetry() call back on this port.
+ tlbPort[tlbPort_index]->stallPort();
+
+ DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
+ "failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+ tmp_vaddr);
+
+ tlbPort[tlbPort_index]->retries.push_back(pkt);
+ } else {
+ DPRINTF(GPUTLB,
+ "CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
+ cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
+ }
+ } else {
+ if (pkt->cmd == MemCmd::MemFenceReq) {
+ gpuDynInst->statusBitVector = VectorMask(0);
+ } else {
+ gpuDynInst->statusBitVector &= (~(1ll << index));
+ }
+
+ // New SenderState for the memory access
+ delete pkt->senderState;
+
+ // Because it's atomic operation, only need TLB translation state
+ pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
+ shader->gpuTc);
+
+ tlbPort[tlbPort_index]->sendFunctional(pkt);
+
+ // the addr of the packet is not modified, so we need to create a new
+ // packet, or otherwise the memory access will have the old virtual
+ // address sent in the translation packet, instead of the physical
+ // address returned by the translation.
+ PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
+ new_pkt->dataStatic(pkt->getPtr<uint8_t>());
+
+ // Translation is done. It is safe to send the packet to memory.
+ memPort[0]->sendFunctional(new_pkt);
+
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
+ gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
+ new_pkt->req->getPaddr());
+
+ // safe_cast the senderState
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ delete sender_state->tlbEntry;
+ delete new_pkt;
+ delete pkt->senderState;
+ delete pkt->req;
+ delete pkt;
+ }
+}
+
+void
+ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
+{
+ ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+ new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt);
+
+
+ // New SenderState for the memory access
+ pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
+ nullptr);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
+ cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
+ pkt->req->getPaddr());
+
+ schedule(mem_req_event, curTick() + req_tick_latency);
+}
+
+void
+ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch,
+ Request* req)
+{
+ if (!req) {
+ req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId, -1);
+ }
+ req->setPaddr(0);
+ if (kernelLaunch) {
+ req->setFlags(Request::KERNEL);
+ }
+
+ gpuDynInst->s_type = SEG_GLOBAL;
+
+ // for non-kernel MemFence operations, memorder flags are set depending
+ // on which type of request is currently being sent, so this
+ // should be set by the caller (e.g. if an inst has acq-rel
+ // semantics, it will send one acquire req an one release req)
+ gpuDynInst->setRequestFlags(req, kernelLaunch);
+
+ // a mem fence must correspond to an acquire/release request
+ assert(req->isAcquire() || req->isRelease());
+
+ // create packet
+ PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq);
+
+ // set packet's sender state
+ pkt->senderState =
+ new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr);
+
+ // send the packet
+ sendSyncRequest(gpuDynInst, 0, pkt);
+}
+
+const char*
+ComputeUnit::DataPort::MemRespEvent::description() const
+{
+ return "ComputeUnit memory response event";
+}
+
+void
+ComputeUnit::DataPort::MemRespEvent::process()
+{
+ DataPort::SenderState *sender_state =
+ safe_cast<DataPort::SenderState*>(pkt->senderState);
+
+ GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+ ComputeUnit *compute_unit = dataPort->computeUnit;
+
+ assert(gpuDynInst);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
+ compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
+ pkt->req->getPaddr(), dataPort->index);
+
+ Addr paddr = pkt->req->getPaddr();
+
+ if (pkt->cmd != MemCmd::MemFenceResp) {
+ int index = gpuDynInst->memStatusVector[paddr].back();
+
+ DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
+ pkt->req->getPaddr(), index);
+
+ gpuDynInst->memStatusVector[paddr].pop_back();
+ gpuDynInst->pAddr = pkt->req->getPaddr();
+
+ if (pkt->isRead() || pkt->isWrite()) {
+
+ if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) {
+ gpuDynInst->statusBitVector &= (~(1ULL << index));
+ } else {
+ assert(gpuDynInst->statusVector[index] > 0);
+ gpuDynInst->statusVector[index]--;
+
+ if (!gpuDynInst->statusVector[index])
+ gpuDynInst->statusBitVector &= (~(1ULL << index));
+ }
+
+ DPRINTF(GPUMem, "bitvector is now %#x\n",
+ gpuDynInst->statusBitVector);
+
+ if (gpuDynInst->statusBitVector == VectorMask(0)) {
+ auto iter = gpuDynInst->memStatusVector.begin();
+ auto end = gpuDynInst->memStatusVector.end();
+
+ while (iter != end) {
+ assert(iter->second.empty());
+ ++iter;
+ }
+
+ gpuDynInst->memStatusVector.clear();
+
+ if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+ gpuDynInst->statusVector.clear();
+
+ if (gpuDynInst->m_op == Enums::MO_LD || MO_A(gpuDynInst->m_op)
+ || MO_ANR(gpuDynInst->m_op)) {
+ assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy());
+
+ compute_unit->globalMemoryPipe.getGMLdRespFIFO()
+ .push(gpuDynInst);
+ } else {
+ assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy());
+
+ compute_unit->globalMemoryPipe.getGMStRespFIFO()
+ .push(gpuDynInst);
+ }
+
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
+ compute_unit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId);
+
+ // after clearing the status vectors,
+ // see if there is a continuation to perform
+ // the continuation may generate more work for
+ // this memory request
+ if (gpuDynInst->useContinuation) {
+ assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+ gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+ gpuDynInst);
+ }
+ }
+ }
+ } else {
+ gpuDynInst->statusBitVector = VectorMask(0);
+
+ if (gpuDynInst->useContinuation) {
+ assert(gpuDynInst->scope != Enums::MEMORY_SCOPE_NONE);
+ gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
+ gpuDynInst);
+ }
+ }
+
+ delete pkt->senderState;
+ delete pkt->req;
+ delete pkt;
+}
+
+ComputeUnit*
+ComputeUnitParams::create()
+{
+ return new ComputeUnit(this);
+}
+
+bool
+ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
+{
+ Addr line = pkt->req->getPaddr();
+
+ DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
+ pkt->req->getVaddr(), line);
+
+ assert(pkt->senderState);
+ computeUnit->tlbCycles += curTick();
+
+ // pop off the TLB translation state
+ TheISA::GpuTLB::TranslationState *translation_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ // no PageFaults are permitted for data accesses
+ if (!translation_state->tlbEntry->valid) {
+ DTLBPort::SenderState *sender_state =
+ safe_cast<DTLBPort::SenderState*>(translation_state->saved);
+
+ Wavefront *w M5_VAR_USED =
+ computeUnit->wfList[sender_state->_gpuDynInst->simdId]
+ [sender_state->_gpuDynInst->wfSlotId];
+
+ DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
+ pkt->req->getVaddr());
+ }
+
+ assert(translation_state->tlbEntry->valid);
+
+ // update the hitLevel distribution
+ int hit_level = translation_state->hitLevel;
+ computeUnit->hitsPerTLBLevel[hit_level]++;
+
+ delete translation_state->tlbEntry;
+ assert(!translation_state->ports.size());
+ pkt->senderState = translation_state->saved;
+
+ // for prefetch pkt
+ BaseTLB::Mode TLB_mode = translation_state->tlbMode;
+
+ delete translation_state;
+
+ // use the original sender state to know how to close this transaction
+ DTLBPort::SenderState *sender_state =
+ safe_cast<DTLBPort::SenderState*>(pkt->senderState);
+
+ GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+ int mp_index = sender_state->portIndex;
+ Addr vaddr = pkt->req->getVaddr();
+ gpuDynInst->memStatusVector[line].push_back(mp_index);
+ gpuDynInst->tlbHitLevel[mp_index] = hit_level;
+
+ MemCmd requestCmd;
+
+ if (pkt->cmd == MemCmd::ReadResp) {
+ requestCmd = MemCmd::ReadReq;
+ } else if (pkt->cmd == MemCmd::WriteResp) {
+ requestCmd = MemCmd::WriteReq;
+ } else if (pkt->cmd == MemCmd::SwapResp) {
+ requestCmd = MemCmd::SwapReq;
+ } else {
+ panic("unsupported response to request conversion %s\n",
+ pkt->cmd.toString());
+ }
+
+ if (computeUnit->prefetchDepth) {
+ int simdId = gpuDynInst->simdId;
+ int wfSlotId = gpuDynInst->wfSlotId;
+ Addr last = 0;
+
+ switch(computeUnit->prefetchType) {
+ case Enums::PF_CU:
+ last = computeUnit->lastVaddrCU[mp_index];
+ break;
+ case Enums::PF_PHASE:
+ last = computeUnit->lastVaddrPhase[simdId][mp_index];
+ break;
+ case Enums::PF_WF:
+ last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
+ default:
+ break;
+ }
+
+ DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
+ computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
+
+ int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
+ roundDown(last, TheISA::PageBytes)) >> TheISA::PageShift
+ : 0;
+
+ DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
+
+ computeUnit->lastVaddrCU[mp_index] = vaddr;
+ computeUnit->lastVaddrPhase[simdId][mp_index] = vaddr;
+ computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
+
+ stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
+ computeUnit->prefetchStride: stride;
+
+ DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
+ computeUnit->cu_id, simdId, wfSlotId, mp_index);
+
+ DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
+
+ // Prefetch Next few pages atomically
+ for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
+ DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
+ vaddr+stride*pf*TheISA::PageBytes);
+
+ if (!stride)
+ break;
+
+ Request *prefetch_req = new Request(0, vaddr + stride * pf *
+ TheISA::PageBytes,
+ sizeof(uint8_t), 0,
+ computeUnit->masterId(),
+ 0, 0, 0);
+
+ PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
+ uint8_t foo = 0;
+ prefetch_pkt->dataStatic(&foo);
+
+ // Because it's atomic operation, only need TLB translation state
+ prefetch_pkt->senderState =
+ new TheISA::GpuTLB::TranslationState(TLB_mode,
+ computeUnit->shader->gpuTc,
+ true);
+
+ // Currently prefetches are zero-latency, hence the sendFunctional
+ sendFunctional(prefetch_pkt);
+
+ /* safe_cast the senderState */
+ TheISA::GpuTLB::TranslationState *tlb_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(
+ prefetch_pkt->senderState);
+
+
+ delete tlb_state->tlbEntry;
+ delete tlb_state;
+ delete prefetch_pkt->req;
+ delete prefetch_pkt;
+ }
+ }
+
+ // First we must convert the response cmd back to a request cmd so that
+ // the request can be sent through the cu's master port
+ PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
+ new_pkt->dataStatic(pkt->getPtr<uint8_t>());
+ delete pkt->senderState;
+ delete pkt;
+
+ // New SenderState for the memory access
+ new_pkt->senderState =
+ new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
+ nullptr);
+
+ // translation is done. Schedule the mem_req_event at the appropriate
+ // cycle to send the timing memory request to ruby
+ ComputeUnit::DataPort::MemReqEvent *mem_req_event =
+ new ComputeUnit::DataPort::MemReqEvent(computeUnit->memPort[mp_index],
+ new_pkt);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
+ computeUnit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
+
+ computeUnit->schedule(mem_req_event, curTick() +
+ computeUnit->req_tick_latency);
+
+ return true;
+}
+
+const char*
+ComputeUnit::DataPort::MemReqEvent::description() const
+{
+ return "ComputeUnit memory request event";
+}
+
+void
+ComputeUnit::DataPort::MemReqEvent::process()
+{
+ SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
+ GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
+ ComputeUnit *compute_unit M5_VAR_USED = dataPort->computeUnit;
+
+ if (!(dataPort->sendTimingReq(pkt))) {
+ dataPort->retries.push_back(std::make_pair(pkt, gpuDynInst));
+
+ DPRINTF(GPUPort,
+ "CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
+ compute_unit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, dataPort->index,
+ pkt->req->getPaddr());
+ } else {
+ DPRINTF(GPUPort,
+ "CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
+ compute_unit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, dataPort->index,
+ pkt->req->getPaddr());
+ }
+}
+
+/*
+ * The initial translation request could have been rejected,
+ * if <retries> queue is not Retry sending the translation
+ * request. sendRetry() is called from the peer port whenever
+ * a translation completes.
+ */
+void
+ComputeUnit::DTLBPort::recvReqRetry()
+{
+ int len = retries.size();
+
+ DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
+ computeUnit->cu_id, len);
+
+ assert(len > 0);
+ assert(isStalled());
+ // recvReqRetry is an indication that the resource on which this
+ // port was stalling on is freed. So, remove the stall first
+ unstallPort();
+
+ for (int i = 0; i < len; ++i) {
+ PacketPtr pkt = retries.front();
+ Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
+ DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
+
+ if (!sendTimingReq(pkt)) {
+ // Stall port
+ stallPort();
+ DPRINTF(GPUTLB, ": failed again\n");
+ break;
+ } else {
+ DPRINTF(GPUTLB, ": successful\n");
+ retries.pop_front();
+ }
+ }
+}
+
+bool
+ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
+{
+ Addr line M5_VAR_USED = pkt->req->getPaddr();
+ DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
+ computeUnit->cu_id, pkt->req->getVaddr(), line);
+
+ assert(pkt->senderState);
+
+ // pop off the TLB translation state
+ TheISA::GpuTLB::TranslationState *translation_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ bool success = translation_state->tlbEntry->valid;
+ delete translation_state->tlbEntry;
+ assert(!translation_state->ports.size());
+ pkt->senderState = translation_state->saved;
+ delete translation_state;
+
+ // use the original sender state to know how to close this transaction
+ ITLBPort::SenderState *sender_state =
+ safe_cast<ITLBPort::SenderState*>(pkt->senderState);
+
+ // get the wavefront associated with this translation request
+ Wavefront *wavefront = sender_state->wavefront;
+ delete pkt->senderState;
+
+ if (success) {
+ // pkt is reused in fetch(), don't delete it here. However, we must
+ // reset the command to be a request so that it can be sent through
+ // the cu's master port
+ assert(pkt->cmd == MemCmd::ReadResp);
+ pkt->cmd = MemCmd::ReadReq;
+
+ computeUnit->fetchStage.fetch(pkt, wavefront);
+ } else {
+ if (wavefront->dropFetch) {
+ assert(wavefront->instructionBuffer.empty());
+ wavefront->dropFetch = false;
+ }
+
+ wavefront->pendingFetch = 0;
+ }
+
+ return true;
+}
+
+/*
+ * The initial translation request could have been rejected, if
+ * <retries> queue is not empty. Retry sending the translation
+ * request. sendRetry() is called from the peer port whenever
+ * a translation completes.
+ */
+void
+ComputeUnit::ITLBPort::recvReqRetry()
+{
+
+ int len = retries.size();
+ DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
+
+ assert(len > 0);
+ assert(isStalled());
+
+ // recvReqRetry is an indication that the resource on which this
+ // port was stalling on is freed. So, remove the stall first
+ unstallPort();
+
+ for (int i = 0; i < len; ++i) {
+ PacketPtr pkt = retries.front();
+ Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
+ DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
+
+ if (!sendTimingReq(pkt)) {
+ stallPort(); // Stall port
+ DPRINTF(GPUTLB, ": failed again\n");
+ break;
+ } else {
+ DPRINTF(GPUTLB, ": successful\n");
+ retries.pop_front();
+ }
+ }
+}
+
+void
+ComputeUnit::regStats()
+{
+ tlbCycles
+ .name(name() + ".tlb_cycles")
+ .desc("total number of cycles for all uncoalesced requests")
+ ;
+
+ tlbRequests
+ .name(name() + ".tlb_requests")
+ .desc("number of uncoalesced requests")
+ ;
+
+ tlbLatency
+ .name(name() + ".avg_translation_latency")
+ .desc("Avg. translation latency for data translations")
+ ;
+
+ tlbLatency = tlbCycles / tlbRequests;
+
+ hitsPerTLBLevel
+ .init(4)
+ .name(name() + ".TLB_hits_distribution")
+ .desc("TLB hits distribution (0 for page table, x for Lx-TLB")
+ ;
+
+ // fixed number of TLB levels
+ for (int i = 0; i < 4; ++i) {
+ if (!i)
+ hitsPerTLBLevel.subname(i,"page_table");
+ else
+ hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
+ }
+
+ execRateDist
+ .init(0, 10, 2)
+ .name(name() + ".inst_exec_rate")
+ .desc("Instruction Execution Rate: Number of executed vector "
+ "instructions per cycle")
+ ;
+
+ ldsBankConflictDist
+ .init(0, VSZ, 2)
+ .name(name() + ".lds_bank_conflicts")
+ .desc("Number of bank conflicts per LDS memory packet")
+ ;
+
+ ldsBankAccesses
+ .name(name() + ".lds_bank_access_cnt")
+ .desc("Total number of LDS bank accesses")
+ ;
+
+ pageDivergenceDist
+ // A wavefront can touch 1 to VSZ pages per memory instruction.
+ // The number of pages per bin can be configured (here it's 4).
+ .init(1, VSZ, 4)
+ .name(name() + ".page_divergence_dist")
+ .desc("pages touched per wf (over all mem. instr.)")
+ ;
+
+ controlFlowDivergenceDist
+ .init(1, VSZ, 4)
+ .name(name() + ".warp_execution_dist")
+ .desc("number of lanes active per instruction (oval all instructions)")
+ ;
+
+ activeLanesPerGMemInstrDist
+ .init(1, VSZ, 4)
+ .name(name() + ".gmem_lanes_execution_dist")
+ .desc("number of active lanes per global memory instruction")
+ ;
+
+ activeLanesPerLMemInstrDist
+ .init(1, VSZ, 4)
+ .name(name() + ".lmem_lanes_execution_dist")
+ .desc("number of active lanes per local memory instruction")
+ ;
+
+ numInstrExecuted
+ .name(name() + ".num_instr_executed")
+ .desc("number of instructions executed")
+ ;
+
+ numVecOpsExecuted
+ .name(name() + ".num_vec_ops_executed")
+ .desc("number of vec ops executed (e.g. VSZ/inst)")
+ ;
+
+ totalCycles
+ .name(name() + ".num_total_cycles")
+ .desc("number of cycles the CU ran for")
+ ;
+
+ ipc
+ .name(name() + ".ipc")
+ .desc("Instructions per cycle (this CU only)")
+ ;
+
+ vpc
+ .name(name() + ".vpc")
+ .desc("Vector Operations per cycle (this CU only)")
+ ;
+
+ numALUInstsExecuted
+ .name(name() + ".num_alu_insts_executed")
+ .desc("Number of dynamic non-GM memory insts executed")
+ ;
+
+ wgBlockedDueLdsAllocation
+ .name(name() + ".wg_blocked_due_lds_alloc")
+ .desc("Workgroup blocked due to LDS capacity")
+ ;
+
+ ipc = numInstrExecuted / totalCycles;
+ vpc = numVecOpsExecuted / totalCycles;
+
+ numTimesWgBlockedDueVgprAlloc
+ .name(name() + ".times_wg_blocked_due_vgpr_alloc")
+ .desc("Number of times WGs are blocked due to VGPR allocation per SIMD")
+ ;
+
+ dynamicGMemInstrCnt
+ .name(name() + ".global_mem_instr_cnt")
+ .desc("dynamic global memory instructions count")
+ ;
+
+ dynamicLMemInstrCnt
+ .name(name() + ".local_mem_instr_cnt")
+ .desc("dynamic local memory intruction count")
+ ;
+
+ numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
+ dynamicLMemInstrCnt;
+
+ completedWfs
+ .name(name() + ".num_completed_wfs")
+ .desc("number of completed wavefronts")
+ ;
+
+ numCASOps
+ .name(name() + ".num_CAS_ops")
+ .desc("number of compare and swap operations")
+ ;
+
+ numFailedCASOps
+ .name(name() + ".num_failed_CAS_ops")
+ .desc("number of compare and swap operations that failed")
+ ;
+
+ // register stats of pipeline stages
+ fetchStage.regStats();
+ scoreboardCheckStage.regStats();
+ scheduleStage.regStats();
+ execStage.regStats();
+
+ // register stats of memory pipeline
+ globalMemoryPipe.regStats();
+ localMemoryPipe.regStats();
+}
+
+void
+ComputeUnit::updatePageDivergenceDist(Addr addr)
+{
+ Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
+
+ if (!pagesTouched.count(virt_page_addr))
+ pagesTouched[virt_page_addr] = 1;
+ else
+ pagesTouched[virt_page_addr]++;
+}
+
+void
+ComputeUnit::CUExitCallback::process()
+{
+ if (computeUnit->countPages) {
+ std::ostream *page_stat_file =
+ simout.create(computeUnit->name().c_str());
+
+ *page_stat_file << "page, wavefront accesses, workitem accesses" <<
+ std::endl;
+
+ for (auto iter : computeUnit->pageAccesses) {
+ *page_stat_file << std::hex << iter.first << ",";
+ *page_stat_file << std::dec << iter.second.first << ",";
+ *page_stat_file << std::dec << iter.second.second << std::endl;
+ }
+ }
+ }
+
+bool
+ComputeUnit::isDone() const
+{
+ for (int i = 0; i < numSIMDs; ++i) {
+ if (!isSimdDone(i)) {
+ return false;
+ }
+ }
+
+ bool glbMemBusRdy = true;
+ for (int j = 0; j < numGlbMemUnits; ++j) {
+ glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy();
+ }
+ bool locMemBusRdy = true;
+ for (int j = 0; j < numLocMemUnits; ++j) {
+ locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy();
+ }
+
+ if (!globalMemoryPipe.isGMLdRespFIFOWrRdy() ||
+ !globalMemoryPipe.isGMStRespFIFOWrRdy() ||
+ !globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy()
+ || !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() ||
+ !glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) {
+ return false;
+ }
+
+ return true;
+}
+
+int32_t
+ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
+{
+ return lds.getRefCounter(dispatchId, wgId);
+}
+
+bool
+ComputeUnit::isSimdDone(uint32_t simdId) const
+{
+ assert(simdId < numSIMDs);
+
+ for (int i=0; i < numGlbMemUnits; ++i) {
+ if (!vrfToGlobalMemPipeBus[i].rdy())
+ return false;
+ }
+ for (int i=0; i < numLocMemUnits; ++i) {
+ if (!vrfToLocalMemPipeBus[i].rdy())
+ return false;
+ }
+ if (!aluPipe[simdId].rdy()) {
+ return false;
+ }
+
+ for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
+ if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * send a general request to the LDS
+ * make sure to look at the return value here as your request might be
+ * NACK'd and returning false means that you have to have some backup plan
+ */
+bool
+ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst)
+{
+ // this is just a request to carry the GPUDynInstPtr
+ // back and forth
+ Request *newRequest = new Request();
+ newRequest->setPaddr(0x0);
+
+ // ReadReq is not evaluted by the LDS but the Packet ctor requires this
+ PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
+
+ // This is the SenderState needed upon return
+ newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
+
+ return ldsPort->sendTimingReq(newPacket);
+}
+
+/**
+ * get the result of packets sent to the LDS when they return
+ */
+bool
+ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet)
+{
+ const ComputeUnit::LDSPort::SenderState *senderState =
+ dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
+
+ fatal_if(!senderState, "did not get the right sort of sender state");
+
+ GPUDynInstPtr gpuDynInst = senderState->getMemInst();
+
+ delete packet->senderState;
+ delete packet->req;
+ delete packet;
+
+ computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
+ return true;
+}
+
+/**
+ * attempt to send this packet, either the port is already stalled, the request
+ * is nack'd and must stall or the request goes through
+ * when a request cannot be sent, add it to the retries queue
+ */
+bool
+ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt)
+{
+ ComputeUnit::LDSPort::SenderState *sender_state =
+ dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
+ fatal_if(!sender_state, "packet without a valid sender state");
+
+ GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
+
+ if (isStalled()) {
+ fatal_if(retries.empty(), "must have retries waiting to be stalled");
+
+ retries.push(pkt);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
+ computeUnit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId);
+ return false;
+ } else if (!MasterPort::sendTimingReq(pkt)) {
+ // need to stall the LDS port until a recvReqRetry() is received
+ // this indicates that there is more space
+ stallPort();
+ retries.push(pkt);
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
+ computeUnit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, pkt->req->getPaddr());
+ return false;
+ } else {
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
+ computeUnit->cu_id, gpuDynInst->simdId,
+ gpuDynInst->wfSlotId, pkt->req->getPaddr());
+ return true;
+ }
+}
+
+/**
+ * the bus is telling the port that there is now space so retrying stalled
+ * requests should work now
+ * this allows the port to have a request be nack'd and then have the receiver
+ * say when there is space, rather than simply retrying the send every cycle
+ */
+void
+ComputeUnit::LDSPort::recvReqRetry()
+{
+ auto queueSize = retries.size();
+
+ DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
+ computeUnit->cu_id, queueSize);
+
+ fatal_if(queueSize < 1,
+ "why was there a recvReqRetry() with no pending reqs?");
+ fatal_if(!isStalled(),
+ "recvReqRetry() happened when the port was not stalled");
+
+ unstallPort();
+
+ while (!retries.empty()) {
+ PacketPtr packet = retries.front();
+
+ DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
+
+ if (!MasterPort::sendTimingReq(packet)) {
+ // Stall port
+ stallPort();
+ DPRINTF(GPUPort, ": LDS send failed again\n");
+ break;
+ } else {
+ DPRINTF(GPUTLB, ": LDS send successful\n");
+ retries.pop();
+ }
+ }
+}
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
new file mode 100644
index 000000000..f47c27a0a
--- /dev/null
+++ b/src/gpu-compute/compute_unit.hh
@@ -0,0 +1,767 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Anthony Gutierrez
+ */
+
+#ifndef __COMPUTE_UNIT_HH__
+#define __COMPUTE_UNIT_HH__
+
+#include <deque>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "base/callback.hh"
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "enums/PrefetchType.hh"
+#include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/fetch_stage.hh"
+#include "gpu-compute/global_memory_pipeline.hh"
+#include "gpu-compute/local_memory_pipeline.hh"
+#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/schedule_stage.hh"
+#include "gpu-compute/scoreboard_check_stage.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+
+static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
+static const int MAX_WIDTH_FOR_MEM_INST = 32;
+
+class NDRange;
+class Shader;
+class VectorRegisterFile;
+
+struct ComputeUnitParams;
+
+enum EXEC_POLICY
+{
+ OLDEST = 0,
+ RR
+};
+
+// List of execution units
+enum EXEC_UNIT
+{
+ SIMD0 = 0,
+ SIMD1,
+ SIMD2,
+ SIMD3,
+ GLBMEM_PIPE,
+ LDSMEM_PIPE,
+ NUM_UNITS
+};
+
+enum TLB_CACHE
+{
+ TLB_MISS_CACHE_MISS = 0,
+ TLB_MISS_CACHE_HIT,
+ TLB_HIT_CACHE_MISS,
+ TLB_HIT_CACHE_HIT
+};
+
+class ComputeUnit : public MemObject
+{
+ public:
+ FetchStage fetchStage;
+ ScoreboardCheckStage scoreboardCheckStage;
+ ScheduleStage scheduleStage;
+ ExecStage execStage;
+ GlobalMemPipeline globalMemoryPipe;
+ LocalMemPipeline localMemoryPipe;
+
+ // Buffers used to communicate between various pipeline stages
+
+ // List of waves which are ready to be scheduled.
+ // Each execution resource has a ready list. readyList is
+ // used to communicate between scoreboardCheck stage and
+ // schedule stage
+ // TODO: make enum to index readyList
+ std::vector<std::vector<Wavefront*>> readyList;
+
+ // Stores the status of waves. A READY implies the
+ // wave is ready to be scheduled this cycle and
+ // is already present in the readyList. waveStatusList is
+ // used to communicate between scoreboardCheck stage and
+ // schedule stage
+ // TODO: convert std::pair to a class to increase readability
+ std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
+
+ // List of waves which will be dispatched to
+ // each execution resource. A FILLED implies
+ // dispatch list is non-empty and
+ // execution unit has something to execute
+ // this cycle. Currently, the dispatch list of
+ // an execution resource can hold only one wave because
+ // an execution resource can execute only one wave in a cycle.
+ // dispatchList is used to communicate between schedule
+ // and exec stage
+ // TODO: convert std::pair to a class to increase readability
+ std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
+
+ int rrNextMemID; // used by RR WF exec policy to cycle through WF's
+ int rrNextALUWp;
+ typedef ComputeUnitParams Params;
+ std::vector<std::vector<Wavefront*>> wfList;
+ int cu_id;
+
+ // array of vector register files, one per SIMD
+ std::vector<VectorRegisterFile*> vrf;
+ // Number of vector ALU units (SIMDs) in CU
+ int numSIMDs;
+ // number of pipe stages for bypassing data to next dependent single
+ // precision vector instruction inside the vector ALU pipeline
+ int spBypassPipeLength;
+ // number of pipe stages for bypassing data to next dependent double
+ // precision vector instruction inside the vector ALU pipeline
+ int dpBypassPipeLength;
+ // number of cycles per issue period
+ int issuePeriod;
+
+ // Number of global and local memory execution resources in CU
+ int numGlbMemUnits;
+ int numLocMemUnits;
+ // tracks the last cycle a vector instruction was executed on a SIMD
+ std::vector<uint64_t> lastExecCycle;
+
+ // true if we allow a separate TLB per lane
+ bool perLaneTLB;
+ // if 0, TLB prefetching is off.
+ int prefetchDepth;
+ // if fixed-stride prefetching, this is the stride.
+ int prefetchStride;
+
+ class LastVaddrWave
+ {
+ public:
+ Addr vaddrs[VSZ];
+ Addr& operator[](int idx) {
+ return vaddrs[idx];
+ }
+
+ LastVaddrWave() {
+ for (int i = 0; i < VSZ; ++i)
+ vaddrs[i] = 0;
+ }
+ };
+
+ LastVaddrWave lastVaddrCU;
+ std::vector<LastVaddrWave> lastVaddrPhase;
+ std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
+ Enums::PrefetchType prefetchType;
+ EXEC_POLICY exec_policy;
+
+ bool xact_cas_mode;
+ bool debugSegFault;
+ bool functionalTLB;
+ bool localMemBarrier;
+
+ /*
+ * for Counting page accesses
+ *
+ * cuExitCallback inherits from Callback. When you register a callback
+ * function as an exit callback, it will get added to an exit callback
+ * queue, such that on simulation exit, all callbacks in the callback
+ * queue will have their process() function called.
+ */
+ bool countPages;
+
+ Shader *shader;
+ uint32_t barrier_id;
+ // vector of Vector ALU (MACC) pipelines
+ std::vector<WaitClass> aluPipe;
+ // minimum issue period per SIMD unit (in cycles)
+ std::vector<WaitClass> wfWait;
+
+ // Resource control for Vector Register File->Global Memory pipe buses
+ std::vector<WaitClass> vrfToGlobalMemPipeBus;
+ // Resource control for Vector Register File->Local Memory pipe buses
+ std::vector<WaitClass> vrfToLocalMemPipeBus;
+ int nextGlbMemBus;
+ int nextLocMemBus;
+ // Resource control for global memory to VRF data/address bus
+ WaitClass glbMemToVrfBus;
+ // Resource control for local memory to VRF data/address bus
+ WaitClass locMemToVrfBus;
+
+ uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
+ uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
+ uint32_t numCyclesPerStoreTransfer; // number of cycles per vector store
+ uint32_t numCyclesPerLoadTransfer; // number of cycles per vector load
+
+ Tick req_tick_latency;
+ Tick resp_tick_latency;
+
+ // number of vector registers being reserved for each SIMD unit
+ std::vector<int> vectorRegsReserved;
+ // number of vector registers per SIMD unit
+ uint32_t numVecRegsPerSimd;
+ // Support for scheduling VGPR status update events
+ std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
+ std::vector<uint64_t> timestampVec;
+ std::vector<uint8_t> statusVec;
+
+ void
+ registerEvent(uint32_t simdId,
+ uint32_t regIdx,
+ uint32_t operandSize,
+ uint64_t when,
+ uint8_t newStatus) {
+ regIdxVec.push_back(std::make_pair(simdId, regIdx));
+ timestampVec.push_back(when);
+ statusVec.push_back(newStatus);
+ if (operandSize > 4) {
+ regIdxVec.push_back(std::make_pair(simdId,
+ ((regIdx + 1) %
+ numVecRegsPerSimd)));
+ timestampVec.push_back(when);
+ statusVec.push_back(newStatus);
+ }
+ }
+
+ void updateEvents();
+
+ // this hash map will keep track of page divergence
+ // per memory instruction per wavefront. The hash map
+ // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
+ std::map<Addr, int> pagesTouched;
+
+ ComputeUnit(const Params *p);
+ ~ComputeUnit();
+ int spBypassLength() { return spBypassPipeLength; };
+ int dpBypassLength() { return dpBypassPipeLength; };
+ int storeBusLength() { return numCyclesPerStoreTransfer; };
+ int loadBusLength() { return numCyclesPerLoadTransfer; };
+ int wfSize() const { return wavefrontSize; };
+
+ void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+ void exec();
+ void initiateFetch(Wavefront *wavefront);
+ void fetch(PacketPtr pkt, Wavefront *wavefront);
+ void FillKernelState(Wavefront *w, NDRange *ndr);
+
+ void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
+ int trueWgSizeTotal);
+
+ void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
+ int trueWgSize[], int trueWgSizeTotal,
+ LdsChunk *ldsChunk, uint64_t origSpillMemStart);
+
+ void StartWorkgroup(NDRange *ndr);
+ int ReadyWorkgroup(NDRange *ndr);
+
+ bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
+ bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
+ bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
+ int GlbMemUnitId() { return GLBMEM_PIPE; }
+ int ShrMemUnitId() { return LDSMEM_PIPE; }
+ int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
+ int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
+ /* This function cycles through all the wavefronts in all the phases to see
+ * if all of the wavefronts which should be associated with one barrier
+ * (denoted with _barrier_id), are all at the same barrier in the program
+ * (denoted by bcnt). When the number at the barrier matches bslots, then
+ * return true.
+ */
+ int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
+ bool cedeSIMD(int simdId, int wfSlotId);
+
+ template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
+ virtual void init();
+ void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+ void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+ void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
+ bool kernelLaunch=true,
+ RequestPtr req=nullptr);
+ void handleMemPacket(PacketPtr pkt, int memport_index);
+ bool processTimingPacket(PacketPtr pkt);
+ void processFetchReturn(PacketPtr pkt);
+ void updatePageDivergenceDist(Addr addr);
+
+ MasterID masterId() { return _masterId; }
+
+ bool isDone() const;
+ bool isSimdDone(uint32_t) const;
+
+ protected:
+ MasterID _masterId;
+
+ LdsState &lds;
+
+ public:
+ // the following stats compute the avg. TLB accesslatency per
+ // uncoalesced request (only for data)
+ Stats::Scalar tlbRequests;
+ Stats::Scalar tlbCycles;
+ Stats::Formula tlbLatency;
+ // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
+ Stats::Vector hitsPerTLBLevel;
+
+ Stats::Scalar ldsBankAccesses;
+ Stats::Distribution ldsBankConflictDist;
+
+ // over all memory instructions executed over all wavefronts
+ // how many touched 0-4 pages, 4-8, ..., 60-64 pages
+ Stats::Distribution pageDivergenceDist;
+ Stats::Scalar dynamicGMemInstrCnt;
+ Stats::Scalar dynamicLMemInstrCnt;
+
+ Stats::Scalar wgBlockedDueLdsAllocation;
+ // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
+ // when the instruction is committed, this number is still incremented by 1
+ Stats::Scalar numInstrExecuted;
+ // Number of cycles among successive instruction executions across all
+ // wavefronts of the same CU
+ Stats::Distribution execRateDist;
+ // number of individual vector operations executed
+ Stats::Scalar numVecOpsExecuted;
+ // Total cycles that something is running on the GPU
+ Stats::Scalar totalCycles;
+ Stats::Formula vpc; // vector ops per cycle
+ Stats::Formula ipc; // vector instructions per cycle
+ Stats::Distribution controlFlowDivergenceDist;
+ Stats::Distribution activeLanesPerGMemInstrDist;
+ Stats::Distribution activeLanesPerLMemInstrDist;
+ // number of vector ALU instructions received
+ Stats::Formula numALUInstsExecuted;
+ // number of times a WG can not start due to lack of free VGPRs in SIMDs
+ Stats::Scalar numTimesWgBlockedDueVgprAlloc;
+ Stats::Scalar numCASOps;
+ Stats::Scalar numFailedCASOps;
+ Stats::Scalar completedWfs;
+ // flag per vector SIMD unit that is set when there is at least one
+ // WV that has a vector ALU instruction as the oldest in its
+ // Instruction Buffer: Defined in the Scoreboard stage, consumed
+ // by the Execute stage.
+ std::vector<bool> vectorAluInstAvail;
+ // number of available (oldest) LDS instructions that could have
+ // been issued to the LDS at a specific issue slot
+ int shrMemInstAvail;
+ // number of available Global memory instructions that could have
+ // been issued to TCP at a specific issue slot
+ int glbMemInstAvail;
+
+ void
+ regStats();
+
+ LdsState &
+ getLds() const
+ {
+ return lds;
+ }
+
+ int32_t
+ getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
+
+ bool
+ sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
+
+ typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
+ pageDataStruct pageAccesses;
+
+ class CUExitCallback : public Callback
+ {
+ private:
+ ComputeUnit *computeUnit;
+
+ public:
+ virtual ~CUExitCallback() { }
+
+ CUExitCallback(ComputeUnit *_cu)
+ {
+ computeUnit = _cu;
+ }
+
+ virtual void
+ process();
+ };
+
+ CUExitCallback *cuExitCallback;
+
+ /** Data access Port **/
+ class DataPort : public MasterPort
+ {
+ public:
+ DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+ : MasterPort(_name, _cu), computeUnit(_cu),
+ index(_index) { }
+
+ bool snoopRangeSent;
+
+ struct SenderState : public Packet::SenderState
+ {
+ GPUDynInstPtr _gpuDynInst;
+ int port_index;
+ Packet::SenderState *saved;
+
+ SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
+ Packet::SenderState *sender_state=nullptr)
+ : _gpuDynInst(gpuDynInst),
+ port_index(_port_index),
+ saved(sender_state) { }
+ };
+
+ class MemReqEvent : public Event
+ {
+ private:
+ DataPort *dataPort;
+ PacketPtr pkt;
+
+ public:
+ MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
+ : Event(), dataPort(_data_port), pkt(_pkt)
+ {
+ setFlags(Event::AutoDelete);
+ }
+
+ void process();
+ const char *description() const;
+ };
+
+ class MemRespEvent : public Event
+ {
+ private:
+ DataPort *dataPort;
+ PacketPtr pkt;
+
+ public:
+ MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
+ : Event(), dataPort(_data_port), pkt(_pkt)
+ {
+ setFlags(Event::AutoDelete);
+ }
+
+ void process();
+ const char *description() const;
+ };
+
+ std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
+
+ protected:
+ ComputeUnit *computeUnit;
+ int index;
+
+ virtual bool recvTimingResp(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt) { }
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+
+ virtual void
+ getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
+ {
+ resp.clear();
+ snoop = true;
+ }
+
+ };
+
+ // Instruction cache access port
+ class SQCPort : public MasterPort
+ {
+ public:
+ SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+ : MasterPort(_name, _cu), computeUnit(_cu),
+ index(_index) { }
+
+ bool snoopRangeSent;
+
+ struct SenderState : public Packet::SenderState
+ {
+ Wavefront *wavefront;
+ Packet::SenderState *saved;
+
+ SenderState(Wavefront *_wavefront, Packet::SenderState
+ *sender_state=nullptr)
+ : wavefront(_wavefront), saved(sender_state) { }
+ };
+
+ std::deque<std::pair<PacketPtr, Wavefront*>> retries;
+
+ protected:
+ ComputeUnit *computeUnit;
+ int index;
+
+ virtual bool recvTimingResp(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt) { }
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+
+ virtual void
+ getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
+ {
+ resp.clear();
+ snoop = true;
+ }
+ };
+
+ /** Data TLB port **/
+ class DTLBPort : public MasterPort
+ {
+ public:
+ DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+ : MasterPort(_name, _cu), computeUnit(_cu),
+ index(_index), stalled(false)
+ { }
+
+ bool isStalled() { return stalled; }
+ void stallPort() { stalled = true; }
+ void unstallPort() { stalled = false; }
+
+ /**
+ * here we queue all the translation requests that were
+ * not successfully sent.
+ */
+ std::deque<PacketPtr> retries;
+
+ /** SenderState is information carried along with the packet
+ * throughout the TLB hierarchy
+ */
+ struct SenderState: public Packet::SenderState
+ {
+ // the memInst that this is associated with
+ GPUDynInstPtr _gpuDynInst;
+
+ // the lane in the memInst this is associated with, so we send
+ // the memory request down the right port
+ int portIndex;
+
+ // constructor used for packets involved in timing accesses
+ SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
+ : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
+
+ };
+
+ protected:
+ ComputeUnit *computeUnit;
+ int index;
+ bool stalled;
+
+ virtual bool recvTimingResp(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt) { }
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+ };
+
+ class ITLBPort : public MasterPort
+ {
+ public:
+ ITLBPort(const std::string &_name, ComputeUnit *_cu)
+ : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
+
+
+ bool isStalled() { return stalled; }
+ void stallPort() { stalled = true; }
+ void unstallPort() { stalled = false; }
+
+ /**
+ * here we queue all the translation requests that were
+ * not successfully sent.
+ */
+ std::deque<PacketPtr> retries;
+
+ /** SenderState is information carried along with the packet
+ * throughout the TLB hierarchy
+ */
+ struct SenderState: public Packet::SenderState
+ {
+ // The wavefront associated with this request
+ Wavefront *wavefront;
+
+ SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
+ };
+
+ protected:
+ ComputeUnit *computeUnit;
+ bool stalled;
+
+ virtual bool recvTimingResp(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt) { }
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+ };
+
+ /**
+ * the port intended to communicate between the CU and its LDS
+ */
+ class LDSPort : public MasterPort
+ {
+ public:
+ LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
+ : MasterPort(_name, _cu, _id), computeUnit(_cu)
+ {
+ }
+
+ bool isStalled() const { return stalled; }
+ void stallPort() { stalled = true; }
+ void unstallPort() { stalled = false; }
+
+ /**
+ * here we queue all the requests that were
+ * not successfully sent.
+ */
+ std::queue<PacketPtr> retries;
+
+ /**
+ * SenderState is information carried along with the packet, esp. the
+ * GPUDynInstPtr
+ */
+ class SenderState: public Packet::SenderState
+ {
+ protected:
+ // The actual read/write/atomic request that goes with this command
+ GPUDynInstPtr _gpuDynInst = nullptr;
+
+ public:
+ SenderState(GPUDynInstPtr gpuDynInst):
+ _gpuDynInst(gpuDynInst)
+ {
+ }
+
+ GPUDynInstPtr
+ getMemInst() const
+ {
+ return _gpuDynInst;
+ }
+ };
+
+ virtual bool
+ sendTimingReq(PacketPtr pkt);
+
+ protected:
+
+ bool stalled = false; ///< whether or not it is stalled
+
+ ComputeUnit *computeUnit;
+
+ virtual bool
+ recvTimingResp(PacketPtr pkt);
+
+ virtual Tick
+ recvAtomic(PacketPtr pkt) { return 0; }
+
+ virtual void
+ recvFunctional(PacketPtr pkt)
+ {
+ }
+
+ virtual void
+ recvRangeChange()
+ {
+ }
+
+ virtual void
+ recvReqRetry();
+ };
+
+ /** The port to access the Local Data Store
+ * Can be connected to a LDS object
+ */
+ LDSPort *ldsPort = nullptr;
+
+ LDSPort *
+ getLdsPort() const
+ {
+ return ldsPort;
+ }
+
+ /** The memory port for SIMD data accesses.
+ * Can be connected to PhysMem for Ruby for timing simulations
+ */
+ std::vector<DataPort*> memPort;
+ // port to the TLB hierarchy (i.e., the L1 TLB)
+ std::vector<DTLBPort*> tlbPort;
+ // port to the SQC (i.e. the I-cache)
+ SQCPort *sqcPort;
+ // port to the SQC TLB (there's a separate TLB for each I-cache)
+ ITLBPort *sqcTLBPort;
+
+ virtual BaseMasterPort&
+ getMasterPort(const std::string &if_name, PortID idx)
+ {
+ if (if_name == "memory_port") {
+ memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
+ this, idx);
+ return *memPort[idx];
+ } else if (if_name == "translation_port") {
+ tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
+ this, idx);
+ return *tlbPort[idx];
+ } else if (if_name == "sqc_port") {
+ sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
+ this, idx);
+ return *sqcPort;
+ } else if (if_name == "sqc_tlb_port") {
+ sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
+ return *sqcTLBPort;
+ } else if (if_name == "ldsPort") {
+ if (ldsPort) {
+ fatal("an LDS port was already allocated");
+ }
+ ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
+ return *ldsPort;
+ } else {
+ panic("incorrect port name");
+ }
+ }
+
+ // xact_cas_load()
+ class waveIdentifier
+ {
+ public:
+ waveIdentifier() { }
+ waveIdentifier(int _simdId, int _wfSlotId)
+ : simdId(_simdId), wfSlotId(_wfSlotId) { }
+
+ int simdId;
+ int wfSlotId;
+ };
+
+ class waveQueue
+ {
+ public:
+ std::list<waveIdentifier> waveIDQueue;
+ };
+ std::map<unsigned, waveQueue> xactCasLoadMap;
+
+ uint64_t getAndIncSeqNum() { return globalSeqNum++; }
+
+ private:
+ uint64_t globalSeqNum;
+ int wavefrontSize;
+};
+
+#endif // __COMPUTE_UNIT_HH__
diff --git a/src/gpu-compute/condition_register_state.cc b/src/gpu-compute/condition_register_state.cc
new file mode 100644
index 000000000..f3f2d2927
--- /dev/null
+++ b/src/gpu-compute/condition_register_state.cc
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/condition_register_state.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+
+ConditionRegisterState::ConditionRegisterState()
+{
+ computeUnit = nullptr;
+ c_reg.clear();
+ busy.clear();
+}
+
+void
+ConditionRegisterState::setParent(ComputeUnit *_computeUnit)
+{
+ computeUnit = _computeUnit;
+ _name = computeUnit->name() + ".CondRegState";
+}
+
+void
+ConditionRegisterState::init(uint32_t _size)
+{
+ c_reg.resize(_size);
+ busy.resize(_size, 0);
+}
+
+void
+ConditionRegisterState::exec(GPUStaticInst *ii, Wavefront *w)
+{
+ // iterate over all operands
+ for (auto i = 0; i < ii->getNumOperands(); ++i) {
+ // is this a condition register destination operand?
+ if (ii->isCondRegister(i) && ii->isDstOperand(i)) {
+ // mark the register as busy
+ markReg(ii->getRegisterIndex(i), 1);
+ uint32_t pipeLen = w->computeUnit->spBypassLength();
+
+ // schedule an event for marking the register as ready
+ w->computeUnit->
+ registerEvent(w->simdId, ii->getRegisterIndex(i),
+ ii->getOperandSize(i),
+ w->computeUnit->shader->tick_cnt +
+ w->computeUnit->shader->ticks(pipeLen), 0);
+ }
+ }
+}
diff --git a/src/gpu-compute/condition_register_state.hh b/src/gpu-compute/condition_register_state.hh
new file mode 100644
index 000000000..139874a66
--- /dev/null
+++ b/src/gpu-compute/condition_register_state.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __CONDITION_REGISTER_STATE_HH__
+#define __CONDITION_REGISTER_STATE_HH__
+
+#include <string>
+#include <vector>
+
+#include "gpu-compute/misc.hh"
+
+class ComputeUnit;
+class GPUStaticInst;
+class Shader;
+class Wavefront;
+
+// Condition Register State (used only when executing HSAIL)
+class ConditionRegisterState
+{
+ public:
+ ConditionRegisterState();
+ void init(uint32_t _size);
+ const std::string name() const { return _name; }
+ void setParent(ComputeUnit *_computeUnit);
+ void regStats() { }
+
+ template<typename T>
+ T
+ read(int regIdx, int threadId)
+ {
+ bool tmp = c_reg[regIdx][threadId];
+ T *p0 = (T*)(&tmp);
+
+ return *p0;
+ }
+
+ template<typename T>
+ void
+ write(int regIdx, int threadId, T value)
+ {
+ c_reg[regIdx][threadId] = (bool)(value & 0x01);
+ }
+
+ void
+ markReg(int regIdx, uint8_t value)
+ {
+ busy.at(regIdx) = value;
+ }
+
+ uint8_t
+ regBusy(int idx)
+ {
+ uint8_t status = busy.at(idx);
+ return status;
+ }
+
+ int numRegs() { return c_reg.size(); }
+ void exec(GPUStaticInst *ii, Wavefront *w);
+
+ private:
+ ComputeUnit* computeUnit;
+ std::string _name;
+ // Condition Register state
+ std::vector<VectorMask> c_reg;
+ // flag indicating if a register is busy
+ std::vector<uint8_t> busy;
+};
+
+#endif
diff --git a/src/gpu-compute/dispatcher.cc b/src/gpu-compute/dispatcher.cc
new file mode 100644
index 000000000..55e4be72a
--- /dev/null
+++ b/src/gpu-compute/dispatcher.cc
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+
+#include "gpu-compute/dispatcher.hh"
+
+#include "cpu/base.hh"
+#include "debug/GPUDisp.hh"
+#include "gpu-compute/cl_driver.hh"
+#include "gpu-compute/cl_event.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/packet_access.hh"
+
+GpuDispatcher *GpuDispatcher::instance = nullptr;
+
+GpuDispatcher::GpuDispatcher(const Params *p)
+ : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")),
+ pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
+ dispatchCount(0), dispatchActive(false), cpu(p->cpu),
+ shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this)
+{
+ shader->handshake(this);
+ driver->handshake(this);
+
+ ndRange.wg_disp_rem = false;
+ ndRange.globalWgId = 0;
+
+ schedule(&tickEvent, 0);
+
+ // translation port for the dispatcher
+ tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
+
+ num_kernelLaunched
+ .name(name() + ".num_kernel_launched")
+ .desc("number of kernel launched")
+ ;
+}
+
+GpuDispatcher *GpuDispatcherParams::create()
+{
+ GpuDispatcher *dispatcher = new GpuDispatcher(this);
+ GpuDispatcher::setInstance(dispatcher);
+
+ return GpuDispatcher::getInstance();
+}
+
+void
+GpuDispatcher::serialize(CheckpointOut &cp) const
+{
+ Tick event_tick = 0;
+
+ if (ndRange.wg_disp_rem)
+ fatal("Checkpointing not supported during active workgroup execution");
+
+ if (tickEvent.scheduled())
+ event_tick = tickEvent.when();
+
+ SERIALIZE_SCALAR(event_tick);
+
+}
+
+void
+GpuDispatcher::unserialize(CheckpointIn &cp)
+{
+ Tick event_tick;
+
+ if (tickEvent.scheduled())
+ deschedule(&tickEvent);
+
+ UNSERIALIZE_SCALAR(event_tick);
+
+ if (event_tick)
+ schedule(&tickEvent, event_tick);
+}
+
+AddrRangeList
+GpuDispatcher::getAddrRanges() const
+{
+ AddrRangeList ranges;
+
+ DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
+ pioAddr, pioSize);
+
+ ranges.push_back(RangeSize(pioAddr, pioSize));
+
+ return ranges;
+}
+
+Tick
+GpuDispatcher::read(PacketPtr pkt)
+{
+ assert(pkt->getAddr() >= pioAddr);
+ assert(pkt->getAddr() < pioAddr + pioSize);
+
+ int offset = pkt->getAddr() - pioAddr;
+ pkt->allocate();
+
+ DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
+
+ if (offset < 8) {
+ assert(!offset);
+ assert(pkt->getSize() == 8);
+
+ uint64_t retval = dispatchActive;
+ pkt->set(retval);
+ } else {
+ offset -= 8;
+ assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
+ char *curTaskPtr = (char*)&curTask;
+
+ memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
+ }
+
+ pkt->makeAtomicResponse();
+
+ return pioDelay;
+}
+
+Tick
+GpuDispatcher::write(PacketPtr pkt)
+{
+ assert(pkt->getAddr() >= pioAddr);
+ assert(pkt->getAddr() < pioAddr + pioSize);
+
+ int offset = pkt->getAddr() - pioAddr;
+
+#if TRACING_ON
+ uint64_t data_val = 0;
+
+ switch (pkt->getSize()) {
+ case 1:
+ data_val = pkt->get<uint8_t>();
+ break;
+ case 2:
+ data_val = pkt->get<uint16_t>();
+ break;
+ case 4:
+ data_val = pkt->get<uint32_t>();
+ break;
+ case 8:
+ data_val = pkt->get<uint64_t>();
+ break;
+ default:
+ DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
+ }
+
+ DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
+ pkt->getSize());
+#endif
+ if (!offset) {
+ static int nextId = 0;
+
+ // The depends field of the qstruct, which was previously unused, is
+ // used to communicate with simulated application.
+ if (curTask.depends) {
+ HostState hs;
+ shader->ReadMem((uint64_t)(curTask.depends), &hs,
+ sizeof(HostState), 0);
+
+ // update event start time (in nano-seconds)
+ uint64_t start = curTick() / 1000;
+
+ shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
+ &start, sizeof(uint64_t), 0);
+ }
+
+ // launch kernel
+ ++num_kernelLaunched;
+
+ NDRange *ndr = &(ndRangeMap[nextId]);
+ // copy dispatch info
+ ndr->q = curTask;
+
+ // update the numDispTask polled by the runtime
+ accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
+
+ ndr->numWgTotal = 1;
+
+ for (int i = 0; i < 3; ++i) {
+ ndr->wgId[i] = 0;
+ ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
+ ndr->numWgTotal *= ndr->numWg[i];
+ }
+
+ ndr->numWgCompleted = 0;
+ ndr->globalWgId = 0;
+ ndr->wg_disp_rem = true;
+ ndr->execDone = false;
+ ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
+ ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
+ ndr->dispatchId = nextId;
+ ndr->curTid = pkt->req->threadId();
+ DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
+ execIds.push(nextId);
+ ++nextId;
+
+ dispatchActive = true;
+
+ if (!tickEvent.scheduled()) {
+ schedule(&tickEvent, curTick() + shader->ticks(1));
+ }
+ } else {
+ // populate current task struct
+ // first 64 bits are launch reg
+ offset -= 8;
+ assert(offset < sizeof(HsaQueueEntry));
+ char *curTaskPtr = (char*)&curTask;
+ memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
+ }
+
+ pkt->makeAtomicResponse();
+
+ return pioDelay;
+}
+
+
+BaseMasterPort&
+GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx)
+{
+ if (if_name == "translation_port") {
+ return *tlbPort;
+ }
+
+ return DmaDevice::getMasterPort(if_name, idx);
+}
+
+void
+GpuDispatcher::exec()
+{
+ int fail_count = 0;
+
+ // There are potentially multiple outstanding kernel launches.
+ // It is possible that the workgroups in a different kernel
+ // can fit on the GPU even if another kernel's workgroups cannot
+ DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
+
+ while (execIds.size() > fail_count) {
+ int execId = execIds.front();
+
+ while (ndRangeMap[execId].wg_disp_rem) {
+ //update the thread context
+ shader->updateThreadContext(ndRangeMap[execId].curTid);
+
+ // attempt to dispatch_workgroup
+ if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
+ // if we failed try the next kernel,
+ // it may have smaller workgroups.
+ // put it on the queue to rety latter
+ DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
+ execIds.push(execId);
+ ++fail_count;
+ break;
+ }
+ }
+ // let's try the next kernel_id
+ execIds.pop();
+ }
+
+ DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
+
+ if (doneIds.size() && cpu) {
+ shader->hostWakeUp(cpu);
+ }
+
+ while (doneIds.size()) {
+ // wakeup the CPU if any Kernels completed this cycle
+ DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
+ doneIds.pop();
+ }
+}
+
+void
+GpuDispatcher::notifyWgCompl(Wavefront *w)
+{
+ int kern_id = w->kern_id;
+ DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
+ assert(ndRangeMap[kern_id].dispatchId == kern_id);
+ ndRangeMap[kern_id].numWgCompleted++;
+
+ if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
+ ndRangeMap[kern_id].execDone = true;
+ doneIds.push(kern_id);
+
+ if (ndRangeMap[kern_id].addrToNotify) {
+ accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
+ 0);
+ }
+
+ accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
+
+ // update event end time (in nano-seconds)
+ if (ndRangeMap[kern_id].q.depends) {
+ HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
+ uint64_t event;
+ shader->ReadMem((uint64_t)(&host_state->event), &event,
+ sizeof(uint64_t), 0);
+
+ uint64_t end = curTick() / 1000;
+
+ shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
+ sizeof(uint64_t), 0);
+ }
+ }
+
+ if (!tickEvent.scheduled()) {
+ schedule(&tickEvent, curTick() + shader->ticks(1));
+ }
+}
+
+void
+GpuDispatcher::scheduleDispatch()
+{
+ if (!tickEvent.scheduled())
+ schedule(&tickEvent, curTick() + shader->ticks(1));
+}
+
+void
+GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
+{
+ if (cpu) {
+ if (off) {
+ shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
+ true);
+ val += off;
+ }
+
+ shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
+ } else {
+ panic("Cannot find host");
+ }
+}
+
+GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher)
+ : Event(CPU_Tick_Pri), dispatcher(_dispatcher)
+{
+}
+
+void
+GpuDispatcher::TickEvent::process()
+{
+ dispatcher->exec();
+}
+
+const char*
+GpuDispatcher::TickEvent::description() const
+{
+ return "GPU Dispatcher tick";
+}
+
+// helper functions for driver to retrieve GPU attributes
+int
+GpuDispatcher::getNumCUs()
+{
+ return shader->cuList.size();
+}
+
+void
+GpuDispatcher::setFuncargsSize(int funcargs_size)
+{
+ shader->funcargs_size = funcargs_size;
+}
diff --git a/src/gpu-compute/dispatcher.hh b/src/gpu-compute/dispatcher.hh
new file mode 100644
index 000000000..76f932655
--- /dev/null
+++ b/src/gpu-compute/dispatcher.hh
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+#ifndef __GPU_DISPATCHER_HH__
+#define __GPU_DISPATCHER_HH__
+
+#include <queue>
+#include <vector>
+
+#include "base/statistics.hh"
+#include "dev/dma_device.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/ndrange.hh"
+#include "gpu-compute/qstruct.hh"
+#include "mem/port.hh"
+#include "params/GpuDispatcher.hh"
+
+class BaseCPU;
+class Shader;
+
+class GpuDispatcher : public DmaDevice
+{
+ public:
+ typedef GpuDispatcherParams Params;
+
+ class TickEvent : public Event
+ {
+ private:
+ GpuDispatcher *dispatcher;
+
+ public:
+ TickEvent(GpuDispatcher *);
+ void process();
+ const char *description() const;
+ };
+
+ MasterID masterId() { return _masterId; }
+
+ protected:
+ MasterID _masterId;
+
+ // Base and length of PIO register space
+ Addr pioAddr;
+ Addr pioSize;
+ Tick pioDelay;
+
+ HsaQueueEntry curTask;
+
+ std::unordered_map<int, NDRange> ndRangeMap;
+ NDRange ndRange;
+
+ // list of kernel_ids to launch
+ std::queue<int> execIds;
+ // list of kernel_ids that have finished
+ std::queue<int> doneIds;
+
+ uint64_t dispatchCount;
+ // is there a kernel in execution?
+ bool dispatchActive;
+
+ BaseCPU *cpu;
+ Shader *shader;
+ ClDriver *driver;
+ TickEvent tickEvent;
+
+ static GpuDispatcher *instance;
+
+ // sycall emulation mode can have only 1 application running(?)
+ // else we have to do some pid based tagging
+ // unused
+ typedef std::unordered_map<uint64_t, uint64_t> TranslationBuffer;
+ TranslationBuffer tlb;
+
+ public:
+ /*statistics*/
+ Stats::Scalar num_kernelLaunched;
+ GpuDispatcher(const Params *p);
+
+ ~GpuDispatcher() { }
+
+ void exec();
+ virtual void serialize(CheckpointOut &cp) const;
+ virtual void unserialize(CheckpointIn &cp);
+ void notifyWgCompl(Wavefront *w);
+ void scheduleDispatch();
+ void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off);
+
+ // using singleton so that glue code can pass pointer locations
+ // to the dispatcher. when there are multiple dispatchers, we can
+ // call something like getInstance(index)
+ static void
+ setInstance(GpuDispatcher *_instance)
+ {
+ instance = _instance;
+ }
+
+ static GpuDispatcher* getInstance() { return instance; }
+
+ class TLBPort : public MasterPort
+ {
+ public:
+
+ TLBPort(const std::string &_name, GpuDispatcher *_dispatcher)
+ : MasterPort(_name, _dispatcher), dispatcher(_dispatcher) { }
+
+ protected:
+ GpuDispatcher *dispatcher;
+
+ virtual bool recvTimingResp(PacketPtr pkt) { return true; }
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt) { }
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry() { }
+
+ };
+
+ TLBPort *tlbPort;
+
+ virtual BaseMasterPort& getMasterPort(const std::string &if_name,
+ PortID idx);
+
+ AddrRangeList getAddrRanges() const;
+ Tick read(PacketPtr pkt);
+ Tick write(PacketPtr pkt);
+
+ // helper functions to retrieve/set GPU attributes
+ int getNumCUs();
+ void setFuncargsSize(int funcargs_size);
+};
+
+#endif // __GPU_DISPATCHER_HH__
diff --git a/src/gpu-compute/exec_stage.cc b/src/gpu-compute/exec_stage.cc
new file mode 100644
index 000000000..c2b95f85e
--- /dev/null
+++ b/src/gpu-compute/exec_stage.cc
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#include "gpu-compute/exec_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/wavefront.hh"
+
+ExecStage::ExecStage(const ComputeUnitParams *p) : numSIMDs(p->num_SIMDs),
+ numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
+ vectorAluInstAvail(nullptr), glbMemInstAvail(nullptr),
+ shrMemInstAvail(nullptr), lastTimeInstExecuted(false),
+ thisTimeInstExecuted(false), instrExecuted (false),
+ executionResourcesUsed(0)
+{
+ numTransActiveIdle = 0;
+ idle_dur = 0;
+}
+
+void
+ExecStage::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ _name = computeUnit->name() + ".ExecStage";
+ dispatchList = &computeUnit->dispatchList;
+ vectorAluInstAvail = &(computeUnit->vectorAluInstAvail);
+ glbMemInstAvail= &(computeUnit->glbMemInstAvail);
+ shrMemInstAvail= &(computeUnit->shrMemInstAvail);
+ idle_dur = 0;
+}
+
+void
+ExecStage::collectStatistics(enum STAT_STATUS stage, int unitId) {
+ if (stage == IdleExec) {
+ // count cycles of no vector ALU instruction executed
+ // even if one was the oldest in a WV of that vector SIMD unit
+ if (computeUnit->isVecAlu(unitId) && vectorAluInstAvail->at(unitId)) {
+ numCyclesWithNoInstrTypeIssued[unitId]++;
+ }
+
+ // count cycles of no global memory (vector) instruction executed
+ // even if one was the oldest in a WV of that vector SIMD unit
+ if (computeUnit->isGlbMem(unitId) && *glbMemInstAvail > 0) {
+ numCyclesWithNoInstrTypeIssued[unitId]++;
+ (*glbMemInstAvail)--;
+ }
+
+ // count cycles of no shared memory (vector) instruction executed
+ // even if one was the oldest in a WV of that vector SIMD unit
+ if (computeUnit->isShrMem(unitId) && *shrMemInstAvail > 0) {
+ numCyclesWithNoInstrTypeIssued[unitId]++;
+ (*shrMemInstAvail)--;
+ }
+ } else if (stage == BusyExec) {
+ // count the number of cycles an instruction to a specific unit
+ // was issued
+ numCyclesWithInstrTypeIssued[unitId]++;
+ thisTimeInstExecuted = true;
+ instrExecuted = true;
+ ++executionResourcesUsed;
+ } else if (stage == PostExec) {
+ // count the number of transitions from active to idle
+ if (lastTimeInstExecuted && !thisTimeInstExecuted) {
+ ++numTransActiveIdle;
+ }
+
+ if (!lastTimeInstExecuted && thisTimeInstExecuted) {
+ idleDur.sample(idle_dur);
+ idle_dur = 0;
+ } else if (!thisTimeInstExecuted) {
+ idle_dur++;
+ }
+
+ lastTimeInstExecuted = thisTimeInstExecuted;
+ // track the number of cycles we either issued one vector instruction
+ // or issued no instructions at all
+ if (instrExecuted) {
+ numCyclesWithInstrIssued++;
+ } else {
+ numCyclesWithNoIssue++;
+ }
+
+ spc.sample(executionResourcesUsed);
+ }
+}
+
+void
+ExecStage::initStatistics()
+{
+ instrExecuted = false;
+ executionResourcesUsed = 0;
+ thisTimeInstExecuted = false;
+}
+
+void
+ExecStage::exec()
+{
+ initStatistics();
+
+ for (int unitId = 0; unitId < (numSIMDs + numMemUnits); ++unitId) {
+ // if dispatch list for this execution resource is empty,
+ // skip this execution resource this cycle
+ if (dispatchList->at(unitId).second == EMPTY) {
+ collectStatistics(IdleExec, unitId);
+ continue;
+ }
+
+ collectStatistics(BusyExec, unitId);
+ // execute an instruction for the WF
+ dispatchList->at(unitId).first->exec();
+ // clear the dispatch list entry
+ dispatchList->at(unitId).second = EMPTY;
+ dispatchList->at(unitId).first = (Wavefront*)nullptr;
+ }
+
+ collectStatistics(PostExec, 0);
+}
+
+void
+ExecStage::regStats()
+{
+ numTransActiveIdle
+ .name(name() + ".num_transitions_active_to_idle")
+ .desc("number of CU transitions from active to idle")
+ ;
+
+ numCyclesWithNoIssue
+ .name(name() + ".num_cycles_with_no_issue")
+ .desc("number of cycles the CU issues nothing")
+ ;
+
+ numCyclesWithInstrIssued
+ .name(name() + ".num_cycles_with_instr_issued")
+ .desc("number of cycles the CU issued at least one instruction")
+ ;
+
+ spc
+ .init(0, numSIMDs + numMemUnits, 1)
+ .name(name() + ".spc")
+ .desc("Execution units active per cycle (Exec unit=SIMD,MemPipe)")
+ ;
+
+ idleDur
+ .init(0,75,5)
+ .name(name() + ".idle_duration_in_cycles")
+ .desc("duration of idle periods in cycles")
+ ;
+
+ numCyclesWithInstrTypeIssued
+ .init(numSIMDs + numMemUnits)
+ .name(name() + ".num_cycles_with_instrtype_issue")
+ .desc("Number of cycles at least one instruction of specific type "
+ "issued")
+ ;
+
+ numCyclesWithNoInstrTypeIssued
+ .init(numSIMDs + numMemUnits)
+ .name(name() + ".num_cycles_with_instr_type_no_issue")
+ .desc("Number of cycles no instruction of specific type issued")
+ ;
+
+ for (int i = 0; i < numSIMDs; ++i) {
+ numCyclesWithInstrTypeIssued.subname(i, csprintf("ALU%d",i));
+ numCyclesWithNoInstrTypeIssued.subname(i, csprintf("ALU%d",i));
+ }
+
+ numCyclesWithInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
+ numCyclesWithNoInstrTypeIssued.subname(numSIMDs, csprintf("GM"));
+ numCyclesWithInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
+ numCyclesWithNoInstrTypeIssued.subname(numSIMDs + 1, csprintf("LM"));
+}
diff --git a/src/gpu-compute/exec_stage.hh b/src/gpu-compute/exec_stage.hh
new file mode 100644
index 000000000..2de74366b
--- /dev/null
+++ b/src/gpu-compute/exec_stage.hh
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#ifndef __EXEC_STAGE_HH__
+#define __EXEC_STAGE_HH__
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "sim/stats.hh"
+
+class ComputeUnit;
+class Wavefront;
+struct ComputeUnitParams;
+
+enum STAT_STATUS
+{
+ IdleExec,
+ BusyExec,
+ PostExec
+};
+
+enum DISPATCH_STATUS
+{
+ EMPTY = 0,
+ FILLED
+};
+
+// Execution stage.
+// Each execution resource executes the
+// wave which is in its dispatch list.
+// The schedule stage is responsible for
+// adding a wave into each execution resource's
+// dispatch list.
+
+class ExecStage
+{
+ public:
+ ExecStage(const ComputeUnitParams* params);
+ ~ExecStage() { }
+ void init(ComputeUnit *cu);
+ void exec();
+
+ std::string name() { return _name; }
+ void regStats();
+ // number of idle cycles
+ Stats::Scalar numCyclesWithNoIssue;
+ // number of busy cycles
+ Stats::Scalar numCyclesWithInstrIssued;
+ // number of cycles (per execution unit) during which at least one
+ // instruction was issued to that unit
+ Stats::Vector numCyclesWithInstrTypeIssued;
+ // number of idle cycles (per execution unit) during which the unit issued
+ // no instruction targeting that unit, even though there is at least one
+ // Wavefront with such an instruction as the oldest
+ Stats::Vector numCyclesWithNoInstrTypeIssued;
+ // SIMDs active per cycle
+ Stats::Distribution spc;
+
+ private:
+ void collectStatistics(enum STAT_STATUS stage, int unitId);
+ void initStatistics();
+ ComputeUnit *computeUnit;
+ uint32_t numSIMDs;
+
+ // Number of memory execution resources;
+ // both global and local memory execution resources in CU
+ uint32_t numMemUnits;
+
+ // List of waves which will be dispatched to
+ // each execution resource. A FILLED implies
+ // dispatch list is non-empty and
+ // execution unit has something to execute
+ // this cycle. Currently, the dispatch list of
+ // an execution resource can hold only one wave because
+ // an execution resource can execute only one wave in a cycle.
+ // dispatchList is used to communicate between schedule
+ // and exec stage
+ std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
+ // flag per vector SIMD unit that is set when there is at least one
+ // WV that has a vector ALU instruction as the oldest in its
+ // Instruction Buffer
+ std::vector<bool> *vectorAluInstAvail;
+ int *glbMemInstAvail;
+ int *shrMemInstAvail;
+ bool lastTimeInstExecuted;
+ bool thisTimeInstExecuted;
+ bool instrExecuted;
+ Stats::Scalar numTransActiveIdle;
+ Stats::Distribution idleDur;
+ uint32_t executionResourcesUsed;
+ uint64_t idle_dur;
+ std::string _name;
+};
+
+#endif // __EXEC_STAGE_HH__
diff --git a/src/gpu-compute/fetch_stage.cc b/src/gpu-compute/fetch_stage.cc
new file mode 100644
index 000000000..1f5e6ded3
--- /dev/null
+++ b/src/gpu-compute/fetch_stage.cc
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez, Sooraj Puthoor
+ */
+
+#include "gpu-compute/fetch_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/wavefront.hh"
+
+FetchStage::FetchStage(const ComputeUnitParams* p) : numSIMDs(p->num_SIMDs),
+ computeUnit(nullptr)
+{
+ for (int j = 0; j < numSIMDs; ++j) {
+ FetchUnit newFetchUnit(p);
+ fetchUnit.push_back(newFetchUnit);
+ }
+}
+
+FetchStage::~FetchStage()
+{
+ fetchUnit.clear();
+}
+
+void
+FetchStage::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ _name = computeUnit->name() + ".FetchStage";
+
+ for (int j = 0; j < numSIMDs; ++j) {
+ fetchUnit[j].bindWaveList(&computeUnit->wfList[j]);
+ fetchUnit[j].init(computeUnit);
+ }
+}
+
+void
+FetchStage::exec()
+{
+ for (int j = 0; j < numSIMDs; ++j) {
+ fetchUnit[j].exec();
+ }
+}
+
+void
+FetchStage::processFetchReturn(PacketPtr pkt)
+{
+ ComputeUnit::SQCPort::SenderState *sender_state =
+ safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
+
+ Wavefront *wavefront = sender_state->wavefront;
+
+ const unsigned num_instructions = pkt->req->getSize() /
+ sizeof(TheGpuISA::RawMachInst);
+
+ instFetchInstReturned.sample(num_instructions);
+ uint32_t simdId = wavefront->simdId;
+ fetchUnit[simdId].processFetchReturn(pkt);
+}
+
+void
+FetchStage::fetch(PacketPtr pkt, Wavefront *wavefront)
+{
+ fetchUnit[wavefront->simdId].fetch(pkt, wavefront);
+}
+
+void
+FetchStage::regStats()
+{
+ instFetchInstReturned
+ .init(1, 32, 1)
+ .name(name() + ".inst_fetch_instr_returned")
+ .desc("For each instruction fetch request recieved record how many "
+ "instructions you got from it")
+ ;
+}
diff --git a/src/gpu-compute/fetch_stage.hh b/src/gpu-compute/fetch_stage.hh
new file mode 100644
index 000000000..ce7faa8ac
--- /dev/null
+++ b/src/gpu-compute/fetch_stage.hh
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez, Sooraj Puthoor
+ */
+
+#ifndef __FETCH_STAGE_HH__
+#define __FETCH_STAGE_HH__
+
+#include <string>
+#include <vector>
+
+#include "gpu-compute/fetch_unit.hh"
+
+// Instruction fetch stage.
+// All dispatched wavefronts for all SIMDS are analyzed for the
+// need to fetch instructions. From the fetch eligible waves,
+// one wave is selected from each SIMD and fetch is initiated
+// for the selected waves.
+
+class ComputeUnit;
+class Wavefront;
+
+class FetchStage
+{
+ public:
+ FetchStage(const ComputeUnitParams* params);
+ ~FetchStage();
+ void init(ComputeUnit *cu);
+ void exec();
+ void processFetchReturn(PacketPtr pkt);
+ void fetch(PacketPtr pkt, Wavefront *wave);
+
+ // Stats related variables and methods
+ std::string name() { return _name; }
+ void regStats();
+ Stats::Distribution instFetchInstReturned;
+
+ private:
+ uint32_t numSIMDs;
+ ComputeUnit *computeUnit;
+
+ // List of fetch units. A fetch unit is
+ // instantiated per SIMD
+ std::vector<FetchUnit> fetchUnit;
+ std::string _name;
+};
+
+#endif // __FETCH_STAGE_HH__
diff --git a/src/gpu-compute/fetch_unit.cc b/src/gpu-compute/fetch_unit.cc
new file mode 100644
index 000000000..1f0a7d78e
--- /dev/null
+++ b/src/gpu-compute/fetch_unit.cc
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Sooraj Puthoor
+ */
+
+#include "gpu-compute/fetch_unit.hh"
+
+#include "debug/GPUFetch.hh"
+#include "debug/GPUPort.hh"
+#include "debug/GPUTLB.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/ruby/system/RubySystem.hh"
+
+uint32_t FetchUnit::globalFetchUnitID;
+
+FetchUnit::FetchUnit(const ComputeUnitParams* params) :
+ timingSim(true),
+ computeUnit(nullptr),
+ fetchScheduler(params),
+ waveList(nullptr)
+{
+}
+
+FetchUnit::~FetchUnit()
+{
+ fetchQueue.clear();
+ fetchStatusQueue.clear();
+}
+
+void
+FetchUnit::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ timingSim = computeUnit->shader->timingSim;
+ fetchQueue.clear();
+ fetchStatusQueue.resize(computeUnit->shader->n_wf);
+
+ for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
+ fetchStatusQueue[j] = std::make_pair(waveList->at(j), false);
+ }
+
+ fetchScheduler.bindList(&fetchQueue);
+}
+
+void
+FetchUnit::exec()
+{
+ // re-evaluate waves which are marked as not ready for fetch
+ for (int j = 0; j < computeUnit->shader->n_wf; ++j) {
+ // Following code assumes 64-bit opertaion and all insts are
+ // represented by 64-bit pointers to inst objects.
+ Wavefront *curWave = fetchStatusQueue[j].first;
+ assert (curWave);
+
+ // The wavefront has to be active, the IB occupancy has to be
+ // 4 or less instructions and it can not have any branches to
+ // prevent speculative instruction fetches
+ if (!fetchStatusQueue[j].second) {
+ if (curWave->status == Wavefront::S_RUNNING &&
+ curWave->instructionBuffer.size() <= 4 &&
+ !curWave->instructionBufferHasBranch() &&
+ !curWave->pendingFetch) {
+ fetchQueue.push_back(curWave);
+ fetchStatusQueue[j].second = true;
+ }
+ }
+ }
+
+ // Fetch only if there is some wave ready to be fetched
+ // An empty fetchQueue will cause the schedular to panic
+ if (fetchQueue.size()) {
+ Wavefront *waveToBeFetched = fetchScheduler.chooseWave();
+ waveToBeFetched->pendingFetch = true;
+ fetchStatusQueue[waveToBeFetched->wfSlotId].second = false;
+ initiateFetch(waveToBeFetched);
+ }
+}
+
+void
+FetchUnit::initiateFetch(Wavefront *wavefront)
+{
+ // calculate the virtual address to fetch from the SQC
+ Addr vaddr = wavefront->pc() + wavefront->instructionBuffer.size();
+ vaddr = wavefront->base_ptr + vaddr * sizeof(GPUStaticInst*);
+
+ DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
+ computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId, vaddr);
+
+ // Since this is an instruction prefetch, if you're split then just finish
+ // out the current line.
+ unsigned block_size = RubySystem::getBlockSizeBytes();
+ // check for split accesses
+ Addr split_addr = roundDown(vaddr + block_size - 1, block_size);
+ unsigned size = block_size;
+
+ if (split_addr > vaddr) {
+ // misaligned access, just grab the rest of the line
+ size = split_addr - vaddr;
+ }
+
+ // set up virtual request
+ Request *req = new Request(0, vaddr, size, Request::INST_FETCH,
+ computeUnit->masterId(), 0, 0, 0);
+
+ PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+ // This fetchBlock is kind of faux right now - because the translations so
+ // far don't actually return Data
+ uint64_t fetchBlock;
+ pkt->dataStatic(&fetchBlock);
+
+ if (timingSim) {
+ // SenderState needed on Return
+ pkt->senderState = new ComputeUnit::ITLBPort::SenderState(wavefront);
+
+ // Sender State needed by TLB hierarchy
+ pkt->senderState =
+ new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
+ computeUnit->shader->gpuTc,
+ false, pkt->senderState);
+
+ if (computeUnit->sqcTLBPort->isStalled()) {
+ assert(computeUnit->sqcTLBPort->retries.size() > 0);
+
+ DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
+ vaddr);
+
+ computeUnit->sqcTLBPort->retries.push_back(pkt);
+ } else if (!computeUnit->sqcTLBPort->sendTimingReq(pkt)) {
+ // Stall the data port;
+ // No more packet is issued till
+ // ruby indicates resources are freed by
+ // a recvReqRetry() call back on this port.
+ computeUnit->sqcTLBPort->stallPort();
+
+ DPRINTF(GPUTLB, "Failed to send TLB req for FETCH addr %#x\n",
+ vaddr);
+
+ computeUnit->sqcTLBPort->retries.push_back(pkt);
+ } else {
+ DPRINTF(GPUTLB, "sent FETCH translation request for %#x\n", vaddr);
+ }
+ } else {
+ pkt->senderState =
+ new TheISA::GpuTLB::TranslationState(BaseTLB::Execute,
+ computeUnit->shader->gpuTc);
+
+ computeUnit->sqcTLBPort->sendFunctional(pkt);
+
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ delete sender_state->tlbEntry;
+ delete sender_state;
+ // fetch the instructions from the SQC when we operate in
+ // functional mode only
+ fetch(pkt, wavefront);
+ }
+}
+
+void
+FetchUnit::fetch(PacketPtr pkt, Wavefront *wavefront)
+{
+ assert(pkt->req->hasPaddr());
+ assert(pkt->req->hasSize());
+
+ DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch Access: %#x\n",
+ computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+ pkt->req->getPaddr());
+
+ // this is necessary because the GPU TLB receives packets instead of
+ // requests. when the translation is complete, all relevent fields in the
+ // request will be populated, but not in the packet. here we create the
+ // new packet so we can set the size, addr, and proper flags.
+ PacketPtr oldPkt = pkt;
+ pkt = new Packet(oldPkt->req, oldPkt->cmd);
+ delete oldPkt;
+
+ TheGpuISA::RawMachInst *data =
+ new TheGpuISA::RawMachInst[pkt->req->getSize() /
+ sizeof(TheGpuISA::RawMachInst)];
+
+ pkt->dataDynamic<TheGpuISA::RawMachInst>(data);
+
+ // New SenderState for the memory access
+ pkt->senderState = new ComputeUnit::SQCPort::SenderState(wavefront);
+
+ if (timingSim) {
+ // translation is done. Send the appropriate timing memory request.
+
+ if (!computeUnit->sqcPort->sendTimingReq(pkt)) {
+ computeUnit->sqcPort->retries.push_back(std::make_pair(pkt,
+ wavefront));
+
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
+ computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+ pkt->req->getPaddr());
+ } else {
+ DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
+ computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
+ pkt->req->getPaddr());
+ }
+ } else {
+ computeUnit->sqcPort->sendFunctional(pkt);
+ processFetchReturn(pkt);
+ }
+}
+
+void
+FetchUnit::processFetchReturn(PacketPtr pkt)
+{
+ ComputeUnit::SQCPort::SenderState *sender_state =
+ safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
+
+ Wavefront *wavefront = sender_state->wavefront;
+
+ DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: Fetch addr %#x returned "
+ "%d bytes, %d instructions!\n", computeUnit->cu_id,
+ wavefront->simdId, wavefront->wfSlotId, pkt->req->getPaddr(),
+ pkt->req->getSize(), pkt->req->getSize() /
+ sizeof(TheGpuISA::RawMachInst));
+
+ if (wavefront->dropFetch) {
+ assert(wavefront->instructionBuffer.empty());
+ wavefront->dropFetch = false;
+ } else {
+ TheGpuISA::RawMachInst *inst_index_ptr =
+ (TheGpuISA::RawMachInst*)pkt->getPtr<uint8_t>();
+
+ assert(wavefront->instructionBuffer.size() <= 4);
+
+ for (int i = 0; i < pkt->req->getSize() /
+ sizeof(TheGpuISA::RawMachInst); ++i) {
+ GPUStaticInst *inst_ptr = decoder.decode(inst_index_ptr[i]);
+
+ assert(inst_ptr);
+ DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: added %s\n",
+ computeUnit->cu_id, wavefront->simdId,
+ wavefront->wfSlotId, inst_ptr->disassemble());
+
+ GPUDynInstPtr gpuDynInst =
+ std::make_shared<GPUDynInst>(computeUnit, wavefront, inst_ptr,
+ computeUnit->getAndIncSeqNum());
+
+ wavefront->instructionBuffer.push_back(gpuDynInst);
+ }
+ }
+
+ wavefront->pendingFetch = false;
+
+ delete pkt->senderState;
+ delete pkt->req;
+ delete pkt;
+}
+
+void
+FetchUnit::bindWaveList(std::vector<Wavefront*> *wave_list)
+{
+ waveList = wave_list;
+}
diff --git a/src/gpu-compute/fetch_unit.hh b/src/gpu-compute/fetch_unit.hh
new file mode 100644
index 000000000..c7c6afb3c
--- /dev/null
+++ b/src/gpu-compute/fetch_unit.hh
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Sooraj Puthoor
+ */
+
+#ifndef __FETCH_UNIT_HH__
+#define __FETCH_UNIT_HH__
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arch/gpu_decoder.hh"
+#include "base/statistics.hh"
+#include "config/the_gpu_isa.hh"
+#include "gpu-compute/scheduler.hh"
+#include "mem/packet.hh"
+
+class ComputeUnit;
+class Wavefront;
+
+class FetchUnit
+{
+ public:
+ FetchUnit(const ComputeUnitParams* params);
+ ~FetchUnit();
+ void init(ComputeUnit *cu);
+ void exec();
+ void bindWaveList(std::vector<Wavefront*> *list);
+ void initiateFetch(Wavefront *wavefront);
+ void fetch(PacketPtr pkt, Wavefront *wavefront);
+ void processFetchReturn(PacketPtr pkt);
+ static uint32_t globalFetchUnitID;
+
+ private:
+ bool timingSim;
+ ComputeUnit *computeUnit;
+ TheGpuISA::Decoder decoder;
+
+ // Fetch scheduler; Selects one wave from
+ // the fetch queue for instruction fetching.
+ // The selection is made according to
+ // a scheduling policy
+ Scheduler fetchScheduler;
+
+ // Stores the list of waves that are
+ // ready to be fetched this cycle
+ std::vector<Wavefront*> fetchQueue;
+
+ // Stores the fetch status of all waves dispatched to this SIMD.
+ // TRUE implies the wave is ready to fetch and is already
+ // moved to fetchQueue
+ std::vector<std::pair<Wavefront*, bool>> fetchStatusQueue;
+
+ // Pointer to list of waves dispatched on to this SIMD unit
+ std::vector<Wavefront*> *waveList;
+};
+
+#endif // __FETCH_UNIT_HH__
diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc
new file mode 100644
index 000000000..913327412
--- /dev/null
+++ b/src/gpu-compute/global_memory_pipeline.cc
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#include "gpu-compute/global_memory_pipeline.hh"
+
+#include "debug/GPUMem.hh"
+#include "debug/GPUReg.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) :
+ computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size),
+ inflightStores(0), inflightLoads(0)
+{
+}
+
+void
+GlobalMemPipeline::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ globalMemSize = computeUnit->shader->globalMemSize;
+ _name = computeUnit->name() + ".GlobalMemPipeline";
+}
+
+void
+GlobalMemPipeline::exec()
+{
+ // apply any returned global memory operations
+ GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() :
+ !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
+
+ bool accessVrf = true;
+ // check the VRF to see if the operands of a load (or load component
+ // of an atomic) are accessible
+ if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
+ Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+ accessVrf =
+ w->computeUnit->vrf[m->simdId]->
+ vrfOperandAccessReady(m->seqNum(), w, m,
+ VrfAccessType::WRITE);
+ }
+
+ if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) &&
+ m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() &&
+ accessVrf && m->statusBitVector == VectorMask(0) &&
+ (computeUnit->shader->coissue_return ||
+ computeUnit->wfWait.at(m->pipeId).rdy())) {
+
+ if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
+ doGmReturn<uint32_t, uint8_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
+ doGmReturn<uint32_t, uint16_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
+ doGmReturn<uint32_t, uint32_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
+ doGmReturn<int32_t, int8_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
+ doGmReturn<int32_t, int16_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
+ doGmReturn<int32_t, int32_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
+ doGmReturn<float, Float16>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
+ doGmReturn<float, float>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
+ doGmReturn<uint64_t, uint8_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
+ doGmReturn<uint64_t, uint16_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
+ doGmReturn<uint64_t, uint32_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
+ doGmReturn<uint64_t, uint64_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
+ doGmReturn<int64_t, int8_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
+ doGmReturn<int64_t, int16_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
+ doGmReturn<int64_t, int32_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
+ doGmReturn<int64_t, int64_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
+ doGmReturn<double, Float16>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
+ doGmReturn<double, float>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
+ doGmReturn<double, double>(m);
+ }
+
+ // If pipeline has executed a global memory instruction
+ // execute global memory packets and issue global
+ // memory packets to DTLB
+ if (!gmIssuedRequests.empty()) {
+ GPUDynInstPtr mp = gmIssuedRequests.front();
+ if (mp->m_op == Enums::MO_LD ||
+ (mp->m_op >= Enums::MO_AAND && mp->m_op <= Enums::MO_AMIN) ||
+ (mp->m_op >= Enums::MO_ANRAND && mp->m_op <= Enums::MO_ANRMIN)) {
+
+ if (inflightLoads >= gmQueueSize) {
+ return;
+ } else {
+ ++inflightLoads;
+ }
+ } else {
+ if (inflightStores >= gmQueueSize) {
+ return;
+ } else {
+ ++inflightStores;
+ }
+ }
+
+ mp->initiateAcc(mp);
+ gmIssuedRequests.pop();
+
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n",
+ computeUnit->cu_id, mp->simdId, mp->wfSlotId,
+ Enums::MemOpTypeStrings[mp->m_op]);
+ }
+}
+
+template<typename c0, typename c1>
+void
+GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
+{
+ Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+ // Return data to registers
+ if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+ gmReturnedLoads.pop();
+ assert(inflightLoads > 0);
+ --inflightLoads;
+
+ if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
+ std::vector<uint32_t> regVec;
+ // iterate over number of destination register operands since
+ // this is a load or atomic operation
+ for (int k = 0; k < m->n_reg; ++k) {
+ assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST);
+ int dst = m->dst_reg + k;
+
+ if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+ dst = m->dst_reg_vec[k];
+ // virtual->physical VGPR mapping
+ int physVgpr = w->remap(dst, sizeof(c0), 1);
+ // save the physical VGPR index
+ regVec.push_back(physVgpr);
+ c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+
+ for (int i = 0; i < VSZ; ++i) {
+ if (m->exec_mask[i]) {
+ DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
+ "$%s%d <- %d global ld done (src = wavefront "
+ "ld inst)\n", w->computeUnit->cu_id, w->simdId,
+ w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
+ dst, *p1);
+ // write the value into the physical VGPR. This is a
+ // purely functional operation. No timing is modeled.
+ w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
+ *p1, i);
+ }
+ ++p1;
+ }
+ }
+
+ // Schedule the write operation of the load data on the VRF.
+ // This simply models the timing aspect of the VRF write operation.
+ // It does not modify the physical VGPR.
+ loadVrfBankConflictCycles +=
+ w->computeUnit->vrf[w->simdId]->exec(m->seqNum(),
+ w, regVec, sizeof(c0),
+ m->time);
+ }
+ } else {
+ gmReturnedStores.pop();
+ assert(inflightStores > 0);
+ --inflightStores;
+ }
+
+ // Decrement outstanding register count
+ computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
+
+ if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op) ||
+ MO_H(m->m_op)) {
+ computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_gm, m->time,
+ -1);
+ }
+
+ if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+ computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_gm, m->time,
+ -1);
+ }
+
+ // Mark write bus busy for appropriate amount of time
+ computeUnit->glbMemToVrfBus.set(m->time);
+ if (!computeUnit->shader->coissue_return)
+ w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+}
+
+void
+GlobalMemPipeline::regStats()
+{
+ loadVrfBankConflictCycles
+ .name(name() + ".load_vrf_bank_conflict_cycles")
+ .desc("total number of cycles GM data are delayed before updating "
+ "the VRF")
+ ;
+}
diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh
new file mode 100644
index 000000000..ed49f6f6b
--- /dev/null
+++ b/src/gpu-compute/global_memory_pipeline.hh
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Sooraj Puthoor
+ */
+
+#ifndef __GLOBAL_MEMORY_PIPELINE_HH__
+#define __GLOBAL_MEMORY_PIPELINE_HH__
+
+#include <queue>
+#include <string>
+
+#include "gpu-compute/misc.hh"
+#include "params/ComputeUnit.hh"
+#include "sim/stats.hh"
+
+/*
+ * @file global_memory_pipeline.hh
+ *
+ * The global memory pipeline issues newly created global memory packets
+ * from the pipeline to DTLB. The exec() method of the memory packet issues
+ * the packet to the DTLB if there is space available in the return fifo.
+ * This stage also retires previously issued loads and stores that have
+ * returned from the memory sub-system.
+ */
+
+class ComputeUnit;
+
+class GlobalMemPipeline
+{
+ public:
+ GlobalMemPipeline(const ComputeUnitParams *params);
+ void init(ComputeUnit *cu);
+ void exec();
+
+ template<typename c0, typename c1> void doGmReturn(GPUDynInstPtr m);
+
+ std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; }
+ std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
+ std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
+
+ bool
+ isGMLdRespFIFOWrRdy() const
+ {
+ return gmReturnedLoads.size() < gmQueueSize;
+ }
+
+ bool
+ isGMStRespFIFOWrRdy() const
+ {
+ return gmReturnedStores.size() < gmQueueSize;
+ }
+
+ bool
+ isGMReqFIFOWrRdy(uint32_t pendReqs=0) const
+ {
+ return (gmIssuedRequests.size() + pendReqs) < gmQueueSize;
+ }
+
+ const std::string &name() const { return _name; }
+ void regStats();
+
+ private:
+ ComputeUnit *computeUnit;
+ std::string _name;
+ int gmQueueSize;
+
+ // number of cycles of delaying the update of a VGPR that is the
+ // target of a load instruction (or the load component of an atomic)
+ // The delay is due to VRF bank conflicts
+ Stats::Scalar loadVrfBankConflictCycles;
+ // Counters to track the inflight loads and stores
+ // so that we can provide the proper backpressure
+ // on the number of inflight memory operations.
+ int inflightStores;
+ int inflightLoads;
+
+ // The size of global memory.
+ int globalMemSize;
+
+ // Global Memory Request FIFO: all global memory requests
+ // are issued to this FIFO from the memory pipelines
+ std::queue<GPUDynInstPtr> gmIssuedRequests;
+
+ // Globa Store Response FIFO: all responses of global memory
+ // stores are sent to this FIFO from TCP
+ std::queue<GPUDynInstPtr> gmReturnedStores;
+
+ // Global Load Response FIFO: all responses of global memory
+ // loads are sent to this FIFO from TCP
+ std::queue<GPUDynInstPtr> gmReturnedLoads;
+};
+
+#endif // __GLOBAL_MEMORY_PIPELINE_HH__
diff --git a/src/gpu-compute/gpu_dyn_inst.cc b/src/gpu-compute/gpu_dyn_inst.cc
new file mode 100644
index 000000000..83e348dbe
--- /dev/null
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_dyn_inst.hh"
+
+#include "debug/GPUMem.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+
+GPUDynInst::GPUDynInst(ComputeUnit *_cu, Wavefront *_wf,
+ GPUStaticInst *_staticInst, uint64_t instSeqNum)
+ : GPUExecContext(_cu, _wf), m_op(Enums::MO_UNDEF),
+ memoryOrder(Enums::MEMORY_ORDER_NONE), useContinuation(false),
+ statusBitVector(0), staticInst(_staticInst), _seqNum(instSeqNum)
+{
+ tlbHitLevel.assign(VSZ, -1);
+}
+
+void
+GPUDynInst::execute()
+{
+ GPUDynInstPtr gpuDynInst = std::make_shared<GPUDynInst>(cu, wf, staticInst,
+ _seqNum);
+ staticInst->execute(gpuDynInst);
+}
+
+int
+GPUDynInst::numSrcRegOperands()
+{
+ return staticInst->numSrcRegOperands();
+}
+
+int
+GPUDynInst::numDstRegOperands()
+{
+ return staticInst->numDstRegOperands();
+}
+
+int
+GPUDynInst::getNumOperands()
+{
+ return staticInst->getNumOperands();
+}
+
+bool
+GPUDynInst::isVectorRegister(int operandIdx)
+{
+ return staticInst->isVectorRegister(operandIdx);
+}
+
+bool
+GPUDynInst::isScalarRegister(int operandIdx)
+{
+ return staticInst->isVectorRegister(operandIdx);
+}
+
+int
+GPUDynInst::getRegisterIndex(int operandIdx)
+{
+ return staticInst->getRegisterIndex(operandIdx);
+}
+
+int
+GPUDynInst::getOperandSize(int operandIdx)
+{
+ return staticInst->getOperandSize(operandIdx);
+}
+
+bool
+GPUDynInst::isDstOperand(int operandIdx)
+{
+ return staticInst->isDstOperand(operandIdx);
+}
+
+bool
+GPUDynInst::isSrcOperand(int operandIdx)
+{
+ return staticInst->isSrcOperand(operandIdx);
+}
+
+bool
+GPUDynInst::isArgLoad()
+{
+ return staticInst->isArgLoad();
+}
+
+const std::string&
+GPUDynInst::disassemble() const
+{
+ return staticInst->disassemble();
+}
+
+uint64_t
+GPUDynInst::seqNum() const
+{
+ return _seqNum;
+}
+
+Enums::OpType
+GPUDynInst::opType()
+{
+ return staticInst->o_type;
+}
+
+Enums::StorageClassType
+GPUDynInst::executedAs()
+{
+ return staticInst->executed_as;
+}
+
+// Process a memory instruction and (if necessary) submit timing request
+void
+GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
+{
+ DPRINTF(GPUMem, "CU%d: WF[%d][%d]: mempacket status bitvector=%#x\n",
+ cu->cu_id, simdId, wfSlotId, exec_mask);
+
+ staticInst->initiateAcc(gpuDynInst);
+ time = 0;
+}
+
+bool
+GPUDynInst::scalarOp() const
+{
+ return staticInst->scalarOp();
+}
+
+void
+GPUDynInst::updateStats()
+{
+ if (staticInst->isLocalMem()) {
+ // access to LDS (shared) memory
+ cu->dynamicLMemInstrCnt++;
+ } else {
+ // access to global memory
+
+ // update PageDivergence histogram
+ int number_pages_touched = cu->pagesTouched.size();
+ assert(number_pages_touched);
+ cu->pageDivergenceDist.sample(number_pages_touched);
+
+ std::pair<ComputeUnit::pageDataStruct::iterator, bool> ret;
+
+ for (auto it : cu->pagesTouched) {
+ // see if this page has been touched before. if not, this also
+ // inserts the page into the table.
+ ret = cu->pageAccesses
+ .insert(ComputeUnit::pageDataStruct::value_type(it.first,
+ std::make_pair(1, it.second)));
+
+ // if yes, then update the stats
+ if (!ret.second) {
+ ret.first->second.first++;
+ ret.first->second.second += it.second;
+ }
+ }
+
+ cu->pagesTouched.clear();
+
+ // total number of memory instructions (dynamic)
+ // Atomics are counted as a single memory instruction.
+ // this is # memory instructions per wavefronts, not per workitem
+ cu->dynamicGMemInstrCnt++;
+ }
+}
diff --git a/src/gpu-compute/gpu_dyn_inst.hh b/src/gpu-compute/gpu_dyn_inst.hh
new file mode 100644
index 000000000..e44d8f80d
--- /dev/null
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_DYN_INST_HH__
+#define __GPU_DYN_INST_HH__
+
+#include <cstdint>
+#include <string>
+
+#include "enums/GenericMemoryOrder.hh"
+#include "enums/GenericMemoryScope.hh"
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "enums/OpType.hh"
+#include "enums/StorageClassType.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_exec_context.hh"
+
+class GPUStaticInst;
+
+template<typename T>
+class AtomicOpAnd : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+
+ AtomicOpAnd(T _a) : a(_a) { }
+ void execute(T *b) { *b &= a; }
+};
+
+template<typename T>
+class AtomicOpOr : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+ AtomicOpOr(T _a) : a(_a) { }
+ void execute(T *b) { *b |= a; }
+};
+
+template<typename T>
+class AtomicOpXor : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+ AtomicOpXor(T _a) : a(_a) {}
+ void execute(T *b) { *b ^= a; }
+};
+
+template<typename T>
+class AtomicOpCAS : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T c;
+ T s;
+
+ ComputeUnit *computeUnit;
+
+ AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
+ : c(_c), s(_s), computeUnit(compute_unit) { }
+
+ void
+ execute(T *b)
+ {
+ computeUnit->numCASOps++;
+
+ if (*b == c) {
+ *b = s;
+ } else {
+ computeUnit->numFailedCASOps++;
+ }
+
+ if (computeUnit->xact_cas_mode) {
+ computeUnit->xactCasLoadMap.clear();
+ }
+ }
+};
+
+template<typename T>
+class AtomicOpExch : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+ AtomicOpExch(T _a) : a(_a) { }
+ void execute(T *b) { *b = a; }
+};
+
+template<typename T>
+class AtomicOpAdd : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+ AtomicOpAdd(T _a) : a(_a) { }
+ void execute(T *b) { *b += a; }
+};
+
+template<typename T>
+class AtomicOpSub : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+ AtomicOpSub(T _a) : a(_a) { }
+ void execute(T *b) { *b -= a; }
+};
+
+template<typename T>
+class AtomicOpInc : public TypedAtomicOpFunctor<T>
+{
+ public:
+ AtomicOpInc() { }
+ void execute(T *b) { *b += 1; }
+};
+
+template<typename T>
+class AtomicOpDec : public TypedAtomicOpFunctor<T>
+{
+ public:
+ AtomicOpDec() {}
+ void execute(T *b) { *b -= 1; }
+};
+
+template<typename T>
+class AtomicOpMax : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+ AtomicOpMax(T _a) : a(_a) { }
+
+ void
+ execute(T *b)
+ {
+ if (a > *b)
+ *b = a;
+ }
+};
+
+template<typename T>
+class AtomicOpMin : public TypedAtomicOpFunctor<T>
+{
+ public:
+ T a;
+ AtomicOpMin(T _a) : a(_a) {}
+
+ void
+ execute(T *b)
+ {
+ if (a < *b)
+ *b = a;
+ }
+};
+
+#define MO_A(a) ((a)>=Enums::MO_AAND && (a)<=Enums::MO_AMIN)
+#define MO_ANR(a) ((a)>=Enums::MO_ANRAND && (a)<=Enums::MO_ANRMIN)
+#define MO_H(a) ((a)>=Enums::MO_HAND && (a)<=Enums::MO_HMIN)
+
+typedef enum
+{
+ VT_32,
+ VT_64,
+} vgpr_type;
+
+typedef enum
+{
+ SEG_PRIVATE,
+ SEG_SPILL,
+ SEG_GLOBAL,
+ SEG_SHARED,
+ SEG_READONLY,
+ SEG_FLAT
+} seg_type;
+
+class GPUDynInst : public GPUExecContext
+{
+ public:
+ GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *_staticInst,
+ uint64_t instSeqNum);
+
+ void execute();
+ int numSrcRegOperands();
+ int numDstRegOperands();
+ int getNumOperands();
+ bool isVectorRegister(int operandIdx);
+ bool isScalarRegister(int operandIdx);
+ int getRegisterIndex(int operandIdx);
+ int getOperandSize(int operandIdx);
+ bool isDstOperand(int operandIdx);
+ bool isSrcOperand(int operandIdx);
+ bool isArgLoad();
+
+ const std::string &disassemble() const;
+
+ uint64_t seqNum() const;
+
+ Enums::OpType opType();
+ Enums::StorageClassType executedAs();
+
+ // The address of the memory operation
+ Addr addr[VSZ];
+ Addr pAddr;
+
+ // The data to get written
+ uint8_t d_data[VSZ * 16];
+ // Additional data (for atomics)
+ uint8_t a_data[VSZ * 8];
+ // Additional data (for atomics)
+ uint8_t x_data[VSZ * 8];
+ // The execution mask
+ VectorMask exec_mask;
+
+ // The memory type (M_U32, M_S32, ...)
+ Enums::MemType m_type;
+ // The memory operation (MO_LD, MO_ST, ...)
+ Enums::MemOpType m_op;
+ Enums::GenericMemoryOrder memoryOrder;
+
+ // Scope of the request
+ Enums::GenericMemoryScope scope;
+ // The memory segment (SEG_SHARED, SEG_GLOBAL, ...)
+ seg_type s_type;
+ // The equivalency class
+ int equiv;
+ // The return VGPR type (VT_32 or VT_64)
+ vgpr_type v_type;
+ // Number of VGPR's accessed (1, 2, or 4)
+ int n_reg;
+ // The return VGPR index
+ int dst_reg;
+ // There can be max 4 dest regs>
+ int dst_reg_vec[4];
+ // SIMD where the WF of the memory instruction has been mapped to
+ int simdId;
+ // unique id of the WF where the memory instruction belongs to
+ int wfDynId;
+ // The kernel id of the requesting wf
+ int kern_id;
+ // The CU id of the requesting wf
+ int cu_id;
+ // HW slot id where the WF is mapped to inside a SIMD unit
+ int wfSlotId;
+ // execution pipeline id where the memory instruction has been scheduled
+ int pipeId;
+ // The execution time of this operation
+ Tick time;
+ // The latency of this operation
+ WaitClass latency;
+ // A list of bank conflicts for the 4 cycles.
+ uint32_t bc[4];
+
+ // A pointer to ROM
+ uint8_t *rom;
+ // The size of the READONLY segment
+ int sz_rom;
+
+ // Initiate the specified memory operation, by creating a
+ // memory request and sending it off to the memory system.
+ void initiateAcc(GPUDynInstPtr gpuDynInst);
+
+ void updateStats();
+
+ GPUStaticInst* staticInstruction() { return staticInst; }
+
+ // Is the instruction a scalar or vector op?
+ bool scalarOp() const;
+
+ /*
+ * Loads/stores/atomics may have acquire/release semantics associated
+ * withthem. Some protocols want to see the acquire/release as separate
+ * requests from the load/store/atomic. We implement that separation
+ * using continuations (i.e., a function pointer with an object associated
+ * with it). When, for example, the front-end generates a store with
+ * release semantics, we will first issue a normal store and set the
+ * continuation in the GPUDynInst to a function that generate a
+ * release request. That continuation will be called when the normal
+ * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
+ * continuation will be called in the context of the same GPUDynInst
+ * that generated the initial store.
+ */
+ std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;
+
+ // when true, call execContinuation when response arrives
+ bool useContinuation;
+
+ template<typename c0> AtomicOpFunctor*
+ makeAtomicOpFunctor(c0 *reg0, c0 *reg1, Enums::MemOpType op)
+ {
+ using namespace Enums;
+
+ switch(op) {
+ case MO_AAND:
+ case MO_ANRAND:
+ return new AtomicOpAnd<c0>(*reg0);
+ case MO_AOR:
+ case MO_ANROR:
+ return new AtomicOpOr<c0>(*reg0);
+ case MO_AXOR:
+ case MO_ANRXOR:
+ return new AtomicOpXor<c0>(*reg0);
+ case MO_ACAS:
+ case MO_ANRCAS:
+ return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
+ case MO_AEXCH:
+ case MO_ANREXCH:
+ return new AtomicOpExch<c0>(*reg0);
+ case MO_AADD:
+ case MO_ANRADD:
+ return new AtomicOpAdd<c0>(*reg0);
+ case MO_ASUB:
+ case MO_ANRSUB:
+ return new AtomicOpSub<c0>(*reg0);
+ case MO_AINC:
+ case MO_ANRINC:
+ return new AtomicOpInc<c0>();
+ case MO_ADEC:
+ case MO_ANRDEC:
+ return new AtomicOpDec<c0>();
+ case MO_AMAX:
+ case MO_ANRMAX:
+ return new AtomicOpMax<c0>(*reg0);
+ case MO_AMIN:
+ case MO_ANRMIN:
+ return new AtomicOpMin<c0>(*reg0);
+ default:
+ panic("Unrecognized atomic operation");
+ }
+ }
+
+ void
+ setRequestFlags(Request *req, bool setMemOrder=true)
+ {
+ // currently these are the easy scopes to deduce
+ switch (s_type) {
+ case SEG_PRIVATE:
+ req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
+ break;
+ case SEG_SPILL:
+ req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
+ break;
+ case SEG_GLOBAL:
+ req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
+ break;
+ case SEG_READONLY:
+ req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
+ break;
+ case SEG_SHARED:
+ req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
+ break;
+ case SEG_FLAT:
+ // TODO: translate to correct scope
+ assert(false);
+ default:
+ panic("Bad segment type");
+ break;
+ }
+
+ switch (scope) {
+ case Enums::MEMORY_SCOPE_NONE:
+ case Enums::MEMORY_SCOPE_WORKITEM:
+ break;
+ case Enums::MEMORY_SCOPE_WAVEFRONT:
+ req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+ Request::WAVEFRONT_SCOPE);
+ break;
+ case Enums::MEMORY_SCOPE_WORKGROUP:
+ req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+ Request::WORKGROUP_SCOPE);
+ break;
+ case Enums::MEMORY_SCOPE_DEVICE:
+ req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+ Request::DEVICE_SCOPE);
+ break;
+ case Enums::MEMORY_SCOPE_SYSTEM:
+ req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
+ Request::SYSTEM_SCOPE);
+ break;
+ default:
+ panic("Bad scope type");
+ break;
+ }
+
+ if (setMemOrder) {
+ // set acquire and release flags
+ switch (memoryOrder){
+ case Enums::MEMORY_ORDER_SC_ACQUIRE:
+ req->setFlags(Request::ACQUIRE);
+ break;
+ case Enums::MEMORY_ORDER_SC_RELEASE:
+ req->setFlags(Request::RELEASE);
+ break;
+ case Enums::MEMORY_ORDER_SC_ACQUIRE_RELEASE:
+ req->setFlags(Request::ACQUIRE | Request::RELEASE);
+ break;
+ default:
+ break;
+ }
+ }
+
+ // set atomic type
+ // currently, the instruction genenerator only produces atomic return
+ // but a magic instruction can produce atomic no return
+ if (m_op == Enums::MO_AADD || m_op == Enums::MO_ASUB ||
+ m_op == Enums::MO_AAND || m_op == Enums::MO_AOR ||
+ m_op == Enums::MO_AXOR || m_op == Enums::MO_AMAX ||
+ m_op == Enums::MO_AMIN || m_op == Enums::MO_AINC ||
+ m_op == Enums::MO_ADEC || m_op == Enums::MO_AEXCH ||
+ m_op == Enums::MO_ACAS) {
+ req->setFlags(Request::ATOMIC_RETURN_OP);
+ } else if (m_op == Enums::MO_ANRADD || m_op == Enums::MO_ANRSUB ||
+ m_op == Enums::MO_ANRAND || m_op == Enums::MO_ANROR ||
+ m_op == Enums::MO_ANRXOR || m_op == Enums::MO_ANRMAX ||
+ m_op == Enums::MO_ANRMIN || m_op == Enums::MO_ANRINC ||
+ m_op == Enums::MO_ANRDEC || m_op == Enums::MO_ANREXCH ||
+ m_op == Enums::MO_ANRCAS) {
+ req->setFlags(Request::ATOMIC_NO_RETURN_OP);
+ }
+ }
+
+ // Map returned packets and the addresses they satisfy with which lane they
+ // were requested from
+ typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
+ StatusVector memStatusVector;
+
+ // Track the status of memory requests per lane, a bit per lane
+ VectorMask statusBitVector;
+ // for ld_v# or st_v#
+ std::vector<int> statusVector;
+ std::vector<int> tlbHitLevel;
+
+ private:
+ GPUStaticInst *staticInst;
+ uint64_t _seqNum;
+};
+
+#endif // __GPU_DYN_INST_HH__
diff --git a/src/gpu-compute/gpu_exec_context.cc b/src/gpu-compute/gpu_exec_context.cc
new file mode 100644
index 000000000..4af69c41e
--- /dev/null
+++ b/src/gpu-compute/gpu_exec_context.cc
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_exec_context.hh"
+
+GPUExecContext::GPUExecContext(ComputeUnit *_cu, Wavefront *_wf)
+ : cu(_cu), wf(_wf)
+{
+}
+
+ComputeUnit*
+GPUExecContext::computeUnit()
+{
+ return cu;
+}
+
+Wavefront*
+GPUExecContext::wavefront()
+{
+ return wf;
+}
diff --git a/src/gpu-compute/gpu_exec_context.hh b/src/gpu-compute/gpu_exec_context.hh
new file mode 100644
index 000000000..a3deb9b8f
--- /dev/null
+++ b/src/gpu-compute/gpu_exec_context.hh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_EXEC_CONTEXT_HH__
+#define __GPU_EXEC_CONTEXT_HH__
+
+class ComputeUnit;
+class Wavefront;
+
+class GPUExecContext
+{
+ public:
+ GPUExecContext(ComputeUnit *_cu, Wavefront *_wf);
+ Wavefront* wavefront();
+ ComputeUnit* computeUnit();
+
+ protected:
+ ComputeUnit *cu;
+ Wavefront *wf;
+};
+
+#endif // __GPU_EXEC_CONTEXT_HH__
diff --git a/src/gpu-compute/gpu_static_inst.cc b/src/gpu-compute/gpu_static_inst.cc
new file mode 100644
index 000000000..bcb8a5f3d
--- /dev/null
+++ b/src/gpu-compute/gpu_static_inst.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+GPUStaticInst::GPUStaticInst(const std::string &opcode)
+ : o_type(Enums::OT_ALU), executed_as(Enums::SC_NONE), opcode(opcode),
+ _instNum(0), _scalarOp(false)
+{
+}
diff --git a/src/gpu-compute/gpu_static_inst.hh b/src/gpu-compute/gpu_static_inst.hh
new file mode 100644
index 000000000..c1de28427
--- /dev/null
+++ b/src/gpu-compute/gpu_static_inst.hh
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __GPU_STATIC_INST_HH__
+#define __GPU_STATIC_INST_HH__
+
+/*
+ * @file gpu_static_inst.hh
+ *
+ * Defines the base class representing static instructions for the GPU. The
+ * instructions are "static" because they contain no dynamic instruction
+ * information. GPUStaticInst corresponds to the StaticInst class for the CPU
+ * models.
+ */
+
+#include <cstdint>
+#include <string>
+
+#include "enums/OpType.hh"
+#include "enums/StorageClassType.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/misc.hh"
+
+class BaseOperand;
+class BaseRegOperand;
+class Wavefront;
+
+class GPUStaticInst
+{
+ public:
+ GPUStaticInst(const std::string &opcode);
+
+ void instNum(int num) { _instNum = num; }
+
+ int instNum() { return _instNum; }
+
+ void ipdInstNum(int num) { _ipdInstNum = num; }
+
+ int ipdInstNum() const { return _ipdInstNum; }
+
+ virtual void execute(GPUDynInstPtr gpuDynInst) = 0;
+ virtual void generateDisassembly() = 0;
+ virtual const std::string &disassemble() = 0;
+ virtual int getNumOperands() = 0;
+ virtual bool isCondRegister(int operandIndex) = 0;
+ virtual bool isScalarRegister(int operandIndex) = 0;
+ virtual bool isVectorRegister(int operandIndex) = 0;
+ virtual bool isSrcOperand(int operandIndex) = 0;
+ virtual bool isDstOperand(int operandIndex) = 0;
+ virtual int getOperandSize(int operandIndex) = 0;
+ virtual int getRegisterIndex(int operandIndex) = 0;
+ virtual int numDstRegOperands() = 0;
+ virtual int numSrcRegOperands() = 0;
+
+ /*
+ * Most instructions (including all HSAIL instructions)
+ * are vector ops, so _scalarOp will be false by default.
+ * Derived instruction objects that are scalar ops must
+ * set _scalarOp to true in their constructors.
+ */
+ bool scalarOp() const { return _scalarOp; }
+
+ virtual bool isLocalMem() const
+ {
+ fatal("calling isLocalMem() on non-memory instruction.\n");
+
+ return false;
+ }
+
+ bool isArgLoad() { return false; }
+ virtual uint32_t instSize() = 0;
+
+ // only used for memory instructions
+ virtual void
+ initiateAcc(GPUDynInstPtr gpuDynInst)
+ {
+ fatal("calling initiateAcc() on a non-memory instruction.\n");
+ }
+
+ virtual uint32_t getTargetPc() { return 0; }
+
+ /**
+ * Query whether the instruction is an unconditional jump i.e., the jump
+ * is always executed because there is no condition to be evaluated.
+ *
+ * If the instruction is not of branch type, the result is always false.
+ *
+ * @return True if the instruction is an unconditional jump.
+ */
+ virtual bool unconditionalJumpInstruction() { return false; }
+
+ static uint64_t dynamic_id_count;
+
+ Enums::OpType o_type;
+ // For flat memory accesses
+ Enums::StorageClassType executed_as;
+
+ protected:
+ virtual void
+ execLdAcq(GPUDynInstPtr gpuDynInst)
+ {
+ fatal("calling execLdAcq() on a non-load instruction.\n");
+ }
+
+ virtual void
+ execSt(GPUDynInstPtr gpuDynInst)
+ {
+ fatal("calling execLdAcq() on a non-load instruction.\n");
+ }
+
+ virtual void
+ execAtomic(GPUDynInstPtr gpuDynInst)
+ {
+ fatal("calling execAtomic() on a non-atomic instruction.\n");
+ }
+
+ virtual void
+ execAtomicAcq(GPUDynInstPtr gpuDynInst)
+ {
+ fatal("calling execAtomicAcq() on a non-atomic instruction.\n");
+ }
+
+ const std::string opcode;
+ std::string disassembly;
+ int _instNum;
+ /**
+ * Identifier of the immediate post-dominator instruction.
+ */
+ int _ipdInstNum;
+
+ bool _scalarOp;
+};
+
+#endif // __GPU_STATIC_INST_HH__
diff --git a/src/gpu-compute/gpu_tlb.cc b/src/gpu-compute/gpu_tlb.cc
new file mode 100644
index 000000000..de005fd04
--- /dev/null
+++ b/src/gpu-compute/gpu_tlb.cc
@@ -0,0 +1,1801 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/gpu_tlb.hh"
+
+#include <cmath>
+#include <cstring>
+
+#include "arch/x86/faults.hh"
+#include "arch/x86/insts/microldstop.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/pagetable_walker.hh"
+#include "arch/x86/regs/misc.hh"
+#include "arch/x86/x86_traits.hh"
+#include "base/bitfield.hh"
+#include "base/output.hh"
+#include "base/trace.hh"
+#include "cpu/base.hh"
+#include "cpu/thread_context.hh"
+#include "debug/GPUPrefetch.hh"
+#include "debug/GPUTLB.hh"
+#include "mem/packet_access.hh"
+#include "mem/page_table.hh"
+#include "mem/request.hh"
+#include "sim/process.hh"
+
+namespace X86ISA
+{
+
+ GpuTLB::GpuTLB(const Params *p)
+ : MemObject(p), configAddress(0), size(p->size),
+ cleanupEvent(this, false, Event::Maximum_Pri), exitEvent(this)
+ {
+ assoc = p->assoc;
+ assert(assoc <= size);
+ numSets = size/assoc;
+ allocationPolicy = p->allocationPolicy;
+ hasMemSidePort = false;
+ accessDistance = p->accessDistance;
+ clock = p->clk_domain->clockPeriod();
+
+ tlb = new GpuTlbEntry[size];
+ std::memset(tlb, 0, sizeof(GpuTlbEntry) * size);
+
+ freeList.resize(numSets);
+ entryList.resize(numSets);
+
+ for (int set = 0; set < numSets; ++set) {
+ for (int way = 0; way < assoc; ++way) {
+ int x = set*assoc + way;
+ freeList[set].push_back(&tlb[x]);
+ }
+ }
+
+ FA = (size == assoc);
+
+ /**
+ * @warning: the set-associative version assumes you have a
+ * fixed page size of 4KB.
+ * If the page size is greather than 4KB (as defined in the
+ * TheISA::PageBytes), then there are various issues w/ the current
+ * implementation (you'd have the same 8KB page being replicated in
+ * different sets etc)
+ */
+ setMask = numSets - 1;
+
+ #if 0
+ // GpuTLB doesn't yet support full system
+ walker = p->walker;
+ walker->setTLB(this);
+ #endif
+
+ maxCoalescedReqs = p->maxOutstandingReqs;
+
+ // Do not allow maxCoalescedReqs to be more than the TLB associativity
+ if (maxCoalescedReqs > assoc) {
+ maxCoalescedReqs = assoc;
+ cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc);
+ }
+
+ outstandingReqs = 0;
+ hitLatency = p->hitLatency;
+ missLatency1 = p->missLatency1;
+ missLatency2 = p->missLatency2;
+
+ // create the slave ports based on the number of connected ports
+ for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
+ cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d",
+ name(), i), this, i));
+ }
+
+ // create the master ports based on the number of connected ports
+ for (size_t i = 0; i < p->port_master_connection_count; ++i) {
+ memSidePort.push_back(new MemSidePort(csprintf("%s-port%d",
+ name(), i), this, i));
+ }
+ }
+
+ // fixme: this is never called?
+ GpuTLB::~GpuTLB()
+ {
+ // make sure all the hash-maps are empty
+ assert(translationReturnEvent.empty());
+
+ // delete the TLB
+ delete[] tlb;
+ }
+
+ BaseSlavePort&
+ GpuTLB::getSlavePort(const std::string &if_name, PortID idx)
+ {
+ if (if_name == "slave") {
+ if (idx >= static_cast<PortID>(cpuSidePort.size())) {
+ panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
+ }
+
+ return *cpuSidePort[idx];
+ } else {
+ panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
+ }
+ }
+
+ BaseMasterPort&
+ GpuTLB::getMasterPort(const std::string &if_name, PortID idx)
+ {
+ if (if_name == "master") {
+ if (idx >= static_cast<PortID>(memSidePort.size())) {
+ panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
+ }
+
+ hasMemSidePort = true;
+
+ return *memSidePort[idx];
+ } else {
+ panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
+ }
+ }
+
+ GpuTlbEntry*
+ GpuTLB::insert(Addr vpn, GpuTlbEntry &entry)
+ {
+ GpuTlbEntry *newEntry = nullptr;
+
+ /**
+ * vpn holds the virtual page address
+ * The least significant bits are simply masked
+ */
+ int set = (vpn >> TheISA::PageShift) & setMask;
+
+ if (!freeList[set].empty()) {
+ newEntry = freeList[set].front();
+ freeList[set].pop_front();
+ } else {
+ newEntry = entryList[set].back();
+ entryList[set].pop_back();
+ }
+
+ *newEntry = entry;
+ newEntry->vaddr = vpn;
+ entryList[set].push_front(newEntry);
+
+ return newEntry;
+ }
+
+ GpuTLB::EntryList::iterator
+ GpuTLB::lookupIt(Addr va, bool update_lru)
+ {
+ int set = (va >> TheISA::PageShift) & setMask;
+
+ if (FA) {
+ assert(!set);
+ }
+
+ auto entry = entryList[set].begin();
+ for (; entry != entryList[set].end(); ++entry) {
+ int page_size = (*entry)->size();
+
+ if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) {
+ DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x "
+ "with size %#x.\n", va, (*entry)->vaddr, page_size);
+
+ if (update_lru) {
+ entryList[set].push_front(*entry);
+ entryList[set].erase(entry);
+ entry = entryList[set].begin();
+ }
+
+ break;
+ }
+ }
+
+ return entry;
+ }
+
+ GpuTlbEntry*
+ GpuTLB::lookup(Addr va, bool update_lru)
+ {
+ int set = (va >> TheISA::PageShift) & setMask;
+
+ auto entry = lookupIt(va, update_lru);
+
+ if (entry == entryList[set].end())
+ return nullptr;
+ else
+ return *entry;
+ }
+
+ void
+ GpuTLB::invalidateAll()
+ {
+ DPRINTF(GPUTLB, "Invalidating all entries.\n");
+
+ for (int i = 0; i < numSets; ++i) {
+ while (!entryList[i].empty()) {
+ GpuTlbEntry *entry = entryList[i].front();
+ entryList[i].pop_front();
+ freeList[i].push_back(entry);
+ }
+ }
+ }
+
+ void
+ GpuTLB::setConfigAddress(uint32_t addr)
+ {
+ configAddress = addr;
+ }
+
+ void
+ GpuTLB::invalidateNonGlobal()
+ {
+ DPRINTF(GPUTLB, "Invalidating all non global entries.\n");
+
+ for (int i = 0; i < numSets; ++i) {
+ for (auto entryIt = entryList[i].begin();
+ entryIt != entryList[i].end();) {
+ if (!(*entryIt)->global) {
+ freeList[i].push_back(*entryIt);
+ entryList[i].erase(entryIt++);
+ } else {
+ ++entryIt;
+ }
+ }
+ }
+ }
+
+ void
+ GpuTLB::demapPage(Addr va, uint64_t asn)
+ {
+
+ int set = (va >> TheISA::PageShift) & setMask;
+ auto entry = lookupIt(va, false);
+
+ if (entry != entryList[set].end()) {
+ freeList[set].push_back(*entry);
+ entryList[set].erase(entry);
+ }
+ }
+
+ Fault
+ GpuTLB::translateInt(RequestPtr req, ThreadContext *tc)
+ {
+ DPRINTF(GPUTLB, "Addresses references internal memory.\n");
+ Addr vaddr = req->getVaddr();
+ Addr prefix = (vaddr >> 3) & IntAddrPrefixMask;
+
+ if (prefix == IntAddrPrefixCPUID) {
+ panic("CPUID memory space not yet implemented!\n");
+ } else if (prefix == IntAddrPrefixMSR) {
+ vaddr = vaddr >> 3;
+ req->setFlags(Request::MMAPPED_IPR);
+ Addr regNum = 0;
+
+ switch (vaddr & ~IntAddrPrefixMask) {
+ case 0x10:
+ regNum = MISCREG_TSC;
+ break;
+ case 0x1B:
+ regNum = MISCREG_APIC_BASE;
+ break;
+ case 0xFE:
+ regNum = MISCREG_MTRRCAP;
+ break;
+ case 0x174:
+ regNum = MISCREG_SYSENTER_CS;
+ break;
+ case 0x175:
+ regNum = MISCREG_SYSENTER_ESP;
+ break;
+ case 0x176:
+ regNum = MISCREG_SYSENTER_EIP;
+ break;
+ case 0x179:
+ regNum = MISCREG_MCG_CAP;
+ break;
+ case 0x17A:
+ regNum = MISCREG_MCG_STATUS;
+ break;
+ case 0x17B:
+ regNum = MISCREG_MCG_CTL;
+ break;
+ case 0x1D9:
+ regNum = MISCREG_DEBUG_CTL_MSR;
+ break;
+ case 0x1DB:
+ regNum = MISCREG_LAST_BRANCH_FROM_IP;
+ break;
+ case 0x1DC:
+ regNum = MISCREG_LAST_BRANCH_TO_IP;
+ break;
+ case 0x1DD:
+ regNum = MISCREG_LAST_EXCEPTION_FROM_IP;
+ break;
+ case 0x1DE:
+ regNum = MISCREG_LAST_EXCEPTION_TO_IP;
+ break;
+ case 0x200:
+ regNum = MISCREG_MTRR_PHYS_BASE_0;
+ break;
+ case 0x201:
+ regNum = MISCREG_MTRR_PHYS_MASK_0;
+ break;
+ case 0x202:
+ regNum = MISCREG_MTRR_PHYS_BASE_1;
+ break;
+ case 0x203:
+ regNum = MISCREG_MTRR_PHYS_MASK_1;
+ break;
+ case 0x204:
+ regNum = MISCREG_MTRR_PHYS_BASE_2;
+ break;
+ case 0x205:
+ regNum = MISCREG_MTRR_PHYS_MASK_2;
+ break;
+ case 0x206:
+ regNum = MISCREG_MTRR_PHYS_BASE_3;
+ break;
+ case 0x207:
+ regNum = MISCREG_MTRR_PHYS_MASK_3;
+ break;
+ case 0x208:
+ regNum = MISCREG_MTRR_PHYS_BASE_4;
+ break;
+ case 0x209:
+ regNum = MISCREG_MTRR_PHYS_MASK_4;
+ break;
+ case 0x20A:
+ regNum = MISCREG_MTRR_PHYS_BASE_5;
+ break;
+ case 0x20B:
+ regNum = MISCREG_MTRR_PHYS_MASK_5;
+ break;
+ case 0x20C:
+ regNum = MISCREG_MTRR_PHYS_BASE_6;
+ break;
+ case 0x20D:
+ regNum = MISCREG_MTRR_PHYS_MASK_6;
+ break;
+ case 0x20E:
+ regNum = MISCREG_MTRR_PHYS_BASE_7;
+ break;
+ case 0x20F:
+ regNum = MISCREG_MTRR_PHYS_MASK_7;
+ break;
+ case 0x250:
+ regNum = MISCREG_MTRR_FIX_64K_00000;
+ break;
+ case 0x258:
+ regNum = MISCREG_MTRR_FIX_16K_80000;
+ break;
+ case 0x259:
+ regNum = MISCREG_MTRR_FIX_16K_A0000;
+ break;
+ case 0x268:
+ regNum = MISCREG_MTRR_FIX_4K_C0000;
+ break;
+ case 0x269:
+ regNum = MISCREG_MTRR_FIX_4K_C8000;
+ break;
+ case 0x26A:
+ regNum = MISCREG_MTRR_FIX_4K_D0000;
+ break;
+ case 0x26B:
+ regNum = MISCREG_MTRR_FIX_4K_D8000;
+ break;
+ case 0x26C:
+ regNum = MISCREG_MTRR_FIX_4K_E0000;
+ break;
+ case 0x26D:
+ regNum = MISCREG_MTRR_FIX_4K_E8000;
+ break;
+ case 0x26E:
+ regNum = MISCREG_MTRR_FIX_4K_F0000;
+ break;
+ case 0x26F:
+ regNum = MISCREG_MTRR_FIX_4K_F8000;
+ break;
+ case 0x277:
+ regNum = MISCREG_PAT;
+ break;
+ case 0x2FF:
+ regNum = MISCREG_DEF_TYPE;
+ break;
+ case 0x400:
+ regNum = MISCREG_MC0_CTL;
+ break;
+ case 0x404:
+ regNum = MISCREG_MC1_CTL;
+ break;
+ case 0x408:
+ regNum = MISCREG_MC2_CTL;
+ break;
+ case 0x40C:
+ regNum = MISCREG_MC3_CTL;
+ break;
+ case 0x410:
+ regNum = MISCREG_MC4_CTL;
+ break;
+ case 0x414:
+ regNum = MISCREG_MC5_CTL;
+ break;
+ case 0x418:
+ regNum = MISCREG_MC6_CTL;
+ break;
+ case 0x41C:
+ regNum = MISCREG_MC7_CTL;
+ break;
+ case 0x401:
+ regNum = MISCREG_MC0_STATUS;
+ break;
+ case 0x405:
+ regNum = MISCREG_MC1_STATUS;
+ break;
+ case 0x409:
+ regNum = MISCREG_MC2_STATUS;
+ break;
+ case 0x40D:
+ regNum = MISCREG_MC3_STATUS;
+ break;
+ case 0x411:
+ regNum = MISCREG_MC4_STATUS;
+ break;
+ case 0x415:
+ regNum = MISCREG_MC5_STATUS;
+ break;
+ case 0x419:
+ regNum = MISCREG_MC6_STATUS;
+ break;
+ case 0x41D:
+ regNum = MISCREG_MC7_STATUS;
+ break;
+ case 0x402:
+ regNum = MISCREG_MC0_ADDR;
+ break;
+ case 0x406:
+ regNum = MISCREG_MC1_ADDR;
+ break;
+ case 0x40A:
+ regNum = MISCREG_MC2_ADDR;
+ break;
+ case 0x40E:
+ regNum = MISCREG_MC3_ADDR;
+ break;
+ case 0x412:
+ regNum = MISCREG_MC4_ADDR;
+ break;
+ case 0x416:
+ regNum = MISCREG_MC5_ADDR;
+ break;
+ case 0x41A:
+ regNum = MISCREG_MC6_ADDR;
+ break;
+ case 0x41E:
+ regNum = MISCREG_MC7_ADDR;
+ break;
+ case 0x403:
+ regNum = MISCREG_MC0_MISC;
+ break;
+ case 0x407:
+ regNum = MISCREG_MC1_MISC;
+ break;
+ case 0x40B:
+ regNum = MISCREG_MC2_MISC;
+ break;
+ case 0x40F:
+ regNum = MISCREG_MC3_MISC;
+ break;
+ case 0x413:
+ regNum = MISCREG_MC4_MISC;
+ break;
+ case 0x417:
+ regNum = MISCREG_MC5_MISC;
+ break;
+ case 0x41B:
+ regNum = MISCREG_MC6_MISC;
+ break;
+ case 0x41F:
+ regNum = MISCREG_MC7_MISC;
+ break;
+ case 0xC0000080:
+ regNum = MISCREG_EFER;
+ break;
+ case 0xC0000081:
+ regNum = MISCREG_STAR;
+ break;
+ case 0xC0000082:
+ regNum = MISCREG_LSTAR;
+ break;
+ case 0xC0000083:
+ regNum = MISCREG_CSTAR;
+ break;
+ case 0xC0000084:
+ regNum = MISCREG_SF_MASK;
+ break;
+ case 0xC0000100:
+ regNum = MISCREG_FS_BASE;
+ break;
+ case 0xC0000101:
+ regNum = MISCREG_GS_BASE;
+ break;
+ case 0xC0000102:
+ regNum = MISCREG_KERNEL_GS_BASE;
+ break;
+ case 0xC0000103:
+ regNum = MISCREG_TSC_AUX;
+ break;
+ case 0xC0010000:
+ regNum = MISCREG_PERF_EVT_SEL0;
+ break;
+ case 0xC0010001:
+ regNum = MISCREG_PERF_EVT_SEL1;
+ break;
+ case 0xC0010002:
+ regNum = MISCREG_PERF_EVT_SEL2;
+ break;
+ case 0xC0010003:
+ regNum = MISCREG_PERF_EVT_SEL3;
+ break;
+ case 0xC0010004:
+ regNum = MISCREG_PERF_EVT_CTR0;
+ break;
+ case 0xC0010005:
+ regNum = MISCREG_PERF_EVT_CTR1;
+ break;
+ case 0xC0010006:
+ regNum = MISCREG_PERF_EVT_CTR2;
+ break;
+ case 0xC0010007:
+ regNum = MISCREG_PERF_EVT_CTR3;
+ break;
+ case 0xC0010010:
+ regNum = MISCREG_SYSCFG;
+ break;
+ case 0xC0010016:
+ regNum = MISCREG_IORR_BASE0;
+ break;
+ case 0xC0010017:
+ regNum = MISCREG_IORR_BASE1;
+ break;
+ case 0xC0010018:
+ regNum = MISCREG_IORR_MASK0;
+ break;
+ case 0xC0010019:
+ regNum = MISCREG_IORR_MASK1;
+ break;
+ case 0xC001001A:
+ regNum = MISCREG_TOP_MEM;
+ break;
+ case 0xC001001D:
+ regNum = MISCREG_TOP_MEM2;
+ break;
+ case 0xC0010114:
+ regNum = MISCREG_VM_CR;
+ break;
+ case 0xC0010115:
+ regNum = MISCREG_IGNNE;
+ break;
+ case 0xC0010116:
+ regNum = MISCREG_SMM_CTL;
+ break;
+ case 0xC0010117:
+ regNum = MISCREG_VM_HSAVE_PA;
+ break;
+ default:
+ return std::make_shared<GeneralProtection>(0);
+ }
+ //The index is multiplied by the size of a MiscReg so that
+ //any memory dependence calculations will not see these as
+ //overlapping.
+ req->setPaddr(regNum * sizeof(MiscReg));
+ return NoFault;
+ } else if (prefix == IntAddrPrefixIO) {
+ // TODO If CPL > IOPL or in virtual mode, check the I/O permission
+ // bitmap in the TSS.
+
+ Addr IOPort = vaddr & ~IntAddrPrefixMask;
+ // Make sure the address fits in the expected 16 bit IO address
+ // space.
+ assert(!(IOPort & ~0xFFFF));
+
+ if (IOPort == 0xCF8 && req->getSize() == 4) {
+ req->setFlags(Request::MMAPPED_IPR);
+ req->setPaddr(MISCREG_PCI_CONFIG_ADDRESS * sizeof(MiscReg));
+ } else if ((IOPort & ~mask(2)) == 0xCFC) {
+ req->setFlags(Request::UNCACHEABLE);
+
+ Addr configAddress =
+ tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS);
+
+ if (bits(configAddress, 31, 31)) {
+ req->setPaddr(PhysAddrPrefixPciConfig |
+ mbits(configAddress, 30, 2) |
+ (IOPort & mask(2)));
+ } else {
+ req->setPaddr(PhysAddrPrefixIO | IOPort);
+ }
+ } else {
+ req->setFlags(Request::UNCACHEABLE);
+ req->setPaddr(PhysAddrPrefixIO | IOPort);
+ }
+ return NoFault;
+ } else {
+ panic("Access to unrecognized internal address space %#x.\n",
+ prefix);
+ }
+ }
+
+ /**
+ * TLB_lookup will only perform a TLB lookup returning true on a TLB hit
+ * and false on a TLB miss.
+ * Many of the checks about different modes have been converted to
+ * assertions, since these parts of the code are not really used.
+ * On a hit it will update the LRU stack.
+ */
+ bool
+ GpuTLB::tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats)
+ {
+ bool tlb_hit = false;
+ #ifndef NDEBUG
+ uint32_t flags = req->getFlags();
+ int seg = flags & SegmentFlagMask;
+ #endif
+
+ assert(seg != SEGMENT_REG_MS);
+ Addr vaddr = req->getVaddr();
+ DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr);
+ HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+
+ if (m5Reg.prot) {
+ DPRINTF(GPUTLB, "In protected mode.\n");
+ // make sure we are in 64-bit mode
+ assert(m5Reg.mode == LongMode);
+
+ // If paging is enabled, do the translation.
+ if (m5Reg.paging) {
+ DPRINTF(GPUTLB, "Paging enabled.\n");
+ //update LRU stack on a hit
+ GpuTlbEntry *entry = lookup(vaddr, true);
+
+ if (entry)
+ tlb_hit = true;
+
+ if (!update_stats) {
+ // functional tlb access for memory initialization
+ // i.e., memory seeding or instr. seeding -> don't update
+ // TLB and stats
+ return tlb_hit;
+ }
+
+ localNumTLBAccesses++;
+
+ if (!entry) {
+ localNumTLBMisses++;
+ } else {
+ localNumTLBHits++;
+ }
+ }
+ }
+
+ return tlb_hit;
+ }
+
+ Fault
+ GpuTLB::translate(RequestPtr req, ThreadContext *tc,
+ Translation *translation, Mode mode,
+ bool &delayedResponse, bool timing, int &latency)
+ {
+ uint32_t flags = req->getFlags();
+ int seg = flags & SegmentFlagMask;
+ bool storeCheck = flags & (StoreCheck << FlagShift);
+
+ // If this is true, we're dealing with a request
+ // to a non-memory address space.
+ if (seg == SEGMENT_REG_MS) {
+ return translateInt(req, tc);
+ }
+
+ delayedResponse = false;
+ Addr vaddr = req->getVaddr();
+ DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr);
+
+ HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+
+ // If protected mode has been enabled...
+ if (m5Reg.prot) {
+ DPRINTF(GPUTLB, "In protected mode.\n");
+ // If we're not in 64-bit mode, do protection/limit checks
+ if (m5Reg.mode != LongMode) {
+ DPRINTF(GPUTLB, "Not in long mode. Checking segment "
+ "protection.\n");
+
+ // Check for a null segment selector.
+ if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR ||
+ seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS)
+ && !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) {
+ return std::make_shared<GeneralProtection>(0);
+ }
+
+ bool expandDown = false;
+ SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg));
+
+ if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) {
+ if (!attr.writable && (mode == BaseTLB::Write ||
+ storeCheck))
+ return std::make_shared<GeneralProtection>(0);
+
+ if (!attr.readable && mode == BaseTLB::Read)
+ return std::make_shared<GeneralProtection>(0);
+
+ expandDown = attr.expandDown;
+
+ }
+
+ Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg));
+ Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg));
+ // This assumes we're not in 64 bit mode. If we were, the
+ // default address size is 64 bits, overridable to 32.
+ int size = 32;
+ bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift));
+ SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR);
+
+ if ((csAttr.defaultSize && sizeOverride) ||
+ (!csAttr.defaultSize && !sizeOverride)) {
+ size = 16;
+ }
+
+ Addr offset = bits(vaddr - base, size - 1, 0);
+ Addr endOffset = offset + req->getSize() - 1;
+
+ if (expandDown) {
+ DPRINTF(GPUTLB, "Checking an expand down segment.\n");
+ warn_once("Expand down segments are untested.\n");
+
+ if (offset <= limit || endOffset <= limit)
+ return std::make_shared<GeneralProtection>(0);
+ } else {
+ if (offset > limit || endOffset > limit)
+ return std::make_shared<GeneralProtection>(0);
+ }
+ }
+
+ // If paging is enabled, do the translation.
+ if (m5Reg.paging) {
+ DPRINTF(GPUTLB, "Paging enabled.\n");
+ // The vaddr already has the segment base applied.
+ GpuTlbEntry *entry = lookup(vaddr);
+ localNumTLBAccesses++;
+
+ if (!entry) {
+ localNumTLBMisses++;
+ if (timing) {
+ latency = missLatency1;
+ }
+
+ if (FullSystem) {
+ fatal("GpuTLB doesn't support full-system mode\n");
+ } else {
+ DPRINTF(GPUTLB, "Handling a TLB miss for address %#x "
+ "at pc %#x.\n", vaddr, tc->instAddr());
+
+ Process *p = tc->getProcessPtr();
+ GpuTlbEntry newEntry;
+ bool success = p->pTable->lookup(vaddr, newEntry);
+
+ if (!success && mode != BaseTLB::Execute) {
+ // penalize a "page fault" more
+ if (timing) {
+ latency += missLatency2;
+ }
+
+ if (p->fixupStackFault(vaddr))
+ success = p->pTable->lookup(vaddr, newEntry);
+ }
+
+ if (!success) {
+ return std::make_shared<PageFault>(vaddr, true,
+ mode, true,
+ false);
+ } else {
+ newEntry.valid = success;
+ Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+
+ DPRINTF(GPUTLB, "Mapping %#x to %#x\n",
+ alignedVaddr, newEntry.pageStart());
+
+ entry = insert(alignedVaddr, newEntry);
+ }
+
+ DPRINTF(GPUTLB, "Miss was serviced.\n");
+ }
+ } else {
+ localNumTLBHits++;
+
+ if (timing) {
+ latency = hitLatency;
+ }
+ }
+
+ // Do paging protection checks.
+ bool inUser = (m5Reg.cpl == 3 &&
+ !(flags & (CPL0FlagBit << FlagShift)));
+
+ CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
+ bool badWrite = (!entry->writable && (inUser || cr0.wp));
+
+ if ((inUser && !entry->user) || (mode == BaseTLB::Write &&
+ badWrite)) {
+ // The page must have been present to get into the TLB in
+ // the first place. We'll assume the reserved bits are
+ // fine even though we're not checking them.
+ return std::make_shared<PageFault>(vaddr, true, mode,
+ inUser, false);
+ }
+
+ if (storeCheck && badWrite) {
+ // This would fault if this were a write, so return a page
+ // fault that reflects that happening.
+ return std::make_shared<PageFault>(vaddr, true,
+ BaseTLB::Write,
+ inUser, false);
+ }
+
+
+ DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection "
+ "checks.\n", entry->paddr);
+
+ int page_size = entry->size();
+ Addr paddr = entry->paddr | (vaddr & (page_size - 1));
+ DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+ req->setPaddr(paddr);
+
+ if (entry->uncacheable)
+ req->setFlags(Request::UNCACHEABLE);
+ } else {
+ //Use the address which already has segmentation applied.
+ DPRINTF(GPUTLB, "Paging disabled.\n");
+ DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
+ req->setPaddr(vaddr);
+ }
+ } else {
+ // Real mode
+ DPRINTF(GPUTLB, "In real mode.\n");
+ DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);
+ req->setPaddr(vaddr);
+ }
+
+ // Check for an access to the local APIC
+ if (FullSystem) {
+ LocalApicBase localApicBase =
+ tc->readMiscRegNoEffect(MISCREG_APIC_BASE);
+
+ Addr baseAddr = localApicBase.base * PageBytes;
+ Addr paddr = req->getPaddr();
+
+ if (baseAddr <= paddr && baseAddr + PageBytes > paddr) {
+ // Force the access to be uncacheable.
+ req->setFlags(Request::UNCACHEABLE);
+ req->setPaddr(x86LocalAPICAddress(tc->contextId(),
+ paddr - baseAddr));
+ }
+ }
+
+ return NoFault;
+ };
+
+ Fault
+ GpuTLB::translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
+ int &latency)
+ {
+ bool delayedResponse;
+
+ return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse, false,
+ latency);
+ }
+
+ void
+ GpuTLB::translateTiming(RequestPtr req, ThreadContext *tc,
+ Translation *translation, Mode mode, int &latency)
+ {
+ bool delayedResponse;
+ assert(translation);
+
+ Fault fault = GpuTLB::translate(req, tc, translation, mode,
+ delayedResponse, true, latency);
+
+ if (!delayedResponse)
+ translation->finish(fault, req, tc, mode);
+ }
+
+ Walker*
+ GpuTLB::getWalker()
+ {
+ return walker;
+ }
+
+
+ void
+ GpuTLB::serialize(CheckpointOut &cp) const
+ {
+ }
+
+ void
+ GpuTLB::unserialize(CheckpointIn &cp)
+ {
+ }
+
+ void
+ GpuTLB::regStats()
+ {
+ localNumTLBAccesses
+ .name(name() + ".local_TLB_accesses")
+ .desc("Number of TLB accesses")
+ ;
+
+ localNumTLBHits
+ .name(name() + ".local_TLB_hits")
+ .desc("Number of TLB hits")
+ ;
+
+ localNumTLBMisses
+ .name(name() + ".local_TLB_misses")
+ .desc("Number of TLB misses")
+ ;
+
+ localTLBMissRate
+ .name(name() + ".local_TLB_miss_rate")
+ .desc("TLB miss rate")
+ ;
+
+ accessCycles
+ .name(name() + ".access_cycles")
+ .desc("Cycles spent accessing this TLB level")
+ ;
+
+ pageTableCycles
+ .name(name() + ".page_table_cycles")
+ .desc("Cycles spent accessing the page table")
+ ;
+
+ localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;
+
+ numUniquePages
+ .name(name() + ".unique_pages")
+ .desc("Number of unique pages touched")
+ ;
+
+ localCycles
+ .name(name() + ".local_cycles")
+ .desc("Number of cycles spent in queue for all incoming reqs")
+ ;
+
+ localLatency
+ .name(name() + ".local_latency")
+ .desc("Avg. latency over incoming coalesced reqs")
+ ;
+
+ localLatency = localCycles / localNumTLBAccesses;
+
+ globalNumTLBAccesses
+ .name(name() + ".global_TLB_accesses")
+ .desc("Number of TLB accesses")
+ ;
+
+ globalNumTLBHits
+ .name(name() + ".global_TLB_hits")
+ .desc("Number of TLB hits")
+ ;
+
+ globalNumTLBMisses
+ .name(name() + ".global_TLB_misses")
+ .desc("Number of TLB misses")
+ ;
+
+ globalTLBMissRate
+ .name(name() + ".global_TLB_miss_rate")
+ .desc("TLB miss rate")
+ ;
+
+ globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;
+
+ avgReuseDistance
+ .name(name() + ".avg_reuse_distance")
+ .desc("avg. reuse distance over all pages (in ticks)")
+ ;
+
+ }
+
+ /**
+ * Do the TLB lookup for this coalesced request and schedule
+ * another event <TLB access latency> cycles later.
+ */
+
+ void
+ GpuTLB::issueTLBLookup(PacketPtr pkt)
+ {
+ assert(pkt);
+ assert(pkt->senderState);
+
+ Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+ TheISA::PageBytes);
+
+ TranslationState *sender_state =
+ safe_cast<TranslationState*>(pkt->senderState);
+
+ bool update_stats = !sender_state->prefetch;
+ ThreadContext * tmp_tc = sender_state->tc;
+
+ DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n",
+ virt_page_addr);
+
+ int req_cnt = sender_state->reqCnt.back();
+
+ if (update_stats) {
+ accessCycles -= (curTick() * req_cnt);
+ localCycles -= curTick();
+ updatePageFootprint(virt_page_addr);
+ globalNumTLBAccesses += req_cnt;
+ }
+
+ tlbOutcome lookup_outcome = TLB_MISS;
+ RequestPtr tmp_req = pkt->req;
+
+ // Access the TLB and figure out if it's a hit or a miss.
+ bool success = tlbLookup(tmp_req, tmp_tc, update_stats);
+
+ if (success) {
+ lookup_outcome = TLB_HIT;
+ // Put the entry in SenderState
+ GpuTlbEntry *entry = lookup(tmp_req->getVaddr(), false);
+ assert(entry);
+
+ sender_state->tlbEntry =
+ new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid);
+
+ if (update_stats) {
+ // the reqCnt has an entry per level, so its size tells us
+ // which level we are in
+ sender_state->hitLevel = sender_state->reqCnt.size();
+ globalNumTLBHits += req_cnt;
+ }
+ } else {
+ if (update_stats)
+ globalNumTLBMisses += req_cnt;
+ }
+
+ /*
+ * We now know the TLB lookup outcome (if it's a hit or a miss), as well
+ * as the TLB access latency.
+ *
+ * We create and schedule a new TLBEvent which will help us take the
+ * appropriate actions (e.g., update TLB on a hit, send request to lower
+ * level TLB on a miss, or start a page walk if this was the last-level
+ * TLB)
+ */
+ TLBEvent *tlb_event =
+ new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);
+
+ if (translationReturnEvent.count(virt_page_addr)) {
+ panic("Virtual Page Address %#x already has a return event\n",
+ virt_page_addr);
+ }
+
+ translationReturnEvent[virt_page_addr] = tlb_event;
+ assert(tlb_event);
+
+ DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",
+ curTick() + this->ticks(hitLatency));
+
+ schedule(tlb_event, curTick() + this->ticks(hitLatency));
+ }
+
+ GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr, tlbOutcome tlb_outcome,
+ PacketPtr _pkt)
+ : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),
+ outcome(tlb_outcome), pkt(_pkt)
+ {
+ }
+
+ /**
+ * Do Paging protection checks. If we encounter a page fault, then
+ * an assertion is fired.
+ */
+ void
+ GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
+ GpuTlbEntry * tlb_entry, Mode mode)
+ {
+ HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
+ uint32_t flags = pkt->req->getFlags();
+ bool storeCheck = flags & (StoreCheck << FlagShift);
+
+ // Do paging protection checks.
+ bool inUser = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));
+ CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);
+
+ bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));
+
+ if ((inUser && !tlb_entry->user) ||
+ (mode == BaseTLB::Write && badWrite)) {
+ // The page must have been present to get into the TLB in
+ // the first place. We'll assume the reserved bits are
+ // fine even though we're not checking them.
+ assert(false);
+ }
+
+ if (storeCheck && badWrite) {
+ // This would fault if this were a write, so return a page
+ // fault that reflects that happening.
+ assert(false);
+ }
+ }
+
+ /**
+ * handleTranslationReturn is called on a TLB hit,
+ * when a TLB miss returns or when a page fault returns.
+ * The latter calls handelHit with TLB miss as tlbOutcome.
+ */
+ void
+ GpuTLB::handleTranslationReturn(Addr virt_page_addr, tlbOutcome tlb_outcome,
+ PacketPtr pkt)
+ {
+
+ assert(pkt);
+ Addr vaddr = pkt->req->getVaddr();
+
+ TranslationState *sender_state =
+ safe_cast<TranslationState*>(pkt->senderState);
+
+ ThreadContext *tc = sender_state->tc;
+ Mode mode = sender_state->tlbMode;
+
+ GpuTlbEntry *local_entry, *new_entry;
+
+ if (tlb_outcome == TLB_HIT) {
+ DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n", vaddr);
+ local_entry = sender_state->tlbEntry;
+ } else {
+ DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",
+ vaddr);
+
+ // We are returning either from a page walk or from a hit at a lower
+ // TLB level. The senderState should be "carrying" a pointer to the
+ // correct TLBEntry.
+ new_entry = sender_state->tlbEntry;
+ assert(new_entry);
+ local_entry = new_entry;
+
+ if (allocationPolicy) {
+ DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
+ virt_page_addr);
+
+ local_entry = insert(virt_page_addr, *new_entry);
+ }
+
+ assert(local_entry);
+ }
+
+ /**
+ * At this point the packet carries an up-to-date tlbEntry pointer
+ * in its senderState.
+ * Next step is to do the paging protection checks.
+ */
+ DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
+ "while paddr was %#x.\n", local_entry->vaddr,
+ local_entry->paddr);
+
+ pagingProtectionChecks(tc, pkt, local_entry, mode);
+ int page_size = local_entry->size();
+ Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
+ DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+
+ // Since this packet will be sent through the cpu side slave port,
+ // it must be converted to a response pkt if it is not one already
+ if (pkt->isRequest()) {
+ pkt->makeTimingResponse();
+ }
+
+ pkt->req->setPaddr(paddr);
+
+ if (local_entry->uncacheable) {
+ pkt->req->setFlags(Request::UNCACHEABLE);
+ }
+
+ //send packet back to coalescer
+ cpuSidePort[0]->sendTimingResp(pkt);
+ //schedule cleanup event
+ cleanupQueue.push(virt_page_addr);
+
+ // schedule this only once per cycle.
+ // The check is required because we might have multiple translations
+ // returning the same cycle
+ // this is a maximum priority event and must be on the same cycle
+ // as the cleanup event in TLBCoalescer to avoid a race with
+ // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry
+ if (!cleanupEvent.scheduled())
+ schedule(cleanupEvent, curTick());
+ }
+
+ /**
+ * Here we take the appropriate actions based on the result of the
+ * TLB lookup.
+ */
+ void
+ GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome,
+ PacketPtr pkt)
+ {
+ DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr);
+
+ assert(translationReturnEvent[virtPageAddr]);
+ assert(pkt);
+
+ TranslationState *tmp_sender_state =
+ safe_cast<TranslationState*>(pkt->senderState);
+
+ int req_cnt = tmp_sender_state->reqCnt.back();
+ bool update_stats = !tmp_sender_state->prefetch;
+
+
+ if (outcome == TLB_HIT) {
+ handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);
+
+ if (update_stats) {
+ accessCycles += (req_cnt * curTick());
+ localCycles += curTick();
+ }
+
+ } else if (outcome == TLB_MISS) {
+
+ DPRINTF(GPUTLB, "This is a TLB miss\n");
+ if (update_stats) {
+ accessCycles += (req_cnt*curTick());
+ localCycles += curTick();
+ }
+
+ if (hasMemSidePort) {
+ // the one cyle added here represent the delay from when we get
+ // the reply back till when we propagate it to the coalescer
+ // above.
+ if (update_stats) {
+ accessCycles += (req_cnt * 1);
+ localCycles += 1;
+ }
+
+ /**
+ * There is a TLB below. Send the coalesced request.
+ * We actually send the very first packet of all the
+ * pending packets for this virtual page address.
+ */
+ if (!memSidePort[0]->sendTimingReq(pkt)) {
+ DPRINTF(GPUTLB, "Failed sending translation request to "
+ "lower level TLB for addr %#x\n", virtPageAddr);
+
+ memSidePort[0]->retries.push_back(pkt);
+ } else {
+ DPRINTF(GPUTLB, "Sent translation request to lower level "
+ "TLB for addr %#x\n", virtPageAddr);
+ }
+ } else {
+ //this is the last level TLB. Start a page walk
+ DPRINTF(GPUTLB, "Last level TLB - start a page walk for "
+ "addr %#x\n", virtPageAddr);
+
+ if (update_stats)
+ pageTableCycles -= (req_cnt*curTick());
+
+ TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];
+ assert(tlb_event);
+ tlb_event->updateOutcome(PAGE_WALK);
+ schedule(tlb_event, curTick() + ticks(missLatency2));
+ }
+ } else if (outcome == PAGE_WALK) {
+ if (update_stats)
+ pageTableCycles += (req_cnt*curTick());
+
+ // Need to access the page table and update the TLB
+ DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
+ virtPageAddr);
+
+ TranslationState *sender_state =
+ safe_cast<TranslationState*>(pkt->senderState);
+
+ Process *p = sender_state->tc->getProcessPtr();
+ TlbEntry newEntry;
+ Addr vaddr = pkt->req->getVaddr();
+ #ifndef NDEBUG
+ Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+ assert(alignedVaddr == virtPageAddr);
+ #endif
+ bool success;
+ success = p->pTable->lookup(vaddr, newEntry);
+ if (!success && sender_state->tlbMode != BaseTLB::Execute) {
+ if (p->fixupStackFault(vaddr)) {
+ success = p->pTable->lookup(vaddr, newEntry);
+ }
+ }
+
+ DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+ newEntry.pageStart());
+
+ sender_state->tlbEntry =
+ new GpuTlbEntry(0, newEntry.vaddr, newEntry.paddr, success);
+
+ handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
+ } else if (outcome == MISS_RETURN) {
+ /** we add an extra cycle in the return path of the translation
+ * requests in between the various TLB levels.
+ */
+ handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);
+ } else {
+ assert(false);
+ }
+ }
+
+ void
+ GpuTLB::TLBEvent::process()
+ {
+ tlb->translationReturn(virtPageAddr, outcome, pkt);
+ }
+
+ const char*
+ GpuTLB::TLBEvent::description() const
+ {
+ return "trigger translationDoneEvent";
+ }
+
+ void
+ GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome)
+ {
+ outcome = _outcome;
+ }
+
+ Addr
+ GpuTLB::TLBEvent::getTLBEventVaddr()
+ {
+ return virtPageAddr;
+ }
+
+ /*
+ * recvTiming receives a coalesced timing request from a TLBCoalescer
+ * and it calls issueTLBLookup()
+ * It only rejects the packet if we have exceeded the max
+ * outstanding number of requests for the TLB
+ */
+ bool
+ GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt)
+ {
+ if (tlb->outstandingReqs < tlb->maxCoalescedReqs) {
+ tlb->issueTLBLookup(pkt);
+ // update number of outstanding translation requests
+ tlb->outstandingReqs++;
+ return true;
+ } else {
+ DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n",
+ tlb->outstandingReqs);
+ return false;
+ }
+ }
+
+ /**
+ * handleFuncTranslationReturn is called on a TLB hit,
+ * when a TLB miss returns or when a page fault returns.
+ * It updates LRU, inserts the TLB entry on a miss
+ * depending on the allocation policy and does the required
+ * protection checks. It does NOT create a new packet to
+ * update the packet's addr; this is done in hsail-gpu code.
+ */
+ void
+ GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome)
+ {
+ TranslationState *sender_state =
+ safe_cast<TranslationState*>(pkt->senderState);
+
+ ThreadContext *tc = sender_state->tc;
+ Mode mode = sender_state->tlbMode;
+ Addr vaddr = pkt->req->getVaddr();
+
+ GpuTlbEntry *local_entry, *new_entry;
+
+ if (tlb_outcome == TLB_HIT) {
+ DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr "
+ "%#x\n", vaddr);
+
+ local_entry = sender_state->tlbEntry;
+ } else {
+ DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "
+ "%#x\n", vaddr);
+
+ // We are returning either from a page walk or from a hit at a lower
+ // TLB level. The senderState should be "carrying" a pointer to the
+ // correct TLBEntry.
+ new_entry = sender_state->tlbEntry;
+ assert(new_entry);
+ local_entry = new_entry;
+
+ if (allocationPolicy) {
+ Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes);
+
+ DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",
+ virt_page_addr);
+
+ local_entry = insert(virt_page_addr, *new_entry);
+ }
+
+ assert(local_entry);
+ }
+
+ DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "
+ "while paddr was %#x.\n", local_entry->vaddr,
+ local_entry->paddr);
+
+ // Do paging checks if it's a normal functional access. If it's for a
+ // prefetch, then sometimes you can try to prefetch something that won't
+ // pass protection. We don't actually want to fault becuase there is no
+ // demand access to deem this a violation. Just put it in the TLB and
+ // it will fault if indeed a future demand access touches it in
+ // violation.
+ if (!sender_state->prefetch && sender_state->tlbEntry->valid)
+ pagingProtectionChecks(tc, pkt, local_entry, mode);
+
+ int page_size = local_entry->size();
+ Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));
+ DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);
+
+ pkt->req->setPaddr(paddr);
+
+ if (local_entry->uncacheable)
+ pkt->req->setFlags(Request::UNCACHEABLE);
+ }
+
+ // This is used for atomic translations. Need to
+ // make it all happen during the same cycle.
+ void
+ GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt)
+ {
+ TranslationState *sender_state =
+ safe_cast<TranslationState*>(pkt->senderState);
+
+ ThreadContext *tc = sender_state->tc;
+ bool update_stats = !sender_state->prefetch;
+
+ Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+ TheISA::PageBytes);
+
+ if (update_stats)
+ tlb->updatePageFootprint(virt_page_addr);
+
+ // do the TLB lookup without updating the stats
+ bool success = tlb->tlbLookup(pkt->req, tc, update_stats);
+ tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS;
+
+ // functional mode means no coalescing
+ // global metrics are the same as the local metrics
+ if (update_stats) {
+ tlb->globalNumTLBAccesses++;
+
+ if (success) {
+ sender_state->hitLevel = sender_state->reqCnt.size();
+ tlb->globalNumTLBHits++;
+ }
+ }
+
+ if (!success) {
+ if (update_stats)
+ tlb->globalNumTLBMisses++;
+ if (tlb->hasMemSidePort) {
+ // there is a TLB below -> propagate down the TLB hierarchy
+ tlb->memSidePort[0]->sendFunctional(pkt);
+ // If no valid translation from a prefetch, then just return
+ if (sender_state->prefetch && !pkt->req->hasPaddr())
+ return;
+ } else {
+ // Need to access the page table and update the TLB
+ DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",
+ virt_page_addr);
+
+ Process *p = tc->getProcessPtr();
+ TlbEntry newEntry;
+
+ Addr vaddr = pkt->req->getVaddr();
+ #ifndef NDEBUG
+ Addr alignedVaddr = p->pTable->pageAlign(vaddr);
+ assert(alignedVaddr == virt_page_addr);
+ #endif
+
+ bool success = p->pTable->lookup(vaddr, newEntry);
+ if (!success && sender_state->tlbMode != BaseTLB::Execute) {
+ if (p->fixupStackFault(vaddr))
+ success = p->pTable->lookup(vaddr, newEntry);
+ }
+
+ if (!sender_state->prefetch) {
+ // no PageFaults are permitted after
+ // the second page table lookup
+ assert(success);
+
+ DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+ newEntry.pageStart());
+
+ sender_state->tlbEntry = new GpuTlbEntry(0, newEntry.vaddr,
+ newEntry.paddr,
+ success);
+ } else {
+ // If this was a prefetch, then do the normal thing if it
+ // was a successful translation. Otherwise, send an empty
+ // TLB entry back so that it can be figured out as empty and
+ // handled accordingly.
+ if (success) {
+ DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,
+ newEntry.pageStart());
+
+ sender_state->tlbEntry = new GpuTlbEntry(0,
+ newEntry.vaddr,
+ newEntry.paddr,
+ success);
+ } else {
+ DPRINTF(GPUPrefetch, "Prefetch failed %#x\n",
+ alignedVaddr);
+
+ sender_state->tlbEntry = new GpuTlbEntry();
+
+ return;
+ }
+ }
+ }
+ } else {
+ DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n",
+ tlb->lookup(pkt->req->getVaddr()));
+
+ GpuTlbEntry *entry = tlb->lookup(pkt->req->getVaddr(),
+ update_stats);
+
+ assert(entry);
+
+ sender_state->tlbEntry =
+ new GpuTlbEntry(0, entry->vaddr, entry->paddr, entry->valid);
+ }
+ // This is the function that would populate pkt->req with the paddr of
+ // the translation. But if no translation happens (i.e Prefetch fails)
+ // then the early returns in the above code wiill keep this function
+ // from executing.
+ tlb->handleFuncTranslationReturn(pkt, tlb_outcome);
+ }
+
+ void
+ GpuTLB::CpuSidePort::recvReqRetry()
+ {
+ // The CPUSidePort never sends anything but replies. No retries
+ // expected.
+ assert(false);
+ }
+
+ AddrRangeList
+ GpuTLB::CpuSidePort::getAddrRanges() const
+ {
+ // currently not checked by the master
+ AddrRangeList ranges;
+
+ return ranges;
+ }
+
+ /**
+ * MemSidePort receives the packet back.
+ * We need to call the handleTranslationReturn
+ * and propagate up the hierarchy.
+ */
+ bool
+ GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt)
+ {
+ Addr virt_page_addr = roundDown(pkt->req->getVaddr(),
+ TheISA::PageBytes);
+
+ DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n",
+ virt_page_addr);
+
+ TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr];
+ assert(tlb_event);
+ assert(virt_page_addr == tlb_event->getTLBEventVaddr());
+
+ tlb_event->updateOutcome(MISS_RETURN);
+ tlb->schedule(tlb_event, curTick()+tlb->ticks(1));
+
+ return true;
+ }
+
+ void
+ GpuTLB::MemSidePort::recvReqRetry()
+ {
+ // No retries should reach the TLB. The retries
+ // should only reach the TLBCoalescer.
+ assert(false);
+ }
+
+ void
+ GpuTLB::cleanup()
+ {
+ while (!cleanupQueue.empty()) {
+ Addr cleanup_addr = cleanupQueue.front();
+ cleanupQueue.pop();
+
+ // delete TLBEvent
+ TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr];
+ delete old_tlb_event;
+ translationReturnEvent.erase(cleanup_addr);
+
+ // update number of outstanding requests
+ outstandingReqs--;
+ }
+
+ /** the higher level coalescer should retry if it has
+ * any pending requests.
+ */
+ for (int i = 0; i < cpuSidePort.size(); ++i) {
+ cpuSidePort[i]->sendRetryReq();
+ }
+ }
+
+ void
+ GpuTLB::updatePageFootprint(Addr virt_page_addr)
+ {
+
+ std::pair<AccessPatternTable::iterator, bool> ret;
+
+ AccessInfo tmp_access_info;
+ tmp_access_info.lastTimeAccessed = 0;
+ tmp_access_info.accessesPerPage = 0;
+ tmp_access_info.totalReuseDistance = 0;
+ tmp_access_info.sumDistance = 0;
+ tmp_access_info.meanDistance = 0;
+
+ ret = TLBFootprint.insert(AccessPatternTable::value_type(virt_page_addr,
+ tmp_access_info));
+
+ bool first_page_access = ret.second;
+
+ if (first_page_access) {
+ numUniquePages++;
+ } else {
+ int accessed_before;
+ accessed_before = curTick() - ret.first->second.lastTimeAccessed;
+ ret.first->second.totalReuseDistance += accessed_before;
+ }
+
+ ret.first->second.accessesPerPage++;
+ ret.first->second.lastTimeAccessed = curTick();
+
+ if (accessDistance) {
+ ret.first->second.localTLBAccesses
+ .push_back(localNumTLBAccesses.value());
+ }
+ }
+
+ void
+ GpuTLB::exitCallback()
+ {
+ std::ostream *page_stat_file = nullptr;
+
+ if (accessDistance) {
+
+ // print per page statistics to a separate file (.csv format)
+ // simout is the gem5 output directory (default is m5out or the one
+ // specified with -d
+ page_stat_file = simout.create(name().c_str());
+
+ // print header
+ *page_stat_file << "page,max_access_distance,mean_access_distance, "
+ << "stddev_distance" << std::endl;
+ }
+
+ // update avg. reuse distance footprint
+ AccessPatternTable::iterator iter, iter_begin, iter_end;
+ unsigned int sum_avg_reuse_distance_per_page = 0;
+
+ // iterate through all pages seen by this TLB
+ for (iter = TLBFootprint.begin(); iter != TLBFootprint.end(); iter++) {
+ sum_avg_reuse_distance_per_page += iter->second.totalReuseDistance /
+ iter->second.accessesPerPage;
+
+ if (accessDistance) {
+ unsigned int tmp = iter->second.localTLBAccesses[0];
+ unsigned int prev = tmp;
+
+ for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+ if (i) {
+ tmp = prev + 1;
+ }
+
+ prev = iter->second.localTLBAccesses[i];
+ // update the localTLBAccesses value
+ // with the actual differece
+ iter->second.localTLBAccesses[i] -= tmp;
+ // compute the sum of AccessDistance per page
+ // used later for mean
+ iter->second.sumDistance +=
+ iter->second.localTLBAccesses[i];
+ }
+
+ iter->second.meanDistance =
+ iter->second.sumDistance / iter->second.accessesPerPage;
+
+ // compute std_dev and max (we need a second round because we
+ // need to know the mean value
+ unsigned int max_distance = 0;
+ unsigned int stddev_distance = 0;
+
+ for (int i = 0; i < iter->second.localTLBAccesses.size(); ++i) {
+ unsigned int tmp_access_distance =
+ iter->second.localTLBAccesses[i];
+
+ if (tmp_access_distance > max_distance) {
+ max_distance = tmp_access_distance;
+ }
+
+ unsigned int diff =
+ tmp_access_distance - iter->second.meanDistance;
+ stddev_distance += pow(diff, 2);
+
+ }
+
+ stddev_distance =
+ sqrt(stddev_distance/iter->second.accessesPerPage);
+
+ if (page_stat_file) {
+ *page_stat_file << std::hex << iter->first << ",";
+ *page_stat_file << std::dec << max_distance << ",";
+ *page_stat_file << std::dec << iter->second.meanDistance
+ << ",";
+ *page_stat_file << std::dec << stddev_distance;
+ *page_stat_file << std::endl;
+ }
+
+ // erase the localTLBAccesses array
+ iter->second.localTLBAccesses.clear();
+ }
+ }
+
+ if (!TLBFootprint.empty()) {
+ avgReuseDistance =
+ sum_avg_reuse_distance_per_page / TLBFootprint.size();
+ }
+
+ //clear the TLBFootprint map
+ TLBFootprint.clear();
+ }
+} // namespace X86ISA
+
+X86ISA::GpuTLB*
+X86GPUTLBParams::create()
+{
+ return new X86ISA::GpuTLB(this);
+}
+
diff --git a/src/gpu-compute/gpu_tlb.hh b/src/gpu-compute/gpu_tlb.hh
new file mode 100644
index 000000000..3549c598b
--- /dev/null
+++ b/src/gpu-compute/gpu_tlb.hh
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __GPU_TLB_HH__
+#define __GPU_TLB_HH__
+
+#include <fstream>
+#include <list>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "arch/generic/tlb.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/pagetable_walker.hh"
+#include "arch/x86/regs/segment.hh"
+#include "base/callback.hh"
+#include "base/misc.hh"
+#include "base/statistics.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/X86GPUTLB.hh"
+#include "sim/sim_object.hh"
+
+class BaseTLB;
+class Packet;
+class ThreadContext;
+
+namespace X86ISA
+{
+ class GpuTlbEntry : public TlbEntry
+ {
+ public:
+ GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid)
+ : TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { }
+
+ GpuTlbEntry() : TlbEntry() { }
+
+ bool valid;
+ };
+
+ class GpuTLB : public MemObject
+ {
+ protected:
+ friend class Walker;
+
+ typedef std::list<GpuTlbEntry*> EntryList;
+
+ uint32_t configAddress;
+
+ // TLB clock: will inherit clock from shader's clock period in terms
+ // of nuber of ticks of curTime (aka global simulation clock)
+ // The assignment of TLB clock from shader clock is done in the python
+ // config files.
+ int clock;
+
+ public:
+ // clock related functions ; maps to-and-from Simulation ticks and
+ // object clocks.
+ Tick frequency() const { return SimClock::Frequency / clock; }
+
+ Tick
+ ticks(int numCycles) const
+ {
+ return (Tick)clock * numCycles;
+ }
+
+ Tick curCycle() const { return curTick() / clock; }
+ Tick tickToCycles(Tick val) const { return val / clock;}
+
+ typedef X86GPUTLBParams Params;
+ GpuTLB(const Params *p);
+ ~GpuTLB();
+
+ typedef enum BaseTLB::Mode Mode;
+
+ class Translation
+ {
+ public:
+ virtual ~Translation() { }
+
+ /**
+ * Signal that the translation has been delayed due to a hw page
+ * table walk.
+ */
+ virtual void markDelayed() = 0;
+
+ /**
+ * The memory for this object may be dynamically allocated, and it
+ * may be responsible for cleaning itslef up which will happen in
+ * this function. Once it's called the object is no longer valid.
+ */
+ virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
+ Mode mode) = 0;
+ };
+
+ void dumpAll();
+ GpuTlbEntry *lookup(Addr va, bool update_lru=true);
+ void setConfigAddress(uint32_t addr);
+
+ protected:
+ EntryList::iterator lookupIt(Addr va, bool update_lru=true);
+ Walker *walker;
+
+ public:
+ Walker *getWalker();
+ void invalidateAll();
+ void invalidateNonGlobal();
+ void demapPage(Addr va, uint64_t asn);
+
+ protected:
+ int size;
+ int assoc;
+ int numSets;
+
+ /**
+ * true if this is a fully-associative TLB
+ */
+ bool FA;
+ Addr setMask;
+
+ /**
+ * Allocation Policy: true if we always allocate on a hit, false
+ * otherwise. Default is true.
+ */
+ bool allocationPolicy;
+
+ /**
+ * if true, then this is not the last level TLB
+ */
+ bool hasMemSidePort;
+
+ /**
+ * Print out accessDistance stats. One stat file
+ * per TLB.
+ */
+ bool accessDistance;
+
+ GpuTlbEntry *tlb;
+
+ /*
+ * It's a per-set list. As long as we have not reached
+ * the full capacity of the given set, grab an entry from
+ * the freeList.
+ */
+ std::vector<EntryList> freeList;
+
+ /**
+ * An entryList per set is the equivalent of an LRU stack;
+ * it's used to guide replacement decisions. The head of the list
+ * contains the MRU TLB entry of the given set. If the freeList
+ * for this set is empty, the last element of the list
+ * is evicted (i.e., dropped on the floor).
+ */
+ std::vector<EntryList> entryList;
+
+ Fault translateInt(RequestPtr req, ThreadContext *tc);
+
+ Fault translate(RequestPtr req, ThreadContext *tc,
+ Translation *translation, Mode mode, bool &delayedResponse,
+ bool timing, int &latency);
+
+ public:
+ // latencies for a TLB hit, miss and page fault
+ int hitLatency;
+ int missLatency1;
+ int missLatency2;
+
+ // local_stats are as seen from the TLB
+ // without taking into account coalescing
+ Stats::Scalar localNumTLBAccesses;
+ Stats::Scalar localNumTLBHits;
+ Stats::Scalar localNumTLBMisses;
+ Stats::Formula localTLBMissRate;
+
+ // global_stats are as seen from the
+ // CU's perspective taking into account
+ // all coalesced requests.
+ Stats::Scalar globalNumTLBAccesses;
+ Stats::Scalar globalNumTLBHits;
+ Stats::Scalar globalNumTLBMisses;
+ Stats::Formula globalTLBMissRate;
+
+ // from the CU perspective (global)
+ Stats::Scalar accessCycles;
+ // from the CU perspective (global)
+ Stats::Scalar pageTableCycles;
+ Stats::Scalar numUniquePages;
+ // from the perspective of this TLB
+ Stats::Scalar localCycles;
+ // from the perspective of this TLB
+ Stats::Formula localLatency;
+ // I take the avg. per page and then
+ // the avg. over all pages.
+ Stats::Scalar avgReuseDistance;
+
+ void regStats();
+ void updatePageFootprint(Addr virt_page_addr);
+ void printAccessPattern();
+
+
+ Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
+ int &latency);
+
+ void translateTiming(RequestPtr req, ThreadContext *tc,
+ Translation *translation, Mode mode,
+ int &latency);
+
+ Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
+ Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
+
+ GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry);
+
+ // Checkpointing
+ virtual void serialize(CheckpointOut& cp) const;
+ virtual void unserialize(CheckpointIn& cp);
+ void issueTranslation();
+ enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
+ bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);
+
+ void handleTranslationReturn(Addr addr, tlbOutcome outcome,
+ PacketPtr pkt);
+
+ void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
+
+ void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
+ GpuTlbEntry *tlb_entry, Mode mode);
+
+ void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry,
+ Addr phys_page_addr);
+
+ void issueTLBLookup(PacketPtr pkt);
+
+ // CpuSidePort is the TLB Port closer to the CPU/CU side
+ class CpuSidePort : public SlavePort
+ {
+ public:
+ CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
+ PortID _index)
+ : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
+
+ protected:
+ GpuTLB *tlb;
+ int index;
+
+ virtual bool recvTimingReq(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt);
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+ virtual void recvRespRetry() { assert(false); }
+ virtual AddrRangeList getAddrRanges() const;
+ };
+
+ /**
+ * MemSidePort is the TLB Port closer to the memory side
+ * If this is a last level TLB then this port will not be connected.
+ *
+ * Future action item: if we ever do real page walks, then this port
+ * should be connected to a RubyPort.
+ */
+ class MemSidePort : public MasterPort
+ {
+ public:
+ MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
+ PortID _index)
+ : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
+
+ std::deque<PacketPtr> retries;
+
+ protected:
+ GpuTLB *tlb;
+ int index;
+
+ virtual bool recvTimingResp(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt) { }
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+ };
+
+ // TLB ports on the cpu Side
+ std::vector<CpuSidePort*> cpuSidePort;
+ // TLB ports on the memory side
+ std::vector<MemSidePort*> memSidePort;
+
+ BaseMasterPort &getMasterPort(const std::string &if_name,
+ PortID idx=InvalidPortID);
+
+ BaseSlavePort &getSlavePort(const std::string &if_name,
+ PortID idx=InvalidPortID);
+
+ /**
+ * TLB TranslationState: this currently is a somewhat bastardization of
+ * the usage of SenderState, whereby the receiver of a packet is not
+ * usually supposed to need to look at the contents of the senderState,
+ * you're really only supposed to look at what you pushed on, pop it
+ * off, and send it back.
+ *
+ * However, since there is state that we want to pass to the TLBs using
+ * the send/recv Timing/Functional/etc. APIs, which don't allow for new
+ * arguments, we need a common TLB senderState to pass between TLBs,
+ * both "forwards" and "backwards."
+ *
+ * So, basically, the rule is that any packet received by a TLB port
+ * (cpuside OR memside) must be safely castable to a TranslationState.
+ */
+
+ struct TranslationState : public Packet::SenderState
+ {
+ // TLB mode, read or write
+ Mode tlbMode;
+ // Thread context associated with this req
+ ThreadContext *tc;
+
+ /*
+ * TLB entry to be populated and passed back and filled in
+ * previous TLBs. Equivalent to the data cache concept of
+ * "data return."
+ */
+ GpuTlbEntry *tlbEntry;
+ // Is this a TLB prefetch request?
+ bool prefetch;
+ // When was the req for this translation issued
+ uint64_t issueTime;
+ // Remember where this came from
+ std::vector<SlavePort*>ports;
+
+ // keep track of #uncoalesced reqs per packet per TLB level;
+ // reqCnt per level >= reqCnt higher level
+ std::vector<int> reqCnt;
+ // TLB level this packet hit in; 0 if it hit in the page table
+ int hitLevel;
+ Packet::SenderState *saved;
+
+ TranslationState(Mode tlb_mode, ThreadContext *_tc,
+ bool _prefetch=false,
+ Packet::SenderState *_saved=nullptr)
+ : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
+ prefetch(_prefetch), issueTime(0),
+ hitLevel(0),saved(_saved) { }
+ };
+
+ // maximum number of permitted coalesced requests per cycle
+ int maxCoalescedReqs;
+
+ // Current number of outstandings coalesced requests.
+ // Should be <= maxCoalescedReqs
+ int outstandingReqs;
+
+ /**
+ * A TLBEvent is scheduled after the TLB lookup and helps us take the
+ * appropriate actions:
+ * (e.g., update TLB on a hit,
+ * send request to lower level TLB on a miss,
+ * or start a page walk if this was the last-level TLB).
+ */
+ void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
+ PacketPtr pkt);
+
+ class TLBEvent : public Event
+ {
+ private:
+ GpuTLB *tlb;
+ Addr virtPageAddr;
+ /**
+ * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
+ */
+ tlbOutcome outcome;
+ PacketPtr pkt;
+
+ public:
+ TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
+ PacketPtr _pkt);
+
+ void process();
+ const char *description() const;
+
+ // updateOutcome updates the tlbOutcome of a TLBEvent
+ void updateOutcome(tlbOutcome _outcome);
+ Addr getTLBEventVaddr();
+ };
+
+ std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
+
+ // this FIFO queue keeps track of the virt. page addresses
+ // that are pending cleanup
+ std::queue<Addr> cleanupQueue;
+
+ // the cleanupEvent is scheduled after a TLBEvent triggers in order to
+ // free memory and do the required clean-up
+ void cleanup();
+
+ EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent;
+
+ /**
+ * This hash map will use the virtual page address as a key
+ * and will keep track of total number of accesses per page
+ */
+
+ struct AccessInfo
+ {
+ unsigned int lastTimeAccessed; // last access to this page
+ unsigned int accessesPerPage;
+ // need to divide it by accessesPerPage at the end
+ unsigned int totalReuseDistance;
+
+ /**
+ * The field below will help us compute the access distance,
+ * that is the number of (coalesced) TLB accesses that
+ * happened in between each access to this page
+ *
+ * localTLBAccesses[x] is the value of localTLBNumAccesses
+ * when the page <Addr> was accessed for the <x>th time
+ */
+ std::vector<unsigned int> localTLBAccesses;
+ unsigned int sumDistance;
+ unsigned int meanDistance;
+ };
+
+ typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
+ AccessPatternTable TLBFootprint;
+
+ // Called at the end of simulation to dump page access stats.
+ void exitCallback();
+
+ EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent;
+ };
+}
+
+#endif // __GPU_TLB_HH__
diff --git a/src/gpu-compute/hsa_code.hh b/src/gpu-compute/hsa_code.hh
new file mode 100644
index 000000000..9f358e23c
--- /dev/null
+++ b/src/gpu-compute/hsa_code.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __HSA_CODE_HH__
+#define __HSA_CODE_HH__
+
+#include <string>
+#include <vector>
+
+#include "arch/gpu_types.hh"
+#include "config/the_gpu_isa.hh"
+
+class HsaKernelInfo;
+
+/* @class HsaCode
+ * base code object for the set of HSA kernels associated
+ * with a single application. this class provides the common
+ * methods for creating, accessing, and storing information
+ * about kernel and variable symbols, symbol name, memory
+ * segment sizes, and instruction count, etc.
+ */
+
+class HsaCode
+{
+ public:
+ HsaCode(const std::string &name) : readonly_data(nullptr), funcarg_size(0),
+ _name(name)
+ {
+ }
+
+ enum class MemorySegment {
+ NONE,
+ FLAT,
+ GLOBAL,
+ READONLY,
+ KERNARG,
+ GROUP,
+ PRIVATE,
+ SPILL,
+ ARG,
+ EXTSPACE0
+ };
+
+ const std::string& name() const { return _name; }
+ int numInsts() const { return _insts.size(); }
+ std::vector<TheGpuISA::RawMachInst>* insts() { return &_insts; }
+
+ void
+ setReadonlyData(uint8_t *_readonly_data)
+ {
+ readonly_data = _readonly_data;
+ }
+
+ virtual int getSize(MemorySegment segment) const = 0;
+ virtual void generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const = 0;
+
+ uint8_t *readonly_data;
+ int funcarg_size;
+
+ protected:
+ // An array that stores instruction indices (0 through kernel size)
+ // for a kernel passed to code object constructor as an argument.
+ std::vector<TheGpuISA::RawMachInst> _insts;
+
+ private:
+ const std::string _name;
+};
+
+#endif // __HSA_CODE_HH__
diff --git a/src/gpu-compute/hsa_kernel_info.hh b/src/gpu-compute/hsa_kernel_info.hh
new file mode 100644
index 000000000..396913dac
--- /dev/null
+++ b/src/gpu-compute/hsa_kernel_info.hh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __HSA_KERNEL_INFO_HH__
+#define __HSA_KERNEL_INFO_HH__
+
+// This file defines the public interface between the HSA emulated
+// driver and application programs.
+
+#include <cstdint>
+
+static const int HSA_GET_SIZES = 0x4801;
+static const int HSA_GET_KINFO = 0x4802;
+static const int HSA_GET_STRINGS = 0x4803;
+static const int HSA_GET_CODE = 0x4804;
+static const int HSA_GET_READONLY_DATA = 0x4805;
+static const int HSA_GET_CU_CNT = 0x4806;
+static const int HSA_GET_VSZ = 0x4807;
+
+// Return value (via buffer ptr) for HSA_GET_SIZES
+struct HsaDriverSizes
+{
+ uint32_t num_kernels;
+ uint32_t string_table_size;
+ uint32_t code_size;
+ uint32_t readonly_size;
+};
+
+// HSA_GET_KINFO returns an array of num_kernels of these structs
+struct HsaKernelInfo
+{
+ // byte offset into string table
+ uint32_t name_offs;
+ // byte offset into code array
+ uint32_t code_offs;
+ uint32_t static_lds_size;
+ uint32_t private_mem_size;
+ uint32_t spill_mem_size;
+ // Number of s registers
+ uint32_t sRegCount;
+ // Number of d registers
+ uint32_t dRegCount;
+ // Number of c registers
+ uint32_t cRegCount;
+};
+
+#endif // __HSA_KERNEL_INFO_HH__
diff --git a/src/gpu-compute/hsa_object.cc b/src/gpu-compute/hsa_object.cc
new file mode 100644
index 000000000..91dfb160e
--- /dev/null
+++ b/src/gpu-compute/hsa_object.cc
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#include "gpu-compute/hsa_object.hh"
+
+#include <fstream>
+
+#include "gpu-compute/brig_object.hh"
+
+HsaObject::HsaObject(const std::string &fname)
+ : readonlyData(nullptr), filename(fname)
+{
+}
+
+HsaObject*
+HsaObject::createHsaObject(const std::string &fname)
+{
+ HsaObject *hsaObj = nullptr;
+ uint8_t *file_data = nullptr;
+ int file_length = 0;
+
+ std::ifstream code_file(fname, std::ifstream::ate | std::ifstream::in |
+ std::ifstream::binary);
+
+ assert(code_file.is_open());
+ assert(code_file.good());
+
+ file_length = code_file.tellg();
+ code_file.seekg(0, code_file.beg);
+ file_data = new uint8_t[file_length];
+ code_file.read((char*)file_data, file_length);
+ code_file.close();
+
+ for (const auto &tryFile : tryFileFuncs) {
+ if ((hsaObj = tryFile(fname, file_length, file_data))) {
+ return hsaObj;
+ }
+ }
+
+ delete[] file_data;
+ fatal("Unknown HSA object type for file: %s.\n", fname);
+
+ return nullptr;
+}
diff --git a/src/gpu-compute/hsa_object.hh b/src/gpu-compute/hsa_object.hh
new file mode 100644
index 000000000..1f08f5d80
--- /dev/null
+++ b/src/gpu-compute/hsa_object.hh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Anthony Gutierrez
+ */
+
+#ifndef __HSA_OBJECT_HH__
+#define __HSA_OBJECT_HH__
+
+#include <functional>
+#include <string>
+#include <vector>
+
+class HsaCode;
+
+/* @class HsaObject
+ * base loader object for HSA kernels. this class provides
+ * the base method definitions for loading, storing, and
+ * accessing HSA kernel objects into the simulator.
+ */
+
+class HsaObject
+{
+ public:
+ HsaObject(const std::string &fileName);
+
+ static HsaObject* createHsaObject(const std::string &fname);
+ static std::vector<std::function<HsaObject*(const std::string&, int,
+ uint8_t*)>> tryFileFuncs;
+
+ virtual HsaCode* getKernel(const std::string &name) const = 0;
+ virtual HsaCode* getKernel(int i) const = 0;
+ virtual HsaCode* getFunction(const std::string &name) const = 0;
+ virtual int numKernels() const = 0;
+
+ const std::string& name() const { return filename; }
+
+ uint8_t *readonlyData;
+
+
+ protected:
+ const std::string filename;
+};
+
+#endif // __HSA_OBJECT_HH__
diff --git a/src/gpu-compute/hsail_code.cc b/src/gpu-compute/hsail_code.cc
new file mode 100644
index 000000000..b0ddf0161
--- /dev/null
+++ b/src/gpu-compute/hsail_code.cc
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/hsail_code.hh"
+
+#include "arch/gpu_types.hh"
+#include "arch/hsail/Brig.h"
+#include "arch/hsail/operand.hh"
+#include "config/the_gpu_isa.hh"
+#include "debug/BRIG.hh"
+#include "debug/HSAILObject.hh"
+#include "gpu-compute/brig_object.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/kernel_cfg.hh"
+
+using namespace Brig;
+
+int getBrigDataTypeBytes(BrigType16_t t);
+
+HsailCode::HsailCode(const std::string &name_str)
+ : HsaCode(name_str), private_size(-1), readonly_size(-1)
+{
+}
+
+void
+HsailCode::init(const BrigDirectiveExecutable *code_dir, const BrigObject *obj,
+ StorageMap *objStorageMap)
+{
+ storageMap = objStorageMap;
+
+ // set pointer so that decoding process can find this kernel context when
+ // needed
+ obj->currentCode = this;
+
+ if (code_dir->base.kind != BRIG_KIND_DIRECTIVE_FUNCTION &&
+ code_dir->base.kind != BRIG_KIND_DIRECTIVE_KERNEL) {
+ fatal("unexpected directive kind %d inside kernel/function init\n",
+ code_dir->base.kind);
+ }
+
+ DPRINTF(HSAILObject, "Initializing code, first code block entry is: %d\n",
+ code_dir->firstCodeBlockEntry);
+
+ // clear these static vars so we can properly track the max index
+ // for this kernel
+ SRegOperand::maxRegIdx = 0;
+ DRegOperand::maxRegIdx = 0;
+ CRegOperand::maxRegIdx = 0;
+ setPrivateSize(0);
+
+ const BrigBase *entryPtr = brigNext((BrigBase*)code_dir);
+ const BrigBase *endPtr =
+ obj->getCodeSectionEntry(code_dir->nextModuleEntry);
+
+ int inst_idx = 0;
+ std::vector<GPUStaticInst*> instructions;
+ int funcarg_size_scope = 0;
+
+ // walk through instructions in code section and directives in
+ // directive section in parallel, processing directives that apply
+ // when we reach the relevant code point.
+ while (entryPtr < endPtr) {
+ switch (entryPtr->kind) {
+ case BRIG_KIND_DIRECTIVE_VARIABLE:
+ {
+ const BrigDirectiveVariable *sym =
+ (const BrigDirectiveVariable*)entryPtr;
+
+ DPRINTF(HSAILObject,"Initializing code, directive is "
+ "kind_variable, symbol is: %s\n",
+ obj->getString(sym->name));
+
+ StorageElement *se = storageMap->addSymbol(sym, obj);
+
+ if (sym->segment == BRIG_SEGMENT_PRIVATE) {
+ setPrivateSize(se->size);
+ } else { // spill
+ funcarg_size_scope += se->size;
+ }
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_LABEL:
+ {
+ const BrigDirectiveLabel *lbl =
+ (const BrigDirectiveLabel*)entryPtr;
+
+ DPRINTF(HSAILObject,"Initializing code, directive is "
+ "kind_label, label is: %s \n",
+ obj->getString(lbl->name));
+
+ labelMap.addLabel(lbl, inst_idx, obj);
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_PRAGMA:
+ {
+ DPRINTF(HSAILObject, "Initializing code, directive "
+ "is kind_pragma\n");
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_COMMENT:
+ {
+ DPRINTF(HSAILObject, "Initializing code, directive is "
+ "kind_comment\n");
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_ARG_BLOCK_START:
+ {
+ DPRINTF(HSAILObject, "Initializing code, directive is "
+ "kind_arg_block_start\n");
+
+ storageMap->resetOffset(BRIG_SEGMENT_ARG);
+ funcarg_size_scope = 0;
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_ARG_BLOCK_END:
+ {
+ DPRINTF(HSAILObject, "Initializing code, directive is "
+ "kind_arg_block_end\n");
+
+ funcarg_size = funcarg_size < funcarg_size_scope ?
+ funcarg_size_scope : funcarg_size;
+ }
+ break;
+
+ case BRIG_KIND_DIRECTIVE_END:
+ DPRINTF(HSAILObject, "Initializing code, dircetive is "
+ "kind_end\n");
+
+ break;
+
+ default:
+ if (entryPtr->kind >= BRIG_KIND_INST_BEGIN &&
+ entryPtr->kind <= BRIG_KIND_INST_END) {
+
+ BrigInstBase *instPtr = (BrigInstBase*)entryPtr;
+ TheGpuISA::MachInst machInst = { instPtr, obj };
+ GPUStaticInst *iptr = decoder.decode(machInst);
+
+ if (iptr) {
+ DPRINTF(HSAILObject, "Initializing code, processing inst "
+ "#%d idx %d: OPCODE=%d\n",
+ inst_idx, _insts.size(), instPtr->opcode);
+
+ TheGpuISA::RawMachInst inst_num = decoder.saveInst(iptr);
+ iptr->instNum(inst_idx);
+ _insts.push_back(inst_num);
+ instructions.push_back(iptr);
+ }
+ ++inst_idx;
+ } else if (entryPtr->kind >= BRIG_KIND_OPERAND_BEGIN &&
+ entryPtr->kind < BRIG_KIND_OPERAND_END) {
+ warn("unexpected operand entry in code segment\n");
+ } else {
+ // there are surely some more cases we will need to handle,
+ // but we'll deal with them as we find them.
+ fatal("unexpected directive kind %d inside kernel scope\n",
+ entryPtr->kind);
+ }
+ }
+
+ entryPtr = brigNext(entryPtr);
+ }
+
+ // compute Control Flow Graph for current kernel
+ ControlFlowInfo::assignImmediatePostDominators(instructions);
+
+ max_sreg = SRegOperand::maxRegIdx;
+ max_dreg = DRegOperand::maxRegIdx;
+ max_creg = CRegOperand::maxRegIdx;
+
+ obj->currentCode = nullptr;
+}
+
+HsailCode::HsailCode(const std::string &name_str,
+ const BrigDirectiveExecutable *code_dir,
+ const BrigObject *obj, StorageMap *objStorageMap)
+ : HsaCode(name_str), private_size(-1), readonly_size(-1)
+{
+ init(code_dir, obj, objStorageMap);
+}
+
+void
+LabelMap::addLabel(const Brig::BrigDirectiveLabel *lblDir, int inst_index,
+ const BrigObject *obj)
+{
+ std::string lbl_name = obj->getString(lblDir->name);
+ Label &lbl = map[lbl_name];
+
+ if (lbl.defined()) {
+ fatal("Attempt to redefine existing label %s\n", lbl_name);
+ }
+
+ lbl.define(lbl_name, inst_index);
+ DPRINTF(HSAILObject, "label %s = %d\n", lbl_name, inst_index);
+}
+
+Label*
+LabelMap::refLabel(const Brig::BrigDirectiveLabel *lblDir,
+ const BrigObject *obj)
+{
+ std::string name = obj->getString(lblDir->name);
+ Label &lbl = map[name];
+ lbl.checkName(name);
+
+ return &lbl;
+}
+
+int
+getBrigDataTypeBytes(BrigType16_t t)
+{
+ switch (t) {
+ case BRIG_TYPE_S8:
+ case BRIG_TYPE_U8:
+ case BRIG_TYPE_B8:
+ return 1;
+
+ case BRIG_TYPE_S16:
+ case BRIG_TYPE_U16:
+ case BRIG_TYPE_B16:
+ case BRIG_TYPE_F16:
+ return 2;
+
+ case BRIG_TYPE_S32:
+ case BRIG_TYPE_U32:
+ case BRIG_TYPE_B32:
+ case BRIG_TYPE_F32:
+ return 4;
+
+ case BRIG_TYPE_S64:
+ case BRIG_TYPE_U64:
+ case BRIG_TYPE_B64:
+ case BRIG_TYPE_F64:
+ return 8;
+
+ case BRIG_TYPE_B1:
+
+ default:
+ fatal("unhandled symbol data type %d", t);
+ return 0;
+ }
+}
+
+StorageElement*
+StorageSpace::addSymbol(const BrigDirectiveVariable *sym,
+ const BrigObject *obj)
+{
+ const char *sym_name = obj->getString(sym->name);
+ uint64_t size = 0;
+ uint64_t offset = 0;
+
+ if (sym->type & BRIG_TYPE_ARRAY) {
+ size = getBrigDataTypeBytes(sym->type & ~BRIG_TYPE_ARRAY);
+ size *= (((uint64_t)sym->dim.hi) << 32 | (uint64_t)sym->dim.lo);
+
+ offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type &
+ ~BRIG_TYPE_ARRAY));
+ } else {
+ size = getBrigDataTypeBytes(sym->type);
+ offset = roundUp(nextOffset, getBrigDataTypeBytes(sym->type));
+ }
+
+ nextOffset = offset + size;
+
+ DPRINTF(HSAILObject, "Adding %s SYMBOL %s size %d offset 0x%x, init: %d\n",
+ segmentNames[segment], sym_name, size, offset, sym->init);
+
+ StorageElement* se = new StorageElement(sym_name, offset, size, sym);
+ elements.push_back(se);
+ elements_by_addr.insert(AddrRange(offset, offset + size - 1), se);
+ elements_by_brigptr[sym] = se;
+
+ return se;
+}
+
+StorageElement*
+StorageSpace::findSymbol(std::string name)
+{
+ for (auto it : elements) {
+ if (it->name == name) {
+ return it;
+ }
+ }
+
+ return nullptr;
+}
+
+StorageElement*
+StorageSpace::findSymbol(uint64_t addr)
+{
+ assert(elements_by_addr.size() > 0);
+
+ auto se = elements_by_addr.find(addr);
+
+ if (se == elements_by_addr.end()) {
+ return nullptr;
+ } else {
+ return se->second;
+ }
+}
+
+StorageElement*
+StorageSpace::findSymbol(const BrigDirectiveVariable *brigptr)
+{
+ assert(elements_by_brigptr.size() > 0);
+
+ auto se = elements_by_brigptr.find(brigptr);
+
+ if (se == elements_by_brigptr.end()) {
+ return nullptr;
+ } else {
+ return se->second;
+ }
+}
+
+StorageMap::StorageMap(StorageMap *outerScope)
+ : outerScopeMap(outerScope)
+{
+ for (int i = 0; i < NumSegments; ++i)
+ space[i] = new StorageSpace((BrigSegment)i);
+}
+
+StorageElement*
+StorageMap::addSymbol(const BrigDirectiveVariable *sym, const BrigObject *obj)
+{
+ BrigSegment8_t segment = sym->segment;
+
+ assert(segment >= Brig::BRIG_SEGMENT_FLAT);
+ assert(segment < NumSegments);
+
+ return space[segment]->addSymbol(sym, obj);
+}
+
+int
+StorageMap::getSize(Brig::BrigSegment segment)
+{
+ assert(segment > Brig::BRIG_SEGMENT_GLOBAL);
+ assert(segment < NumSegments);
+
+ if (segment != Brig::BRIG_SEGMENT_GROUP &&
+ segment != Brig::BRIG_SEGMENT_READONLY) {
+ return space[segment]->getSize();
+ } else {
+ int ret = space[segment]->getSize();
+
+ if (outerScopeMap) {
+ ret += outerScopeMap->getSize(segment);
+ }
+
+ return ret;
+ }
+}
+
+void
+StorageMap::resetOffset(Brig::BrigSegment segment)
+{
+ space[segment]->resetOffset();
+}
+
+StorageElement*
+StorageMap::findSymbol(BrigSegment segment, std::string name)
+{
+ StorageElement *se = space[segment]->findSymbol(name);
+
+ if (se)
+ return se;
+
+ if (outerScopeMap)
+ return outerScopeMap->findSymbol(segment, name);
+
+ return nullptr;
+}
+
+StorageElement*
+StorageMap::findSymbol(Brig::BrigSegment segment, uint64_t addr)
+{
+ StorageSpace *sp = space[segment];
+
+ if (!sp) {
+ // there is no memory in segment?
+ return nullptr;
+ }
+
+ StorageElement *se = sp->findSymbol(addr);
+
+ if (se)
+ return se;
+
+ if (outerScopeMap)
+ return outerScopeMap->findSymbol(segment, addr);
+
+ return nullptr;
+
+}
+
+StorageElement*
+StorageMap::findSymbol(Brig::BrigSegment segment,
+ const BrigDirectiveVariable *brigptr)
+{
+ StorageSpace *sp = space[segment];
+
+ if (!sp) {
+ // there is no memory in segment?
+ return nullptr;
+ }
+
+ StorageElement *se = sp->findSymbol(brigptr);
+
+ if (se)
+ return se;
+
+ if (outerScopeMap)
+ return outerScopeMap->findSymbol(segment, brigptr);
+
+ return nullptr;
+
+}
diff --git a/src/gpu-compute/hsail_code.hh b/src/gpu-compute/hsail_code.hh
new file mode 100644
index 000000000..d9fbcc577
--- /dev/null
+++ b/src/gpu-compute/hsail_code.hh
@@ -0,0 +1,447 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __HSAIL_CODE_HH__
+#define __HSAIL_CODE_HH__
+
+#include <cassert>
+#include <list>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "arch/gpu_decoder.hh"
+#include "arch/hsail/Brig.h"
+#include "base/addr_range_map.hh"
+#include "base/intmath.hh"
+#include "config/the_gpu_isa.hh"
+#include "gpu-compute/hsa_code.hh"
+#include "gpu-compute/hsa_kernel_info.hh"
+#include "gpu-compute/misc.hh"
+
+class BrigObject;
+class GPUStaticInst;
+
+inline int
+popcount(uint64_t src, int sz)
+{
+ int cnt = 0;
+
+ for (int i = 0; i < sz; ++i) {
+ if (src & 1)
+ ++cnt;
+ src >>= 1;
+ }
+
+ return cnt;
+}
+
+inline int
+firstbit(uint64_t src, int sz)
+{
+ int i;
+
+ for (i = 0; i < sz; ++i) {
+ if (src & 1)
+ break;
+ src >>= 1;
+ }
+
+ return i;
+}
+
+inline int
+lastbit(uint64_t src, int sz)
+{
+ int i0 = -1;
+
+ for (int i = 0; i < sz; ++i) {
+ if (src & 1)
+ i0 = i;
+ src >>= 1;
+ }
+
+ return i0;
+}
+
+inline int
+signbit(uint64_t src, int sz)
+{
+ int i0 = -1;
+
+ if (src & (1 << (sz - 1))) {
+ for (int i = 0; i < sz - 1; ++i) {
+ if (!(src & 1))
+ i0 = i;
+ src >>= 1;
+ }
+ } else {
+ for (int i = 0; i < sz - 1; ++i) {
+ if (src & 1)
+ i0 = i;
+ src >>= 1;
+ }
+ }
+
+ return i0;
+}
+
+inline uint64_t
+bitrev(uint64_t src, int sz)
+{
+ uint64_t r = 0;
+
+ for (int i = 0; i < sz; ++i) {
+ r <<= 1;
+ if (src & 1)
+ r |= 1;
+ src >>= 1;
+ }
+
+ return r;
+}
+
+inline uint64_t
+mul_hi(uint32_t a, uint32_t b)
+{
+ return ((uint64_t)a * (uint64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(int32_t a, int32_t b)
+{
+ return ((int64_t)a * (int64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(uint64_t a, uint64_t b)
+{
+ return ((uint64_t)a * (uint64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(int64_t a, int64_t b)
+{
+ return ((int64_t)a * (int64_t)b) >> 32;
+}
+
+inline uint64_t
+mul_hi(double a, double b)
+{
+ return 0;
+}
+
+class Label
+{
+ public:
+ std::string name;
+ int value;
+
+ Label() : value(-1)
+ {
+ }
+
+ bool defined() { return value != -1; }
+
+ void
+ checkName(std::string &_name)
+ {
+ if (name.empty()) {
+ name = _name;
+ } else {
+ assert(name == _name);
+ }
+ }
+
+ void
+ define(std::string &_name, int _value)
+ {
+ assert(!defined());
+ assert(_value != -1);
+ value = _value;
+ checkName(_name);
+ }
+
+ int
+ get()
+ {
+ assert(defined());
+ return value;
+ }
+};
+
+class LabelMap
+{
+ std::map<std::string, Label> map;
+
+ public:
+ LabelMap() { }
+
+ void addLabel(const Brig::BrigDirectiveLabel *lbl, int inst_index,
+ const BrigObject *obj);
+
+ Label *refLabel(const Brig::BrigDirectiveLabel *lbl,
+ const BrigObject *obj);
+};
+
+const int NumSegments = Brig::BRIG_SEGMENT_AMD_GCN;
+
+extern const char *segmentNames[];
+
+class StorageElement
+{
+ public:
+ std::string name;
+ uint64_t offset;
+
+ uint64_t size;
+ const Brig::BrigDirectiveVariable *brigSymbol;
+ StorageElement(const char *_name, uint64_t _offset, int _size,
+ const Brig::BrigDirectiveVariable *sym)
+ : name(_name), offset(_offset), size(_size), brigSymbol(sym)
+ {
+ }
+};
+
+class StorageSpace
+{
+ typedef std::map<const Brig::BrigDirectiveVariable*, StorageElement*>
+ DirVarToSE_map;
+
+ std::list<StorageElement*> elements;
+ AddrRangeMap<StorageElement*> elements_by_addr;
+ DirVarToSE_map elements_by_brigptr;
+
+ uint64_t nextOffset;
+ Brig::BrigSegment segment;
+
+ public:
+ StorageSpace(Brig::BrigSegment _class)
+ : nextOffset(0), segment(_class)
+ {
+ }
+
+ StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
+ const BrigObject *obj);
+
+ StorageElement* findSymbol(std::string name);
+ StorageElement* findSymbol(uint64_t addr);
+ StorageElement* findSymbol(const Brig::BrigDirectiveVariable *brigptr);
+
+ int getSize() { return nextOffset; }
+ void resetOffset() { nextOffset = 0; }
+};
+
+class StorageMap
+{
+ StorageMap *outerScopeMap;
+ StorageSpace *space[NumSegments];
+
+ public:
+ StorageMap(StorageMap *outerScope = nullptr);
+
+ StorageElement *addSymbol(const Brig::BrigDirectiveVariable *sym,
+ const BrigObject *obj);
+
+ StorageElement* findSymbol(Brig::BrigSegment segment, std::string name);
+ StorageElement* findSymbol(Brig::BrigSegment segment, uint64_t addr);
+
+ StorageElement* findSymbol(Brig::BrigSegment segment,
+ const Brig::BrigDirectiveVariable *brigptr);
+
+ // overloaded version to avoid casting
+ StorageElement*
+ findSymbol(Brig::BrigSegment8_t segment, std::string name)
+ {
+ return findSymbol((Brig::BrigSegment)segment, name);
+ }
+
+ int getSize(Brig::BrigSegment segment);
+ void resetOffset(Brig::BrigSegment segment);
+};
+
+typedef enum
+{
+ BT_DEFAULT,
+ BT_B8,
+ BT_U8,
+ BT_U16,
+ BT_U32,
+ BT_U64,
+ BT_S8,
+ BT_S16,
+ BT_S32,
+ BT_S64,
+ BT_F16,
+ BT_F32,
+ BT_F64,
+ BT_NULL
+} base_type_e;
+
+/* @class HsailCode
+ * the HsailCode class is used to store information
+ * about HSA kernels stored in the BRIG format. it holds
+ * all information about a kernel, function, or variable
+ * symbol and provides methods for accessing that
+ * information.
+ */
+
+class HsailCode final : public HsaCode
+{
+ public:
+ TheGpuISA::Decoder decoder;
+
+ StorageMap *storageMap;
+ LabelMap labelMap;
+ uint32_t kernarg_start;
+ uint32_t kernarg_end;
+ int32_t private_size;
+
+ int32_t readonly_size;
+
+ // We track the maximum register index used for each register
+ // class when we load the code so we can size the register files
+ // appropriately (i.e., one more than the max index).
+ uint32_t max_creg; // maximum c-register index
+ uint32_t max_sreg; // maximum s-register index
+ uint32_t max_dreg; // maximum d-register index
+
+ HsailCode(const std::string &name_str,
+ const Brig::BrigDirectiveExecutable *code_dir,
+ const BrigObject *obj,
+ StorageMap *objStorageMap);
+
+ // this version is used to create a placeholder when
+ // we encounter a kernel-related directive before the
+ // kernel itself
+ HsailCode(const std::string &name_str);
+
+ void init(const Brig::BrigDirectiveExecutable *code_dir,
+ const BrigObject *obj, StorageMap *objStorageMap);
+
+ void
+ generateHsaKernelInfo(HsaKernelInfo *hsaKernelInfo) const
+ {
+ hsaKernelInfo->sRegCount = max_sreg + 1;
+ hsaKernelInfo->dRegCount = max_dreg + 1;
+ hsaKernelInfo->cRegCount = max_creg + 1;
+
+ hsaKernelInfo->static_lds_size = getSize(Brig::BRIG_SEGMENT_GROUP);
+
+ hsaKernelInfo->private_mem_size =
+ roundUp(getSize(Brig::BRIG_SEGMENT_PRIVATE), 8);
+
+ hsaKernelInfo->spill_mem_size =
+ roundUp(getSize(Brig::BRIG_SEGMENT_SPILL), 8);
+ }
+
+ int
+ getSize(MemorySegment segment) const
+ {
+ Brig::BrigSegment brigSeg;
+
+ switch (segment) {
+ case MemorySegment::NONE:
+ brigSeg = Brig::BRIG_SEGMENT_NONE;
+ break;
+ case MemorySegment::FLAT:
+ brigSeg = Brig::BRIG_SEGMENT_FLAT;
+ break;
+ case MemorySegment::GLOBAL:
+ brigSeg = Brig::BRIG_SEGMENT_GLOBAL;
+ break;
+ case MemorySegment::READONLY:
+ brigSeg = Brig::BRIG_SEGMENT_READONLY;
+ break;
+ case MemorySegment::KERNARG:
+ brigSeg = Brig::BRIG_SEGMENT_KERNARG;
+ break;
+ case MemorySegment::GROUP:
+ brigSeg = Brig::BRIG_SEGMENT_GROUP;
+ break;
+ case MemorySegment::PRIVATE:
+ brigSeg = Brig::BRIG_SEGMENT_PRIVATE;
+ break;
+ case MemorySegment::SPILL:
+ brigSeg = Brig::BRIG_SEGMENT_SPILL;
+ break;
+ case MemorySegment::ARG:
+ brigSeg = Brig::BRIG_SEGMENT_ARG;
+ break;
+ case MemorySegment::EXTSPACE0:
+ brigSeg = Brig::BRIG_SEGMENT_AMD_GCN;
+ break;
+ default:
+ fatal("Unknown BrigSegment type.\n");
+ }
+
+ return getSize(brigSeg);
+ }
+
+ private:
+ int
+ getSize(Brig::BrigSegment segment) const
+ {
+ if (segment == Brig::BRIG_SEGMENT_PRIVATE) {
+ // with the code generated by new HSA compiler the assertion
+ // does not hold anymore..
+ //assert(private_size != -1);
+ return private_size;
+ } else {
+ return storageMap->getSize(segment);
+ }
+ }
+
+ public:
+ StorageElement*
+ findSymbol(Brig::BrigSegment segment, uint64_t addr)
+ {
+ return storageMap->findSymbol(segment, addr);
+ }
+
+ void
+ setPrivateSize(int32_t _private_size)
+ {
+ private_size = _private_size;
+ }
+
+ Label*
+ refLabel(const Brig::BrigDirectiveLabel *lbl, const BrigObject *obj)
+ {
+ return labelMap.refLabel(lbl, obj);
+ }
+};
+
+#endif // __HSAIL_CODE_HH__
diff --git a/src/gpu-compute/kernel_cfg.cc b/src/gpu-compute/kernel_cfg.cc
new file mode 100644
index 000000000..7e0e10912
--- /dev/null
+++ b/src/gpu-compute/kernel_cfg.cc
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/kernel_cfg.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <string>
+
+#include "gpu-compute/gpu_static_inst.hh"
+
+void
+ControlFlowInfo::assignImmediatePostDominators(
+ const std::vector<GPUStaticInst*>& instructions)
+{
+ ControlFlowInfo cfg(instructions);
+ cfg.findImmediatePostDominators();
+}
+
+
+ControlFlowInfo::ControlFlowInfo(const std::vector<GPUStaticInst*>& insts) :
+ instructions(insts)
+{
+ createBasicBlocks();
+ connectBasicBlocks();
+}
+
+BasicBlock*
+ControlFlowInfo::basicBlock(int inst_num) const {
+ for (auto& block: basicBlocks) {
+ int first_block_id = block->firstInstruction->instNum();
+ if (inst_num >= first_block_id &&
+ inst_num < first_block_id + block->size) {
+ return block.get();
+ }
+ }
+ return nullptr;
+}
+
+
+GPUStaticInst*
+ControlFlowInfo::lastInstruction(const BasicBlock* block) const
+{
+ if (block->isExit()) {
+ return nullptr;
+ }
+
+ return instructions.at(block->firstInstruction->instNum() +
+ block->size - 1);
+}
+
+BasicBlock*
+ControlFlowInfo::postDominator(const BasicBlock* block) const
+{
+ if (block->isExit()) {
+ return nullptr;
+ }
+ return basicBlock(lastInstruction(block)->ipdInstNum());
+}
+
+void
+ControlFlowInfo::createBasicBlocks()
+{
+ assert(!instructions.empty());
+ std::set<int> leaders;
+ // first instruction is a leader
+ leaders.insert(0);
+ for (int i = 1; i < instructions.size(); i++) {
+ GPUStaticInst* instruction = instructions[i];
+ if (instruction->o_type == Enums::OT_BRANCH) {
+ const int target_pc = instruction->getTargetPc();
+ leaders.insert(target_pc);
+ leaders.insert(i + 1);
+ }
+ }
+
+ size_t block_size = 0;
+ for (int i = 0; i < instructions.size(); i++) {
+ if (leaders.find(i) != leaders.end()) {
+ uint32_t id = basicBlocks.size();
+ if (id > 0) {
+ basicBlocks.back()->size = block_size;
+ }
+ block_size = 0;
+ basicBlocks.emplace_back(new BasicBlock(id, instructions[i]));
+ }
+ block_size++;
+ }
+ basicBlocks.back()->size = block_size;
+ // exit basic block
+ basicBlocks.emplace_back(new BasicBlock(basicBlocks.size(), nullptr));
+}
+
+void
+ControlFlowInfo::connectBasicBlocks()
+{
+ BasicBlock* exit_bb = basicBlocks.back().get();
+ for (auto& bb : basicBlocks) {
+ if (bb->isExit()) {
+ break;
+ }
+ GPUStaticInst* last = lastInstruction(bb.get());
+ if (last->o_type == Enums::OT_RET) {
+ bb->successorIds.insert(exit_bb->id);
+ break;
+ }
+ if (last->o_type == Enums::OT_BRANCH) {
+ const uint32_t target_pc = last->getTargetPc();
+ BasicBlock* target_bb = basicBlock(target_pc);
+ bb->successorIds.insert(target_bb->id);
+ }
+
+ // Unconditional jump instructions have a unique successor
+ if (!last->unconditionalJumpInstruction()) {
+ BasicBlock* next_bb = basicBlock(last->instNum() + 1);
+ bb->successorIds.insert(next_bb->id);
+ }
+ }
+}
+
+
+// In-place set intersection
+static void
+intersect(std::set<uint32_t>& a, const std::set<uint32_t>& b)
+{
+ std::set<uint32_t>::iterator it = a.begin();
+ while (it != a.end()) {
+ it = b.find(*it) != b.end() ? ++it : a.erase(it);
+ }
+}
+
+
+void
+ControlFlowInfo::findPostDominators()
+{
+ // the only postdominator of the exit block is itself
+ basicBlocks.back()->postDominatorIds.insert(basicBlocks.back()->id);
+ //copy all basic blocks to all postdominator lists except for exit block
+ for (auto& block : basicBlocks) {
+ if (!block->isExit()) {
+ for (uint32_t i = 0; i < basicBlocks.size(); i++) {
+ block->postDominatorIds.insert(i);
+ }
+ }
+ }
+
+ bool change = true;
+ while (change) {
+ change = false;
+ for (int h = basicBlocks.size() - 2; h >= 0; --h) {
+ size_t num_postdominators =
+ basicBlocks[h]->postDominatorIds.size();
+ for (int s : basicBlocks[h]->successorIds) {
+ intersect(basicBlocks[h]->postDominatorIds,
+ basicBlocks[s]->postDominatorIds);
+ }
+ basicBlocks[h]->postDominatorIds.insert(h);
+ change |= (num_postdominators
+ != basicBlocks[h]->postDominatorIds.size());
+ }
+ }
+}
+
+
+// In-place set difference
+static void
+setDifference(std::set<uint32_t>&a,
+ const std::set<uint32_t>& b, uint32_t exception)
+{
+ for (uint32_t b_elem : b) {
+ if (b_elem != exception) {
+ a.erase(b_elem);
+ }
+ }
+}
+
+void
+ControlFlowInfo::findImmediatePostDominators()
+{
+ assert(basicBlocks.size() > 1); // Entry and exit blocks must be present
+
+ findPostDominators();
+
+ for (auto& basicBlock : basicBlocks) {
+ if (basicBlock->isExit()) {
+ continue;
+ }
+ std::set<uint32_t> candidates = basicBlock->postDominatorIds;
+ candidates.erase(basicBlock->id);
+ for (uint32_t postDominatorId : basicBlock->postDominatorIds) {
+ if (postDominatorId != basicBlock->id) {
+ setDifference(candidates,
+ basicBlocks[postDominatorId]->postDominatorIds,
+ postDominatorId);
+ }
+ }
+ assert(candidates.size() == 1);
+ GPUStaticInst* last_instruction = lastInstruction(basicBlock.get());
+ BasicBlock* ipd_block = basicBlocks[*(candidates.begin())].get();
+ if (!ipd_block->isExit()) {
+ GPUStaticInst* ipd_first_inst = ipd_block->firstInstruction;
+ last_instruction->ipdInstNum(ipd_first_inst->instNum());
+ } else {
+ last_instruction->ipdInstNum(last_instruction->instNum() + 1);
+ }
+ }
+}
+
+void
+ControlFlowInfo::printPostDominators() const
+{
+ for (auto& block : basicBlocks) {
+ std::cout << "PD(" << block->id << ") = {";
+ std::copy(block->postDominatorIds.begin(),
+ block->postDominatorIds.end(),
+ std::ostream_iterator<uint32_t>(std::cout, ", "));
+ std::cout << "}" << std::endl;
+ }
+}
+
+void
+ControlFlowInfo::printImmediatePostDominators() const
+{
+ for (const auto& block : basicBlocks) {
+ if (block->isExit()) {
+ continue;
+ }
+ std::cout << "IPD(" << block->id << ") = ";
+ std::cout << postDominator(block.get())->id << ", ";
+ }
+ std::cout << std::endl;
+}
+void
+ControlFlowInfo::printBasicBlocks() const
+{
+ for (GPUStaticInst* inst : instructions) {
+ int inst_num = inst->instNum();
+ std::cout << inst_num << " [" << basicBlock(inst_num)->id
+ << "]: " << inst->disassemble();
+ if (inst->o_type == Enums::OT_BRANCH) {
+ std::cout << ", PC = " << inst->getTargetPc();
+ }
+ std::cout << std::endl;
+ }
+}
+
+void
+ControlFlowInfo::printBasicBlockDot() const
+{
+ printf("digraph {\n");
+ for (const auto& basic_block : basicBlocks) {
+ printf("\t");
+ for (uint32_t successorId : basic_block->successorIds) {
+ printf("%d -> %d; ", basic_block->id, successorId);
+ }
+ printf("\n");
+ }
+ printf("}\n");
+}
diff --git a/src/gpu-compute/kernel_cfg.hh b/src/gpu-compute/kernel_cfg.hh
new file mode 100644
index 000000000..74ea861d8
--- /dev/null
+++ b/src/gpu-compute/kernel_cfg.hh
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __KERNEL_CFG_HH__
+#define __KERNEL_CFG_HH__
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <set>
+#include <vector>
+
+
+class GPUStaticInst;
+class HsailCode;
+
+struct BasicBlock
+{
+ BasicBlock(uint32_t num, GPUStaticInst* begin) :
+ id(num), size(0), firstInstruction(begin)
+ {
+ }
+
+ bool
+ isEntry() const
+ {
+ return !id;
+ }
+
+ bool
+ isExit() const
+ {
+ return !size;
+ }
+
+ /**
+ * Unique identifier for the block within a given kernel.
+ */
+ const uint32_t id;
+
+ /**
+ * Number of instructions contained in the block
+ */
+ size_t size;
+
+ /**
+ * Pointer to first instruction of the block.
+ */
+ GPUStaticInst* firstInstruction;
+
+ /**
+ * Identifiers of the blocks that follow (are reachable from) this block.
+ */
+ std::set<uint32_t> successorIds;
+
+ /**
+ * Identifiers of the blocks that will be visited from this block.
+ */
+ std::set<uint32_t> postDominatorIds;
+};
+
+class ControlFlowInfo
+{
+public:
+
+ /**
+ * Compute immediate post-dominator instruction for kernel instructions.
+ */
+ static void assignImmediatePostDominators(
+ const std::vector<GPUStaticInst*>& instructions);
+
+private:
+ ControlFlowInfo(const std::vector<GPUStaticInst*>& instructions);
+
+ GPUStaticInst* lastInstruction(const BasicBlock* block) const;
+
+ BasicBlock* basicBlock(int inst_num) const;
+
+ BasicBlock* postDominator(const BasicBlock* block) const;
+
+ void createBasicBlocks();
+
+ void connectBasicBlocks();
+
+ void findPostDominators();
+
+ void findImmediatePostDominators();
+
+ void printBasicBlocks() const;
+
+ void printBasicBlockDot() const;
+
+ void printPostDominators() const;
+
+ void printImmediatePostDominators() const;
+
+ std::vector<std::unique_ptr<BasicBlock>> basicBlocks;
+ std::vector<GPUStaticInst*> instructions;
+};
+
+#endif // __KERNEL_CFG_HH__
diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc
new file mode 100644
index 000000000..91ee8009a
--- /dev/null
+++ b/src/gpu-compute/lds_state.cc
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Joe Gross
+ */
+
+#include "gpu-compute/lds_state.hh"
+
+#include <array>
+#include <cstdio>
+#include <cstdlib>
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+
+/**
+ * the default constructor that works with SWIG
+ */
+LdsState::LdsState(const Params *params) :
+ MemObject(params),
+ tickEvent(this),
+ cuPort(name() + ".port", this),
+ maximumSize(params->size),
+ range(params->range),
+ bankConflictPenalty(params->bankConflictPenalty),
+ banks(params->banks)
+{
+ fatal_if(params->banks <= 0,
+ "Number of LDS banks should be positive number");
+ fatal_if((params->banks & (params->banks - 1)) != 0,
+ "Number of LDS banks should be a power of 2");
+ fatal_if(params->size <= 0,
+ "cannot allocate an LDS with a size less than 1");
+ fatal_if(params->size % 2,
+ "the LDS should be an even number");
+}
+
+/**
+ * Needed by the SWIG compiler
+ */
+LdsState *
+LdsStateParams::create()
+{
+ return new LdsState(this);
+}
+
+/**
+ * set the parent and name based on the parent
+ */
+void
+LdsState::setParent(ComputeUnit *x_parent)
+{
+ // check that this gets assigned to the same thing each time
+ fatal_if(!x_parent, "x_parent should not be nullptr");
+ fatal_if(x_parent == parent,
+ "should not be setting the parent twice");
+
+ parent = x_parent;
+ _name = x_parent->name() + ".LdsState";
+}
+
+/**
+ * derive the gpu mem packet from the packet and then count the bank conflicts
+ */
+unsigned
+LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses)
+{
+ Packet::SenderState *baseSenderState = packet->senderState;
+ while (baseSenderState->predecessor) {
+ baseSenderState = baseSenderState->predecessor;
+ }
+ const ComputeUnit::LDSPort::SenderState *senderState =
+ dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState);
+
+ fatal_if(!senderState,
+ "did not get the right sort of sender state");
+
+ GPUDynInstPtr gpuDynInst = senderState->getMemInst();
+
+ return countBankConflicts(gpuDynInst, bankAccesses);
+}
+
+// Count the total number of bank conflicts for the local memory packet
+unsigned
+LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst,
+ unsigned *numBankAccesses)
+{
+ int bank_conflicts = 0;
+ std::vector<int> bank;
+ // the number of LDS banks being touched by the memory instruction
+ int numBanks = std::min(parent->wfSize(), banks);
+ // if the wavefront size is larger than the number of LDS banks, we
+ // need to iterate over all work items to calculate the total
+ // number of bank conflicts
+ int groups = (parent->wfSize() > numBanks) ?
+ (parent->wfSize() / numBanks) : 1;
+ for (int i = 0; i < groups; i++) {
+ // Address Array holding all the work item addresses of an instruction
+ std::vector<Addr> addr_array;
+ addr_array.resize(numBanks, 0);
+ bank.clear();
+ bank.resize(banks, 0);
+ int max_bank = 0;
+
+ // populate the address array for all active work items
+ for (int j = 0; j < numBanks; j++) {
+ if (gpuDynInst->exec_mask[(i*numBanks)+j]) {
+ addr_array[j] = gpuDynInst->addr[(i*numBanks)+j];
+ } else {
+ addr_array[j] = std::numeric_limits<Addr>::max();
+ }
+ }
+
+ if (gpuDynInst->m_op == Enums::MO_LD ||
+ gpuDynInst->m_op == Enums::MO_ST) {
+ // mask identical addresses
+ for (int j = 0; j < numBanks; ++j) {
+ for (int j0 = 0; j0 < j; j0++) {
+ if (addr_array[j] != std::numeric_limits<Addr>::max()
+ && addr_array[j] == addr_array[j0]) {
+ addr_array[j] = std::numeric_limits<Addr>::max();
+ }
+ }
+ }
+ }
+ // calculate bank conflicts
+ for (int j = 0; j < numBanks; ++j) {
+ if (addr_array[j] != std::numeric_limits<Addr>::max()) {
+ int bankId = addr_array[j] % banks;
+ bank[bankId]++;
+ max_bank = std::max(max_bank, bank[bankId]);
+ // Count the number of LDS banks accessed.
+ // Since we have masked identical addresses all remaining
+ // accesses will need to be serialized if they access
+ // the same bank (bank conflict).
+ (*numBankAccesses)++;
+ }
+ }
+ bank_conflicts += max_bank;
+ }
+ panic_if(bank_conflicts > parent->wfSize(),
+ "Max bank conflicts should match num of work items per instr");
+ return bank_conflicts;
+}
+
+/**
+ * receive the packet from the CU
+ */
+bool
+LdsState::CuSidePort::recvTimingReq(PacketPtr packet)
+{
+ return ownerLds->processPacket(packet);
+}
+
+GPUDynInstPtr
+LdsState::getDynInstr(PacketPtr packet)
+{
+ ComputeUnit::LDSPort::SenderState *ss =
+ dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
+ packet->senderState);
+ return ss->getMemInst();
+}
+
+/**
+ * process an incoming packet, add it to the return queue
+ */
+bool
+LdsState::processPacket(PacketPtr packet)
+{
+ unsigned bankAccesses = 0;
+ // the number of conflicts this packet will have when accessing the LDS
+ unsigned bankConflicts = countBankConflicts(packet, &bankAccesses);
+ // count the total number of physical LDS bank accessed
+ parent->ldsBankAccesses += bankAccesses;
+ // count the LDS bank conflicts. A number set to 1 indicates one
+ // access per bank maximum so there are no bank conflicts
+ parent->ldsBankConflictDist.sample(bankConflicts-1);
+
+ GPUDynInstPtr dynInst = getDynInstr(packet);
+ // account for the LDS bank conflict overhead
+ int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() :
+ (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() :
+ parent->loadBusLength();
+ // delay for accessing the LDS
+ Tick processingTime =
+ parent->shader->ticks(bankConflicts * bankConflictPenalty) +
+ parent->shader->ticks(busLength);
+ // choose (delay + last packet in queue) or (now + delay) as the time to
+ // return this
+ Tick doneAt = earliestReturnTime() + processingTime;
+ // then store it for processing
+ return returnQueuePush(std::make_pair(doneAt, packet));
+}
+
+/**
+ * add this to the queue of packets to be returned
+ */
+bool
+LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair)
+{
+ // TODO add time limits (e.g. one packet per cycle) and queue size limits
+ // and implement flow control
+ returnQueue.push(thePair);
+
+ // if there is no set wakeup time, look through the queue
+ if (!tickEvent.scheduled()) {
+ process();
+ }
+
+ return true;
+}
+
+/**
+ * receive a packet in functional mode
+ */
+void
+LdsState::CuSidePort::recvFunctional(PacketPtr pkt)
+{
+ fatal("not implemented");
+}
+
+/**
+ * receive a retry for a response
+ */
+void
+LdsState::CuSidePort::recvRespRetry()
+{
+ // TODO verify that this is the right way to do this
+ assert(ownerLds->isRetryResp());
+ ownerLds->setRetryResp(false);
+ ownerLds->process();
+}
+
+/**
+ * receive a retry
+ */
+void
+LdsState::CuSidePort::recvRetry()
+{
+ fatal("not implemented");
+}
+
+/**
+ * look for packets to return at this time
+ */
+bool
+LdsState::process()
+{
+ Tick now = clockEdge();
+
+ // send back completed packets
+ while (!returnQueue.empty() && returnQueue.front().first <= now) {
+ PacketPtr packet = returnQueue.front().second;
+
+ ComputeUnit::LDSPort::SenderState *ss =
+ dynamic_cast<ComputeUnit::LDSPort::SenderState *>(
+ packet->senderState);
+
+ GPUDynInstPtr gpuDynInst = ss->getMemInst();
+
+ gpuDynInst->initiateAcc(gpuDynInst);
+
+ packet->makeTimingResponse();
+
+ returnQueue.pop();
+
+ bool success = cuPort.sendTimingResp(packet);
+
+ if (!success) {
+ retryResp = true;
+ panic("have not handled timing responses being NACK'd when sent"
+ "back");
+ }
+ }
+
+ // determine the next wakeup time
+ if (!returnQueue.empty()) {
+
+ Tick next = returnQueue.front().first;
+
+ if (tickEvent.scheduled()) {
+
+ if (next < tickEvent.when()) {
+
+ tickEvent.deschedule();
+ tickEvent.schedule(next);
+ }
+ } else {
+ tickEvent.schedule(next);
+ }
+ }
+
+ return true;
+}
+
+/**
+ * wake up at this time and perform specified actions
+ */
+void
+LdsState::TickEvent::process()
+{
+ ldsState->process();
+}
+
+/**
+ *
+ */
+void
+LdsState::regStats()
+{
+}
diff --git a/src/gpu-compute/lds_state.hh b/src/gpu-compute/lds_state.hh
new file mode 100644
index 000000000..89f08a1d3
--- /dev/null
+++ b/src/gpu-compute/lds_state.hh
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Joe Gross
+ */
+
+#ifndef __LDS_STATE_HH__
+#define __LDS_STATE_HH__
+
+#include <array>
+#include <queue>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "gpu-compute/misc.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "params/LdsState.hh"
+
+class ComputeUnit;
+
+/**
+ * this represents a slice of the overall LDS, intended to be associated with an
+ * individual workgroup
+ */
+class LdsChunk
+{
+ public:
+ LdsChunk(const uint32_t x_size):
+ chunk(x_size)
+ {
+ }
+
+ LdsChunk() {}
+
+ /**
+ * a read operation
+ */
+ template<class T>
+ T
+ read(const uint32_t index)
+ {
+ fatal_if(!chunk.size(), "cannot read from an LDS chunk of size 0");
+ fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+ T *p0 = (T *) (&(chunk.at(index)));
+ return *p0;
+ }
+
+ /**
+ * a write operation
+ */
+ template<class T>
+ void
+ write(const uint32_t index, const T value)
+ {
+ fatal_if(!chunk.size(), "cannot write to an LDS chunk of size 0");
+ fatal_if(index >= chunk.size(), "out-of-bounds access to an LDS chunk");
+ T *p0 = (T *) (&(chunk.at(index)));
+ *p0 = value;
+ }
+
+ /**
+ * get the size of this chunk
+ */
+ std::vector<uint8_t>::size_type
+ size() const
+ {
+ return chunk.size();
+ }
+
+ protected:
+ // the actual data store for this slice of the LDS
+ std::vector<uint8_t> chunk;
+};
+
+// Local Data Share (LDS) State per Wavefront (contents of the LDS region
+// allocated to the WorkGroup of this Wavefront)
+class LdsState: public MemObject
+{
+ protected:
+
+ /**
+ * an event to allow event-driven execution
+ */
+ class TickEvent: public Event
+ {
+ protected:
+
+ LdsState *ldsState = nullptr;
+
+ Tick nextTick = 0;
+
+ public:
+
+ TickEvent(LdsState *_ldsState) :
+ ldsState(_ldsState)
+ {
+ }
+
+ virtual void
+ process();
+
+ void
+ schedule(Tick when)
+ {
+ mainEventQueue[0]->schedule(this, when);
+ }
+
+ void
+ deschedule()
+ {
+ mainEventQueue[0]->deschedule(this);
+ }
+ };
+
+ /**
+ * CuSidePort is the LDS Port closer to the CU side
+ */
+ class CuSidePort: public SlavePort
+ {
+ public:
+ CuSidePort(const std::string &_name, LdsState *_ownerLds) :
+ SlavePort(_name, _ownerLds), ownerLds(_ownerLds)
+ {
+ }
+
+ protected:
+ LdsState *ownerLds;
+
+ virtual bool
+ recvTimingReq(PacketPtr pkt);
+
+ virtual Tick
+ recvAtomic(PacketPtr pkt)
+ {
+ return 0;
+ }
+
+ virtual void
+ recvFunctional(PacketPtr pkt);
+
+ virtual void
+ recvRangeChange()
+ {
+ }
+
+ virtual void
+ recvRetry();
+
+ virtual void
+ recvRespRetry();
+
+ virtual AddrRangeList
+ getAddrRanges() const
+ {
+ AddrRangeList ranges;
+ ranges.push_back(ownerLds->getAddrRange());
+ return ranges;
+ }
+
+ template<typename T>
+ void
+ loadData(PacketPtr packet);
+
+ template<typename T>
+ void
+ storeData(PacketPtr packet);
+
+ template<typename T>
+ void
+ atomicOperation(PacketPtr packet);
+ };
+
+ protected:
+
+ // the lds reference counter
+ // The key is the workgroup ID and dispatch ID
+ // The value is the number of wavefronts that reference this LDS, as
+ // wavefronts are launched, the counter goes up for that workgroup and when
+ // they return it decreases, once it reaches 0 then this chunk of the LDS is
+ // returned to the available pool. However,it is deallocated on the 1->0
+ // transition, not whenever the counter is 0 as it always starts with 0 when
+ // the workgroup asks for space
+ std::unordered_map<uint32_t,
+ std::unordered_map<uint32_t, int32_t>> refCounter;
+
+ // the map that allows workgroups to access their own chunk of the LDS
+ std::unordered_map<uint32_t,
+ std::unordered_map<uint32_t, LdsChunk>> chunkMap;
+
+ // an event to allow the LDS to wake up at a specified time
+ TickEvent tickEvent;
+
+ // the queue of packets that are going back to the CU after a
+ // read/write/atomic op
+ // TODO need to make this have a maximum size to create flow control
+ std::queue<std::pair<Tick, PacketPtr>> returnQueue;
+
+ // whether or not there are pending responses
+ bool retryResp = false;
+
+ bool
+ process();
+
+ GPUDynInstPtr
+ getDynInstr(PacketPtr packet);
+
+ bool
+ processPacket(PacketPtr packet);
+
+ unsigned
+ countBankConflicts(PacketPtr packet, unsigned *bankAccesses);
+
+ unsigned
+ countBankConflicts(GPUDynInstPtr gpuDynInst,
+ unsigned *numBankAccesses);
+
+ public:
+ typedef LdsStateParams Params;
+
+ LdsState(const Params *params);
+
+ // prevent copy construction
+ LdsState(const LdsState&) = delete;
+
+ ~LdsState()
+ {
+ parent = nullptr;
+ }
+
+ const Params *
+ params() const
+ {
+ return dynamic_cast<const Params *>(_params);
+ }
+
+ bool
+ isRetryResp() const
+ {
+ return retryResp;
+ }
+
+ void
+ setRetryResp(const bool value)
+ {
+ retryResp = value;
+ }
+
+ // prevent assignment
+ LdsState &
+ operator=(const LdsState &) = delete;
+
+ /**
+ * use the dynamic wave id to create or just increase the reference count
+ */
+ int
+ increaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
+ {
+ int refCount = getRefCounter(dispatchId, wgId);
+ fatal_if(refCount < 0,
+ "reference count should not be below zero");
+ return ++refCounter[dispatchId][wgId];
+ }
+
+ /**
+ * decrease the reference count after making sure it is in the list
+ * give back this chunk if the ref counter has reached 0
+ */
+ int
+ decreaseRefCounter(const uint32_t dispatchId, const uint32_t wgId)
+ {
+ int refCount = getRefCounter(dispatchId, wgId);
+
+ fatal_if(refCount <= 0,
+ "reference count should not be below zero or at zero to"
+ "decrement");
+
+ refCounter[dispatchId][wgId]--;
+
+ if (refCounter[dispatchId][wgId] == 0) {
+ releaseSpace(dispatchId, wgId);
+ return 0;
+ } else {
+ return refCounter[dispatchId][wgId];
+ }
+ }
+
+ /**
+ * return the current reference count for this workgroup id
+ */
+ int
+ getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
+ {
+ auto dispatchIter = chunkMap.find(dispatchId);
+ fatal_if(dispatchIter == chunkMap.end(),
+ "could not locate this dispatch id [%d]", dispatchId);
+
+ auto workgroup = dispatchIter->second.find(wgId);
+ fatal_if(workgroup == dispatchIter->second.end(),
+ "could not find this workgroup id within this dispatch id"
+ " did[%d] wgid[%d]", dispatchId, wgId);
+
+ auto refCountIter = refCounter.find(dispatchId);
+ if (refCountIter == refCounter.end()) {
+ fatal("could not locate this dispatch id [%d]", dispatchId);
+ } else {
+ auto workgroup = refCountIter->second.find(wgId);
+ if (workgroup == refCountIter->second.end()) {
+ fatal("could not find this workgroup id within this dispatch id"
+ " did[%d] wgid[%d]", dispatchId, wgId);
+ } else {
+ return refCounter.at(dispatchId).at(wgId);
+ }
+ }
+
+ fatal("should not reach this point");
+ return 0;
+ }
+
+ /**
+ * assign a parent and request this amount of space be set aside
+ * for this wgid
+ */
+ LdsChunk *
+ reserveSpace(const uint32_t dispatchId, const uint32_t wgId,
+ const uint32_t size)
+ {
+ if (chunkMap.find(dispatchId) != chunkMap.end()) {
+ fatal_if(
+ chunkMap[dispatchId].find(wgId) != chunkMap[dispatchId].end(),
+ "duplicate workgroup ID asking for space in the LDS "
+ "did[%d] wgid[%d]", dispatchId, wgId);
+ }
+
+ fatal_if(bytesAllocated + size > maximumSize,
+ "request would ask for more space than is available");
+
+ bytesAllocated += size;
+
+ chunkMap[dispatchId].emplace(wgId, LdsChunk(size));
+ // make an entry for this workgroup
+ refCounter[dispatchId][wgId] = 0;
+
+ return &chunkMap[dispatchId][wgId];
+ }
+
+ bool
+ returnQueuePush(std::pair<Tick, PacketPtr> thePair);
+
+ Tick
+ earliestReturnTime() const
+ {
+ // TODO set to max(lastCommand+1, curTick())
+ return returnQueue.empty() ? curTick() : returnQueue.back().first;
+ }
+
+ void
+ setParent(ComputeUnit *x_parent);
+
+ void
+ regStats();
+
+ // accessors
+ ComputeUnit *
+ getParent() const
+ {
+ return parent;
+ }
+
+ std::string
+ getName()
+ {
+ return _name;
+ }
+
+ int
+ getBanks() const
+ {
+ return banks;
+ }
+
+ ComputeUnit *
+ getComputeUnit() const
+ {
+ return parent;
+ }
+
+ int
+ getBankConflictPenalty() const
+ {
+ return bankConflictPenalty;
+ }
+
+ /**
+ * get the allocated size for this workgroup
+ */
+ std::size_t
+ ldsSize(const uint32_t x_wgId)
+ {
+ return chunkMap[x_wgId].size();
+ }
+
+ AddrRange
+ getAddrRange() const
+ {
+ return range;
+ }
+
+ virtual BaseSlavePort &
+ getSlavePort(const std::string& if_name, PortID idx)
+ {
+ if (if_name == "cuPort") {
+ // TODO need to set name dynamically at this point?
+ return cuPort;
+ } else {
+ fatal("cannot resolve the port name " + if_name);
+ }
+ }
+
+ /**
+ * can this much space be reserved for a workgroup?
+ */
+ bool
+ canReserve(uint32_t x_size) const
+ {
+ return bytesAllocated + x_size <= maximumSize;
+ }
+
+ private:
+ /**
+ * give back the space
+ */
+ bool
+ releaseSpace(const uint32_t x_dispatchId, const uint32_t x_wgId)
+ {
+ auto dispatchIter = chunkMap.find(x_dispatchId);
+
+ if (dispatchIter == chunkMap.end()) {
+ fatal("dispatch id not found [%d]", x_dispatchId);
+ } else {
+ auto workgroupIter = dispatchIter->second.find(x_wgId);
+ if (workgroupIter == dispatchIter->second.end()) {
+ fatal("workgroup id [%d] not found in dispatch id [%d]",
+ x_wgId, x_dispatchId);
+ }
+ }
+
+ fatal_if(bytesAllocated < chunkMap[x_dispatchId][x_wgId].size(),
+ "releasing more space than was allocated");
+
+ bytesAllocated -= chunkMap[x_dispatchId][x_wgId].size();
+ chunkMap[x_dispatchId].erase(chunkMap[x_dispatchId].find(x_wgId));
+ return true;
+ }
+
+ // the port that connects this LDS to its owner CU
+ CuSidePort cuPort;
+
+ ComputeUnit* parent = nullptr;
+
+ std::string _name;
+
+ // the number of bytes currently reserved by all workgroups
+ int bytesAllocated = 0;
+
+ // the size of the LDS, the most bytes available
+ int maximumSize;
+
+ // Address range of this memory
+ AddrRange range;
+
+ // the penalty, in cycles, for each LDS bank conflict
+ int bankConflictPenalty = 0;
+
+ // the number of banks in the LDS underlying data store
+ int banks = 0;
+};
+
+#endif // __LDS_STATE_HH__
diff --git a/src/gpu-compute/local_memory_pipeline.cc b/src/gpu-compute/local_memory_pipeline.cc
new file mode 100644
index 000000000..7f919c5f4
--- /dev/null
+++ b/src/gpu-compute/local_memory_pipeline.cc
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/local_memory_pipeline.hh"
+
+#include "debug/GPUPort.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+LocalMemPipeline::LocalMemPipeline(const ComputeUnitParams* p) :
+ computeUnit(nullptr), lmQueueSize(p->local_mem_queue_size)
+{
+}
+
+void
+LocalMemPipeline::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ _name = computeUnit->name() + ".LocalMemPipeline";
+}
+
+void
+LocalMemPipeline::exec()
+{
+ // apply any returned shared (LDS) memory operations
+ GPUDynInstPtr m = !lmReturnedRequests.empty() ?
+ lmReturnedRequests.front() : nullptr;
+
+ bool accessVrf = true;
+ if ((m) && (m->m_op==Enums::MO_LD || MO_A(m->m_op))) {
+ Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+ accessVrf =
+ w->computeUnit->vrf[m->simdId]->
+ vrfOperandAccessReady(m->seqNum(), w, m,
+ VrfAccessType::WRITE);
+ }
+
+ if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
+ computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
+ || computeUnit->wfWait.at(m->pipeId).rdy())) {
+ if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
+ doSmReturn<uint32_t, uint8_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
+ doSmReturn<uint32_t, uint16_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
+ doSmReturn<uint32_t, uint32_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
+ doSmReturn<int32_t, int8_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
+ doSmReturn<int32_t, int16_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
+ doSmReturn<int32_t, int32_t>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
+ doSmReturn<float, Float16>(m);
+ else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
+ doSmReturn<float, float>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
+ doSmReturn<uint64_t, uint8_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
+ doSmReturn<uint64_t, uint16_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
+ doSmReturn<uint64_t, uint32_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
+ doSmReturn<uint64_t, uint64_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
+ doSmReturn<int64_t, int8_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
+ doSmReturn<int64_t, int16_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
+ doSmReturn<int64_t, int32_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
+ doSmReturn<int64_t, int64_t>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
+ doSmReturn<double, Float16>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
+ doSmReturn<double, float>(m);
+ else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
+ doSmReturn<double, double>(m);
+ }
+
+ // If pipeline has executed a local memory instruction
+ // execute local memory packet and issue the packets
+ // to LDS
+ if (!lmIssuedRequests.empty() && lmReturnedRequests.size() < lmQueueSize) {
+
+ GPUDynInstPtr m = lmIssuedRequests.front();
+
+ bool returnVal = computeUnit->sendToLds(m);
+ if (!returnVal) {
+ DPRINTF(GPUPort, "packet was nack'd and put in retry queue");
+ }
+ lmIssuedRequests.pop();
+ }
+}
+
+template<typename c0, typename c1>
+void
+LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
+{
+ lmReturnedRequests.pop();
+ Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+
+ // Return data to registers
+ if (m->m_op == Enums::MO_LD || MO_A(m->m_op)) {
+ std::vector<uint32_t> regVec;
+ for (int k = 0; k < m->n_reg; ++k) {
+ int dst = m->dst_reg+k;
+
+ if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
+ dst = m->dst_reg_vec[k];
+ // virtual->physical VGPR mapping
+ int physVgpr = w->remap(dst,sizeof(c0),1);
+ // save the physical VGPR index
+ regVec.push_back(physVgpr);
+ c1 *p1 = &((c1*)m->d_data)[k * VSZ];
+
+ for (int i = 0; i < VSZ; ++i) {
+ if (m->exec_mask[i]) {
+ // write the value into the physical VGPR. This is a purely
+ // functional operation. No timing is modeled.
+ w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
+ *p1, i);
+ }
+ ++p1;
+ }
+ }
+
+ // Schedule the write operation of the load data on the VRF. This simply
+ // models the timing aspect of the VRF write operation. It does not
+ // modify the physical VGPR.
+ loadVrfBankConflictCycles +=
+ w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w,
+ regVec, sizeof(c0), m->time);
+ }
+
+ // Decrement outstanding request count
+ computeUnit->shader->ScheduleAdd(&w->outstanding_reqs, m->time, -1);
+
+ if (m->m_op == Enums::MO_ST || MO_A(m->m_op) || MO_ANR(m->m_op)
+ || MO_H(m->m_op)) {
+ computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_wr_lm,
+ m->time, -1);
+ }
+
+ if (m->m_op == Enums::MO_LD || MO_A(m->m_op) || MO_ANR(m->m_op)) {
+ computeUnit->shader->ScheduleAdd(&w->outstanding_reqs_rd_lm,
+ m->time, -1);
+ }
+
+ // Mark write bus busy for appropriate amount of time
+ computeUnit->locMemToVrfBus.set(m->time);
+ if (computeUnit->shader->coissue_return == 0)
+ w->computeUnit->wfWait.at(m->pipeId).set(m->time);
+}
+
+void
+LocalMemPipeline::regStats()
+{
+ loadVrfBankConflictCycles
+ .name(name() + ".load_vrf_bank_conflict_cycles")
+ .desc("total number of cycles LDS data are delayed before updating "
+ "the VRF")
+ ;
+}
diff --git a/src/gpu-compute/local_memory_pipeline.hh b/src/gpu-compute/local_memory_pipeline.hh
new file mode 100644
index 000000000..a63d867d0
--- /dev/null
+++ b/src/gpu-compute/local_memory_pipeline.hh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __LOCAL_MEMORY_PIPELINE_HH__
+#define __LOCAL_MEMORY_PIPELINE_HH__
+
+#include <queue>
+#include <string>
+
+#include "gpu-compute/misc.hh"
+#include "params/ComputeUnit.hh"
+#include "sim/stats.hh"
+
+/*
+ * @file local_memory_pipeline.hh
+ *
+ * The local memory pipeline issues newly created local memory packets
+ * from pipeline to the LDS. This stage also retires previously issued
+ * loads and stores that have returned from the LDS.
+ */
+
+class ComputeUnit;
+class Wavefront;
+
+class LocalMemPipeline
+{
+ public:
+ LocalMemPipeline(const ComputeUnitParams *params);
+ void init(ComputeUnit *cu);
+ void exec();
+
+ template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr m);
+
+ std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
+ std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
+
+ bool
+ isLMRespFIFOWrRdy() const
+ {
+ return lmReturnedRequests.size() < lmQueueSize;
+ }
+
+ bool
+ isLMReqFIFOWrRdy(uint32_t pendReqs=0) const
+ {
+ return (lmIssuedRequests.size() + pendReqs) < lmQueueSize;
+ }
+
+ const std::string& name() const { return _name; }
+ void regStats();
+
+ private:
+ ComputeUnit *computeUnit;
+ std::string _name;
+ int lmQueueSize;
+ Stats::Scalar loadVrfBankConflictCycles;
+ // Local Memory Request Fifo: all shared memory requests
+ // are issued to this FIFO from the memory pipelines
+ std::queue<GPUDynInstPtr> lmIssuedRequests;
+
+ // Local Memory Response Fifo: all responses of shared memory
+ // requests are sent to this FIFO from LDS
+ std::queue<GPUDynInstPtr> lmReturnedRequests;
+};
+
+#endif // __LOCAL_MEMORY_PIPELINE_HH__
diff --git a/src/gpu-compute/misc.hh b/src/gpu-compute/misc.hh
new file mode 100644
index 000000000..4f8032832
--- /dev/null
+++ b/src/gpu-compute/misc.hh
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __MISC_HH__
+#define __MISC_HH__
+
+#include <bitset>
+#include <memory>
+
+#include "base/misc.hh"
+
+class GPUDynInst;
+
+// wavefront size of the machine
+static const int VSZ = 64;
+
+/*
+ This check is necessary because std::bitset only provides conversion to
+ unsigned long or unsigned long long via to_ulong() or to_ullong(). there are
+ a few places in the code where to_ullong() is used, however if VSZ is larger
+ than a value the host can support then bitset will throw a runtime exception.
+
+ we should remove all use of to_long() or to_ullong() so we can have VSZ
+ greater than 64b, however until that is done this assert is required.
+ */
+static_assert(VSZ <= sizeof(unsigned long long) * 8,
+ "VSZ is larger than the host can support");
+
+typedef std::bitset<VSZ> VectorMask;
+typedef std::shared_ptr<GPUDynInst> GPUDynInstPtr;
+
+class WaitClass
+{
+ public:
+ WaitClass() : nxtAvail(0), lookAheadAvail(0), tcnt(0) { }
+ void init(uint64_t *_tcnt, uint32_t _numStages=0)
+ {
+ tcnt = _tcnt;
+ numStages = _numStages;
+ }
+
+ void set(uint32_t i)
+ {
+ fatal_if(nxtAvail > *tcnt,
+ "Can't allocate resource because it is busy!!!");
+ nxtAvail = *tcnt + i;
+ }
+ void preset(uint32_t delay)
+ {
+ lookAheadAvail = std::max(lookAheadAvail, delay + (*tcnt) - numStages);
+ }
+ bool rdy() const { return *tcnt >= nxtAvail; }
+ bool prerdy() const { return *tcnt >= lookAheadAvail; }
+
+ private:
+ // timestamp indicating when resource will be available
+ uint64_t nxtAvail;
+ // timestamp indicating when resource will be available including
+ // pending uses of the resource (when there is a cycle gap between
+ // rdy() and set()
+ uint64_t lookAheadAvail;
+ // current timestamp
+ uint64_t *tcnt;
+ // number of stages between checking if a resource is ready and
+ // setting the resource's utilization
+ uint32_t numStages;
+};
+
+class Float16
+{
+ public:
+ uint16_t val;
+
+ Float16() { val = 0; }
+
+ Float16(const Float16 &x) : val(x.val) { }
+
+ Float16(float x)
+ {
+ uint32_t ai = *(uint32_t *)&x;
+
+ uint32_t s = (ai >> 31) & 0x1;
+ uint32_t exp = (ai >> 23) & 0xff;
+ uint32_t mant = (ai >> 0) & 0x7fffff;
+
+ if (exp == 0 || exp <= 0x70) {
+ exp = 0;
+ mant = 0;
+ } else if (exp == 0xff) {
+ exp = 0x1f;
+ } else if (exp >= 0x8f) {
+ exp = 0x1f;
+ mant = 0;
+ } else {
+ exp = exp - 0x7f + 0x0f;
+ }
+
+ mant = mant >> 13;
+
+ val = 0;
+ val |= (s << 15);
+ val |= (exp << 10);
+ val |= (mant << 0);
+ }
+
+ operator float() const
+ {
+ uint32_t s = (val >> 15) & 0x1;
+ uint32_t exp = (val >> 10) & 0x1f;
+ uint32_t mant = (val >> 0) & 0x3ff;
+
+ if (!exp) {
+ exp = 0;
+ mant = 0;
+ } else if (exp == 0x1f) {
+ exp = 0xff;
+ } else {
+ exp = exp - 0x0f + 0x7f;
+ }
+
+ uint32_t val1 = 0;
+ val1 |= (s << 31);
+ val1 |= (exp << 23);
+ val1 |= (mant << 13);
+
+ return *(float*)&val1;
+ }
+};
+
+#endif // __MISC_HH__
diff --git a/src/gpu-compute/ndrange.hh b/src/gpu-compute/ndrange.hh
new file mode 100644
index 000000000..d1ad35d4b
--- /dev/null
+++ b/src/gpu-compute/ndrange.hh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __NDRANGE_HH__
+#define __NDRANGE_HH__
+
+#include "base/types.hh"
+#include "gpu-compute/qstruct.hh"
+
+struct NDRange
+{
+ // copy of the queue entry provided at dispatch
+ HsaQueueEntry q;
+
+ // The current workgroup id (3 dimensions)
+ int wgId[3];
+ // The number of workgroups in each dimension
+ int numWg[3];
+ // The total number of workgroups
+ int numWgTotal;
+
+ // The number of completed work groups
+ int numWgCompleted;
+ // The global workgroup ID
+ uint32_t globalWgId;
+
+ // flag indicating whether all work groups have been launched
+ bool wg_disp_rem;
+ // kernel complete
+ bool execDone;
+ bool userDoorBellSet;
+ volatile bool *addrToNotify;
+ volatile uint32_t *numDispLeft;
+ int dispatchId;
+ int curTid; // Current thread id
+};
+
+#endif // __NDRANGE_HH__
diff --git a/src/gpu-compute/of_scheduling_policy.cc b/src/gpu-compute/of_scheduling_policy.cc
new file mode 100644
index 000000000..7f114706a
--- /dev/null
+++ b/src/gpu-compute/of_scheduling_policy.cc
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/of_scheduling_policy.hh"
+
+#include "gpu-compute/wavefront.hh"
+
+Wavefront*
+OFSchedulingPolicy::chooseWave()
+{
+ // Set when policy choose a wave to schedule
+ bool waveChosen = false;
+ Wavefront *selectedWave = nullptr;
+ int selectedWaveID = -1;
+ uint32_t selectedPosition = 0;
+
+ for (int position = 0; position < scheduleList->size(); ++position) {
+ Wavefront *curWave = scheduleList->at(position);
+ uint32_t curWaveID = curWave->wfDynId;
+
+ // Choosed wave with the lowest wave ID
+ if (selectedWaveID == -1 || curWaveID < selectedWaveID) {
+ waveChosen = true;
+ selectedWaveID = curWaveID;
+ selectedWave = curWave;
+ selectedPosition = position;
+ }
+ }
+
+ // Check to make sure ready list had atleast one schedulable wave
+ if (waveChosen) {
+ scheduleList->erase(scheduleList->begin() + selectedPosition);
+ } else {
+ panic("Empty ready list");
+ }
+
+ return selectedWave;
+}
+
+void
+OFSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
+{
+ scheduleList = list;
+}
diff --git a/src/gpu-compute/of_scheduling_policy.hh b/src/gpu-compute/of_scheduling_policy.hh
new file mode 100644
index 000000000..684e51a3a
--- /dev/null
+++ b/src/gpu-compute/of_scheduling_policy.hh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __OF_SCHEDULING_POLICY_HH__
+#define __OF_SCHEDULING_POLICY_HH__
+
+#include <cstddef>
+#include <vector>
+
+#include "base/misc.hh"
+
+class Wavefront;
+
+// Oldest First where age is marked by the wave id
+class OFSchedulingPolicy
+{
+ public:
+ OFSchedulingPolicy() : scheduleList(nullptr) { }
+
+ Wavefront* chooseWave();
+ void bindList(std::vector<Wavefront*> *list);
+
+ private:
+ // List of waves which are participating in scheduling.
+ // This scheduler selects the oldest wave from this list
+ std::vector<Wavefront*> *scheduleList;
+};
+
+#endif // __OF_SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/pool_manager.cc b/src/gpu-compute/pool_manager.cc
new file mode 100644
index 000000000..b1bc6b1f3
--- /dev/null
+++ b/src/gpu-compute/pool_manager.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/pool_manager.hh"
+
+PoolManager::PoolManager(uint32_t minAlloc, uint32_t poolSize)
+ : _minAllocation(minAlloc), _poolSize(poolSize)
+{
+ assert(poolSize > 0);
+}
diff --git a/src/gpu-compute/pool_manager.hh b/src/gpu-compute/pool_manager.hh
new file mode 100644
index 000000000..2cb53ce72
--- /dev/null
+++ b/src/gpu-compute/pool_manager.hh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __POOL_MANAGER_HH__
+#define __POOL_MANAGER_HH__
+
+#include <cassert>
+#include <cstdint>
+#include <string>
+
+// Pool Manager Logic
+class PoolManager
+{
+ public:
+ PoolManager(uint32_t minAlloc, uint32_t poolSize);
+ uint32_t minAllocation() { return _minAllocation; }
+ virtual std::string printRegion() = 0;
+ virtual uint32_t regionSize(std::pair<uint32_t,uint32_t> &region) = 0;
+ virtual bool canAllocate(uint32_t numRegions, uint32_t size) = 0;
+
+ virtual uint32_t allocateRegion(const uint32_t size,
+ uint32_t *reserved) = 0;
+
+ virtual void freeRegion(uint32_t firstIdx, uint32_t lastIdx) = 0;
+ uint32_t poolSize() { return _poolSize; }
+
+ private:
+ // minimum size that can be reserved per allocation
+ uint32_t _minAllocation;
+ // pool size in number of elements
+ uint32_t _poolSize;
+};
+
+#endif // __POOL_MANAGER_HH__
diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh
new file mode 100644
index 000000000..092303c00
--- /dev/null
+++ b/src/gpu-compute/qstruct.hh
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Brad Beckmann, Marc Orr
+ */
+
+#ifndef __Q_STRUCT_HH__
+#define __Q_STRUCT_HH__
+
+#include <bitset>
+#include <cstdint>
+
+// Maximum number of arguments
+static const int KER_NUM_ARGS = 32;
+// Kernel argument buffer size
+static const int KER_ARGS_LENGTH = 512;
+
+class LdsChunk;
+struct NDRange;
+
+// Be very careful of alignment in this structure. The structure
+// must compile to the same layout in both 32-bit and 64-bit mode.
+struct HsaQueueEntry
+{
+ // Base pointer for array of instruction pointers
+ uint64_t code_ptr;
+ // Grid Size (3 dimensions)
+ uint32_t gdSize[3];
+ // Workgroup Size (3 dimensions)
+ uint32_t wgSize[3];
+ uint16_t sRegCount;
+ uint16_t dRegCount;
+ uint16_t cRegCount;
+ uint64_t privMemStart;
+ uint32_t privMemPerItem;
+ uint32_t privMemTotal;
+ uint64_t spillMemStart;
+ uint32_t spillMemPerItem;
+ uint32_t spillMemTotal;
+ uint64_t roMemStart;
+ uint32_t roMemTotal;
+ // Size (in bytes) of LDS
+ uint32_t ldsSize;
+ // Virtual Memory Id (unused right now)
+ uint32_t vmId;
+
+ // Pointer to dependency chain (unused now)
+ uint64_t depends;
+
+ // pointer to bool
+ uint64_t addrToNotify;
+ // pointer to uint32_t
+ uint64_t numDispLeft;
+
+ // variables to pass arguments when running in standalone mode,
+ // will be removed when run.py and sh.cpp have been updated to
+ // use args and offset arrays
+ uint64_t arg1;
+ uint64_t arg2;
+ uint64_t arg3;
+ uint64_t arg4;
+
+ // variables to pass arguments when running in cpu+gpu mode
+ uint8_t args[KER_ARGS_LENGTH];
+ uint16_t offsets[KER_NUM_ARGS];
+ uint16_t num_args;
+};
+
+// State used to start (or restart) a WF
+struct WFContext
+{
+ // 32 bit values
+ // barrier state
+ int bar_cnt[VSZ];
+
+ // id (which WF in the WG)
+ int cnt;
+
+ // more barrier state
+ int max_bar_cnt;
+ int old_barrier_cnt;
+ int barrier_cnt;
+
+ // More Program Counter Stuff
+ uint32_t pc;
+
+ // Program counter of the immediate post-dominator instruction
+ uint32_t rpc;
+
+ // WG wide state (I don't see how to avoid redundancy here)
+ int cu_id;
+ uint32_t wg_id;
+ uint32_t barrier_id;
+
+ // 64 bit values (these values depend on the wavefront size)
+ // masks
+ uint64_t init_mask;
+ uint64_t exec_mask;
+
+ // private memory;
+ Addr privBase;
+ Addr spillBase;
+
+ LdsChunk *ldsChunk;
+
+ /*
+ * Kernel wide state
+ * This is a hack. This state should be moved through simulated memory
+ * during a yield. Though not much is being used here, so it's probably
+ * probably not a big deal.
+ *
+ * Just to add to this comment... The ndr is derived from simulated
+ * memory when the cl-runtime allocates an HsaQueueEntry and populates it
+ * for a kernel launch. So in theory the runtime should be able to keep
+ * that state around. Then a WF can reference it upon restart to derive
+ * kernel wide state. The runtime can deallocate the state when the
+ * kernel completes.
+ */
+ NDRange *ndr;
+};
+
+// State that needs to be passed between the simulation and simulated app, a
+// pointer to this struct can be passed through the depends field in the
+// HsaQueueEntry struct
+struct HostState
+{
+ // cl_event* has original HsaQueueEntry for init
+ uint64_t event;
+};
+
+// Total number of HSA queues
+static const int HSAQ_NQUEUES = 8;
+
+// These values will eventually live in memory mapped registers
+// and be settable by the kernel mode driver.
+
+// Number of entries in each HSA queue
+static const int HSAQ_SIZE = 64;
+// Address of first HSA queue index
+static const int HSAQ_INDX_BASE = 0x10000ll;
+// Address of first HSA queue
+static const int HSAQ_BASE = 0x11000ll;
+// Suggested start of HSA code
+static const int HSA_CODE_BASE = 0x18000ll;
+
+// These are shortcuts for deriving the address of a specific
+// HSA queue or queue index
+#define HSAQ(n) (HSAQ_BASE + HSAQ_SIZE * sizeof(struct fsaQueue) * n)
+#define HSAQE(n,i) (HSAQ_BASE + (HSAQ_SIZE * n + i) * sizeof(struct fsaQueue))
+#define HSAQ_RI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 0))
+#define HSAQ_WI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 1))
+#define HSAQ_CI(n) (HSAQ_INDX_BASE + sizeof(int) * (n * 3 + 2))
+
+/*
+ * Example code for writing to a queue
+ *
+ * void
+ * ToQueue(int n,struct fsaQueue *val)
+ * {
+ * int wi = *(int*)HSAQ_WI(n);
+ * int ri = *(int*)HSAQ_RI(n);
+ * int ci = *(int*)HSAQ_CI(n);
+ *
+ * if (ci - ri < HSAQ_SIZE) {
+ * (*(int*)HSAQ_CI(n))++;
+ * *(HsaQueueEntry*)(HSAQE(n, (wi % HSAQ_SIZE))) = *val;
+ * (*(int*)HSAQ_WI(n))++;
+ * }
+ * }
+ */
+
+#endif // __Q_STRUCT_HH__
diff --git a/src/gpu-compute/rr_scheduling_policy.cc b/src/gpu-compute/rr_scheduling_policy.cc
new file mode 100644
index 000000000..5d3591901
--- /dev/null
+++ b/src/gpu-compute/rr_scheduling_policy.cc
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/rr_scheduling_policy.hh"
+
+#include "gpu-compute/wavefront.hh"
+
+Wavefront*
+RRSchedulingPolicy::chooseWave()
+{
+ Wavefront *selectedWave = nullptr;
+
+ // Check to make sure ready list had atleast one schedulable wave
+ if (scheduleList->size()) {
+ // For RR policy, select the wave which is at the
+ // front of the list. The selected wave is popped
+ // out from the schedule list immediately after selection
+ // to avoid starvation. It is the responsibility of the
+ // module invoking the RR scheduler to make surei scheduling
+ // eligible waves are added to the back of the schedule
+ // list
+ selectedWave = scheduleList->front();
+ scheduleList->erase(scheduleList->begin() + 0);
+ } else {
+ panic("Empty ready list");
+ }
+
+ return selectedWave;
+}
+
+void
+RRSchedulingPolicy::bindList(std::vector<Wavefront*> *list)
+{
+ scheduleList = list;
+}
diff --git a/src/gpu-compute/rr_scheduling_policy.hh b/src/gpu-compute/rr_scheduling_policy.hh
new file mode 100644
index 000000000..780f294aa
--- /dev/null
+++ b/src/gpu-compute/rr_scheduling_policy.hh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __RR_SCHEDULING_POLICY_HH__
+#define __RR_SCHEDULING_POLICY_HH__
+
+#include <inttypes.h>
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include "base/misc.hh"
+
+class Wavefront;
+
+// Round-Robin pick among the list of ready waves
+class RRSchedulingPolicy
+{
+ public:
+ RRSchedulingPolicy() : scheduleList(nullptr) { }
+
+ Wavefront* chooseWave();
+ void bindList(std::vector<Wavefront*> *list);
+
+ private:
+ // List of waves which are participating in scheduling.
+ // This scheduler selects one wave from this list based on
+ // round robin policy
+ std::vector<Wavefront*> *scheduleList;
+};
+
+#endif // __RR_SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/schedule_stage.cc b/src/gpu-compute/schedule_stage.cc
new file mode 100644
index 000000000..068136026
--- /dev/null
+++ b/src/gpu-compute/schedule_stage.cc
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/schedule_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/vector_register_file.hh"
+#include "gpu-compute/wavefront.hh"
+
+ScheduleStage::ScheduleStage(const ComputeUnitParams *p)
+ : numSIMDs(p->num_SIMDs),
+ numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes)
+{
+ for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+ Scheduler newScheduler(p);
+ scheduler.push_back(newScheduler);
+ }
+}
+
+ScheduleStage::~ScheduleStage()
+{
+ scheduler.clear();
+ waveStatusList.clear();
+}
+
+void
+ScheduleStage::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ _name = computeUnit->name() + ".ScheduleStage";
+
+ for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+ scheduler[j].bindList(&computeUnit->readyList[j]);
+ }
+
+ for (int j = 0; j < numSIMDs; ++j) {
+ waveStatusList.push_back(&computeUnit->waveStatusList[j]);
+ }
+
+ dispatchList = &computeUnit->dispatchList;
+}
+
+void
+ScheduleStage::arbitrate()
+{
+ // iterate over all Memory pipelines
+ for (int j = numSIMDs; j < numSIMDs + numMemUnits; ++j) {
+ if (dispatchList->at(j).first) {
+ Wavefront *waveToMemPipe = dispatchList->at(j).first;
+ // iterate over all execution pipelines
+ for (int i = 0; i < numSIMDs + numMemUnits; ++i) {
+ if ((i != j) && (dispatchList->at(i).first)) {
+ Wavefront *waveToExePipe = dispatchList->at(i).first;
+ // if the two selected wavefronts are mapped to the same
+ // SIMD unit then they share the VRF
+ if (waveToMemPipe->simdId == waveToExePipe->simdId) {
+ int simdId = waveToMemPipe->simdId;
+ // Read VRF port arbitration:
+ // If there are read VRF port conflicts between the
+ // a memory and another instruction we drop the other
+ // instruction. We don't need to check for write VRF
+ // port conflicts because the memory instruction either
+ // does not need to write to the VRF (store) or will
+ // write to the VRF when the data comes back (load) in
+ // which case the arbiter of the memory pipes will
+ // resolve any conflicts
+ if (computeUnit->vrf[simdId]->
+ isReadConflict(waveToMemPipe->wfSlotId,
+ waveToExePipe->wfSlotId)) {
+ // FIXME: The "second" member variable is never
+ // used in the model. I am setting it to READY
+ // simply to follow the protocol of setting it
+ // when the WF has an instruction ready to issue
+ waveStatusList[simdId]->at(waveToExePipe->wfSlotId)
+ .second = READY;
+
+ dispatchList->at(i).first = nullptr;
+ dispatchList->at(i).second = EMPTY;
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void
+ScheduleStage::exec()
+{
+ for (int j = 0; j < numSIMDs + numMemUnits; ++j) {
+ uint32_t readyListSize = computeUnit->readyList[j].size();
+
+ // If no wave is ready to be scheduled on the execution resource
+ // then skip scheduling for this execution resource
+ if (!readyListSize) {
+ continue;
+ }
+
+ Wavefront *waveToBeDispatched = scheduler[j].chooseWave();
+ dispatchList->at(j).first = waveToBeDispatched;
+ waveToBeDispatched->updateResources();
+ dispatchList->at(j).second = FILLED;
+
+ waveStatusList[waveToBeDispatched->simdId]->at(
+ waveToBeDispatched->wfSlotId).second = BLOCKED;
+
+ assert(computeUnit->readyList[j].size() == readyListSize - 1);
+ }
+ // arbitrate over all shared resources among instructions being issued
+ // simultaneously
+ arbitrate();
+}
+
+void
+ScheduleStage::regStats()
+{
+}
diff --git a/src/gpu-compute/schedule_stage.hh b/src/gpu-compute/schedule_stage.hh
new file mode 100644
index 000000000..26eb9a25b
--- /dev/null
+++ b/src/gpu-compute/schedule_stage.hh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULE_STAGE_HH__
+#define __SCHEDULE_STAGE_HH__
+
+#include <utility>
+#include <vector>
+
+#include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/scheduler.hh"
+#include "gpu-compute/scoreboard_check_stage.hh"
+
+// Schedule or execution arbitration stage.
+// From the pool of ready waves in the ready list,
+// one wave is selected for each execution resource.
+// The selection is made based on a scheduling policy
+
+class ComputeUnit;
+class Wavefront;
+
+struct ComputeUnitParams;
+
+class ScheduleStage
+{
+ public:
+ ScheduleStage(const ComputeUnitParams *params);
+ ~ScheduleStage();
+ void init(ComputeUnit *cu);
+ void exec();
+ void arbitrate();
+ // Stats related variables and methods
+ std::string name() { return _name; }
+ void regStats();
+
+ private:
+ ComputeUnit *computeUnit;
+ uint32_t numSIMDs;
+ uint32_t numMemUnits;
+
+ // Each execution resource will have its own
+ // scheduler and a dispatch list
+ std::vector<Scheduler> scheduler;
+
+ // Stores the status of waves. A READY implies the
+ // wave is ready to be scheduled this cycle and
+ // is already present in the readyList
+ std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
+ waveStatusList;
+
+ // List of waves which will be dispatched to
+ // each execution resource. A FILLED implies
+ // dispatch list is non-empty and
+ // execution unit has something to execute
+ // this cycle. Currently, the dispatch list of
+ // an execution resource can hold only one wave because
+ // an execution resource can execute only one wave in a cycle.
+ std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> *dispatchList;
+
+ std::string _name;
+};
+
+#endif // __SCHEDULE_STAGE_HH__
diff --git a/src/gpu-compute/scheduler.cc b/src/gpu-compute/scheduler.cc
new file mode 100644
index 000000000..1cd0bfe55
--- /dev/null
+++ b/src/gpu-compute/scheduler.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/scheduler.hh"
+
+Scheduler::Scheduler(const ComputeUnitParams *p)
+{
+ if (p->execPolicy == "OLDEST-FIRST") {
+ schedPolicy = SCHED_POLICY::OF_POLICY;
+ } else if (p->execPolicy == "ROUND-ROBIN") {
+ schedPolicy = SCHED_POLICY::RR_POLICY;
+ } else {
+ fatal("Unimplemented scheduling policy");
+ }
+}
+
+Wavefront*
+Scheduler::chooseWave()
+{
+ if (schedPolicy == SCHED_POLICY::OF_POLICY) {
+ return OFSchedPolicy.chooseWave();
+ } else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
+ return RRSchedPolicy.chooseWave();
+ } else {
+ fatal("Unimplemented scheduling policy");
+ }
+}
+
+void
+Scheduler::bindList(std::vector<Wavefront*> *list)
+{
+ if (schedPolicy == SCHED_POLICY::OF_POLICY) {
+ OFSchedPolicy.bindList(list);
+ } else if (schedPolicy == SCHED_POLICY::RR_POLICY) {
+ RRSchedPolicy.bindList(list);
+ } else {
+ fatal("Unimplemented scheduling policy");
+ }
+}
diff --git a/src/gpu-compute/scheduler.hh b/src/gpu-compute/scheduler.hh
new file mode 100644
index 000000000..148ec9425
--- /dev/null
+++ b/src/gpu-compute/scheduler.hh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULER_HH__
+#define __SCHEDULER_HH__
+
+#include "gpu-compute/of_scheduling_policy.hh"
+#include "gpu-compute/rr_scheduling_policy.hh"
+#include "gpu-compute/scheduling_policy.hh"
+#include "params/ComputeUnit.hh"
+
+enum SCHED_POLICY
+{
+ OF_POLICY = 0,
+ RR_POLICY
+};
+
+class Scheduler
+{
+ public:
+ Scheduler(const ComputeUnitParams *params);
+ Wavefront *chooseWave();
+ void bindList(std::vector<Wavefront*> *list);
+
+ private:
+ SCHED_POLICY schedPolicy;
+ SchedulingPolicy<RRSchedulingPolicy> RRSchedPolicy;
+ SchedulingPolicy<OFSchedulingPolicy> OFSchedPolicy;
+};
+
+#endif // __SCHEDULER_HH__
diff --git a/src/gpu-compute/scheduling_policy.hh b/src/gpu-compute/scheduling_policy.hh
new file mode 100644
index 000000000..b5e923c62
--- /dev/null
+++ b/src/gpu-compute/scheduling_policy.hh
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCHEDULING_POLICY_HH__
+#define __SCHEDULING_POLICY_HH__
+
+#include <vector>
+
+template<typename Impl>
+class SchedulingPolicy
+{
+ public:
+ Wavefront* chooseWave() { return policyImpl.chooseWave(); }
+
+ void
+ bindList(std::vector<Wavefront*> *list)
+ {
+ return policyImpl.bindList(list);
+ }
+
+ private:
+ Impl policyImpl;
+};
+
+#endif // __SCHEDULING_POLICY_HH__
diff --git a/src/gpu-compute/scoreboard_check_stage.cc b/src/gpu-compute/scoreboard_check_stage.cc
new file mode 100644
index 000000000..0d856a9b0
--- /dev/null
+++ b/src/gpu-compute/scoreboard_check_stage.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "gpu-compute/scoreboard_check_stage.hh"
+
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/ComputeUnit.hh"
+
+ScoreboardCheckStage::ScoreboardCheckStage(const ComputeUnitParams *p)
+ : numSIMDs(p->num_SIMDs),
+ numMemUnits(p->num_global_mem_pipes + p->num_shared_mem_pipes),
+ numGlbMemPipes(p->num_global_mem_pipes),
+ numShrMemPipes(p->num_shared_mem_pipes),
+ vectorAluInstAvail(nullptr),
+ lastGlbMemSimd(-1),
+ lastShrMemSimd(-1), glbMemInstAvail(nullptr),
+ shrMemInstAvail(nullptr)
+{
+}
+
+ScoreboardCheckStage::~ScoreboardCheckStage()
+{
+ readyList.clear();
+ waveStatusList.clear();
+ shrMemInstAvail = nullptr;
+ glbMemInstAvail = nullptr;
+}
+
+void
+ScoreboardCheckStage::init(ComputeUnit *cu)
+{
+ computeUnit = cu;
+ _name = computeUnit->name() + ".ScoreboardCheckStage";
+
+ for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+ readyList.push_back(&computeUnit->readyList[unitId]);
+ }
+
+ for (int unitId = 0; unitId < numSIMDs; ++unitId) {
+ waveStatusList.push_back(&computeUnit->waveStatusList[unitId]);
+ }
+
+ vectorAluInstAvail = &computeUnit->vectorAluInstAvail;
+ glbMemInstAvail= &computeUnit->glbMemInstAvail;
+ shrMemInstAvail= &computeUnit->shrMemInstAvail;
+}
+
+void
+ScoreboardCheckStage::initStatistics()
+{
+ lastGlbMemSimd = -1;
+ lastShrMemSimd = -1;
+ *glbMemInstAvail = 0;
+ *shrMemInstAvail = 0;
+
+ for (int unitId = 0; unitId < numSIMDs; ++unitId)
+ vectorAluInstAvail->at(unitId) = false;
+}
+
+void
+ScoreboardCheckStage::collectStatistics(Wavefront *curWave, int unitId)
+{
+ if (curWave->instructionBuffer.empty())
+ return;
+
+ // track which vector SIMD unit has at least one WV with a vector
+ // ALU as the oldest instruction in its Instruction buffer
+ vectorAluInstAvail->at(unitId) = vectorAluInstAvail->at(unitId) ||
+ curWave->isOldestInstALU();
+
+ // track how many vector SIMD units have at least one WV with a
+ // vector Global memory instruction as the oldest instruction
+ // in its Instruction buffer
+ if ((curWave->isOldestInstGMem() || curWave->isOldestInstPrivMem() ||
+ curWave->isOldestInstFlatMem()) && lastGlbMemSimd != unitId &&
+ *glbMemInstAvail <= 1) {
+ (*glbMemInstAvail)++;
+ lastGlbMemSimd = unitId;
+ }
+
+ // track how many vector SIMD units have at least one WV with a
+ // vector shared memory (LDS) instruction as the oldest instruction
+ // in its Instruction buffer
+ // TODO: parametrize the limit of the LDS units
+ if (curWave->isOldestInstLMem() && (*shrMemInstAvail <= numShrMemPipes) &&
+ lastShrMemSimd != unitId) {
+ (*shrMemInstAvail)++;
+ lastShrMemSimd = unitId;
+ }
+}
+
+void
+ScoreboardCheckStage::exec()
+{
+ initStatistics();
+
+ // reset the ready list for all execution units; it will be
+ // constructed every cycle since resource availability may change
+ for (int unitId = 0; unitId < numSIMDs + numMemUnits; ++unitId) {
+ readyList[unitId]->clear();
+ }
+
+ // iterate over the Wavefronts of all SIMD units
+ for (int unitId = 0; unitId < numSIMDs; ++unitId) {
+ for (int wvId = 0; wvId < computeUnit->shader->n_wf; ++wvId) {
+ // reset the ready status of each wavefront
+ waveStatusList[unitId]->at(wvId).second = BLOCKED;
+ Wavefront *curWave = waveStatusList[unitId]->at(wvId).first;
+ collectStatistics(curWave, unitId);
+
+ if (curWave->ready(Wavefront::I_ALU)) {
+ readyList[unitId]->push_back(curWave);
+ waveStatusList[unitId]->at(wvId).second = READY;
+ } else if (curWave->ready(Wavefront::I_GLOBAL)) {
+ if (computeUnit->cedeSIMD(unitId, wvId)) {
+ continue;
+ }
+
+ readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+ waveStatusList[unitId]->at(wvId).second = READY;
+ } else if (curWave->ready(Wavefront::I_SHARED)) {
+ readyList[computeUnit->ShrMemUnitId()]->push_back(curWave);
+ waveStatusList[unitId]->at(wvId).second = READY;
+ } else if (curWave->ready(Wavefront::I_FLAT)) {
+ readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+ waveStatusList[unitId]->at(wvId).second = READY;
+ } else if (curWave->ready(Wavefront::I_PRIVATE)) {
+ readyList[computeUnit->GlbMemUnitId()]->push_back(curWave);
+ waveStatusList[unitId]->at(wvId).second = READY;
+ }
+ }
+ }
+}
+
+void
+ScoreboardCheckStage::regStats()
+{
+}
diff --git a/src/gpu-compute/scoreboard_check_stage.hh b/src/gpu-compute/scoreboard_check_stage.hh
new file mode 100644
index 000000000..099597afb
--- /dev/null
+++ b/src/gpu-compute/scoreboard_check_stage.hh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __SCOREBOARD_CHECK_STAGE_HH__
+#define __SCOREBOARD_CHECK_STAGE_HH__
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+class ComputeUnit;
+class Wavefront;
+
+struct ComputeUnitParams;
+
+enum WAVE_STATUS
+{
+ BLOCKED = 0,
+ READY
+};
+
+/*
+ * Scoreboard check stage.
+ * All wavefronts are analyzed to see if they are ready
+ * to be executed this cycle. Both structural and data
+ * hazards are considered while marking a wave "ready"
+ * for execution. After analysis, the ready waves are
+ * added to readyList.
+ */
+class ScoreboardCheckStage
+{
+ public:
+ ScoreboardCheckStage(const ComputeUnitParams* params);
+ ~ScoreboardCheckStage();
+ void init(ComputeUnit *cu);
+ void exec();
+
+ // Stats related variables and methods
+ const std::string& name() const { return _name; }
+ void regStats();
+
+ private:
+ void collectStatistics(Wavefront *curWave, int unitId);
+ void initStatistics();
+ ComputeUnit *computeUnit;
+ uint32_t numSIMDs;
+ uint32_t numMemUnits;
+ uint32_t numGlbMemPipes;
+ uint32_t numShrMemPipes;
+
+ // flag per vector SIMD unit that is set when there is at least one
+ // WF that has a vector ALU instruction as the oldest in its
+ // Instruction Buffer
+ std::vector<bool> *vectorAluInstAvail;
+ int lastGlbMemSimd;
+ int lastShrMemSimd;
+
+ int *glbMemInstAvail;
+ int *shrMemInstAvail;
+ // List of waves which are ready to be scheduled.
+ // Each execution resource has a ready list
+ std::vector<std::vector<Wavefront*>*> readyList;
+
+ // Stores the status of waves. A READY implies the
+ // wave is ready to be scheduled this cycle and
+ // is already present in the readyList
+ std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>*>
+ waveStatusList;
+
+ std::string _name;
+};
+
+#endif // __SCOREBOARD_CHECK_STAGE_HH__
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
new file mode 100644
index 000000000..e8d7946ff
--- /dev/null
+++ b/src/gpu-compute/shader.cc
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/shader.hh"
+
+#include <limits>
+
+#include "arch/x86/linux/linux.hh"
+#include "base/chunk_generator.hh"
+#include "debug/GPUDisp.hh"
+#include "debug/GPUMem.hh"
+#include "debug/HSAIL.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "sim/sim_exit.hh"
+
+Shader::Shader(const Params *p) : SimObject(p),
+ clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
+ cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),
+ hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),
+ separate_acquire_release(p->separate_acquire_release), coissue_return(1),
+ trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
+ globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
+ box_tick_cnt(0), start_tick_cnt(0)
+{
+
+ cuList.resize(n_cu);
+
+ for (int i = 0; i < n_cu; ++i) {
+ cuList[i] = p->CUs[i];
+ assert(i == cuList[i]->cu_id);
+ cuList[i]->shader = this;
+ }
+}
+
+Addr
+Shader::mmap(int length)
+{
+
+ Addr start;
+
+ // round up length to the next page
+ length = roundUp(length, TheISA::PageBytes);
+
+ if (X86Linux64::mmapGrowsDown()) {
+ DPRINTF(HSAIL, "GROWS DOWN");
+ start = gpuTc->getProcessPtr()->mmap_end -length;
+ gpuTc->getProcessPtr()->mmap_end = start;
+ } else {
+ DPRINTF(HSAIL, "GROWS UP");
+ start = gpuTc->getProcessPtr()->mmap_end;
+ gpuTc->getProcessPtr()->mmap_end += length;
+
+ // assertion to make sure we don't overwrite the stack (it grows down)
+ assert(gpuTc->getProcessPtr()->mmap_end <
+ gpuTc->getProcessPtr()->stack_base -
+ gpuTc->getProcessPtr()->max_stack_size);
+
+ }
+
+ DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
+
+ gpuTc->getProcessPtr()->allocateMem(start,length);
+
+ return start;
+}
+
+void
+Shader::init()
+{
+ // grab the threadContext of the thread running on the CPU
+ assert(cpuPointer);
+ gpuTc = cpuPointer->getContext(0);
+ assert(gpuTc);
+}
+
+Shader::~Shader()
+{
+ for (int j = 0; j < n_cu; ++j)
+ delete cuList[j];
+}
+
+void
+Shader::updateThreadContext(int tid) {
+ // thread context of the thread which dispatched work
+ assert(cpuPointer);
+ gpuTc = cpuPointer->getContext(tid);
+ assert(gpuTc);
+}
+
+void
+Shader::hostWakeUp(BaseCPU *cpu) {
+ if (cpuPointer == cpu) {
+ if (gpuTc->status() == ThreadContext::Suspended)
+ cpu->activateContext(gpuTc->threadId());
+ } else {
+ //Make sure both dispatcher and shader are trying to
+ //wakeup same host. Hack here to enable kernel launch
+ //from multiple CPUs
+ panic("Dispatcher wants to wakeup a different host");
+ }
+}
+
+Shader*
+ShaderParams::create()
+{
+ return new Shader(this);
+}
+
+void
+Shader::exec()
+{
+ tick_cnt = curTick();
+ box_tick_cnt = curTick() - start_tick_cnt;
+
+ // apply any scheduled adds
+ for (int i = 0; i < sa_n; ++i) {
+ if (sa_when[i] <= tick_cnt) {
+ *sa_val[i] += sa_x[i];
+ sa_val.erase(sa_val.begin() + i);
+ sa_x.erase(sa_x.begin() + i);
+ sa_when.erase(sa_when.begin() + i);
+ --sa_n;
+ --i;
+ }
+ }
+
+ // clock all of the cu's
+ for (int i = 0; i < n_cu; ++i)
+ cuList[i]->exec();
+}
+
+bool
+Shader::dispatch_workgroups(NDRange *ndr)
+{
+ bool scheduledSomething = false;
+ int cuCount = 0;
+ int curCu = nextSchedCu;
+
+ while (cuCount < n_cu) {
+ //Every time we try a CU, update nextSchedCu
+ nextSchedCu = (nextSchedCu + 1) % n_cu;
+
+ // dispatch workgroup iff the following two conditions are met:
+ // (a) wg_rem is true - there are unassigned workgroups in the grid
+ // (b) there are enough free slots in cu cuList[i] for this wg
+ if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
+ scheduledSomething = true;
+ DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
+
+ // ticks() member function translates cycles to simulation ticks.
+ if (!tickEvent.scheduled()) {
+ schedule(tickEvent, curTick() + this->ticks(1));
+ }
+
+ cuList[curCu]->StartWorkgroup(ndr);
+ ndr->wgId[0]++;
+ ndr->globalWgId++;
+ if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
+ ndr->wgId[0] = 0;
+ ndr->wgId[1]++;
+
+ if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
+ ndr->wgId[1] = 0;
+ ndr->wgId[2]++;
+
+ if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
+ ndr->wg_disp_rem = false;
+ break;
+ }
+ }
+ }
+ }
+
+ ++cuCount;
+ curCu = nextSchedCu;
+ }
+
+ return scheduledSomething;
+}
+
+void
+Shader::handshake(GpuDispatcher *_dispatcher)
+{
+ dispatcher = _dispatcher;
+}
+
+void
+Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
+ bool suppress_func_errors, int cu_id)
+{
+ unsigned block_size = RubySystem::getBlockSizeBytes();
+ unsigned size = req->getSize();
+
+ Addr tmp_addr;
+ BaseTLB::Mode trans_mode;
+
+ if (cmd == MemCmd::ReadReq) {
+ trans_mode = BaseTLB::Read;
+ } else if (cmd == MemCmd::WriteReq) {
+ trans_mode = BaseTLB::Write;
+ } else {
+ fatal("unexcepted MemCmd\n");
+ }
+
+ tmp_addr = req->getVaddr();
+ Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
+
+ assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
+
+ // Misaligned access
+ if (split_addr > tmp_addr) {
+ RequestPtr req1, req2;
+ req->splitOnVaddr(split_addr, req1, req2);
+
+
+ PacketPtr pkt1 = new Packet(req2, cmd);
+ PacketPtr pkt2 = new Packet(req1, cmd);
+
+ functionalTLBAccess(pkt1, cu_id, trans_mode);
+ functionalTLBAccess(pkt2, cu_id, trans_mode);
+
+ PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
+ PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
+
+ new_pkt1->dataStatic(data);
+ new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
+
+ if (suppress_func_errors) {
+ new_pkt1->setSuppressFuncError();
+ new_pkt2->setSuppressFuncError();
+ }
+
+ // fixme: this should be cuList[cu_id] if cu_id != n_cu
+ // The latter requires a memPort in the dispatcher
+ cuList[0]->memPort[0]->sendFunctional(new_pkt1);
+ cuList[0]->memPort[0]->sendFunctional(new_pkt2);
+
+ delete new_pkt1;
+ delete new_pkt2;
+ delete pkt1;
+ delete pkt2;
+ } else {
+ PacketPtr pkt = new Packet(req, cmd);
+ functionalTLBAccess(pkt, cu_id, trans_mode);
+ PacketPtr new_pkt = new Packet(pkt->req, cmd);
+ new_pkt->dataStatic(data);
+
+ if (suppress_func_errors) {
+ new_pkt->setSuppressFuncError();
+ };
+
+ // fixme: this should be cuList[cu_id] if cu_id != n_cu
+ // The latter requires a memPort in the dispatcher
+ cuList[0]->memPort[0]->sendFunctional(new_pkt);
+
+ delete new_pkt;
+ delete pkt;
+ }
+}
+
+bool
+Shader::busy()
+{
+ for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
+ if (!cuList[i_cu]->isDone()) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void
+Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
+{
+ sa_val.push_back(val);
+ sa_when.push_back(tick_cnt + when);
+ sa_x.push_back(x);
+ ++sa_n;
+}
+
+Shader::TickEvent::TickEvent(Shader *_shader)
+ : Event(CPU_Tick_Pri), shader(_shader)
+{
+}
+
+
+void
+Shader::TickEvent::process()
+{
+ if (shader->busy()) {
+ shader->exec();
+ shader->schedule(this, curTick() + shader->ticks(1));
+ }
+}
+
+const char*
+Shader::TickEvent::description() const
+{
+ return "Shader tick";
+}
+
+void
+Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+ MemCmd cmd, bool suppress_func_errors)
+{
+ uint8_t *data_buf = (uint8_t*)ptr;
+
+ for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes());
+ !gen.done(); gen.next()) {
+ Request *req = new Request(0, gen.addr(), gen.size(), 0,
+ cuList[0]->masterId(), 0, 0, 0);
+
+ doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
+ data_buf += gen.size();
+ delete req;
+ }
+}
+
+void
+Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
+{
+ AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
+}
+
+void
+Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+ bool suppress_func_errors)
+{
+ AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
+}
+
+void
+Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
+{
+ AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
+}
+
+void
+Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+ bool suppress_func_errors)
+{
+ AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
+ suppress_func_errors);
+}
+
+/*
+ * Send a packet through the appropriate TLB functional port.
+ * If cu_id=n_cu, then this is the dispatcher's TLB.
+ * Otherwise it's the TLB of the cu_id compute unit.
+ */
+void
+Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
+{
+ // update senderState. Need to know the gpuTc and the TLB mode
+ pkt->senderState =
+ new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
+
+ if (cu_id == n_cu) {
+ dispatcher->tlbPort->sendFunctional(pkt);
+ } else {
+ // even when the perLaneTLB flag is turned on
+ // it's ok tp send all accesses through lane 0
+ // since the lane # is not known here,
+ // This isn't important since these are functional accesses.
+ cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
+ }
+
+ /* safe_cast the senderState */
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ delete sender_state->tlbEntry;
+ delete pkt->senderState;
+}
diff --git a/src/gpu-compute/shader.hh b/src/gpu-compute/shader.hh
new file mode 100644
index 000000000..91ea8aae0
--- /dev/null
+++ b/src/gpu-compute/shader.hh
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#ifndef __SHADER_HH__
+#define __SHADER_HH__
+
+#include <functional>
+#include <string>
+
+#include "arch/isa.hh"
+#include "arch/isa_traits.hh"
+#include "base/types.hh"
+#include "cpu/simple/atomic.hh"
+#include "cpu/simple/timing.hh"
+#include "cpu/simple_thread.hh"
+#include "cpu/thread_context.hh"
+#include "cpu/thread_state.hh"
+#include "enums/MemOpType.hh"
+#include "enums/MemType.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_tlb.hh"
+#include "gpu-compute/lds_state.hh"
+#include "gpu-compute/qstruct.hh"
+#include "mem/page_table.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/Shader.hh"
+#include "sim/faults.hh"
+#include "sim/process.hh"
+#include "sim/sim_object.hh"
+
+class BaseTLB;
+class GpuDispatcher;
+
+namespace TheISA
+{
+ class GpuTLB;
+}
+
+static const int LDS_SIZE = 65536;
+
+// Class Shader: This describes a single shader instance. Most
+// configurations will only have a single shader.
+
+class Shader : public SimObject
+{
+ protected:
+ // Shader's clock period in terms of number of ticks of curTime,
+ // aka global simulation clock
+ Tick clock;
+
+ public:
+ typedef ShaderParams Params;
+ enum hsail_mode_e {SIMT,VECTOR_SCALAR};
+
+ // clock related functions ; maps to-and-from
+ // Simulation ticks and shader clocks.
+ Tick frequency() const { return SimClock::Frequency / clock; }
+
+ Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
+
+ Tick getClock() const { return clock; }
+ Tick curCycle() const { return curTick() / clock; }
+ Tick tickToCycles(Tick val) const { return val / clock;}
+
+
+ SimpleThread *cpuThread;
+ ThreadContext *gpuTc;
+ BaseCPU *cpuPointer;
+
+ class TickEvent : public Event
+ {
+ private:
+ Shader *shader;
+
+ public:
+ TickEvent(Shader*);
+ void process();
+ const char* description() const;
+ };
+
+ TickEvent tickEvent;
+
+ // is this simulation going to be timing mode in the memory?
+ bool timingSim;
+ hsail_mode_e hsail_mode;
+
+ // If set, issue acq packet @ kernel launch
+ int impl_kern_boundary_sync;
+ // If set, generate a separate packet for acquire/release on
+ // ld_acquire/st_release/atomic operations
+ int separate_acquire_release;
+ // If set, fetch returns may be coissued with instructions
+ int coissue_return;
+ // If set, always dump all 64 gprs to trace
+ int trace_vgpr_all;
+ // Number of cu units in the shader
+ int n_cu;
+ // Number of wavefront slots per cu
+ int n_wf;
+ // The size of global memory
+ int globalMemSize;
+
+ /*
+ * Bytes/work-item for call instruction
+ * The number of arguments for an hsail function will
+ * vary. We simply determine the maximum # of arguments
+ * required by any hsail function up front before the
+ * simulation (during parsing of the Brig) and record
+ * that number here.
+ */
+ int funcargs_size;
+
+ // Tracks CU that rr dispatcher should attempt scheduling
+ int nextSchedCu;
+
+ // Size of scheduled add queue
+ uint32_t sa_n;
+
+ // Pointer to value to be increments
+ std::vector<uint32_t*> sa_val;
+ // When to do the increment
+ std::vector<uint64_t> sa_when;
+ // Amount to increment by
+ std::vector<int32_t> sa_x;
+
+ // List of Compute Units (CU's)
+ std::vector<ComputeUnit*> cuList;
+
+ uint64_t tick_cnt;
+ uint64_t box_tick_cnt;
+ uint64_t start_tick_cnt;
+
+ GpuDispatcher *dispatcher;
+
+ Shader(const Params *p);
+ ~Shader();
+ virtual void init();
+
+ // Run shader
+ void exec();
+
+ // Check to see if shader is busy
+ bool busy();
+
+ // Schedule a 32-bit value to be incremented some time in the future
+ void ScheduleAdd(uint32_t *val, Tick when, int x);
+ bool processTimingPacket(PacketPtr pkt);
+
+ void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+ MemCmd cmd, bool suppress_func_errors);
+
+ void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
+
+ void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
+ bool suppress_func_errors);
+
+ void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id);
+
+ void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id,
+ bool suppress_func_errors);
+
+ void doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
+ bool suppress_func_errors, int cu_id);
+
+ void
+ registerCU(int cu_id, ComputeUnit *compute_unit)
+ {
+ cuList[cu_id] = compute_unit;
+ }
+
+ void handshake(GpuDispatcher *dispatcher);
+ bool dispatch_workgroups(NDRange *ndr);
+ Addr mmap(int length);
+ void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode);
+ void updateThreadContext(int tid);
+ void hostWakeUp(BaseCPU *cpu);
+};
+
+#endif // __SHADER_HH__
diff --git a/src/gpu-compute/simple_pool_manager.cc b/src/gpu-compute/simple_pool_manager.cc
new file mode 100644
index 000000000..0e35ab9cc
--- /dev/null
+++ b/src/gpu-compute/simple_pool_manager.cc
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/simple_pool_manager.hh"
+
+#include "base/misc.hh"
+
+// return the min number of elements that the manager can reserve given
+// a request for "size" elements
+uint32_t
+SimplePoolManager::minAllocatedElements(uint32_t size)
+{
+ fatal_if(size <= 0 || size > poolSize(), "Illegal VGPR region size=%d\n",
+ size);
+
+ return size % minAllocation() > 0 ?
+ (minAllocation() - (size % minAllocation())) + size : size;
+}
+
+std::string
+SimplePoolManager::printRegion()
+{
+ std::string _cout;
+ if (_reservedGroups == 0)
+ _cout = "VRF is empty\n";
+ else if (_reservedGroups > 0) {
+ uint32_t reservedEntries = _reservedGroups * _regionSize;
+ _cout = "VRF reserves " + std::to_string(reservedEntries) + " VGPRs\n";
+ }
+
+ return _cout;
+}
+
+bool
+SimplePoolManager::canAllocate(uint32_t numRegions, uint32_t size)
+{
+ assert(numRegions * minAllocatedElements(size) <= poolSize());
+
+ return _reservedGroups == 0;
+}
+
+void
+SimplePoolManager::freeRegion(uint32_t firstIdx, uint32_t lastIdx)
+{
+ assert(_reservedGroups > 0);
+ --_reservedGroups;
+
+ if (!_reservedGroups)
+ _nxtFreeIdx = 0;
+}
+
+uint32_t
+SimplePoolManager::allocateRegion(const uint32_t size,
+ uint32_t *reservedPoolSize)
+{
+ uint32_t actualSize = minAllocatedElements(size);
+ uint32_t startIdx = _nxtFreeIdx;
+ _nxtFreeIdx += actualSize;
+ _regionSize = actualSize;
+ assert(_nxtFreeIdx < poolSize());
+ *reservedPoolSize = actualSize;
+ ++_reservedGroups;
+
+ return startIdx;
+}
+
+uint32_t
+SimplePoolManager::regionSize(std::pair<uint32_t, uint32_t> &region)
+{
+ bool wrapAround = (region.first > region.second);
+ if (!wrapAround) {
+ return region.second - region.first + 1;
+ } else {
+ return region.second + poolSize() - region.first + 1;
+ }
+}
diff --git a/src/gpu-compute/simple_pool_manager.hh b/src/gpu-compute/simple_pool_manager.hh
new file mode 100644
index 000000000..1d4174da8
--- /dev/null
+++ b/src/gpu-compute/simple_pool_manager.hh
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __SIMPLE_POOL_MANAGER_HH__
+#define __SIMPLE_POOL_MANAGER_HH__
+
+#include <cassert>
+#include <cstdint>
+
+#include "gpu-compute/pool_manager.hh"
+
+// Simple Pool Manager: allows one region per pool. No region merging is
+// supported.
+class SimplePoolManager : public PoolManager
+{
+ public:
+ SimplePoolManager(uint32_t minAlloc, uint32_t poolSize)
+ : PoolManager(minAlloc, poolSize), _regionSize(0), _nxtFreeIdx(0),
+ _reservedGroups(0)
+ {
+ }
+
+ uint32_t minAllocatedElements(uint32_t size);
+ std::string printRegion();
+ bool canAllocate(uint32_t numRegions, uint32_t size);
+ uint32_t allocateRegion(const uint32_t size, uint32_t *reservedPoolSize);
+ void freeRegion(uint32_t firstIdx, uint32_t lastIdx);
+ uint32_t regionSize(std::pair<uint32_t,uint32_t> &region);
+
+ private:
+ // actual size of a region (normalized to the minimum size that can
+ // be reserved)
+ uint32_t _regionSize;
+ // next index to allocate a region
+ uint8_t _nxtFreeIdx;
+ // number of groups that reserve a region
+ uint32_t _reservedGroups;
+};
+
+#endif // __SIMPLE_POOL_MANAGER_HH__
diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc
new file mode 100644
index 000000000..835d7b740
--- /dev/null
+++ b/src/gpu-compute/tlb_coalescer.cc
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/tlb_coalescer.hh"
+
+#include <cstring>
+
+#include "debug/GPUTLB.hh"
+
+TLBCoalescer::TLBCoalescer(const Params *p) : MemObject(p),
+ clock(p->clk_domain->clockPeriod()), TLBProbesPerCycle(p->probesPerCycle),
+ coalescingWindow(p->coalescingWindow),
+ disableCoalescing(p->disableCoalescing), probeTLBEvent(this),
+ cleanupEvent(this)
+{
+ // create the slave ports based on the number of connected ports
+ for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
+ cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
+ this, i));
+ }
+
+ // create the master ports based on the number of connected ports
+ for (size_t i = 0; i < p->port_master_connection_count; ++i) {
+ memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
+ this, i));
+ }
+}
+
+BaseSlavePort&
+TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx)
+{
+ if (if_name == "slave") {
+ if (idx >= static_cast<PortID>(cpuSidePort.size())) {
+ panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
+ }
+
+ return *cpuSidePort[idx];
+ } else {
+ panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
+ }
+}
+
+BaseMasterPort&
+TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx)
+{
+ if (if_name == "master") {
+ if (idx >= static_cast<PortID>(memSidePort.size())) {
+ panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
+ }
+
+ return *memSidePort[idx];
+ } else {
+ panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
+ }
+}
+
+/*
+ * This method returns true if the <incoming_pkt>
+ * can be coalesced with <coalesced_pkt> and false otherwise.
+ * A given set of rules is checked.
+ * The rules can potentially be modified based on the TLB level.
+ */
+bool
+TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
+{
+ if (disableCoalescing)
+ return false;
+
+ TheISA::GpuTLB::TranslationState *incoming_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
+
+ TheISA::GpuTLB::TranslationState *coalesced_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
+
+ // Rule 1: Coalesce requests only if they
+ // fall within the same virtual page
+ Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
+ TheISA::PageBytes);
+
+ Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
+ TheISA::PageBytes);
+
+ if (incoming_virt_page_addr != coalesced_virt_page_addr)
+ return false;
+
+ //* Rule 2: Coalesce requests only if they
+ // share a TLB Mode, i.e. they are both read
+ // or write requests.
+ BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
+ BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
+
+ if (incoming_mode != coalesced_mode)
+ return false;
+
+ // when we can coalesce a packet update the reqCnt
+ // that is the number of packets represented by
+ // this coalesced packet
+ if (!incoming_state->prefetch)
+ coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
+
+ return true;
+}
+
+/*
+ * We need to update the physical addresses of all the translation requests
+ * that were coalesced into the one that just returned.
+ */
+void
+TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
+{
+ Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
+
+ DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
+ issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
+
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ TheISA::GpuTlbEntry *tlb_entry = sender_state->tlbEntry;
+ assert(tlb_entry);
+ Addr first_entry_vaddr = tlb_entry->vaddr;
+ Addr first_entry_paddr = tlb_entry->paddr;
+ int page_size = tlb_entry->size();
+ bool uncacheable = tlb_entry->uncacheable;
+ int first_hit_level = sender_state->hitLevel;
+ bool valid = tlb_entry->valid;
+
+ // Get the physical page address of the translated request
+ // Using the page_size specified in the TLBEntry allows us
+ // to support different page sizes.
+ Addr phys_page_paddr = pkt->req->getPaddr();
+ phys_page_paddr &= ~(page_size - 1);
+
+ for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
+ PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(
+ local_pkt->senderState);
+
+ // we are sending the packet back, so pop the reqCnt associated
+ // with this level in the TLB hiearchy
+ if (!sender_state->prefetch)
+ sender_state->reqCnt.pop_back();
+
+ /*
+ * Only the first packet from this coalesced request has been
+ * translated. Grab the translated phys. page addr and update the
+ * physical addresses of the remaining packets with the appropriate
+ * page offsets.
+ */
+ if (i) {
+ Addr paddr = phys_page_paddr;
+ paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
+ local_pkt->req->setPaddr(paddr);
+
+ if (uncacheable)
+ local_pkt->req->setFlags(Request::UNCACHEABLE);
+
+ // update senderState->tlbEntry, so we can insert
+ // the correct TLBEentry in the TLBs above.
+ sender_state->tlbEntry =
+ new TheISA::GpuTlbEntry(0, first_entry_vaddr, first_entry_paddr,
+ valid);
+
+ // update the hitLevel for all uncoalesced reqs
+ // so that each packet knows where it hit
+ // (used for statistics in the CUs)
+ sender_state->hitLevel = first_hit_level;
+ }
+
+ SlavePort *return_port = sender_state->ports.back();
+ sender_state->ports.pop_back();
+
+ // Translation is done - Convert to a response pkt if necessary and
+ // send the translation back
+ if (local_pkt->isRequest()) {
+ local_pkt->makeTimingResponse();
+ }
+
+ return_port->sendTimingResp(local_pkt);
+ }
+
+ // schedule clean up for end of this cycle
+ // This is a maximum priority event and must be on
+ // the same cycle as GPUTLB cleanup event to prevent
+ // race conditions with an IssueProbeEvent caused by
+ // MemSidePort::recvReqRetry
+ cleanupQueue.push(virt_page_addr);
+
+ if (!cleanupEvent.scheduled())
+ schedule(cleanupEvent, curTick());
+}
+
+// Receive translation requests, create a coalesced request,
+// and send them to the TLB (TLBProbesPerCycle)
+bool
+TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
+{
+ // first packet of a coalesced request
+ PacketPtr first_packet = nullptr;
+ // true if we are able to do coalescing
+ bool didCoalesce = false;
+ // number of coalesced reqs for a given window
+ int coalescedReq_cnt = 0;
+
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ // push back the port to remember the path back
+ sender_state->ports.push_back(this);
+
+ bool update_stats = !sender_state->prefetch;
+
+ if (update_stats) {
+ // if reqCnt is empty then this packet does not represent
+ // multiple uncoalesced reqs(pkts) but just a single pkt.
+ // If it does though then the reqCnt for each level in the
+ // hierarchy accumulates the total number of reqs this packet
+ // represents
+ int req_cnt = 1;
+
+ if (!sender_state->reqCnt.empty())
+ req_cnt = sender_state->reqCnt.back();
+
+ sender_state->reqCnt.push_back(req_cnt);
+
+ // update statistics
+ coalescer->uncoalescedAccesses++;
+ req_cnt = sender_state->reqCnt.back();
+ DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
+ coalescer->queuingCycles -= (curTick() * req_cnt);
+ coalescer->localqueuingCycles -= curTick();
+ }
+
+ // FIXME if you want to coalesce not based on the issueTime
+ // of the packets (i.e., from the compute unit's perspective)
+ // but based on when they reached this coalescer then
+ // remove the following if statement and use curTick() or
+ // coalescingWindow for the tick_index.
+ if (!sender_state->issueTime)
+ sender_state->issueTime = curTick();
+
+ // The tick index is used as a key to the coalescerFIFO hashmap.
+ // It is shared by all candidates that fall within the
+ // given coalescingWindow.
+ int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
+
+ if (coalescer->coalescerFIFO.count(tick_index)) {
+ coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
+ }
+
+ // see if we can coalesce the incoming pkt with another
+ // coalesced request with the same tick_index
+ for (int i = 0; i < coalescedReq_cnt; ++i) {
+ first_packet = coalescer->coalescerFIFO[tick_index][i][0];
+
+ if (coalescer->canCoalesce(pkt, first_packet)) {
+ coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
+
+ DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
+ i, tick_index,
+ coalescer->coalescerFIFO[tick_index][i].size());
+
+ didCoalesce = true;
+ break;
+ }
+ }
+
+ // if this is the first request for this tick_index
+ // or we did not manage to coalesce, update stats
+ // and make necessary allocations.
+ if (!coalescedReq_cnt || !didCoalesce) {
+ if (update_stats)
+ coalescer->coalescedAccesses++;
+
+ std::vector<PacketPtr> new_array;
+ new_array.push_back(pkt);
+ coalescer->coalescerFIFO[tick_index].push_back(new_array);
+
+ DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
+ "push\n", tick_index,
+ coalescer->coalescerFIFO[tick_index].size());
+ }
+
+ //schedule probeTLBEvent next cycle to send the
+ //coalesced requests to the TLB
+ if (!coalescer->probeTLBEvent.scheduled()) {
+ coalescer->schedule(coalescer->probeTLBEvent,
+ curTick() + coalescer->ticks(1));
+ }
+
+ return true;
+}
+
+void
+TLBCoalescer::CpuSidePort::recvReqRetry()
+{
+ assert(false);
+}
+
+void
+TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
+{
+
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ bool update_stats = !sender_state->prefetch;
+
+ if (update_stats)
+ coalescer->uncoalescedAccesses++;
+
+ // If there is a pending timing request for this virtual address
+ // print a warning message. This is a temporary caveat of
+ // the current simulator where atomic and timing requests can
+ // coexist. FIXME remove this check/warning in the future.
+ Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
+ int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
+
+ if (map_count) {
+ DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
+ "req. pending\n", virt_page_addr);
+ }
+
+ coalescer->memSidePort[0]->sendFunctional(pkt);
+}
+
+AddrRangeList
+TLBCoalescer::CpuSidePort::getAddrRanges() const
+{
+ // currently not checked by the master
+ AddrRangeList ranges;
+
+ return ranges;
+}
+
+bool
+TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
+{
+ // a translation completed and returned
+ coalescer->updatePhysAddresses(pkt);
+
+ return true;
+}
+
+void
+TLBCoalescer::MemSidePort::recvReqRetry()
+{
+ //we've receeived a retry. Schedule a probeTLBEvent
+ if (!coalescer->probeTLBEvent.scheduled())
+ coalescer->schedule(coalescer->probeTLBEvent,
+ curTick() + coalescer->ticks(1));
+}
+
+void
+TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
+{
+ fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
+}
+
+TLBCoalescer::IssueProbeEvent::IssueProbeEvent(TLBCoalescer * _coalescer)
+ : Event(CPU_Tick_Pri), coalescer(_coalescer)
+{
+}
+
+const char*
+TLBCoalescer::IssueProbeEvent::description() const
+{
+ return "Probe the TLB below";
+}
+
+/*
+ * Here we scan the coalescer FIFO and issue the max
+ * number of permitted probes to the TLB below. We
+ * permit bypassing of coalesced requests for the same
+ * tick_index.
+ *
+ * We do not access the next tick_index unless we've
+ * drained the previous one. The coalesced requests
+ * that are successfully sent are moved to the
+ * issuedTranslationsTable table (the table which keeps
+ * track of the outstanding reqs)
+ */
+void
+TLBCoalescer::IssueProbeEvent::process()
+{
+ // number of TLB probes sent so far
+ int sent_probes = 0;
+ // rejected denotes a blocking event
+ bool rejected = false;
+
+ // It is set to true either when the recvTiming of the TLB below
+ // returns false or when there is another outstanding request for the
+ // same virt. page.
+
+ DPRINTF(GPUTLB, "triggered TLBCoalescer IssueProbeEvent\n");
+
+ for (auto iter = coalescer->coalescerFIFO.begin();
+ iter != coalescer->coalescerFIFO.end() && !rejected; ) {
+ int coalescedReq_cnt = iter->second.size();
+ int i = 0;
+ int vector_index = 0;
+
+ DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
+ coalescedReq_cnt, iter->first);
+
+ while (i < coalescedReq_cnt) {
+ ++i;
+ PacketPtr first_packet = iter->second[vector_index][0];
+
+ // compute virtual page address for this request
+ Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
+ TheISA::PageBytes);
+
+ // is there another outstanding request for the same page addr?
+ int pending_reqs =
+ coalescer->issuedTranslationsTable.count(virt_page_addr);
+
+ if (pending_reqs) {
+ DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
+ "page %#x\n", virt_page_addr);
+
+ ++vector_index;
+ rejected = true;
+
+ continue;
+ }
+
+ // send the coalesced request for virt_page_addr
+ if (!coalescer->memSidePort[0]->sendTimingReq(first_packet)) {
+ DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
+ virt_page_addr);
+
+ // No need for a retries queue since we are already buffering
+ // the coalesced request in coalescerFIFO.
+ rejected = true;
+ ++vector_index;
+ } else {
+ TheISA::GpuTLB::TranslationState *tmp_sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>
+ (first_packet->senderState);
+
+ bool update_stats = !tmp_sender_state->prefetch;
+
+ if (update_stats) {
+ // req_cnt is total number of packets represented
+ // by the one we just sent counting all the way from
+ // the top of TLB hiearchy (i.e., from the CU)
+ int req_cnt = tmp_sender_state->reqCnt.back();
+ coalescer->queuingCycles += (curTick() * req_cnt);
+
+ DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
+ coalescer->name(), req_cnt);
+
+ // pkt_cnt is number of packets we coalesced into the one
+ // we just sent but only at this coalescer level
+ int pkt_cnt = iter->second[vector_index].size();
+ coalescer->localqueuingCycles += (curTick() * pkt_cnt);
+ }
+
+ DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
+ virt_page_addr);
+
+ //copy coalescedReq to issuedTranslationsTable
+ coalescer->issuedTranslationsTable[virt_page_addr]
+ = iter->second[vector_index];
+
+ //erase the entry of this coalesced req
+ iter->second.erase(iter->second.begin() + vector_index);
+
+ if (iter->second.empty())
+ assert(i == coalescedReq_cnt);
+
+ sent_probes++;
+ if (sent_probes == coalescer->TLBProbesPerCycle)
+ return;
+ }
+ }
+
+ //if there are no more coalesced reqs for this tick_index
+ //erase the hash_map with the first iterator
+ if (iter->second.empty()) {
+ coalescer->coalescerFIFO.erase(iter++);
+ } else {
+ ++iter;
+ }
+ }
+}
+
+TLBCoalescer::CleanupEvent::CleanupEvent(TLBCoalescer* _coalescer)
+ : Event(Maximum_Pri), coalescer(_coalescer)
+{
+}
+
+const char*
+TLBCoalescer::CleanupEvent::description() const
+{
+ return "Cleanup issuedTranslationsTable hashmap";
+}
+
+void
+TLBCoalescer::CleanupEvent::process()
+{
+ while (!coalescer->cleanupQueue.empty()) {
+ Addr cleanup_addr = coalescer->cleanupQueue.front();
+ coalescer->cleanupQueue.pop();
+ coalescer->issuedTranslationsTable.erase(cleanup_addr);
+
+ DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
+ cleanup_addr);
+ }
+}
+
+void
+TLBCoalescer::regStats()
+{
+ uncoalescedAccesses
+ .name(name() + ".uncoalesced_accesses")
+ .desc("Number of uncoalesced TLB accesses")
+ ;
+
+ coalescedAccesses
+ .name(name() + ".coalesced_accesses")
+ .desc("Number of coalesced TLB accesses")
+ ;
+
+ queuingCycles
+ .name(name() + ".queuing_cycles")
+ .desc("Number of cycles spent in queue")
+ ;
+
+ localqueuingCycles
+ .name(name() + ".local_queuing_cycles")
+ .desc("Number of cycles spent in queue for all incoming reqs")
+ ;
+
+ localLatency
+ .name(name() + ".local_latency")
+ .desc("Avg. latency over all incoming pkts")
+ ;
+
+ localLatency = localqueuingCycles / uncoalescedAccesses;
+}
+
+
+TLBCoalescer*
+TLBCoalescerParams::create()
+{
+ return new TLBCoalescer(this);
+}
+
diff --git a/src/gpu-compute/tlb_coalescer.hh b/src/gpu-compute/tlb_coalescer.hh
new file mode 100644
index 000000000..09210148b
--- /dev/null
+++ b/src/gpu-compute/tlb_coalescer.hh
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __TLB_COALESCER_HH__
+#define __TLB_COALESCER_HH__
+
+#include <list>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "arch/generic/tlb.hh"
+#include "arch/isa.hh"
+#include "arch/isa_traits.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/regs/segment.hh"
+#include "base/misc.hh"
+#include "base/statistics.hh"
+#include "gpu-compute/gpu_tlb.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/TLBCoalescer.hh"
+
+class BaseTLB;
+class Packet;
+class ThreadContext;
+
+/**
+ * The TLBCoalescer is a MemObject sitting on the front side (CPUSide) of
+ * each TLB. It receives packets and issues coalesced requests to the
+ * TLB below it. It controls how requests are coalesced (the rules)
+ * and the permitted number of TLB probes per cycle (i.e., how many
+ * coalesced requests it feeds the TLB per cycle).
+ */
+class TLBCoalescer : public MemObject
+{
+ protected:
+ // TLB clock: will inherit clock from shader's clock period in terms
+ // of nuber of ticks of curTime (aka global simulation clock)
+ // The assignment of TLB clock from shader clock is done in the
+ // python config files.
+ int clock;
+
+ public:
+ typedef TLBCoalescerParams Params;
+ TLBCoalescer(const Params *p);
+ ~TLBCoalescer() { }
+
+ // Number of TLB probes per cycle. Parameterizable - default 2.
+ int TLBProbesPerCycle;
+
+ // Consider coalescing across that many ticks.
+ // Paraemterizable - default 1.
+ int coalescingWindow;
+
+ // Each coalesced request consists of multiple packets
+ // that all fall within the same virtual page
+ typedef std::vector<PacketPtr> coalescedReq;
+
+ // disables coalescing when true
+ bool disableCoalescing;
+
+ /*
+ * This is a hash map with <tick_index> as a key.
+ * It contains a vector of coalescedReqs per <tick_index>.
+ * Requests are buffered here until they can be issued to
+ * the TLB, at which point they are copied to the
+ * issuedTranslationsTable hash map.
+ *
+ * In terms of coalescing, we coalesce requests in a given
+ * window of x cycles by using tick_index = issueTime/x as a
+ * key, where x = coalescingWindow. issueTime is the issueTime
+ * of the pkt from the ComputeUnit's perspective, but another
+ * option is to change it to curTick(), so we coalesce based
+ * on the receive time.
+ */
+ typedef std::unordered_map<int64_t, std::vector<coalescedReq>> CoalescingFIFO;
+
+ CoalescingFIFO coalescerFIFO;
+
+ /*
+ * issuedTranslationsTabler: a hash_map indexed by virtual page
+ * address. Each hash_map entry has a vector of PacketPtr associated
+ * with it denoting the different packets that share an outstanding
+ * coalesced translation request for the same virtual page.
+ *
+ * The rules that determine which requests we can coalesce are
+ * specified in the canCoalesce() method.
+ */
+ typedef std::unordered_map<Addr, coalescedReq> CoalescingTable;
+
+ CoalescingTable issuedTranslationsTable;
+
+ // number of packets the coalescer receives
+ Stats::Scalar uncoalescedAccesses;
+ // number packets the coalescer send to the TLB
+ Stats::Scalar coalescedAccesses;
+
+ // Number of cycles the coalesced requests spend waiting in
+ // coalescerFIFO. For each packet the coalescer receives we take into
+ // account the number of all uncoalesced requests this pkt "represents"
+ Stats::Scalar queuingCycles;
+
+ // On average how much time a request from the
+ // uncoalescedAccesses that reaches the TLB
+ // spends waiting?
+ Stats::Scalar localqueuingCycles;
+ // localqueuingCycles/uncoalescedAccesses
+ Stats::Formula localLatency;
+
+ bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2);
+ void updatePhysAddresses(PacketPtr pkt);
+ void regStats();
+
+ // Clock related functions. Maps to-and-from
+ // Simulation ticks and object clocks.
+ Tick frequency() const { return SimClock::Frequency / clock; }
+ Tick ticks(int numCycles) const { return (Tick)clock * numCycles; }
+ Tick curCycle() const { return curTick() / clock; }
+ Tick tickToCycles(Tick val) const { return val / clock;}
+
+ class CpuSidePort : public SlavePort
+ {
+ public:
+ CpuSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer,
+ PortID _index)
+ : SlavePort(_name, tlb_coalescer), coalescer(tlb_coalescer),
+ index(_index) { }
+
+ protected:
+ TLBCoalescer *coalescer;
+ int index;
+
+ virtual bool recvTimingReq(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt);
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+
+ virtual void
+ recvRespRetry()
+ {
+ fatal("recvRespRetry() is not implemented in the TLB coalescer.\n");
+ }
+
+ virtual AddrRangeList getAddrRanges() const;
+ };
+
+ class MemSidePort : public MasterPort
+ {
+ public:
+ MemSidePort(const std::string &_name, TLBCoalescer *tlb_coalescer,
+ PortID _index)
+ : MasterPort(_name, tlb_coalescer), coalescer(tlb_coalescer),
+ index(_index) { }
+
+ std::deque<PacketPtr> retries;
+
+ protected:
+ TLBCoalescer *coalescer;
+ int index;
+
+ virtual bool recvTimingResp(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt);
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+
+ virtual void
+ recvRespRetry()
+ {
+ fatal("recvRespRetry() not implemented in TLB coalescer");
+ }
+ };
+
+ // Coalescer slave ports on the cpu Side
+ std::vector<CpuSidePort*> cpuSidePort;
+ // Coalescer master ports on the memory side
+ std::vector<MemSidePort*> memSidePort;
+
+ BaseMasterPort& getMasterPort(const std::string &if_name, PortID idx);
+ BaseSlavePort& getSlavePort(const std::string &if_name, PortID idx);
+
+ class IssueProbeEvent : public Event
+ {
+ private:
+ TLBCoalescer *coalescer;
+
+ public:
+ IssueProbeEvent(TLBCoalescer *_coalescer);
+ void process();
+ const char *description() const;
+ };
+
+ // this event issues the TLB probes
+ IssueProbeEvent probeTLBEvent;
+
+ // the cleanupEvent is scheduled after a TLBEvent triggers
+ // in order to free memory and do the required clean-up
+ class CleanupEvent : public Event
+ {
+ private:
+ TLBCoalescer *coalescer;
+
+ public:
+ CleanupEvent(TLBCoalescer *_coalescer);
+ void process();
+ const char* description() const;
+ };
+
+ // schedule cleanup
+ CleanupEvent cleanupEvent;
+
+ // this FIFO queue keeps track of the virt. page
+ // addresses that are pending cleanup
+ std::queue<Addr> cleanupQueue;
+};
+
+#endif // __TLB_COALESCER_HH__
diff --git a/src/gpu-compute/vector_register_file.cc b/src/gpu-compute/vector_register_file.cc
new file mode 100644
index 000000000..8b7dc0691
--- /dev/null
+++ b/src/gpu-compute/vector_register_file.cc
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/vector_register_file.hh"
+
+#include <string>
+
+#include "base/misc.hh"
+#include "gpu-compute/code_enums.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/simple_pool_manager.hh"
+#include "gpu-compute/wavefront.hh"
+#include "params/VectorRegisterFile.hh"
+
+VectorRegisterFile::VectorRegisterFile(const VectorRegisterFileParams *p)
+ : SimObject(p),
+ manager(new SimplePoolManager(p->min_alloc, p->num_regs_per_simd)),
+ simdId(p->simd_id), numRegsPerSimd(p->num_regs_per_simd),
+ vgprState(new VecRegisterState())
+{
+ fatal_if(numRegsPerSimd % 2, "VRF size is illegal\n");
+ fatal_if(simdId < 0, "Illegal SIMD id for VRF");
+
+ fatal_if(numRegsPerSimd % p->min_alloc, "Min VGPR region allocation is not "
+ "multiple of VRF size\n");
+
+ busy.clear();
+ busy.resize(numRegsPerSimd, 0);
+ nxtBusy.clear();
+ nxtBusy.resize(numRegsPerSimd, 0);
+
+ vgprState->init(numRegsPerSimd);
+}
+
+void
+VectorRegisterFile::setParent(ComputeUnit *_computeUnit)
+{
+ computeUnit = _computeUnit;
+ vgprState->setParent(computeUnit);
+}
+
+uint8_t
+VectorRegisterFile::regNxtBusy(int idx, uint32_t operandSize) const
+{
+ uint8_t status = nxtBusy.at(idx);
+
+ if (operandSize > 4) {
+ status = status | (nxtBusy.at((idx + 1) % numRegs()));
+ }
+
+ return status;
+}
+
+uint8_t
+VectorRegisterFile::regBusy(int idx, uint32_t operandSize) const
+{
+ uint8_t status = busy.at(idx);
+
+ if (operandSize > 4) {
+ status = status | (busy.at((idx + 1) % numRegs()));
+ }
+
+ return status;
+}
+
+void
+VectorRegisterFile::preMarkReg(int regIdx, uint32_t operandSize, uint8_t value)
+{
+ nxtBusy.at(regIdx) = value;
+
+ if (operandSize > 4) {
+ nxtBusy.at((regIdx + 1) % numRegs()) = value;
+ }
+}
+
+void
+VectorRegisterFile::markReg(int regIdx, uint32_t operandSize, uint8_t value)
+{
+ busy.at(regIdx) = value;
+
+ if (operandSize > 4) {
+ busy.at((regIdx + 1) % numRegs()) = value;
+ }
+}
+
+bool
+VectorRegisterFile::operandsReady(Wavefront *w, GPUDynInstPtr ii) const
+{
+ for (int i = 0; i < ii->getNumOperands(); ++i) {
+ if (ii->isVectorRegister(i)) {
+ uint32_t vgprIdx = ii->getRegisterIndex(i);
+ uint32_t pVgpr = w->remap(vgprIdx, ii->getOperandSize(i), 1);
+
+ if (regBusy(pVgpr, ii->getOperandSize(i)) == 1) {
+ if (ii->isDstOperand(i)) {
+ w->numTimesBlockedDueWAXDependencies++;
+ } else if (ii->isSrcOperand(i)) {
+ w->numTimesBlockedDueRAWDependencies++;
+ }
+
+ return false;
+ }
+
+ if (regNxtBusy(pVgpr, ii->getOperandSize(i)) == 1) {
+ if (ii->isDstOperand(i)) {
+ w->numTimesBlockedDueWAXDependencies++;
+ } else if (ii->isSrcOperand(i)) {
+ w->numTimesBlockedDueRAWDependencies++;
+ }
+
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+void
+VectorRegisterFile::exec(GPUDynInstPtr ii, Wavefront *w)
+{
+ bool loadInstr = IS_OT_READ(ii->opType());
+ bool atomicInstr = IS_OT_ATOMIC(ii->opType());
+
+ bool loadNoArgInstr = loadInstr && !ii->isArgLoad();
+
+ // iterate over all register destination operands
+ for (int i = 0; i < ii->getNumOperands(); ++i) {
+ if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
+ uint32_t physReg = w->remap(ii->getRegisterIndex(i),
+ ii->getOperandSize(i), 1);
+
+ // mark the destination vector register as busy
+ markReg(physReg, ii->getOperandSize(i), 1);
+ // clear the in-flight status of the destination vector register
+ preMarkReg(physReg, ii->getOperandSize(i), 0);
+
+ // FIXME: if we ever model correct timing behavior
+ // for load argument instructions then we should not
+ // set the destination register as busy now but when
+ // the data returns. Loads and Atomics should free
+ // their destination registers when the data returns,
+ // not now
+ if (!atomicInstr && !loadNoArgInstr) {
+ uint32_t pipeLen = ii->getOperandSize(i) <= 4 ?
+ computeUnit->spBypassLength() :
+ computeUnit->dpBypassLength();
+
+ // schedule an event for marking the register as ready
+ computeUnit->registerEvent(w->simdId, physReg,
+ ii->getOperandSize(i),
+ computeUnit->shader->tick_cnt +
+ computeUnit->shader->ticks(pipeLen),
+ 0);
+ }
+ }
+ }
+}
+
+int
+VectorRegisterFile::exec(uint64_t dynamic_id, Wavefront *w,
+ std::vector<uint32_t> &regVec, uint32_t operandSize,
+ uint64_t timestamp)
+{
+ int delay = 0;
+
+ panic_if(regVec.size() <= 0, "Illegal VGPR vector size=%d\n",
+ regVec.size());
+
+ for (int i = 0; i < regVec.size(); ++i) {
+ // mark the destination VGPR as free when the timestamp expires
+ computeUnit->registerEvent(w->simdId, regVec[i], operandSize,
+ computeUnit->shader->tick_cnt + timestamp +
+ computeUnit->shader->ticks(delay), 0);
+ }
+
+ return delay;
+}
+
+void
+VectorRegisterFile::updateResources(Wavefront *w, GPUDynInstPtr ii)
+{
+ // iterate over all register destination operands
+ for (int i = 0; i < ii->getNumOperands(); ++i) {
+ if (ii->isVectorRegister(i) && ii->isDstOperand(i)) {
+ uint32_t physReg = w->remap(ii->getRegisterIndex(i),
+ ii->getOperandSize(i), 1);
+ // set the in-flight status of the destination vector register
+ preMarkReg(physReg, ii->getOperandSize(i), 1);
+ }
+ }
+}
+
+bool
+VectorRegisterFile::vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
+ GPUDynInstPtr ii,
+ VrfAccessType accessType)
+{
+ bool ready = true;
+
+ return ready;
+}
+
+bool
+VectorRegisterFile::vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
+ VrfAccessType accessType)
+{
+ bool ready = true;
+
+ return ready;
+}
+
+VectorRegisterFile*
+VectorRegisterFileParams::create()
+{
+ return new VectorRegisterFile(this);
+}
diff --git a/src/gpu-compute/vector_register_file.hh b/src/gpu-compute/vector_register_file.hh
new file mode 100644
index 000000000..1cb011a1e
--- /dev/null
+++ b/src/gpu-compute/vector_register_file.hh
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __VECTOR_REGISTER_FILE_HH__
+#define __VECTOR_REGISTER_FILE_HH__
+
+#include <list>
+
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "gpu-compute/vector_register_state.hh"
+#include "sim/sim_object.hh"
+
+class ComputeUnit;
+class Shader;
+class SimplePoolManager;
+class Wavefront;
+
+struct VectorRegisterFileParams;
+
+enum class VrfAccessType : uint8_t
+{
+ READ = 0x01,
+ WRITE = 0x02,
+ RD_WR = READ | WRITE
+};
+
+// Vector Register File
+class VectorRegisterFile : public SimObject
+{
+ public:
+ VectorRegisterFile(const VectorRegisterFileParams *p);
+
+ void setParent(ComputeUnit *_computeUnit);
+
+ // Read a register
+ template<typename T>
+ T
+ read(int regIdx, int threadId=0)
+ {
+ T p0 = vgprState->read<T>(regIdx, threadId);
+
+ return p0;
+ }
+
+ // Write a register
+ template<typename T>
+ void
+ write(int regIdx, T value, int threadId=0)
+ {
+ vgprState->write<T>(regIdx, value, threadId);
+ }
+
+ uint8_t regBusy(int idx, uint32_t operandSize) const;
+ uint8_t regNxtBusy(int idx, uint32_t operandSize) const;
+
+ int numRegs() const { return numRegsPerSimd; }
+
+ void markReg(int regIdx, uint32_t operandSize, uint8_t value);
+ void preMarkReg(int regIdx, uint32_t operandSize, uint8_t value);
+
+ virtual void exec(GPUDynInstPtr ii, Wavefront *w);
+
+ virtual int exec(uint64_t dynamic_id, Wavefront *w,
+ std::vector<uint32_t> &regVec, uint32_t operandSize,
+ uint64_t timestamp);
+
+ bool operandsReady(Wavefront *w, GPUDynInstPtr ii) const;
+ virtual void updateEvents() { }
+ virtual void updateResources(Wavefront *w, GPUDynInstPtr ii);
+
+ virtual bool
+ isReadConflict(int memWfId, int exeWfId) const
+ {
+ return false;
+ }
+
+ virtual bool
+ isWriteConflict(int memWfId, int exeWfId) const
+ {
+ return false;
+ }
+
+ virtual bool vrfOperandAccessReady(uint64_t dynamic_id, Wavefront *w,
+ GPUDynInstPtr ii,
+ VrfAccessType accessType);
+
+ virtual bool vrfOperandAccessReady(Wavefront *w, GPUDynInstPtr ii,
+ VrfAccessType accessType);
+
+ SimplePoolManager *manager;
+
+ protected:
+ ComputeUnit* computeUnit;
+ int simdId;
+
+ // flag indicating if a register is busy
+ std::vector<uint8_t> busy;
+ // flag indicating if a register will be busy (by instructions
+ // in the SIMD pipeline)
+ std::vector<uint8_t> nxtBusy;
+
+ // numer of registers (bank size) per simd unit (bank)
+ int numRegsPerSimd;
+
+ // vector register state
+ VecRegisterState *vgprState;
+};
+
+#endif // __VECTOR_REGISTER_FILE_HH__
diff --git a/src/gpu-compute/vector_register_state.cc b/src/gpu-compute/vector_register_state.cc
new file mode 100644
index 000000000..f231b0579
--- /dev/null
+++ b/src/gpu-compute/vector_register_state.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#include "gpu-compute/vector_register_state.hh"
+
+#include "gpu-compute/compute_unit.hh"
+
+VecRegisterState::VecRegisterState() : computeUnit(nullptr)
+{
+ s_reg.clear();
+ d_reg.clear();
+}
+
+void
+VecRegisterState::setParent(ComputeUnit *_computeUnit)
+{
+ computeUnit = _computeUnit;
+ _name = computeUnit->name() + ".VecRegState";
+}
+
+void
+VecRegisterState::init(uint32_t _size)
+{
+ s_reg.resize(_size);
+ d_reg.resize(_size);
+}
diff --git a/src/gpu-compute/vector_register_state.hh b/src/gpu-compute/vector_register_state.hh
new file mode 100644
index 000000000..a233b9acc
--- /dev/null
+++ b/src/gpu-compute/vector_register_state.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos
+ */
+
+#ifndef __VECTOR_REGISTER_STATE_HH__
+#define __VECTOR_REGISTER_STATE_HH__
+
+#include <array>
+#include <cassert>
+#include <string>
+#include <vector>
+
+#include "gpu-compute/misc.hh"
+
+class ComputeUnit;
+
+// Vector Register State per SIMD unit (contents of the vector
+// registers in the VRF of the SIMD)
+class VecRegisterState
+{
+ public:
+ VecRegisterState();
+ void init(uint32_t _size);
+
+ const std::string& name() const { return _name; }
+ void setParent(ComputeUnit *_computeUnit);
+ void regStats() { }
+
+ // Access methods
+ template<typename T>
+ T
+ read(int regIdx, int threadId=0) {
+ T *p0;
+ assert(sizeof(T) == 4 || sizeof(T) == 8);
+ if (sizeof(T) == 4) {
+ p0 = (T*)(&s_reg[regIdx][threadId]);
+ } else {
+ p0 = (T*)(&d_reg[regIdx][threadId]);
+ }
+
+ return *p0;
+ }
+
+ template<typename T>
+ void
+ write(unsigned int regIdx, T value, int threadId=0) {
+ T *p0;
+ assert(sizeof(T) == 4 || sizeof(T) == 8);
+ if (sizeof(T) == 4) {
+ p0 = (T*)(&s_reg[regIdx][threadId]);
+ } else {
+ p0 = (T*)(&d_reg[regIdx][threadId]);
+ }
+
+ *p0 = value;
+ }
+
+ // (Single Precision) Vector Register File size.
+ int regSize() { return s_reg.size(); }
+
+ private:
+ ComputeUnit *computeUnit;
+ std::string _name;
+ // 32-bit Single Precision Vector Register State
+ std::vector<std::array<uint32_t, VSZ>> s_reg;
+ // 64-bit Double Precision Vector Register State
+ std::vector<std::array<uint64_t, VSZ>> d_reg;
+};
+
+#endif // __VECTOR_REGISTER_STATE_HH__
diff --git a/src/gpu-compute/wavefront.cc b/src/gpu-compute/wavefront.cc
new file mode 100644
index 000000000..0aa033db1
--- /dev/null
+++ b/src/gpu-compute/wavefront.cc
@@ -0,0 +1,925 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/wavefront.hh"
+
+#include "debug/GPUExec.hh"
+#include "debug/WavefrontStack.hh"
+#include "gpu-compute/code_enums.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "gpu-compute/gpu_dyn_inst.hh"
+#include "gpu-compute/shader.hh"
+#include "gpu-compute/vector_register_file.hh"
+
+Wavefront*
+WavefrontParams::create()
+{
+ return new Wavefront(this);
+}
+
+Wavefront::Wavefront(const Params *p)
+ : SimObject(p), callArgMem(nullptr)
+{
+ last_trace = 0;
+ simdId = p->simdId;
+ wfSlotId = p->wf_slot_id;
+
+ status = S_STOPPED;
+ reservedVectorRegs = 0;
+ startVgprIndex = 0;
+ outstanding_reqs = 0;
+ mem_reqs_in_pipe = 0;
+ outstanding_reqs_wr_gm = 0;
+ outstanding_reqs_wr_lm = 0;
+ outstanding_reqs_rd_gm = 0;
+ outstanding_reqs_rd_lm = 0;
+ rd_lm_reqs_in_pipe = 0;
+ rd_gm_reqs_in_pipe = 0;
+ wr_lm_reqs_in_pipe = 0;
+ wr_gm_reqs_in_pipe = 0;
+
+ barrier_cnt = 0;
+ old_barrier_cnt = 0;
+ stalledAtBarrier = false;
+
+ mem_trace_busy = 0;
+ old_vgpr_tcnt = 0xffffffffffffffffll;
+ old_dgpr_tcnt = 0xffffffffffffffffll;
+
+ pendingFetch = false;
+ dropFetch = false;
+ condRegState = new ConditionRegisterState();
+ maxSpVgprs = 0;
+ maxDpVgprs = 0;
+}
+
+void
+Wavefront::regStats()
+{
+ srcRegOpDist
+ .init(0, 4, 2)
+ .name(name() + ".src_reg_operand_dist")
+ .desc("number of executed instructions with N source register operands")
+ ;
+
+ dstRegOpDist
+ .init(0, 3, 2)
+ .name(name() + ".dst_reg_operand_dist")
+ .desc("number of executed instructions with N destination register "
+ "operands")
+ ;
+
+ // FIXME: the name of the WF needs to be unique
+ numTimesBlockedDueWAXDependencies
+ .name(name() + ".timesBlockedDueWAXDependencies")
+ .desc("number of times the wf's instructions are blocked due to WAW "
+ "or WAR dependencies")
+ ;
+
+ // FIXME: the name of the WF needs to be unique
+ numTimesBlockedDueRAWDependencies
+ .name(name() + ".timesBlockedDueRAWDependencies")
+ .desc("number of times the wf's instructions are blocked due to RAW "
+ "dependencies")
+ ;
+
+ // FIXME: the name of the WF needs to be unique
+ numTimesBlockedDueVrfPortAvail
+ .name(name() + ".timesBlockedDueVrfPortAvail")
+ .desc("number of times instructions are blocked due to VRF port "
+ "availability")
+ ;
+}
+
+void
+Wavefront::init()
+{
+ reservedVectorRegs = 0;
+ startVgprIndex = 0;
+}
+
+void
+Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
+{
+ condRegState->init(num_cregs);
+ maxSpVgprs = num_sregs;
+ maxDpVgprs = num_dregs;
+}
+
+Wavefront::~Wavefront()
+{
+ if (callArgMem)
+ delete callArgMem;
+}
+
+void
+Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr)
+{
+ wfDynId = _wfDynId;
+ base_ptr = _base_ptr;
+ status = S_RUNNING;
+}
+
+bool
+Wavefront::isGmInstruction(GPUDynInstPtr ii)
+{
+ if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
+ IS_OT_ATOMIC_PM(ii->opType())) {
+ return true;
+ }
+
+ if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
+ IS_OT_ATOMIC_GM(ii->opType())) {
+
+ return true;
+ }
+
+ if (IS_OT_FLAT(ii->opType())) {
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isLmInstruction(GPUDynInstPtr ii)
+{
+ if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) ||
+ IS_OT_ATOMIC_LM(ii->opType())) {
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstALU()
+{
+ assert(!instructionBuffer.empty());
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP ||
+ ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
+ ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+ ii->opType() == Enums::OT_KERN_READ)) {
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstBarrier()
+{
+ assert(!instructionBuffer.empty());
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) {
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstGMem()
+{
+ assert(!instructionBuffer.empty());
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) ||
+ IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
+
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstLMem()
+{
+ assert(!instructionBuffer.empty());
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) ||
+ IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
+
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstPrivMem()
+{
+ assert(!instructionBuffer.empty());
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) ||
+ IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
+
+ return true;
+ }
+
+ return false;
+}
+
+bool
+Wavefront::isOldestInstFlatMem()
+{
+ assert(!instructionBuffer.empty());
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) {
+
+ return true;
+ }
+
+ return false;
+}
+
+// Return true if the Wavefront's instruction
+// buffer has branch instruction.
+bool
+Wavefront::instructionBufferHasBranch()
+{
+ for (auto it : instructionBuffer) {
+ GPUDynInstPtr ii = it;
+
+ if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// Remap HSAIL register to physical VGPR.
+// HSAIL register = virtual register assigned to an operand by HLC compiler
+uint32_t
+Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
+{
+ assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
+ // add the offset from where the VGPRs of the wavefront have been assigned
+ uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
+ // HSAIL double precision (DP) register: calculate the physical VGPR index
+ // assuming that DP registers are placed after SP ones in the VRF. The DP
+ // and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
+ // the DP VGPR index before mapping it to the physical VRF address space
+ if (mode == 1 && size > 4) {
+ physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
+ }
+
+ assert((startVgprIndex <= physicalVgprIndex) &&
+ (startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
+
+ // calculate absolute physical VGPR index
+ return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
+}
+
+// Return true if this wavefront is ready
+// to execute an instruction of the specified type.
+int
+Wavefront::ready(itype_e type)
+{
+ // Check to make sure wave is running
+ if (status == S_STOPPED || status == S_RETURNING ||
+ instructionBuffer.empty()) {
+ return 0;
+ }
+
+ // Is the wave waiting at a barrier
+ if (stalledAtBarrier) {
+ if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt,
+ computeUnit->getRefCounter(dispatchid, wg_id))) {
+ // Are all threads at barrier?
+ return 0;
+ }
+ old_barrier_cnt = barrier_cnt;
+ stalledAtBarrier = false;
+ }
+
+ // Read instruction
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ bool ready_inst M5_VAR_USED = false;
+ bool glbMemBusRdy = false;
+ bool glbMemIssueRdy = false;
+ if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
+ for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
+ if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
+ glbMemBusRdy = true;
+ if (computeUnit->wfWait[j].prerdy())
+ glbMemIssueRdy = true;
+ }
+ }
+ bool locMemBusRdy = false;
+ bool locMemIssueRdy = false;
+ if (type == I_SHARED) {
+ for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
+ if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
+ locMemBusRdy = true;
+ if (computeUnit->wfWait[j].prerdy())
+ locMemIssueRdy = true;
+ }
+ }
+
+ // The following code is very error prone and the entire process for
+ // checking readiness will be fixed eventually. In the meantime, let's
+ // make sure that we do not silently let an instruction type slip
+ // through this logic and always return not ready.
+ if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP ||
+ ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
+ ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+ ii->opType() == Enums::OT_KERN_READ ||
+ ii->opType() == Enums::OT_ARG ||
+ IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
+ IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) ||
+ IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
+ IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
+ IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) {
+ panic("next instruction: %s is of unknown type\n", ii->disassemble());
+ }
+
+ DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
+ computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
+
+ if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) {
+ // Here for ALU instruction (barrier)
+ if (!computeUnit->wfWait[simdId].prerdy()) {
+ // Is wave slot free?
+ return 0;
+ }
+
+ // Are there in pipe or outstanding memory requests?
+ if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
+ return 0;
+ }
+
+ ready_inst = true;
+ } else if (type == I_ALU && ii->opType() == Enums::OT_NOP) {
+ // Here for ALU instruction (nop)
+ if (!computeUnit->wfWait[simdId].prerdy()) {
+ // Is wave slot free?
+ return 0;
+ }
+
+ ready_inst = true;
+ } else if (type == I_ALU && ii->opType() == Enums::OT_RET) {
+ // Here for ALU instruction (return)
+ if (!computeUnit->wfWait[simdId].prerdy()) {
+ // Is wave slot free?
+ return 0;
+ }
+
+ // Are there in pipe or outstanding memory requests?
+ if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
+ return 0;
+ }
+
+ ready_inst = true;
+ } else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH ||
+ ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
+ ii->opType() == Enums::OT_KERN_READ ||
+ ii->opType() == Enums::OT_ARG)) {
+ // Here for ALU instruction (all others)
+ if (!computeUnit->wfWait[simdId].prerdy()) {
+ // Is alu slot free?
+ return 0;
+ }
+ if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+ VrfAccessType::RD_WR)) {
+ return 0;
+ }
+
+ if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+ return 0;
+ }
+ ready_inst = true;
+ } else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) ||
+ IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
+ // Here Global memory instruction
+ if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) {
+ // Are there in pipe or outstanding global memory write requests?
+ if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
+ return 0;
+ }
+ }
+
+ if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) ||
+ IS_OT_HIST_GM(ii->opType())) {
+ // Are there in pipe or outstanding global memory read requests?
+ if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0)
+ return 0;
+ }
+
+ if (!glbMemIssueRdy) {
+ // Is WV issue slot free?
+ return 0;
+ }
+
+ if (!glbMemBusRdy) {
+ // Is there an available VRF->Global memory read bus?
+ return 0;
+ }
+
+ if (!computeUnit->globalMemoryPipe.
+ isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+ // Can we insert a new request to the Global Mem Request FIFO?
+ return 0;
+ }
+ // can we schedule source & destination operands on the VRF?
+ if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+ VrfAccessType::RD_WR)) {
+ return 0;
+ }
+ if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+ return 0;
+ }
+ ready_inst = true;
+ } else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) ||
+ IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
+ // Here for Shared memory instruction
+ if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) {
+ if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) {
+ return 0;
+ }
+ }
+
+ if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
+ IS_OT_HIST_LM(ii->opType())) {
+ if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) {
+ return 0;
+ }
+ }
+
+ if (!locMemBusRdy) {
+ // Is there an available VRF->LDS read bus?
+ return 0;
+ }
+ if (!locMemIssueRdy) {
+ // Is wave slot free?
+ return 0;
+ }
+
+ if (!computeUnit->localMemoryPipe.
+ isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
+ // Can we insert a new request to the LDS Request FIFO?
+ return 0;
+ }
+ // can we schedule source & destination operands on the VRF?
+ if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+ VrfAccessType::RD_WR)) {
+ return 0;
+ }
+ if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+ return 0;
+ }
+ ready_inst = true;
+ } else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) ||
+ IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
+ // Here for Private memory instruction ------------------------ //
+ if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) {
+ if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
+ return 0;
+ }
+ }
+
+ if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) ||
+ IS_OT_HIST_PM(ii->opType())) {
+ if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) {
+ return 0;
+ }
+ }
+
+ if (!glbMemBusRdy) {
+ // Is there an available VRF->Global memory read bus?
+ return 0;
+ }
+
+ if (!glbMemIssueRdy) {
+ // Is wave slot free?
+ return 0;
+ }
+
+ if (!computeUnit->globalMemoryPipe.
+ isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+ // Can we insert a new request to the Global Mem Request FIFO?
+ return 0;
+ }
+ // can we schedule source & destination operands on the VRF?
+ if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+ VrfAccessType::RD_WR)) {
+ return 0;
+ }
+ if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+ return 0;
+ }
+ ready_inst = true;
+ } else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) {
+ if (!glbMemBusRdy) {
+ // Is there an available VRF->Global memory read bus?
+ return 0;
+ }
+
+ if (!locMemBusRdy) {
+ // Is there an available VRF->LDS read bus?
+ return 0;
+ }
+
+ if (!glbMemIssueRdy) {
+ // Is wave slot free?
+ return 0;
+ }
+
+ if (!locMemIssueRdy) {
+ return 0;
+ }
+ if (!computeUnit->globalMemoryPipe.
+ isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
+ // Can we insert a new request to the Global Mem Request FIFO?
+ return 0;
+ }
+
+ if (!computeUnit->localMemoryPipe.
+ isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
+ // Can we insert a new request to the LDS Request FIFO?
+ return 0;
+ }
+ // can we schedule source & destination operands on the VRF?
+ if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
+ VrfAccessType::RD_WR)) {
+ return 0;
+ }
+ // are all the operands ready? (RAW, WAW and WAR depedencies met?)
+ if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
+ return 0;
+ }
+ ready_inst = true;
+ } else {
+ return 0;
+ }
+
+ assert(ready_inst);
+
+ DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
+ simdId, wfSlotId, ii->disassemble());
+
+ return 1;
+}
+
+void
+Wavefront::updateResources()
+{
+ // Get current instruction
+ GPUDynInstPtr ii = instructionBuffer.front();
+ assert(ii);
+ computeUnit->vrf[simdId]->updateResources(this, ii);
+ // Single precision ALU or Branch or Return or Special instruction
+ if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
+ ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
+ // FIXME: Kernel argument loads are currently treated as ALU operations
+ // since we don't send memory packets at execution. If we fix that then
+ // we should map them to one of the memory pipelines
+ ii->opType()==Enums::OT_KERN_READ ||
+ ii->opType()==Enums::OT_ARG ||
+ ii->opType()==Enums::OT_RET) {
+ computeUnit->aluPipe[simdId].preset(computeUnit->shader->
+ ticks(computeUnit->spBypassLength()));
+ // this is to enforce a fixed number of cycles per issue slot per SIMD
+ computeUnit->wfWait[simdId].preset(computeUnit->shader->
+ ticks(computeUnit->issuePeriod));
+ } else if (ii->opType() == Enums::OT_BARRIER) {
+ computeUnit->wfWait[simdId].preset(computeUnit->shader->
+ ticks(computeUnit->issuePeriod));
+ } else if (ii->opType() == Enums::OT_FLAT_READ) {
+ assert(Enums::SC_NONE != ii->executedAs());
+ mem_reqs_in_pipe++;
+ rd_gm_reqs_in_pipe++;
+ if ( Enums::SC_SHARED == ii->executedAs() ) {
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ preset(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else {
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ }
+ } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
+ assert(Enums::SC_NONE != ii->executedAs());
+ mem_reqs_in_pipe++;
+ wr_gm_reqs_in_pipe++;
+ if (Enums::SC_SHARED == ii->executedAs()) {
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else {
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ }
+ } else if (IS_OT_READ_GM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ rd_gm_reqs_in_pipe++;
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_WRITE_GM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ wr_gm_reqs_in_pipe++;
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_ATOMIC_GM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ wr_gm_reqs_in_pipe++;
+ rd_gm_reqs_in_pipe++;
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_READ_LM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ rd_lm_reqs_in_pipe++;
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ preset(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_WRITE_LM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ wr_lm_reqs_in_pipe++;
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_ATOMIC_LM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ wr_lm_reqs_in_pipe++;
+ rd_lm_reqs_in_pipe++;
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_READ_PM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ rd_gm_reqs_in_pipe++;
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_WRITE_PM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ wr_gm_reqs_in_pipe++;
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_ATOMIC_PM(ii->opType())) {
+ mem_reqs_in_pipe++;
+ wr_gm_reqs_in_pipe++;
+ rd_gm_reqs_in_pipe++;
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ preset(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ }
+}
+
+void
+Wavefront::exec()
+{
+ // ---- Exit if wavefront is inactive ----------------------------- //
+
+ if (status == S_STOPPED || status == S_RETURNING ||
+ instructionBuffer.empty()) {
+ return;
+ }
+
+ // Get current instruction
+
+ GPUDynInstPtr ii = instructionBuffer.front();
+
+ const uint32_t old_pc = pc();
+ DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
+ "(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+ ii->disassemble(), old_pc);
+ ii->execute();
+ // access the VRF
+ computeUnit->vrf[simdId]->exec(ii, this);
+ srcRegOpDist.sample(ii->numSrcRegOperands());
+ dstRegOpDist.sample(ii->numDstRegOperands());
+ computeUnit->numInstrExecuted++;
+ computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
+ computeUnit->lastExecCycle[simdId]);
+ computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
+ if (pc() == old_pc) {
+ uint32_t new_pc = old_pc + 1;
+ // PC not modified by instruction, proceed to next or pop frame
+ pc(new_pc);
+ if (new_pc == rpc()) {
+ popFromReconvergenceStack();
+ discardFetch();
+ } else {
+ instructionBuffer.pop_front();
+ }
+ }
+
+ if (computeUnit->shader->hsail_mode==Shader::SIMT) {
+ const int num_active_lanes = execMask().count();
+ computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
+ computeUnit->numVecOpsExecuted += num_active_lanes;
+ if (isGmInstruction(ii)) {
+ computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
+ } else if (isLmInstruction(ii)) {
+ computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
+ }
+ }
+
+ // ---- Update Vector ALU pipeline and other resources ------------------ //
+ // Single precision ALU or Branch or Return or Special instruction
+ if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
+ ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
+ // FIXME: Kernel argument loads are currently treated as ALU operations
+ // since we don't send memory packets at execution. If we fix that then
+ // we should map them to one of the memory pipelines
+ ii->opType() == Enums::OT_KERN_READ ||
+ ii->opType() == Enums::OT_ARG ||
+ ii->opType() == Enums::OT_RET) {
+ computeUnit->aluPipe[simdId].set(computeUnit->shader->
+ ticks(computeUnit->spBypassLength()));
+
+ // this is to enforce a fixed number of cycles per issue slot per SIMD
+ computeUnit->wfWait[simdId].set(computeUnit->shader->
+ ticks(computeUnit->issuePeriod));
+ } else if (ii->opType() == Enums::OT_BARRIER) {
+ computeUnit->wfWait[simdId].set(computeUnit->shader->
+ ticks(computeUnit->issuePeriod));
+ } else if (ii->opType() == Enums::OT_FLAT_READ) {
+ assert(Enums::SC_NONE != ii->executedAs());
+
+ if (Enums::SC_SHARED == ii->executedAs()) {
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ set(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else {
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ set(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ }
+ } else if (ii->opType() == Enums::OT_FLAT_WRITE) {
+ assert(Enums::SC_NONE != ii->executedAs());
+ if (Enums::SC_SHARED == ii->executedAs()) {
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ set(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else {
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ set(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ }
+ } else if (IS_OT_READ_GM(ii->opType())) {
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ set(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_WRITE_GM(ii->opType())) {
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ set(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_ATOMIC_GM(ii->opType())) {
+ computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
+ set(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->GlbMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_READ_LM(ii->opType())) {
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ set(computeUnit->shader->ticks(4));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_WRITE_LM(ii->opType())) {
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ set(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ } else if (IS_OT_ATOMIC_LM(ii->opType())) {
+ computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
+ set(computeUnit->shader->ticks(8));
+ computeUnit->wfWait[computeUnit->ShrMemUnitId()].
+ set(computeUnit->shader->ticks(computeUnit->issuePeriod));
+ }
+}
+
+bool
+Wavefront::waitingAtBarrier(int lane)
+{
+ return bar_cnt[lane] < max_bar_cnt;
+}
+
+void
+Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
+ const VectorMask& mask)
+{
+ assert(mask.count());
+ reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask));
+}
+
+void
+Wavefront::popFromReconvergenceStack()
+{
+ assert(!reconvergenceStack.empty());
+
+ DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
+ computeUnit->cu_id, simdId, wfSlotId, wfDynId,
+ execMask().to_string<char, std::string::traits_type,
+ std::string::allocator_type>().c_str(), pc());
+
+ reconvergenceStack.pop();
+
+ DPRINTF(WavefrontStack, "%3i %s\n", pc(),
+ execMask().to_string<char, std::string::traits_type,
+ std::string::allocator_type>().c_str());
+
+}
+
+void
+Wavefront::discardFetch()
+{
+ instructionBuffer.clear();
+ dropFetch |=pendingFetch;
+}
+
+uint32_t
+Wavefront::pc() const
+{
+ return reconvergenceStack.top()->pc;
+}
+
+uint32_t
+Wavefront::rpc() const
+{
+ return reconvergenceStack.top()->rpc;
+}
+
+VectorMask
+Wavefront::execMask() const
+{
+ return reconvergenceStack.top()->execMask;
+}
+
+bool
+Wavefront::execMask(int lane) const
+{
+ return reconvergenceStack.top()->execMask[lane];
+}
+
+
+void
+Wavefront::pc(uint32_t new_pc)
+{
+ reconvergenceStack.top()->pc = new_pc;
+}
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
new file mode 100644
index 000000000..0abab8e83
--- /dev/null
+++ b/src/gpu-compute/wavefront.hh
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __WAVEFRONT_HH__
+#define __WAVEFRONT_HH__
+
+#include <cassert>
+#include <deque>
+#include <memory>
+#include <stack>
+#include <vector>
+
+#include "base/misc.hh"
+#include "base/types.hh"
+#include "gpu-compute/condition_register_state.hh"
+#include "gpu-compute/lds_state.hh"
+#include "gpu-compute/misc.hh"
+#include "params/Wavefront.hh"
+#include "sim/sim_object.hh"
+
+static const int MAX_NUM_INSTS_PER_WF = 12;
+
+/*
+ * Arguments for the hsail opcode call, are user defined and variable length.
+ * The hardware/finalizer can support arguments in hardware or use memory to
+ * pass arguments. For now, let's assume that an unlimited number of arguments
+ * are supported in hardware (the compiler inlines functions whenver it can
+ * anyways, so unless someone is interested in the implications of linking/
+ * library functions, I think this is a reasonable assumption given the typical
+ * size of an OpenCL kernel).
+ *
+ * Note that call args are different than kernel arguments:
+ * * All work-items in a kernel refer the same set of kernel arguments
+ * * Each work-item has it's on set of call args. So a call argument at
+ * address 0x4 is different for work-item 0 and work-item 1.
+ *
+ * Ok, the table below shows an example of how we organize the call arguments in
+ * the CallArgMem class.
+ *
+ * int foo(int arg1, double arg2)
+ * ___________________________________________________
+ * | 0: return.0 | 4: return.1 | ... | 252: return.63 |
+ * |---------------------------------------------------|
+ * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63 |
+ * |---------------------------------------------------|
+ * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63 |
+ * ___________________________________________________
+ */
+class CallArgMem
+{
+ public:
+ // pointer to buffer for storing function arguments
+ uint8_t *mem;
+ // size of function args
+ int funcArgsSizePerItem;
+
+ template<typename CType>
+ int
+ getLaneOffset(int lane, int addr)
+ {
+ return addr * VSZ + sizeof(CType) * lane;
+ }
+
+ CallArgMem(int func_args_size_per_item)
+ : funcArgsSizePerItem(func_args_size_per_item)
+ {
+ mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
+ }
+
+ ~CallArgMem()
+ {
+ free(mem);
+ }
+
+ template<typename CType>
+ uint8_t*
+ getLaneAddr(int lane, int addr)
+ {
+ return mem + getLaneOffset<CType>(lane, addr);
+ }
+
+ template<typename CType>
+ void
+ setLaneAddr(int lane, int addr, CType val)
+ {
+ *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
+ }
+};
+
+/**
+ * A reconvergence stack entry conveys the necessary state to implement
+ * control flow divergence.
+ */
+class ReconvergenceStackEntry {
+
+ public:
+ ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc,
+ VectorMask new_mask) : pc(new_pc), rpc(new_rpc),
+ execMask(new_mask) {
+ }
+
+ /**
+ * PC of current instruction.
+ */
+ uint32_t pc;
+ /**
+ * PC of the immediate post-dominator instruction, i.e., the value of
+ * @a pc for the first instruction that will be executed by the wavefront
+ * when a reconvergence point is reached.
+ */
+ uint32_t rpc;
+ /**
+ * Execution mask.
+ */
+ VectorMask execMask;
+};
+
+class Wavefront : public SimObject
+{
+ public:
+ enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
+ enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
+
+ // Base pointer for array of instruction pointers
+ uint64_t base_ptr;
+
+ uint32_t old_barrier_cnt;
+ uint32_t barrier_cnt;
+ uint32_t barrier_id;
+ uint32_t barrier_slots;
+ status_e status;
+ // HW slot id where the WF is mapped to inside a SIMD unit
+ int wfSlotId;
+ int kern_id;
+ // SIMD unit where the WV has been scheduled
+ int simdId;
+ // pointer to parent CU
+ ComputeUnit *computeUnit;
+
+ std::deque<GPUDynInstPtr> instructionBuffer;
+
+ bool pendingFetch;
+ bool dropFetch;
+
+ // Condition Register State (for HSAIL simulations only)
+ class ConditionRegisterState *condRegState;
+ // number of single precision VGPRs required by WF
+ uint32_t maxSpVgprs;
+ // number of double precision VGPRs required by WF
+ uint32_t maxDpVgprs;
+ // map virtual to physical vector register
+ uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
+ void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+ bool isGmInstruction(GPUDynInstPtr ii);
+ bool isLmInstruction(GPUDynInstPtr ii);
+ bool isOldestInstGMem();
+ bool isOldestInstLMem();
+ bool isOldestInstPrivMem();
+ bool isOldestInstFlatMem();
+ bool isOldestInstALU();
+ bool isOldestInstBarrier();
+ // used for passing spill address to DDInstGPU
+ uint64_t last_addr[VSZ];
+ uint32_t workitemid[3][VSZ];
+ uint32_t workitemFlatId[VSZ];
+ uint32_t workgroupid[3];
+ uint32_t workgroupsz[3];
+ uint32_t gridsz[3];
+ uint32_t wg_id;
+ uint32_t wg_sz;
+ uint32_t dynwaveid;
+ uint32_t maxdynwaveid;
+ uint32_t dispatchid;
+ // outstanding global+local memory requests
+ uint32_t outstanding_reqs;
+ // memory requests between scoreboard
+ // and execute stage not yet executed
+ uint32_t mem_reqs_in_pipe;
+ // outstanding global memory write requests
+ uint32_t outstanding_reqs_wr_gm;
+ // outstanding local memory write requests
+ uint32_t outstanding_reqs_wr_lm;
+ // outstanding global memory read requests
+ uint32_t outstanding_reqs_rd_gm;
+ // outstanding local memory read requests
+ uint32_t outstanding_reqs_rd_lm;
+ uint32_t rd_lm_reqs_in_pipe;
+ uint32_t rd_gm_reqs_in_pipe;
+ uint32_t wr_lm_reqs_in_pipe;
+ uint32_t wr_gm_reqs_in_pipe;
+
+ int mem_trace_busy;
+ uint64_t last_trace;
+ // number of vector registers reserved by WF
+ int reservedVectorRegs;
+ // Index into the Vector Register File's namespace where the WF's registers
+ // will live while the WF is executed
+ uint32_t startVgprIndex;
+
+ // Old value of destination gpr (for trace)
+ uint32_t old_vgpr[VSZ];
+ // Id of destination gpr (for trace)
+ uint32_t old_vgpr_id;
+ // Tick count of last old_vgpr copy
+ uint64_t old_vgpr_tcnt;
+
+ // Old value of destination gpr (for trace)
+ uint64_t old_dgpr[VSZ];
+ // Id of destination gpr (for trace)
+ uint32_t old_dgpr_id;
+ // Tick count of last old_vgpr copy
+ uint64_t old_dgpr_tcnt;
+
+ // Execution mask at wavefront start
+ VectorMask init_mask;
+
+ // number of barriers this WF has joined
+ int bar_cnt[VSZ];
+ int max_bar_cnt;
+ // Flag to stall a wave on barrier
+ bool stalledAtBarrier;
+
+ // a pointer to the fraction of the LDS allocated
+ // to this workgroup (thus this wavefront)
+ LdsChunk *ldsChunk;
+
+ // A pointer to the spill area
+ Addr spillBase;
+ // The size of the spill area
+ uint32_t spillSizePerItem;
+ // The vector width of the spill area
+ uint32_t spillWidth;
+
+ // A pointer to the private memory area
+ Addr privBase;
+ // The size of the private memory area
+ uint32_t privSizePerItem;
+
+ // A pointer ot the read-only memory area
+ Addr roBase;
+ // size of the read-only memory area
+ uint32_t roSize;
+
+ // pointer to buffer for storing kernel arguments
+ uint8_t *kernelArgs;
+ // unique WF id over all WFs executed across all CUs
+ uint64_t wfDynId;
+
+ // number of times instruction issue for this wavefront is blocked
+ // due to VRF port availability
+ Stats::Scalar numTimesBlockedDueVrfPortAvail;
+ // number of times an instruction of a WF is blocked from being issued
+ // due to WAR and WAW dependencies
+ Stats::Scalar numTimesBlockedDueWAXDependencies;
+ // number of times an instruction of a WF is blocked from being issued
+ // due to WAR and WAW dependencies
+ Stats::Scalar numTimesBlockedDueRAWDependencies;
+ // distribution of executed instructions based on their register
+ // operands; this is used to highlight the load on the VRF
+ Stats::Distribution srcRegOpDist;
+ Stats::Distribution dstRegOpDist;
+
+ // Functions to operate on call argument memory
+ // argument memory for hsail call instruction
+ CallArgMem *callArgMem;
+ void
+ initCallArgMem(int func_args_size_per_item)
+ {
+ callArgMem = new CallArgMem(func_args_size_per_item);
+ }
+
+ template<typename CType>
+ CType
+ readCallArgMem(int lane, int addr)
+ {
+ return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
+ }
+
+ template<typename CType>
+ void
+ writeCallArgMem(int lane, int addr, CType val)
+ {
+ callArgMem->setLaneAddr<CType>(lane, addr, val);
+ }
+
+ typedef WavefrontParams Params;
+ Wavefront(const Params *p);
+ ~Wavefront();
+ virtual void init();
+
+ void
+ setParent(ComputeUnit *cu)
+ {
+ computeUnit = cu;
+ }
+
+ void start(uint64_t _wfDynId, uint64_t _base_ptr);
+
+ void exec();
+ void updateResources();
+ int ready(itype_e type);
+ bool instructionBufferHasBranch();
+ void regStats();
+ VectorMask get_pred() { return execMask() & init_mask; }
+
+ bool waitingAtBarrier(int lane);
+
+ void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
+ const VectorMask& exec_mask);
+
+ void popFromReconvergenceStack();
+
+ uint32_t pc() const;
+
+ uint32_t rpc() const;
+
+ VectorMask execMask() const;
+
+ bool execMask(int lane) const;
+
+ void pc(uint32_t new_pc);
+
+ void discardFetch();
+
+ private:
+ /**
+ * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
+ * to be visited by the wavefront, and the associated execution masks. The
+ * reconvergence stack grows every time the wavefront reaches a divergence
+ * point (branch instruction), and shrinks every time the wavefront
+ * reaches a reconvergence point (immediate post-dominator instruction).
+ */
+ std::stack<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
+};
+
+#endif // __WAVEFRONT_HH__
diff --git a/src/mem/protocol/GPU_RfO-SQC.sm b/src/mem/protocol/GPU_RfO-SQC.sm
new file mode 100644
index 000000000..1e5f8df74
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO-SQC.sm
@@ -0,0 +1,667 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
+ : Sequencer* sequencer;
+ CacheMemory * L1cache;
+ int TCC_select_num_bits;
+ Cycles issue_latency := 80; // time to send data down to TCC
+ Cycles l2_hit_latency := 18;
+
+ MessageBuffer * requestFromSQC, network="To", virtual_network="1", vnet_type="request";
+ MessageBuffer * responseFromSQC, network="To", virtual_network="3", vnet_type="response";
+ MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock";
+
+ MessageBuffer * probeToSQC, network="From", virtual_network="1", vnet_type="request";
+ MessageBuffer * responseToSQC, network="From", virtual_network="3", vnet_type="response";
+
+ MessageBuffer * mandatoryQueue;
+{
+ state_declaration(State, desc="SQC Cache States", default="SQC_State_I") {
+ I, AccessPermission:Invalid, desc="Invalid";
+ S, AccessPermission:Read_Only, desc="Shared";
+
+ I_S, AccessPermission:Busy, desc="Invalid, issued RdBlkS, have not seen response yet";
+ S_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for clean WB ack";
+ I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from TCCdir for canceled WB";
+ }
+
+ enumeration(Event, desc="SQC Events") {
+ // Core initiated
+ Fetch, desc="Fetch";
+
+ //TCC initiated
+ TCC_AckS, desc="TCC Ack to Core Request";
+ TCC_AckWB, desc="TCC Ack for WB";
+ TCC_NackWB, desc="TCC Nack for WB";
+
+ // Mem sys initiated
+ Repl, desc="Replacing block from cache";
+
+ // Probe Events
+ PrbInvData, desc="probe, return M data";
+ PrbInv, desc="probe, no need for data";
+ PrbShrData, desc="probe downgrade, return data";
+ }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ DataArrayRead, desc="Read the data array";
+ DataArrayWrite, desc="Write the data array";
+ TagArrayRead, desc="Read the data array";
+ TagArrayWrite, desc="Write the data array";
+ }
+
+
+ structure(Entry, desc="...", interface="AbstractCacheEntry") {
+ State CacheState, desc="cache state";
+ bool Dirty, desc="Is the data dirty (diff than memory)?";
+ DataBlock DataBlk, desc="data for the block";
+ bool FromL2, default="false", desc="block just moved from L2";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ DataBlock DataBlk, desc="data for the block, required for concurrent writebacks";
+ bool Dirty, desc="Is the data dirty (different than memory)?";
+ int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for";
+ bool Shared, desc="Victim hit by shared probe";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ TBETable TBEs, template="<SQC_TBE>", constructor="m_number_of_TBEs";
+ int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+ Tick clockEdge();
+ Tick cyclesToTicks(Cycles c);
+
+ void set_cache_entry(AbstractCacheEntry b);
+ void unset_cache_entry();
+ void set_tbe(TBE b);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+ Cycles curCycle();
+
+ // Internal functions
+ Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+ Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
+ return cache_entry;
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return tbe.DataBlk;
+ } else {
+ return getCacheEntry(addr).DataBlk;
+ }
+ }
+
+ State getState(TBE tbe, Entry cache_entry, Addr addr) {
+ if(is_valid(tbe)) {
+ return tbe.TBEState;
+ } else if (is_valid(cache_entry)) {
+ return cache_entry.CacheState;
+ }
+ return State:I;
+ }
+
+ void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+ if (is_valid(tbe)) {
+ tbe.TBEState := state;
+ }
+
+ if (is_valid(cache_entry)) {
+ cache_entry.CacheState := state;
+ }
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return SQC_State_to_permission(tbe.TBEState);
+ }
+
+ Entry cache_entry := getCacheEntry(addr);
+ if(is_valid(cache_entry)) {
+ return SQC_State_to_permission(cache_entry.CacheState);
+ }
+
+ return AccessPermission:NotPresent;
+ }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+ if (is_valid(cache_entry)) {
+ cache_entry.changePermission(SQC_State_to_permission(state));
+ }
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ testAndRead(addr, tbe.DataBlk, pkt);
+ } else {
+ functionalMemoryRead(pkt);
+ }
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ int num_functional_writes := 0;
+
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ num_functional_writes := num_functional_writes +
+ testAndWrite(addr, tbe.DataBlk, pkt);
+ }
+
+ num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+ return num_functional_writes;
+ }
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else {
+ error("Invalid RequestType type in checkResourceAvailable");
+ return true;
+ }
+ }
+
+ // Out Ports
+
+ out_port(requestNetwork_out, CPURequestMsg, requestFromSQC);
+ out_port(responseNetwork_out, ResponseMsg, responseFromSQC);
+ out_port(unblockNetwork_out, UnblockMsg, unblockFromCore);
+
+ // In Ports
+
+ in_port(probeNetwork_in, TDProbeRequestMsg, probeToSQC) {
+ if (probeNetwork_in.isReady(clockEdge())) {
+ peek(probeNetwork_in, TDProbeRequestMsg, block_on="addr") {
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+
+ if (in_msg.Type == ProbeRequestType:PrbInv) {
+ if (in_msg.ReturnData) {
+ trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+ assert(in_msg.ReturnData);
+ trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+ }
+
+ in_port(responseToSQC_in, ResponseMsg, responseToSQC) {
+ if (responseToSQC_in.isReady(clockEdge())) {
+ peek(responseToSQC_in, ResponseMsg, block_on="addr") {
+
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+
+ if (in_msg.Type == CoherenceResponseType:TDSysResp) {
+ if (in_msg.State == CoherenceState:Shared) {
+ trigger(Event:TCC_AckS, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("SQC should not receive TDSysResp other than CoherenceState:Shared");
+ }
+ } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck) {
+ trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceResponseType:TDSysWBNack) {
+ trigger(Event:TCC_NackWB, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("Unexpected Response Message to Core");
+ }
+ }
+ }
+ }
+
+ in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+ if (mandatoryQueue_in.isReady(clockEdge())) {
+ peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+ Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+ TBE tbe := TBEs.lookup(in_msg.LineAddress);
+
+ assert(in_msg.Type == RubyRequestType:IFETCH);
+ if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+ trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
+ } else {
+ Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+ trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ }
+ }
+ }
+
+ // Actions
+
+ action(ic_invCache, "ic", desc="invalidate cache") {
+ if(is_valid(cache_entry)) {
+ L1cache.deallocate(address);
+ }
+ unset_cache_entry();
+ }
+
+ action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlkS;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := curCycle();
+ }
+ }
+
+ action(vc_victim, "vc", desc="Victimize E/S Data") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.Type := CoherenceRequestType:VicClean;
+ out_msg.InitialRequestTime := curCycle();
+ if (cache_entry.CacheState == State:S) {
+ out_msg.Shared := true;
+ } else {
+ out_msg.Shared := false;
+ }
+ out_msg.InitialRequestTime := curCycle();
+ }
+ }
+
+ action(a_allocate, "a", desc="allocate block") {
+ if (is_invalid(cache_entry)) {
+ set_cache_entry(L1cache.allocate(address, new Entry));
+ }
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ check_allocate(TBEs);
+ assert(is_valid(cache_entry));
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ tbe.DataBlk := cache_entry.DataBlk; // Data only used for WBs
+ tbe.Dirty := cache_entry.Dirty;
+ tbe.Shared := false;
+ }
+
+ action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+ TBEs.deallocate(address);
+ unset_tbe();
+ }
+
+ action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+ mandatoryQueue_in.dequeue(clockEdge());
+ }
+
+ action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+ responseToSQC_in.dequeue(clockEdge());
+ }
+
+ action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+ probeNetwork_in.dequeue(clockEdge());
+ }
+
+ action(l_loadDone, "l", desc="local load done") {
+ assert(is_valid(cache_entry));
+ sequencer.readCallback(address, cache_entry.DataBlk,
+ false, MachineType:L1Cache);
+ APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
+ }
+
+ action(xl_loadDone, "xl", desc="remote load done") {
+ peek(responseToSQC_in, ResponseMsg) {
+ assert(is_valid(cache_entry));
+ sequencer.readCallback(address,
+ cache_entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
+ }
+ }
+
+ action(w_writeCache, "w", desc="write data to cache") {
+ peek(responseToSQC_in, ResponseMsg) {
+ assert(is_valid(cache_entry));
+ cache_entry.DataBlk := in_msg.DataBlk;
+ cache_entry.Dirty := in_msg.Dirty;
+ }
+ }
+
+ action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+ peek(responseToSQC_in, ResponseMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:StaleNotif;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(wb_data, "wb", desc="write back data") {
+ peek(responseToSQC_in, ResponseMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUData;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.Dirty := tbe.Dirty;
+ if (tbe.Shared) {
+ out_msg.NbReqShared := true;
+ } else {
+ out_msg.NbReqShared := false;
+ }
+ out_msg.State := CoherenceState:Shared; // faux info
+ out_msg.MessageSize := MessageSizeType:Writeback_Data;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ // will this always be ok? probably not for multisocket
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.Dirty := false;
+ out_msg.Hit := false;
+ out_msg.Ntsl := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ // will this always be ok? probably not for multisocket
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.Dirty := false;
+ out_msg.Ntsl := true;
+ out_msg.Hit := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ // will this always be ok? probably not for multisocket
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.Dirty := false; // only true if sending back data i think
+ out_msg.Hit := false;
+ out_msg.Ntsl := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ assert(is_valid(cache_entry) || is_valid(tbe));
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ // will this always be ok? probably not for multisocket
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.DataBlk := getDataBlock(address);
+ if (is_valid(tbe)) {
+ out_msg.Dirty := tbe.Dirty;
+ } else {
+ out_msg.Dirty := cache_entry.Dirty;
+ }
+ out_msg.Hit := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ }
+ }
+
+ action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ assert(is_valid(cache_entry) || is_valid(tbe));
+ assert(is_valid(cache_entry));
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ // will this always be ok? probably not for multisocket
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.DataBlk := getDataBlock(address);
+ if (is_valid(tbe)) {
+ out_msg.Dirty := tbe.Dirty;
+ } else {
+ out_msg.Dirty := cache_entry.Dirty;
+ }
+ out_msg.Hit := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ }
+ }
+
+ action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") {
+ assert(is_valid(tbe));
+ tbe.Shared := true;
+ }
+
+ action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+ enqueue(unblockNetwork_out, UnblockMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") {
+ probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+ mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ // Transitions
+
+ // transitions from base
+ transition(I, Fetch, I_S) {TagArrayRead, TagArrayWrite} {
+ a_allocate;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ // simple hit transitions
+ transition(S, Fetch) {TagArrayRead, DataArrayRead} {
+ l_loadDone;
+ p_popMandatoryQueue;
+ }
+
+ // recycles from transients
+ transition({I_S, S_I, I_C}, {Fetch, Repl}) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition(S, Repl, S_I) {TagArrayRead} {
+ t_allocateTBE;
+ vc_victim;
+ ic_invCache;
+ }
+
+ // TCC event
+ transition(I_S, TCC_AckS, S) {DataArrayRead, DataArrayWrite} {
+ w_writeCache;
+ xl_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(S_I, TCC_NackWB, I){TagArrayWrite} {
+ d_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(S_I, TCC_AckWB, I) {TagArrayWrite} {
+ wb_data;
+ d_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(I_C, TCC_AckWB, I){TagArrayWrite} {
+ ss_sendStaleNotification;
+ d_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(I_C, TCC_NackWB, I) {TagArrayWrite} {
+ d_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ // Probe transitions
+ transition({S, I}, PrbInvData, I) {TagArrayRead, TagArrayWrite} {
+ pd_sendProbeResponseData;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition(I_C, PrbInvData, I_C) {
+ pi_sendProbeResponseInv;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition({S, I}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition({S}, PrbShrData, S) {DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition({I, I_C}, PrbShrData) {TagArrayRead} {
+ prm_sendProbeResponseMiss;
+ pp_popProbeQueue;
+ }
+
+ transition(I_C, PrbInv, I_C){
+ pi_sendProbeResponseInv;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition(I_S, {PrbInv, PrbInvData}) {} {
+ pi_sendProbeResponseInv;
+ ic_invCache;
+ a_allocate; // but make sure there is room for incoming data when it arrives
+ pp_popProbeQueue;
+ }
+
+ transition(I_S, PrbShrData) {} {
+ prm_sendProbeResponseMiss;
+ pp_popProbeQueue;
+ }
+
+ transition(S_I, PrbInvData, I_C) {TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition(S_I, PrbInv, I_C) {TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition(S_I, PrbShrData) {DataArrayRead} {
+ pd_sendProbeResponseData;
+ sf_setSharedFlip;
+ pp_popProbeQueue;
+ }
+}
diff --git a/src/mem/protocol/GPU_RfO-TCC.sm b/src/mem/protocol/GPU_RfO-TCC.sm
new file mode 100644
index 000000000..cfddb3f00
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO-TCC.sm
@@ -0,0 +1,1199 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:TCC, "TCC Cache")
+ : CacheMemory * L2cache;
+ WireBuffer * w_reqToTCCDir;
+ WireBuffer * w_respToTCCDir;
+ WireBuffer * w_TCCUnblockToTCCDir;
+ WireBuffer * w_reqToTCC;
+ WireBuffer * w_probeToTCC;
+ WireBuffer * w_respToTCC;
+ int TCC_select_num_bits;
+ Cycles l2_request_latency := 1;
+ Cycles l2_response_latency := 20;
+
+ // To the general response network
+ MessageBuffer * responseFromTCC, network="To", virtual_network="3", vnet_type="response";
+
+ // From the general response network
+ MessageBuffer * responseToTCC, network="From", virtual_network="3", vnet_type="response";
+
+{
+ // EVENTS
+ enumeration(Event, desc="TCC Events") {
+ // Requests coming from the Cores
+ RdBlk, desc="CPU RdBlk event";
+ RdBlkM, desc="CPU RdBlkM event";
+ RdBlkS, desc="CPU RdBlkS event";
+ CtoD, desc="Change to Dirty request";
+ WrVicBlk, desc="L1 Victim (dirty)";
+ WrVicBlkShared, desc="L1 Victim (dirty)";
+ ClVicBlk, desc="L1 Victim (clean)";
+ ClVicBlkShared, desc="L1 Victim (clean)";
+
+ CPUData, desc="WB data from CPU";
+ CPUDataShared, desc="WB data from CPU, NBReqShared 1";
+ StaleWB, desc="Stale WB, No data";
+
+ L2_Repl, desc="L2 Replacement";
+
+ // Probes
+ PrbInvData, desc="Invalidating probe, return dirty data";
+ PrbInv, desc="Invalidating probe, no need to return data";
+ PrbShrData, desc="Downgrading probe, return data";
+
+ // Coming from Memory Controller
+ WBAck, desc="ack from memory";
+
+ CancelWB, desc="Cancel WB from L2";
+ }
+
+ // STATES
+ state_declaration(State, desc="TCC State", default="TCC_State_I") {
+ M, AccessPermission:Read_Write, desc="Modified"; // No other cache has copy, memory stale
+ O, AccessPermission:Read_Only, desc="Owned"; // Correct most recent copy, others may exist in S
+ E, AccessPermission:Read_Write, desc="Exclusive"; // Correct, most recent, and only copy (and == Memory)
+ S, AccessPermission:Read_Only, desc="Shared"; // Correct, most recent. If no one in O, then == Memory
+ I, AccessPermission:Invalid, desc="Invalid";
+
+ I_M, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data";
+ I_O, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data";
+ I_E, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data";
+ I_S, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data";
+ S_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to M";
+ S_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O";
+ S_E, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to E";
+ S_S, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to S";
+ E_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O";
+ E_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O";
+ E_E, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O";
+ E_S, AccessPermission:Busy, desc="Shared, received WrVicBlk, sent Ack, waiting for Data";
+ O_M, AccessPermission:Busy, desc="...";
+ O_O, AccessPermission:Busy, desc="...";
+ O_E, AccessPermission:Busy, desc="...";
+ M_M, AccessPermission:Busy, desc="...";
+ M_O, AccessPermission:Busy, desc="...";
+ M_E, AccessPermission:Busy, desc="...";
+ M_S, AccessPermission:Busy, desc="...";
+ D_I, AccessPermission:Invalid, desc="drop WB data on the floor when receive";
+ MOD_I, AccessPermission:Busy, desc="drop WB data on the floor, waiting for WBAck from Mem";
+ MO_I, AccessPermission:Busy, desc="M or O, received L2_Repl, waiting for WBAck from Mem";
+ ES_I, AccessPermission:Busy, desc="E or S, received L2_Repl, waiting for WBAck from Mem";
+ I_C, AccessPermission:Invalid, desc="sent cancel, just waiting to receive mem wb ack so nothing gets confused";
+ }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ DataArrayRead, desc="Read the data array";
+ DataArrayWrite, desc="Write the data array";
+ TagArrayRead, desc="Read the data array";
+ TagArrayWrite, desc="Write the data array";
+ }
+
+
+ // STRUCTURES
+
+ structure(Entry, desc="...", interface="AbstractCacheEntry") {
+ State CacheState, desc="cache state";
+ bool Dirty, desc="Is the data dirty (diff from memory?)";
+ DataBlock DataBlk, desc="Data for the block";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ DataBlock DataBlk, desc="data for the block";
+ bool Dirty, desc="Is the data dirty?";
+ bool Shared, desc="Victim hit by shared probe";
+ MachineID From, desc="Waiting for writeback from...";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ TBETable TBEs, template="<TCC_TBE>", constructor="m_number_of_TBEs";
+ int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+ void set_cache_entry(AbstractCacheEntry b);
+ void unset_cache_entry();
+ void set_tbe(TBE b);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+
+
+ // FUNCTION DEFINITIONS
+ Tick clockEdge();
+ Tick cyclesToTicks(Cycles c);
+
+ Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+ return static_cast(Entry, "pointer", L2cache.lookup(addr));
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ return getCacheEntry(addr).DataBlk;
+ }
+
+ bool presentOrAvail(Addr addr) {
+ return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+ }
+
+ State getState(TBE tbe, Entry cache_entry, Addr addr) {
+ if (is_valid(tbe)) {
+ return tbe.TBEState;
+ } else if (is_valid(cache_entry)) {
+ return cache_entry.CacheState;
+ }
+ return State:I;
+ }
+
+ void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+ if (is_valid(tbe)) {
+ tbe.TBEState := state;
+ }
+
+ if (is_valid(cache_entry)) {
+ cache_entry.CacheState := state;
+ }
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return TCC_State_to_permission(tbe.TBEState);
+ }
+
+ Entry cache_entry := getCacheEntry(addr);
+ if(is_valid(cache_entry)) {
+ return TCC_State_to_permission(cache_entry.CacheState);
+ }
+
+ return AccessPermission:NotPresent;
+ }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+ if (is_valid(cache_entry)) {
+ cache_entry.changePermission(TCC_State_to_permission(state));
+ }
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ testAndRead(addr, tbe.DataBlk, pkt);
+ } else {
+ functionalMemoryRead(pkt);
+ }
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ int num_functional_writes := 0;
+
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ num_functional_writes := num_functional_writes +
+ testAndWrite(addr, tbe.DataBlk, pkt);
+ }
+
+ num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+ return num_functional_writes;
+ }
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else {
+ error("Invalid RequestType type in checkResourceAvailable");
+ return true;
+ }
+ }
+
+
+
+ // OUT PORTS
+ out_port(w_requestNetwork_out, CPURequestMsg, w_reqToTCCDir);
+ out_port(w_TCCResp_out, ResponseMsg, w_respToTCCDir);
+ out_port(responseNetwork_out, ResponseMsg, responseFromTCC);
+ out_port(w_unblockNetwork_out, UnblockMsg, w_TCCUnblockToTCCDir);
+
+ // IN PORTS
+ in_port(TDResponse_in, ResponseMsg, w_respToTCC) {
+ if (TDResponse_in.isReady(clockEdge())) {
+ peek(TDResponse_in, ResponseMsg) {
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ if (in_msg.Type == CoherenceResponseType:TDSysWBAck) {
+ trigger(Event:WBAck, in_msg.addr, cache_entry, tbe);
+ }
+ else {
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ error("Error on TDResponse Type");
+ }
+ }
+ }
+ }
+
+ // Response Network
+ in_port(responseNetwork_in, ResponseMsg, responseToTCC) {
+ if (responseNetwork_in.isReady(clockEdge())) {
+ peek(responseNetwork_in, ResponseMsg) {
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ if (in_msg.Type == CoherenceResponseType:CPUData) {
+ if (in_msg.NbReqShared) {
+ trigger(Event:CPUDataShared, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:CPUData, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+ trigger(Event:StaleWB, in_msg.addr, cache_entry, tbe);
+ } else {
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ error("Error on TDResponse Type");
+ }
+ }
+ }
+ }
+
+ // probe network
+ in_port(probeNetwork_in, TDProbeRequestMsg, w_probeToTCC) {
+ if (probeNetwork_in.isReady(clockEdge())) {
+ peek(probeNetwork_in, TDProbeRequestMsg) {
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ if (in_msg.Type == ProbeRequestType:PrbInv) {
+ if (in_msg.ReturnData) {
+ trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+ if (in_msg.ReturnData) {
+ trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("Don't think I should get any of these");
+ }
+ }
+ }
+ }
+ }
+
+ // Request Network
+ in_port(requestNetwork_in, CPURequestMsg, w_reqToTCC) {
+ if (requestNetwork_in.isReady(clockEdge())) {
+ peek(requestNetwork_in, CPURequestMsg) {
+ assert(in_msg.Destination.isElement(machineID));
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ if (in_msg.Type == CoherenceRequestType:RdBlk) {
+ trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+ trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+ trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+ if (presentOrAvail(in_msg.addr)) {
+ if (in_msg.Shared) {
+ trigger(Event:ClVicBlkShared, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:ClVicBlk, in_msg.addr, cache_entry, tbe);
+ }
+ } else {
+ Addr victim := L2cache.cacheProbe(in_msg.addr);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+ if (presentOrAvail(in_msg.addr)) {
+ if (in_msg.Shared) {
+ trigger(Event:WrVicBlkShared, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+ }
+ } else {
+ Addr victim := L2cache.cacheProbe(in_msg.addr);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } else {
+ requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+ }
+ }
+ }
+
+ // BEGIN ACTIONS
+
+ action(i_invL2, "i", desc="invalidate TCC cache block") {
+ if (is_valid(cache_entry)) {
+ L2cache.deallocate(address);
+ }
+ unset_cache_entry();
+ }
+
+ action(rm_sendResponseM, "rm", desc="send Modified response") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.DataBlk := cache_entry.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := cache_entry.Dirty;
+ out_msg.State := CoherenceState:Modified;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(rs_sendResponseS, "rs", desc="send Shared response") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.DataBlk := cache_entry.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := cache_entry.Dirty;
+ out_msg.State := CoherenceState:Shared;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+
+ action(r_requestToTD, "r", desc="Miss in L2, pass on") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := in_msg.Type;
+ out_msg.Requestor := in_msg.Requestor;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.Shared := false; // unneeded for this request
+ out_msg.MessageSize := in_msg.MessageSize;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ if (is_valid(cache_entry)) {
+ tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs
+ tbe.Dirty := cache_entry.Dirty;
+ }
+ tbe.From := machineID;
+ }
+
+ action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+ TBEs.deallocate(address);
+ unset_tbe();
+ }
+
+ action(vc_vicClean, "vc", desc="Victimize Clean L2 data") {
+ enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:VicClean;
+ out_msg.Requestor := machineID;
+ out_msg.DataBlk := cache_entry.DataBlk;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ }
+ }
+
+ action(vd_vicDirty, "vd", desc="Victimize dirty L2 data") {
+ enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:VicDirty;
+ out_msg.Requestor := machineID;
+ out_msg.DataBlk := cache_entry.DataBlk;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ }
+ }
+
+ action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysWBAck;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ }
+ }
+ }
+
+ action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+ enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ // will this always be ok? probably not for multisocket
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.Dirty := false;
+ out_msg.Hit := false;
+ out_msg.Ntsl := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(ph_sendProbeResponseHit, "ph", desc="send probe ack, no data") {
+ enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ // will this always be ok? probably not for multisocket
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.Dirty := false;
+ out_msg.Hit := true;
+ out_msg.Ntsl := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(pm_sendProbeResponseMiss, "pm", desc="send probe ack, no data") {
+ enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ // will this always be ok? probably not for multisocket
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.Dirty := false;
+ out_msg.Hit := false;
+ out_msg.Ntsl := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+ enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ // will this always be ok? probably not for multisocket
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.DataBlk := cache_entry.DataBlk;
+ //assert(cache_entry.Dirty); Not needed in TCC where TCC can supply clean data
+ out_msg.Dirty := cache_entry.Dirty;
+ out_msg.Hit := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ }
+ }
+
+ action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") {
+ enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.DataBlk := tbe.DataBlk;
+ //assert(tbe.Dirty);
+ out_msg.Dirty := tbe.Dirty;
+ out_msg.Hit := true;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.State := CoherenceState:NA;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(mc_cancelMemWriteback, "mc", desc="send writeback cancel to memory") {
+ enqueue(w_requestNetwork_out, CPURequestMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:WrCancel;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ }
+ }
+
+ action(a_allocateBlock, "a", desc="allocate TCC block") {
+ if (is_invalid(cache_entry)) {
+ set_cache_entry(L2cache.allocate(address, new Entry));
+ }
+ }
+
+ action(d_writeData, "d", desc="write data to TCC") {
+ peek(responseNetwork_in, ResponseMsg) {
+ if (in_msg.Dirty) {
+ cache_entry.Dirty := in_msg.Dirty;
+ }
+ cache_entry.DataBlk := in_msg.DataBlk;
+ DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+ }
+ }
+
+ action(rd_copyDataFromRequest, "rd", desc="write data to TCC") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ cache_entry.DataBlk := in_msg.DataBlk;
+ cache_entry.Dirty := true;
+ }
+ }
+
+ action(f_setFrom, "f", desc="set who WB is expected to come from") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ tbe.From := in_msg.Requestor;
+ }
+ }
+
+ action(rf_resetFrom, "rf", desc="reset From") {
+ tbe.From := machineID;
+ }
+
+ action(wb_data, "wb", desc="write back data") {
+ enqueue(w_TCCResp_out, ResponseMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUData;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.Dirty := tbe.Dirty;
+ if (tbe.Shared) {
+ out_msg.NbReqShared := true;
+ } else {
+ out_msg.NbReqShared := false;
+ }
+ out_msg.State := CoherenceState:Shared; // faux info
+ out_msg.MessageSize := MessageSizeType:Writeback_Data;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(wt_writeDataToTBE, "wt", desc="write WB data to TBE") {
+ peek(responseNetwork_in, ResponseMsg) {
+ tbe.DataBlk := in_msg.DataBlk;
+ tbe.Dirty := in_msg.Dirty;
+ }
+ }
+
+ action(uo_sendUnblockOwner, "uo", desc="state changed to E, M, or O, unblock") {
+ enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ out_msg.currentOwner := true;
+ out_msg.valid := true;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(us_sendUnblockSharer, "us", desc="state changed to S , unblock") {
+ enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ out_msg.currentOwner := false;
+ out_msg.valid := true;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(un_sendUnblockNotValid, "un", desc="state changed toI, unblock") {
+ enqueue(w_unblockNetwork_out, UnblockMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ out_msg.currentOwner := false;
+ out_msg.valid := false;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+ L2cache.setMRU(address);
+ }
+
+ action(p_popRequestQueue, "p", desc="pop request queue") {
+ requestNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pr_popResponseQueue, "pr", desc="pop response queue") {
+ responseNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pn_popTDResponseQueue, "pn", desc="pop TD response queue") {
+ TDResponse_in.dequeue(clockEdge());
+ }
+
+ action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+ probeNetwork_in.dequeue(clockEdge());
+ }
+
+ action(zz_recycleRequestQueue, "\z", desc="recycle request queue") {
+ requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+
+ // END ACTIONS
+
+ // BEGIN TRANSITIONS
+
+ // transitions from base
+
+ transition({I, I_C}, {RdBlk, RdBlkS, RdBlkM, CtoD}){TagArrayRead} {
+ // TCCdir already knows that the block is not here. This is to allocate and get the block.
+ r_requestToTD;
+ p_popRequestQueue;
+ }
+
+// check
+ transition({M, O}, RdBlk, O){TagArrayRead, TagArrayWrite} {
+ rs_sendResponseS;
+ ut_updateTag;
+ // detect 2nd chancing
+ p_popRequestQueue;
+ }
+
+//check
+ transition({E, S}, RdBlk, S){TagArrayRead, TagArrayWrite} {
+ rs_sendResponseS;
+ ut_updateTag;
+ // detect 2nd chancing
+ p_popRequestQueue;
+ }
+
+// check
+ transition({M, O}, RdBlkS, O){TagArrayRead, TagArrayWrite} {
+ rs_sendResponseS;
+ ut_updateTag;
+ // detect 2nd chance sharing
+ p_popRequestQueue;
+ }
+
+//check
+ transition({E, S}, RdBlkS, S){TagArrayRead, TagArrayWrite} {
+ rs_sendResponseS;
+ ut_updateTag;
+ // detect 2nd chance sharing
+ p_popRequestQueue;
+ }
+
+// check
+ transition(M, RdBlkM, I){TagArrayRead, TagArrayWrite} {
+ rm_sendResponseM;
+ i_invL2;
+ p_popRequestQueue;
+ }
+
+ //check
+ transition(E, RdBlkM, I){TagArrayRead, TagArrayWrite} {
+ rm_sendResponseM;
+ i_invL2;
+ p_popRequestQueue;
+ }
+
+// check
+ transition({I}, WrVicBlk, I_M){TagArrayRead} {
+ a_allocateBlock;
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(I_C, {WrVicBlk, WrVicBlkShared, ClVicBlk, ClVicBlkShared}) {
+ zz_recycleRequestQueue;
+ }
+
+//check
+ transition({I}, WrVicBlkShared, I_O) {TagArrayRead}{
+ a_allocateBlock;
+ t_allocateTBE;
+ f_setFrom;
+// rd_copyDataFromRequest;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+//check
+ transition(S, WrVicBlkShared, S_O){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+// a stale writeback
+ transition(S, WrVicBlk, S_S){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+// a stale writeback
+ transition(E, WrVicBlk, E_E){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+// a stale writeback
+ transition(E, WrVicBlkShared, E_E){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+// a stale writeback
+ transition(O, WrVicBlk, O_O){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+// a stale writeback
+ transition(O, WrVicBlkShared, O_O){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+// a stale writeback
+ transition(M, WrVicBlk, M_M){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+// a stale writeback
+ transition(M, WrVicBlkShared, M_O){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+//check
+ transition({I}, ClVicBlk, I_E){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ a_allocateBlock;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition({I}, ClVicBlkShared, I_S){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ a_allocateBlock;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+//check
+ transition(S, ClVicBlkShared, S_S){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+// a stale writeback
+ transition(E, ClVicBlk, E_E){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+// a stale writeback
+ transition(E, ClVicBlkShared, E_S){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+// a stale writeback
+ transition(O, ClVicBlk, O_O){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+// check. Original L3 ahd it going from O to O_S. Something can go from O to S only on writeback.
+ transition(O, ClVicBlkShared, O_O){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+// a stale writeback
+ transition(M, ClVicBlk, M_E){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+// a stale writeback
+ transition(M, ClVicBlkShared, M_S){TagArrayRead} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+
+ transition({MO_I}, {RdBlk, RdBlkS, RdBlkM, CtoD}) {
+ a_allocateBlock;
+ t_allocateTBE;
+ f_setFrom;
+ r_requestToTD;
+ p_popRequestQueue;
+ }
+
+ transition(MO_I, {WrVicBlkShared, WrVicBlk, ClVicBlk, ClVicBlkShared}, MOD_I) {
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(I_M, CPUData, M){TagArrayWrite} {
+ uo_sendUnblockOwner;
+ dt_deallocateTBE;
+ d_writeData;
+ pr_popResponseQueue;
+ }
+
+ transition(I_M, CPUDataShared, O){TagArrayWrite, DataArrayWrite} {
+ uo_sendUnblockOwner;
+ dt_deallocateTBE;
+ d_writeData;
+ pr_popResponseQueue;
+ }
+
+ transition(I_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite} {
+ uo_sendUnblockOwner;
+ dt_deallocateTBE;
+ d_writeData;
+ pr_popResponseQueue;
+ }
+
+ transition(I_E, CPUData, E){TagArrayWrite, DataArrayWrite} {
+ uo_sendUnblockOwner;
+ dt_deallocateTBE;
+ d_writeData;
+ pr_popResponseQueue;
+ }
+
+ transition(I_E, CPUDataShared, S){TagArrayWrite, DataArrayWrite} {
+ us_sendUnblockSharer;
+ dt_deallocateTBE;
+ d_writeData;
+ pr_popResponseQueue;
+ }
+
+ transition(I_S, {CPUData, CPUDataShared}, S){TagArrayWrite, DataArrayWrite} {
+ us_sendUnblockSharer;
+ dt_deallocateTBE;
+ d_writeData;
+ pr_popResponseQueue;
+ }
+
+ transition(S_M, CPUDataShared, O){TagArrayWrite, DataArrayWrite} {
+ uo_sendUnblockOwner;
+ dt_deallocateTBE;
+ d_writeData;
+ ut_updateTag; // update tag on writeback hits.
+ pr_popResponseQueue;
+ }
+
+ transition(S_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite} {
+ uo_sendUnblockOwner;
+ dt_deallocateTBE;
+ d_writeData;
+ ut_updateTag; // update tag on writeback hits.
+ pr_popResponseQueue;
+ }
+
+ transition(S_E, CPUDataShared, S){TagArrayWrite, DataArrayWrite} {
+ us_sendUnblockSharer;
+ dt_deallocateTBE;
+ d_writeData;
+ ut_updateTag; // update tag on writeback hits.
+ pr_popResponseQueue;
+ }
+
+ transition(S_S, {CPUData, CPUDataShared}, S){TagArrayWrite, DataArrayWrite} {
+ us_sendUnblockSharer;
+ dt_deallocateTBE;
+ d_writeData;
+ ut_updateTag; // update tag on writeback hits.
+ pr_popResponseQueue;
+ }
+
+ transition(O_E, CPUDataShared, O){TagArrayWrite, DataArrayWrite} {
+ uo_sendUnblockOwner;
+ dt_deallocateTBE;
+ d_writeData;
+ ut_updateTag; // update tag on writeback hits.
+ pr_popResponseQueue;
+ }
+
+ transition(O_O, {CPUData, CPUDataShared}, O){TagArrayWrite, DataArrayWrite} {
+ uo_sendUnblockOwner;
+ dt_deallocateTBE;
+ d_writeData;
+ ut_updateTag; // update tag on writeback hits.
+ pr_popResponseQueue;
+ }
+
+ transition({D_I}, {CPUData, CPUDataShared}, I){TagArrayWrite} {
+ un_sendUnblockNotValid;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(MOD_I, {CPUData, CPUDataShared}, MO_I) {
+ un_sendUnblockNotValid;
+ rf_resetFrom;
+ pr_popResponseQueue;
+ }
+
+ transition({O,S,I}, CPUData) {
+ pr_popResponseQueue;
+ }
+
+ transition({M, O}, L2_Repl, MO_I){TagArrayRead, DataArrayRead} {
+ t_allocateTBE;
+ vd_vicDirty;
+ i_invL2;
+ }
+
+ transition({E, S,}, L2_Repl, ES_I){TagArrayRead, DataArrayRead} {
+ t_allocateTBE;
+ vc_vicClean;
+ i_invL2;
+ }
+
+ transition({I_M, I_O, S_M, S_O, E_M, E_O}, L2_Repl) {
+ zz_recycleRequestQueue;
+ }
+
+ transition({O_M, O_O, O_E, M_M, M_O, M_E, M_S}, L2_Repl) {
+ zz_recycleRequestQueue;
+ }
+
+ transition({I_E, I_S, S_E, S_S, E_E, E_S}, L2_Repl) {
+ zz_recycleRequestQueue;
+ }
+
+ transition({M, O}, PrbInvData, I){TagArrayRead, TagArrayWrite} {
+ pd_sendProbeResponseData;
+ i_invL2;
+ pp_popProbeQueue;
+ }
+
+ transition(I, PrbInvData){TagArrayRead, TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition({E, S}, PrbInvData, I){TagArrayRead, TagArrayWrite} {
+ pd_sendProbeResponseData;
+ i_invL2;
+ pp_popProbeQueue;
+ }
+
+ transition({M, O, E, S, I}, PrbInv, I){TagArrayRead, TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ i_invL2; // nothing will happen in I
+ pp_popProbeQueue;
+ }
+
+ transition({M, O}, PrbShrData, O){TagArrayRead, TagArrayWrite} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition({E, S}, PrbShrData, S){TagArrayRead, TagArrayWrite} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition(I, PrbShrData){TagArrayRead} {
+ pm_sendProbeResponseMiss;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_I, PrbInvData, I_C) {
+ pdt_sendProbeResponseDataFromTBE;
+ pp_popProbeQueue;
+ }
+
+ transition(ES_I, PrbInvData, I_C) {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition({ES_I,MO_I}, PrbInv, I_C) {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition({ES_I, MO_I}, PrbShrData) {
+ pdt_sendProbeResponseDataFromTBE;
+ pp_popProbeQueue;
+ }
+
+ transition(I_C, {PrbInvData, PrbInv}) {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition(I_C, PrbShrData) {
+ pm_sendProbeResponseMiss;
+ pp_popProbeQueue;
+ }
+
+ transition(MOD_I, WBAck, D_I) {
+ pn_popTDResponseQueue;
+ }
+
+ transition(MO_I, WBAck, I){TagArrayWrite} {
+ dt_deallocateTBE;
+ pn_popTDResponseQueue;
+ }
+
+ // this can only be a spurious CPUData from a shared block.
+ transition(MO_I, CPUData) {
+ pr_popResponseQueue;
+ }
+
+ transition(ES_I, WBAck, I){TagArrayWrite} {
+ dt_deallocateTBE;
+ pn_popTDResponseQueue;
+ }
+
+ transition(I_C, {WBAck}, I){TagArrayWrite} {
+ dt_deallocateTBE;
+ pn_popTDResponseQueue;
+ }
+
+ transition({I_M, I_O, I_E, I_S}, StaleWB, I){TagArrayWrite} {
+ un_sendUnblockNotValid;
+ dt_deallocateTBE;
+ i_invL2;
+ pr_popResponseQueue;
+ }
+
+ transition({S_S, S_O, S_M, S_E}, StaleWB, S){TagArrayWrite} {
+ us_sendUnblockSharer;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition({E_M, E_O, E_E, E_S}, StaleWB, E){TagArrayWrite} {
+ uo_sendUnblockOwner;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition({O_M, O_O, O_E}, StaleWB, O){TagArrayWrite} {
+ uo_sendUnblockOwner;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition({M_M, M_O, M_E, M_S}, StaleWB, M){TagArrayWrite} {
+ uo_sendUnblockOwner;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(D_I, StaleWB, I) {TagArrayWrite}{
+ un_sendUnblockNotValid;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(MOD_I, StaleWB, MO_I) {
+ un_sendUnblockNotValid;
+ rf_resetFrom;
+ pr_popResponseQueue;
+ }
+
+}
diff --git a/src/mem/protocol/GPU_RfO-TCCdir.sm b/src/mem/protocol/GPU_RfO-TCCdir.sm
new file mode 100644
index 000000000..8f58d6ebb
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO-TCCdir.sm
@@ -0,0 +1,2672 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Mithuna Thottethodi
+ */
+
+machine(MachineType:TCCdir, "AMD read-for-ownership directory for TCC (aka GPU L2)")
+: CacheMemory * directory;
+ // Convention: wire buffers are prefixed with "w_" for clarity
+ WireBuffer * w_reqToTCCDir;
+ WireBuffer * w_respToTCCDir;
+ WireBuffer * w_TCCUnblockToTCCDir;
+ WireBuffer * w_reqToTCC;
+ WireBuffer * w_probeToTCC;
+ WireBuffer * w_respToTCC;
+ int TCC_select_num_bits;
+ Cycles response_latency := 5;
+ Cycles directory_latency := 6;
+ Cycles issue_latency := 120;
+
+ // From the TCPs or SQCs
+ MessageBuffer * requestFromTCP, network="From", virtual_network="1", vnet_type="request";
+ MessageBuffer * responseFromTCP, network="From", virtual_network="3", vnet_type="response";
+ MessageBuffer * unblockFromTCP, network="From", virtual_network="5", vnet_type="unblock";
+
+ // To the Cores. TCC deals only with TCPs/SQCs. CP cores do not communicate directly with TCC.
+ MessageBuffer * probeToCore, network="To", virtual_network="1", vnet_type="request";
+ MessageBuffer * responseToCore, network="To", virtual_network="3", vnet_type="response";
+
+ // From the NB
+ MessageBuffer * probeFromNB, network="From", virtual_network="0", vnet_type="request";
+ MessageBuffer * responseFromNB, network="From", virtual_network="2", vnet_type="response";
+ // To the NB
+ MessageBuffer * requestToNB, network="To", virtual_network="0", vnet_type="request";
+ MessageBuffer * responseToNB, network="To", virtual_network="2", vnet_type="response";
+ MessageBuffer * unblockToNB, network="To", virtual_network="4", vnet_type="unblock";
+
+ MessageBuffer * triggerQueue, random="false";
+{
+ // STATES
+ state_declaration(State, desc="Directory states", default="TCCdir_State_I") {
+ // Base states
+ I, AccessPermission:Invalid, desc="Invalid";
+ S, AccessPermission:Invalid, desc="Shared";
+ E, AccessPermission:Invalid, desc="Shared";
+ O, AccessPermission:Invalid, desc="Owner";
+ M, AccessPermission:Invalid, desc="Modified";
+
+ CP_I, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to invalid";
+ B_I, AccessPermission:Invalid, desc="Blocked, need not send data after acks are in, going to invalid";
+ CP_O, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to owned";
+ CP_S, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to shared";
+ CP_OM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to O_M";
+ CP_SM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to S_M";
+ CP_ISM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to I_M";
+ CP_IOM, AccessPermission:Invalid, desc="Blocked, must send data after acks are in, going to I_M";
+ CP_OSIW, AccessPermission:Invalid, desc="Blocked, must send data after acks+CancelWB are in, going to I_C";
+
+
+ // Transient states and busy states used for handling side (TCC-facing) interactions
+ BW_S, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock";
+ BW_E, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock";
+ BW_O, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock";
+ BW_M, AccessPermission:Invalid, desc="Blocked, Awaiting TCC unblock";
+
+ // Transient states and busy states used for handling upward (TCP-facing) interactions
+ I_M, AccessPermission:Invalid, desc="Invalid, issued RdBlkM, have not seen response yet";
+ I_ES, AccessPermission:Invalid, desc="Invalid, issued RdBlk, have not seen response yet";
+ I_S, AccessPermission:Invalid, desc="Invalid, issued RdBlkS, have not seen response yet";
+ BBS_S, AccessPermission:Invalid, desc="Blocked, going from S to S";
+ BBO_O, AccessPermission:Invalid, desc="Blocked, going from O to O";
+ BBM_M, AccessPermission:Invalid, desc="Blocked, going from M to M, waiting for data to forward";
+ BBM_O, AccessPermission:Invalid, desc="Blocked, going from M to O, waiting for data to forward";
+ BB_M, AccessPermission:Invalid, desc="Blocked, going from M to M, waiting for unblock";
+ BB_O, AccessPermission:Invalid, desc="Blocked, going from M to O, waiting for unblock";
+ BB_OO, AccessPermission:Invalid, desc="Blocked, going from O to O (adding sharers), waiting for unblock";
+ BB_S, AccessPermission:Invalid, desc="Blocked, going to S, waiting for (possible multiple) unblock(s)";
+ BBS_M, AccessPermission:Invalid, desc="Blocked, going from S or O to M";
+ BBO_M, AccessPermission:Invalid, desc="Blocked, going from S or O to M";
+ BBS_UM, AccessPermission:Invalid, desc="Blocked, going from S or O to M via upgrade";
+ BBO_UM, AccessPermission:Invalid, desc="Blocked, going from S or O to M via upgrade";
+ S_M, AccessPermission:Invalid, desc="Shared, issued CtoD, have not seen response yet";
+ O_M, AccessPermission:Invalid, desc="Shared, issued CtoD, have not seen response yet";
+
+ //
+ BBB_S, AccessPermission:Invalid, desc="Blocked, going to S after core unblock";
+ BBB_M, AccessPermission:Invalid, desc="Blocked, going to M after core unblock";
+ BBB_E, AccessPermission:Invalid, desc="Blocked, going to E after core unblock";
+
+ VES_I, AccessPermission:Invalid, desc="TCC replacement, waiting for clean WB ack";
+ VM_I, AccessPermission:Invalid, desc="TCC replacement, waiting for dirty WB ack";
+ VO_I, AccessPermission:Invalid, desc="TCC replacement, waiting for dirty WB ack";
+ VO_S, AccessPermission:Invalid, desc="TCC owner replacement, waiting for dirty WB ack";
+
+ ES_I, AccessPermission:Invalid, desc="L1 replacement, waiting for clean WB ack";
+ MO_I, AccessPermission:Invalid, desc="L1 replacement, waiting for dirty WB ack";
+
+ I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from NB for canceled WB";
+ I_W, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from NB; canceled WB raced with directory invalidation";
+
+ // Recall States
+ BRWD_I, AccessPermission:Invalid, desc="Recalling, waiting for WBAck and Probe Data responses";
+ BRW_I, AccessPermission:Read_Write, desc="Recalling, waiting for WBAck";
+ BRD_I, AccessPermission:Invalid, desc="Recalling, waiting for Probe Data responses";
+
+ }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ DataArrayRead, desc="Read the data array";
+ DataArrayWrite, desc="Write the data array";
+ TagArrayRead, desc="Read the data array";
+ TagArrayWrite, desc="Write the data array";
+ }
+
+
+
+ // EVENTS
+ enumeration(Event, desc="TCC Directory Events") {
+ // Upward facing events (TCCdir w.r.t. TCP/SQC and TCC behaves like NBdir behaves with TCP/SQC and L3
+
+ // Directory Recall
+ Recall, desc="directory cache is full";
+ // CPU requests
+ CPUWrite, desc="Initial req from core, sent to TCC";
+ NoCPUWrite, desc="Initial req from core, but non-exclusive clean data; can be discarded";
+ CPUWriteCancel, desc="Initial req from core, sent to TCC";
+
+ // Requests from the TCPs
+ RdBlk, desc="RdBlk event";
+ RdBlkM, desc="RdBlkM event";
+ RdBlkS, desc="RdBlkS event";
+ CtoD, desc="Change to Dirty request";
+
+ // TCC writebacks
+ VicDirty, desc="...";
+ VicDirtyLast, desc="...";
+ VicClean, desc="...";
+ NoVic, desc="...";
+ StaleVic, desc="...";
+ CancelWB, desc="TCC got invalidating probe, canceled WB";
+
+ // Probe Responses from TCP/SQCs
+ CPUPrbResp, desc="Probe response from TCP/SQC";
+ TCCPrbResp, desc="Probe response from TCC";
+
+ ProbeAcksComplete, desc="All acks received";
+ ProbeAcksCompleteReissue, desc="All acks received, changing CtoD to reissue";
+
+ CoreUnblock, desc="unblock from TCP/SQC";
+ LastCoreUnblock, desc="Last unblock from TCP/SQC";
+ TCCUnblock, desc="unblock from TCC (current owner)";
+ TCCUnblock_Sharer, desc="unblock from TCC (a sharer, not owner)";
+ TCCUnblock_NotValid,desc="unblock from TCC (not valid...caused by stale writebacks)";
+
+ // Downward facing events
+
+ // NB initiated
+ NB_AckS, desc="NB Ack to TCC Request";
+ NB_AckE, desc="NB Ack to TCC Request";
+ NB_AckM, desc="NB Ack to TCC Request";
+ NB_AckCtoD, desc="NB Ack to TCC Request";
+ NB_AckWB, desc="NB Ack for clean WB";
+
+
+ // Incoming Probes from NB
+ PrbInvData, desc="Invalidating probe, return dirty data";
+ PrbInv, desc="Invalidating probe, no need to return data";
+ PrbShrData, desc="Downgrading probe, return data";
+ }
+
+
+ // TYPES
+
+ // Entry for directory
+ structure(Entry, desc="...", interface='AbstractCacheEntry') {
+ State CacheState, desc="Cache state (Cache of directory entries)";
+ DataBlock DataBlk, desc="data for the block";
+ NetDest Sharers, desc="Sharers for this block";
+ NetDest Owner, desc="Owner of this block";
+ NetDest MergedSharers, desc="Read sharers who are merged on a request";
+ int WaitingUnblocks, desc="Number of acks we're waiting for";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ DataBlock DataBlk, desc="DataBlk";
+ bool Dirty, desc="Is the data dirty?";
+ MachineID Requestor, desc="requestor";
+ int NumPendingAcks, desc="num acks expected";
+ MachineID OriginalRequestor, desc="Original Requestor";
+ MachineID UntransferredOwner, desc = "Untransferred owner for an upgrade transaction";
+ bool UntransferredOwnerExists, desc = "1 if Untransferred owner exists for an upgrade transaction";
+ bool Cached, desc="data hit in Cache";
+ bool Shared, desc="victim hit by shared probe";
+ bool Upgrade, desc="An upgrade request in progress";
+ bool CtoD, desc="Saved sysack info";
+ CoherenceState CohState, desc="Saved sysack info";
+ MessageSizeType MessageSize, desc="Saved sysack info";
+ MachineID Sender, desc="sender";
+ }
+
+ structure(TBETable, external = "yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ // ** OBJECTS **
+ TBETable TBEs, template="<TCCdir_TBE>", constructor="m_number_of_TBEs";
+ int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+ NetDest TCC_dir_subtree;
+ NetDest temp;
+
+ Tick clockEdge();
+ Tick cyclesToTicks(Cycles c);
+
+ void set_cache_entry(AbstractCacheEntry b);
+ void unset_cache_entry();
+ void set_tbe(TBE b);
+ void unset_tbe();
+
+
+ bool presentOrAvail(Addr addr) {
+ return directory.isTagPresent(addr) || directory.cacheAvail(addr);
+ }
+
+ Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+ return static_cast(Entry, "pointer", directory.lookup(addr));
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return tbe.DataBlk;
+ } else {
+ assert(false);
+ return getCacheEntry(addr).DataBlk;
+ }
+ }
+
+ State getState(TBE tbe, Entry cache_entry, Addr addr) {
+ if(is_valid(tbe)) {
+ return tbe.TBEState;
+ } else if (is_valid(cache_entry)) {
+ return cache_entry.CacheState;
+ }
+ return State:I;
+ }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+ if (is_valid(cache_entry)) {
+ cache_entry.changePermission(TCCdir_State_to_permission(state));
+ }
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return TCCdir_State_to_permission(tbe.TBEState);
+ }
+
+ Entry cache_entry := getCacheEntry(addr);
+ if(is_valid(cache_entry)) {
+ return TCCdir_State_to_permission(cache_entry.CacheState);
+ }
+
+ return AccessPermission:NotPresent;
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ testAndRead(addr, tbe.DataBlk, pkt);
+ } else {
+ functionalMemoryRead(pkt);
+ }
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ int num_functional_writes := 0;
+
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ num_functional_writes := num_functional_writes +
+ testAndWrite(addr, tbe.DataBlk, pkt);
+ }
+
+ num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+ return num_functional_writes;
+ }
+
+ void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+ if (is_valid(tbe)) {
+ tbe.TBEState := state;
+ }
+
+ if (is_valid(cache_entry)) {
+ cache_entry.CacheState := state;
+
+ if (state == State:S) {
+ assert(cache_entry.Owner.count() == 0);
+ }
+
+ if (state == State:O) {
+ assert(cache_entry.Owner.count() == 1);
+ assert(cache_entry.Sharers.isSuperset(cache_entry.Owner) == false);
+ }
+
+ if (state == State:M) {
+ assert(cache_entry.Owner.count() == 1);
+ assert(cache_entry.Sharers.count() == 0);
+ }
+
+ if (state == State:E) {
+ assert(cache_entry.Owner.count() == 0);
+ assert(cache_entry.Sharers.count() == 1);
+ }
+ }
+ }
+
+
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ directory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ directory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ directory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ directory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ return directory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ return directory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ return directory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ return directory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else {
+ error("Invalid RequestType type in checkResourceAvailable");
+ return true;
+ }
+ }
+
+ // ** OUT_PORTS **
+
+ // Three classes of ports
+ // Class 1: downward facing network links to NB
+ out_port(requestToNB_out, CPURequestMsg, requestToNB);
+ out_port(responseToNB_out, ResponseMsg, responseToNB);
+ out_port(unblockToNB_out, UnblockMsg, unblockToNB);
+
+
+ // Class 2: upward facing ports to GPU cores
+ out_port(probeToCore_out, TDProbeRequestMsg, probeToCore);
+ out_port(responseToCore_out, ResponseMsg, responseToCore);
+
+ // Class 3: sideward facing ports (on "wirebuffer" links) to TCC
+ out_port(w_requestTCC_out, CPURequestMsg, w_reqToTCC);
+ out_port(w_probeTCC_out, NBProbeRequestMsg, w_probeToTCC);
+ out_port(w_respTCC_out, ResponseMsg, w_respToTCC);
+
+
+ // local trigger port
+ out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+
+ //
+ // request queue going to NB
+ //
+
+ // ** IN_PORTS **
+
+ // Trigger Queue
+ in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=8) {
+ if (triggerQueue_in.isReady(clockEdge())) {
+ peek(triggerQueue_in, TriggerMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ assert(is_valid(tbe));
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if ((in_msg.Type == TriggerType:AcksComplete) && (tbe.Upgrade == false)) {
+ trigger(Event:ProbeAcksComplete, in_msg.addr, cache_entry, tbe);
+ } else if ((in_msg.Type == TriggerType:AcksComplete) && (tbe.Upgrade == true)) {
+ trigger(Event:ProbeAcksCompleteReissue, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+ }
+
+ // Unblock Networks (TCCdir can receive unblocks from TCC, TCPs)
+ // Port on first (of three) wire buffers from TCC
+ in_port(w_TCCUnblock_in, UnblockMsg, w_TCCUnblockToTCCDir, rank=7) {
+ if (w_TCCUnblock_in.isReady(clockEdge())) {
+ peek(w_TCCUnblock_in, UnblockMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (in_msg.currentOwner) {
+ trigger(Event:TCCUnblock, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.valid) {
+ trigger(Event:TCCUnblock_Sharer, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:TCCUnblock_NotValid, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+ }
+
+ in_port(unblockNetwork_in, UnblockMsg, unblockFromTCP, rank=6) {
+ if (unblockNetwork_in.isReady(clockEdge())) {
+ peek(unblockNetwork_in, UnblockMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if(cache_entry.WaitingUnblocks == 1) {
+ trigger(Event:LastCoreUnblock, in_msg.addr, cache_entry, tbe);
+ }
+ else {
+ trigger(Event:CoreUnblock, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+ }
+
+
+ //Responses from TCC, and Cores
+ // Port on second (of three) wire buffers from TCC
+ in_port(w_TCCResponse_in, ResponseMsg, w_respToTCCDir, rank=5) {
+ if (w_TCCResponse_in.isReady(clockEdge())) {
+ peek(w_TCCResponse_in, ResponseMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+ trigger(Event:TCCPrbResp, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+ }
+
+ in_port(responseNetwork_in, ResponseMsg, responseFromTCP, rank=4) {
+ if (responseNetwork_in.isReady(clockEdge())) {
+ peek(responseNetwork_in, ResponseMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+ trigger(Event:CPUPrbResp, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+ }
+
+
+ // Port on third (of three) wire buffers from TCC
+ in_port(w_TCCRequest_in, CPURequestMsg, w_reqToTCCDir, rank=3) {
+ if(w_TCCRequest_in.isReady(clockEdge())) {
+ peek(w_TCCRequest_in, CPURequestMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (in_msg.Type == CoherenceRequestType:WrCancel) {
+ trigger(Event:CancelWB, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+ if (is_valid(cache_entry) && cache_entry.Owner.isElement(in_msg.Requestor)) {
+ // if modified, or owner with no other sharers
+ if ((cache_entry.CacheState == State:M) || (cache_entry.Sharers.count() == 0)) {
+ assert(cache_entry.Owner.count()==1);
+ trigger(Event:VicDirtyLast, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:VicDirty, in_msg.addr, cache_entry, tbe);
+ }
+ } else {
+ trigger(Event:StaleVic, in_msg.addr, cache_entry, tbe);
+ }
+ } else {
+ if (in_msg.Type == CoherenceRequestType:VicClean) {
+ if (is_valid(cache_entry) && cache_entry.Sharers.isElement(in_msg.Requestor)) {
+ if (cache_entry.Sharers.count() == 1) {
+ // Last copy, victimize to L3
+ trigger(Event:VicClean, in_msg.addr, cache_entry, tbe);
+ } else {
+ // Either not the last copy or stall. No need to victimmize
+ // remove sharer from sharer list
+ assert(cache_entry.Sharers.count() > 1);
+ trigger(Event:NoVic, in_msg.addr, cache_entry, tbe);
+ }
+ } else {
+ trigger(Event:StaleVic, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ in_port(responseFromNB_in, ResponseMsg, responseFromNB, rank=2) {
+ if (responseFromNB_in.isReady(clockEdge())) {
+ peek(responseFromNB_in, ResponseMsg, block_on="addr") {
+
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+ if (in_msg.State == CoherenceState:Modified) {
+ if (in_msg.CtoD) {
+ trigger(Event:NB_AckCtoD, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.State == CoherenceState:Shared) {
+ trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.State == CoherenceState:Exclusive) {
+ trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+ trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("Unexpected Response Message to Core");
+ }
+ }
+ }
+ }
+
+ // Finally handling incoming requests (from TCP) and probes (from NB).
+
+ in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB, rank=1) {
+ if (probeNetwork_in.isReady(clockEdge())) {
+ peek(probeNetwork_in, NBProbeRequestMsg) {
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ DPRINTF(RubySlicc, "machineID: %s\n", machineID);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+
+ if (in_msg.Type == ProbeRequestType:PrbInv) {
+ if (in_msg.ReturnData) {
+ trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+ assert(in_msg.ReturnData);
+ trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+ }
+
+
+ in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) {
+ if (coreRequestNetwork_in.isReady(clockEdge())) {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (presentOrAvail(in_msg.addr)) {
+ if (in_msg.Type == CoherenceRequestType:VicDirty) {
+ trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+ if (is_valid(cache_entry) && cache_entry.Owner.isElement(in_msg.Requestor)) {
+ trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+ } else if(is_valid(cache_entry) && (cache_entry.Sharers.count() + cache_entry.Owner.count() ) >1) {
+ trigger(Event:NoCPUWrite, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
+ trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+ trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+ trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:WrCancel) {
+ trigger(Event:CPUWriteCancel, in_msg.addr, cache_entry, tbe);
+ }
+ } else {
+ // All requests require a directory entry
+ Addr victim := directory.cacheProbe(in_msg.addr);
+ trigger(Event:Recall, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ }
+ }
+ }
+
+
+
+
+ // Actions
+
+ //Downward facing actions
+
+ action(c_clearOwner, "c", desc="Clear the owner field") {
+ cache_entry.Owner.clear();
+ }
+
+ action(rS_removeRequesterFromSharers, "rS", desc="Remove unblocker from sharer list") {
+ peek(unblockNetwork_in, UnblockMsg) {
+ cache_entry.Sharers.remove(in_msg.Sender);
+ }
+ }
+
+ action(rT_removeTCCFromSharers, "rT", desc="Remove TCC from sharer list") {
+ peek(w_TCCRequest_in, CPURequestMsg) {
+ cache_entry.Sharers.remove(in_msg.Requestor);
+ }
+ }
+
+ action(rO_removeOriginalRequestorFromSharers, "rO", desc="Remove replacing core from sharer list") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ cache_entry.Sharers.remove(in_msg.Requestor);
+ }
+ }
+
+ action(rC_removeCoreFromSharers, "rC", desc="Remove replacing core from sharer list") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ cache_entry.Sharers.remove(in_msg.Requestor);
+ }
+ }
+
+ action(rCo_removeCoreFromOwner, "rCo", desc="Remove replacing core from sharer list") {
+ // Note that under some cases this action will try to remove a stale owner
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ cache_entry.Owner.remove(in_msg.Requestor);
+ }
+ }
+
+ action(rR_removeResponderFromSharers, "rR", desc="Remove responder from sharer list") {
+ peek(responseNetwork_in, ResponseMsg) {
+ cache_entry.Sharers.remove(in_msg.Sender);
+ }
+ }
+
+ action(nC_sendNullWBAckToCore, "nC", desc = "send a null WB Ack to release core") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ enqueue(responseToCore_out, ResponseMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysWBNack;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.MessageSize := in_msg.MessageSize;
+ }
+ }
+ }
+
+ action(nT_sendNullWBAckToTCC, "nT", desc = "send a null WB Ack to release TCC") {
+ peek(w_TCCRequest_in, CPURequestMsg) {
+ enqueue(w_respTCC_out, ResponseMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysWBAck;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.MessageSize := in_msg.MessageSize;
+ }
+ }
+ }
+
+ action(eto_moveExSharerToOwner, "eto", desc="move the current exclusive sharer to owner") {
+ assert(cache_entry.Sharers.count() == 1);
+ assert(cache_entry.Owner.count() == 0);
+ cache_entry.Owner := cache_entry.Sharers;
+ cache_entry.Sharers.clear();
+ APPEND_TRANSITION_COMMENT(" new owner ");
+ APPEND_TRANSITION_COMMENT(cache_entry.Owner);
+ }
+
+ action(aT_addTCCToSharers, "aT", desc="Add TCC to sharer list") {
+ peek(w_TCCUnblock_in, UnblockMsg) {
+ cache_entry.Sharers.add(in_msg.Sender);
+ }
+ }
+
+ action(as_addToSharers, "as", desc="Add unblocker to sharer list") {
+ peek(unblockNetwork_in, UnblockMsg) {
+ cache_entry.Sharers.add(in_msg.Sender);
+ }
+ }
+
+ action(c_moveOwnerToSharer, "cc", desc="Move owner to sharers") {
+ cache_entry.Sharers.addNetDest(cache_entry.Owner);
+ cache_entry.Owner.clear();
+ }
+
+ action(cc_clearSharers, "\c", desc="Clear the sharers field") {
+ cache_entry.Sharers.clear();
+ }
+
+ action(e_ownerIsUnblocker, "e", desc="The owner is now the unblocker") {
+ peek(unblockNetwork_in, UnblockMsg) {
+ cache_entry.Owner.clear();
+ cache_entry.Owner.add(in_msg.Sender);
+ APPEND_TRANSITION_COMMENT(" tcp_ub owner ");
+ APPEND_TRANSITION_COMMENT(cache_entry.Owner);
+ }
+ }
+
+ action(eT_ownerIsUnblocker, "eT", desc="TCC (unblocker) is now owner") {
+ peek(w_TCCUnblock_in, UnblockMsg) {
+ cache_entry.Owner.clear();
+ cache_entry.Owner.add(in_msg.Sender);
+ APPEND_TRANSITION_COMMENT(" tcc_ub owner ");
+ APPEND_TRANSITION_COMMENT(cache_entry.Owner);
+ }
+ }
+
+ action(ctr_copyTCCResponseToTBE, "ctr", desc="Copy TCC probe response data to TBE") {
+ peek(w_TCCResponse_in, ResponseMsg) {
+ // Overwrite data if tbe does not hold dirty data. Stop once it is dirty.
+ if(tbe.Dirty == false) {
+ tbe.DataBlk := in_msg.DataBlk;
+ tbe.Dirty := in_msg.Dirty;
+ tbe.Sender := in_msg.Sender;
+ }
+ DPRINTF(RubySlicc, "%s\n", (tbe.DataBlk));
+ }
+ }
+
+ action(ccr_copyCoreResponseToTBE, "ccr", desc="Copy core probe response data to TBE") {
+ peek(responseNetwork_in, ResponseMsg) {
+ // Overwrite data if tbe does not hold dirty data. Stop once it is dirty.
+ if(tbe.Dirty == false) {
+ tbe.DataBlk := in_msg.DataBlk;
+ tbe.Dirty := in_msg.Dirty;
+
+ if(tbe.Sender == machineID) {
+ tbe.Sender := in_msg.Sender;
+ }
+ }
+ DPRINTF(RubySlicc, "%s\n", (tbe.DataBlk));
+ }
+ }
+
+ action(cd_clearDirtyBitTBE, "cd", desc="Clear Dirty bit in TBE") {
+ tbe.Dirty := false;
+ }
+
+ action(n_issueRdBlk, "n-", desc="Issue RdBlk") {
+ enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlk;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ }
+ }
+
+ action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+ enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlkS;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ }
+ }
+
+ action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") {
+ enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlkM;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ }
+ }
+
+ action(rU_rememberUpgrade, "rU", desc="Remember that this was an upgrade") {
+ tbe.Upgrade := true;
+ }
+
+ action(ruo_rememberUntransferredOwner, "ruo", desc="Remember the untransferred owner") {
+ peek(responseNetwork_in, ResponseMsg) {
+ if(in_msg.UntransferredOwner == true) {
+ tbe.UntransferredOwner := in_msg.Sender;
+ tbe.UntransferredOwnerExists := true;
+ }
+ DPRINTF(RubySlicc, "%s\n", (in_msg));
+ }
+ }
+
+ action(ruoT_rememberUntransferredOwnerTCC, "ruoT", desc="Remember the untransferred owner") {
+ peek(w_TCCResponse_in, ResponseMsg) {
+ if(in_msg.UntransferredOwner == true) {
+ tbe.UntransferredOwner := in_msg.Sender;
+ tbe.UntransferredOwnerExists := true;
+ }
+ DPRINTF(RubySlicc, "%s\n", (in_msg));
+ }
+ }
+
+ action(vd_victim, "vd", desc="Victimize M/O Data") {
+ enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.Type := CoherenceRequestType:VicDirty;
+ if (cache_entry.CacheState == State:O) {
+ out_msg.Shared := true;
+ } else {
+ out_msg.Shared := false;
+ }
+ out_msg.Dirty := true;
+ }
+ }
+
+ action(vc_victim, "vc", desc="Victimize E/S Data") {
+ enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.Type := CoherenceRequestType:VicClean;
+ if (cache_entry.CacheState == State:S) {
+ out_msg.Shared := true;
+ } else {
+ out_msg.Shared := false;
+ }
+ out_msg.Dirty := false;
+ }
+ }
+
+
+ action(sT_sendRequestToTCC, "sT", desc="send request to TCC") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ enqueue(w_requestTCC_out, CPURequestMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := in_msg.Type;
+ out_msg.Requestor := in_msg.Requestor;
+ out_msg.DataBlk := in_msg.DataBlk;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.Shared := in_msg.Shared;
+ out_msg.MessageSize := in_msg.MessageSize;
+ }
+ APPEND_TRANSITION_COMMENT(" requestor ");
+ APPEND_TRANSITION_COMMENT(in_msg.Requestor);
+
+ }
+ }
+
+
+ action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") {
+ MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits);
+
+ temp := cache_entry.Sharers;
+ temp.addNetDest(cache_entry.Owner);
+ if (temp.isElement(tcc)) {
+ temp.remove(tcc);
+ }
+ if (temp.count() > 0) {
+ enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbDowngrade;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination := temp;
+ tbe.NumPendingAcks := temp.count();
+ if(cache_entry.CacheState == State:M) {
+ assert(tbe.NumPendingAcks == 1);
+ }
+ DPRINTF(RubySlicc, "%s\n", (out_msg));
+ }
+ }
+ }
+
+ action(ls2_probeShrL2Data, "ls2", desc="local probe downgrade L2, return data") {
+ MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits);
+ if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+ enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbDowngrade;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination.add(tcc);
+ tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+
+ }
+ }
+ }
+
+ action(s2_probeShrL2Data, "s2", desc="probe shared L2, return data") {
+ MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits);
+ if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+ enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbDowngrade;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination.add(tcc);
+ tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+
+ }
+ }
+ }
+
+ action(ldc_probeInvCoreData, "ldc", desc="local probe to inv cores, return data") {
+ MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits);
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ NetDest dest:= cache_entry.Sharers;
+ dest.addNetDest(cache_entry.Owner);
+ if(dest.isElement(tcc)){
+ dest.remove(tcc);
+ }
+ dest.remove(in_msg.Requestor);
+ tbe.NumPendingAcks := dest.count();
+ if (dest.count()>0){
+ enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+
+ out_msg.Destination.addNetDest(dest);
+ if(cache_entry.CacheState == State:M) {
+ assert(tbe.NumPendingAcks == 1);
+ }
+
+ DPRINTF(RubySlicc, "%s\n", (out_msg));
+ }
+ }
+ }
+ }
+
+ action(ld2_probeInvL2Data, "ld2", desc="local probe inv L2, return data") {
+ MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits);
+ if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+ enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination.add(tcc);
+ tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+
+ }
+ }
+ }
+
+ action(dc_probeInvCoreData, "dc", desc="probe inv cores + TCC, return data") {
+ MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits);
+ enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+
+ out_msg.Destination.addNetDest(cache_entry.Sharers);
+ out_msg.Destination.addNetDest(cache_entry.Owner);
+ tbe.NumPendingAcks := cache_entry.Sharers.count() + cache_entry.Owner.count();
+ if(cache_entry.CacheState == State:M) {
+ assert(tbe.NumPendingAcks == 1);
+ }
+ if (out_msg.Destination.isElement(tcc)) {
+ out_msg.Destination.remove(tcc);
+ tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+ }
+
+ DPRINTF(RubySlicc, "%s\n", (out_msg));
+ }
+ }
+
+ action(d2_probeInvL2Data, "d2", desc="probe inv L2, return data") {
+ MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits);
+ if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+ enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination.add(tcc);
+ tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+
+ }
+ }
+ }
+
+ action(lpc_probeInvCore, "lpc", desc="local probe inv cores, no data") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ TCC_dir_subtree.broadcast(MachineType:TCP);
+ TCC_dir_subtree.broadcast(MachineType:SQC);
+
+ temp := cache_entry.Sharers;
+ temp := temp.OR(cache_entry.Owner);
+ TCC_dir_subtree := TCC_dir_subtree.AND(temp);
+ tbe.NumPendingAcks := TCC_dir_subtree.count();
+ if(cache_entry.CacheState == State:M) {
+ assert(tbe.NumPendingAcks == 1);
+ }
+ if(TCC_dir_subtree.isElement(in_msg.Requestor)) {
+ TCC_dir_subtree.remove(in_msg.Requestor);
+ tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+ }
+
+ if(TCC_dir_subtree.count() > 0) {
+ enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.ReturnData := false;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.localCtoD := true;
+
+ out_msg.Destination.addNetDest(TCC_dir_subtree);
+
+ DPRINTF(RubySlicc, "%s\n", (out_msg));
+ }
+ }
+ }
+ }
+
+ action(ipc_probeInvCore, "ipc", desc="probe inv cores, no data") {
+ TCC_dir_subtree.broadcast(MachineType:TCP);
+ TCC_dir_subtree.broadcast(MachineType:SQC);
+
+ temp := cache_entry.Sharers;
+ temp := temp.OR(cache_entry.Owner);
+ TCC_dir_subtree := TCC_dir_subtree.AND(temp);
+ tbe.NumPendingAcks := TCC_dir_subtree.count();
+ if(TCC_dir_subtree.count() > 0) {
+
+ enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.ReturnData := false;
+ out_msg.MessageSize := MessageSizeType:Control;
+
+ out_msg.Destination.addNetDest(TCC_dir_subtree);
+ if(cache_entry.CacheState == State:M) {
+ assert(tbe.NumPendingAcks == 1);
+ }
+
+ DPRINTF(RubySlicc, "%s\n", (out_msg));
+ }
+ }
+ }
+
+ action(i2_probeInvL2, "i2", desc="probe inv L2, no data") {
+ MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits);
+ if ((cache_entry.Sharers.isElement(tcc)) || (cache_entry.Owner.isElement(tcc))) {
+ enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+ tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.ReturnData := false;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination.add(tcc);
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+
+ }
+ }
+ }
+
+ action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+ enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.Dirty := false;
+ out_msg.Hit := false;
+ out_msg.Ntsl := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+ enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and TCC respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.Dirty := false;
+ out_msg.Ntsl := true;
+ out_msg.Hit := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") {
+ enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and TCC respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.Dirty := false; // only true if sending back data i think
+ out_msg.Hit := false;
+ out_msg.Ntsl := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+
+
+ action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+ enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+ assert(is_valid(cache_entry) || is_valid(tbe));
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.DataBlk := getDataBlock(address);
+ if (is_valid(tbe)) {
+ out_msg.Dirty := tbe.Dirty;
+ }
+ out_msg.Hit := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ }
+ }
+
+
+ action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+ enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+ assert(is_valid(cache_entry) || is_valid(tbe));
+ assert(is_valid(cache_entry));
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.DataBlk := getDataBlock(address);
+ if (is_valid(tbe)) {
+ out_msg.Dirty := tbe.Dirty;
+ }
+ out_msg.Hit := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ }
+ }
+
+ action(mc_cancelWB, "mc", desc="send writeback cancel to NB directory") {
+ enqueue(requestToNB_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:WrCancel;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.Requestor := machineID;
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ }
+ }
+
+ action(sCS_sendCollectiveResponseS, "sCS", desc="send shared response to all merged TCP/SQC") {
+ enqueue(responseToCore_out, ResponseMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysResp;
+ out_msg.Sender := tbe.Sender;
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.CtoD := false;
+ out_msg.State := CoherenceState:Shared;
+ out_msg.Destination.addNetDest(cache_entry.MergedSharers);
+ out_msg.Shared := tbe.Shared;
+ out_msg.Dirty := tbe.Dirty;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(sS_sendResponseS, "sS", desc="send shared response to TCP/SQC") {
+ enqueue(responseToCore_out, ResponseMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysResp;
+ out_msg.Sender := tbe.Sender;
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.CtoD := false;
+ out_msg.State := CoherenceState:Shared;
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.Shared := tbe.Shared;
+ out_msg.Dirty := tbe.Dirty;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(sM_sendResponseM, "sM", desc="send response to TCP/SQC") {
+ enqueue(responseToCore_out, ResponseMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysResp;
+ out_msg.Sender := tbe.Sender;
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.CtoD := false;
+ out_msg.State := CoherenceState:Modified;
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.Shared := tbe.Shared;
+ out_msg.Dirty := tbe.Dirty;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+
+
+ action(fw2_forwardWBAck, "fw2", desc="forward WBAck to TCC") {
+ peek(responseFromNB_in, ResponseMsg) {
+ if(tbe.OriginalRequestor != machineID) {
+ enqueue(w_respTCC_out, ResponseMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysWBAck;
+ out_msg.Sender := machineID;
+ //out_msg.DataBlk := tbe.DataBlk;
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.MessageSize := in_msg.MessageSize;
+ }
+ }
+ }
+ }
+
+ action(sa_saveSysAck, "sa", desc="Save SysAck ") {
+ peek(responseFromNB_in, ResponseMsg) {
+ tbe.Dirty := in_msg.Dirty;
+ if (tbe.Dirty == false) {
+ tbe.DataBlk := in_msg.DataBlk;
+ }
+ else {
+ tbe.DataBlk := tbe.DataBlk;
+ }
+ tbe.CtoD := in_msg.CtoD;
+ tbe.CohState := in_msg.State;
+ tbe.Shared := in_msg.Shared;
+ tbe.MessageSize := in_msg.MessageSize;
+ }
+ }
+
+ action(fsa_forwardSavedAck, "fsa", desc="forward saved SysAck to TCP or SQC") {
+ enqueue(responseToCore_out, ResponseMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysResp;
+ out_msg.Sender := machineID;
+ if (tbe.Dirty == false) {
+ out_msg.DataBlk := tbe.DataBlk;
+ }
+ else {
+ out_msg.DataBlk := tbe.DataBlk;
+ }
+ out_msg.CtoD := tbe.CtoD;
+ out_msg.State := tbe.CohState;
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.Shared := tbe.Shared;
+ out_msg.MessageSize := tbe.MessageSize;
+ out_msg.Dirty := tbe.Dirty;
+ out_msg.Sender := tbe.Sender;
+ }
+ }
+
+ action(fa_forwardSysAck, "fa", desc="forward SysAck to TCP or SQC") {
+ peek(responseFromNB_in, ResponseMsg) {
+ enqueue(responseToCore_out, ResponseMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysResp;
+ out_msg.Sender := machineID;
+ if (tbe.Dirty == false) {
+ out_msg.DataBlk := in_msg.DataBlk;
+ tbe.Sender := machineID;
+ }
+ else {
+ out_msg.DataBlk := tbe.DataBlk;
+ }
+ out_msg.CtoD := in_msg.CtoD;
+ out_msg.State := in_msg.State;
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.Shared := in_msg.Shared;
+ out_msg.MessageSize := in_msg.MessageSize;
+ out_msg.Dirty := in_msg.Dirty;
+ out_msg.Sender := tbe.Sender;
+ DPRINTF(RubySlicc, "%s\n", (out_msg.DataBlk));
+ }
+ }
+ }
+
+ action(pso_probeSharedDataOwner, "pso", desc="probe shared data at owner") {
+ MachineID tcc := mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits);
+ if (cache_entry.Owner.isElement(tcc)) {
+ enqueue(w_probeTCC_out, TDProbeRequestMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbDowngrade;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination.add(tcc);
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ else { // i.e., owner is a core
+ enqueue(probeToCore_out, TDProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbDowngrade;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination.addNetDest(cache_entry.Owner);
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ tbe.NumPendingAcks := 1;
+ }
+
+ action(i_popIncomingRequestQueue, "i", desc="Pop incoming request queue") {
+ coreRequestNetwork_in.dequeue(clockEdge());
+ }
+
+ action(j_popIncomingUnblockQueue, "j", desc="Pop incoming unblock queue") {
+ unblockNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pk_popResponseQueue, "pk", desc="Pop response queue") {
+ responseNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pp_popProbeQueue, "pp", desc="Pop incoming probe queue") {
+ probeNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pR_popResponseFromNBQueue, "pR", desc="Pop incoming Response queue From NB") {
+ responseFromNB_in.dequeue(clockEdge());
+ }
+
+ action(pt_popTriggerQueue, "pt", desc="pop trigger queue") {
+ triggerQueue_in.dequeue(clockEdge());
+ }
+
+ action(pl_popTCCRequestQueue, "pl", desc="pop TCC request queue") {
+ w_TCCRequest_in.dequeue(clockEdge());
+ }
+
+ action(plr_popTCCResponseQueue, "plr", desc="pop TCC response queue") {
+ w_TCCResponse_in.dequeue(clockEdge());
+ }
+
+ action(plu_popTCCUnblockQueue, "plu", desc="pop TCC unblock queue") {
+ w_TCCUnblock_in.dequeue(clockEdge());
+ }
+
+
+ action(m_addUnlockerToSharers, "m", desc="Add the unlocker to the sharer list") {
+ peek(unblockNetwork_in, UnblockMsg) {
+ cache_entry.Sharers.add(in_msg.Sender);
+ cache_entry.MergedSharers.remove(in_msg.Sender);
+ assert(cache_entry.WaitingUnblocks >= 0);
+ cache_entry.WaitingUnblocks := cache_entry.WaitingUnblocks - 1;
+ }
+ }
+
+ action(q_addOutstandingMergedSharer, "q", desc="Increment outstanding requests") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ cache_entry.MergedSharers.add(in_msg.Requestor);
+ cache_entry.WaitingUnblocks := cache_entry.WaitingUnblocks + 1;
+ }
+ }
+
+ action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+ enqueue(unblockToNB_out, UnblockMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(zz_recycleRequest, "\z", desc="Recycle the request queue") {
+ coreRequestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(yy_recycleTCCRequestQueue, "yy", desc="recycle yy request queue") {
+ w_TCCRequest_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(xz_recycleResponseQueue, "xz", desc="recycle response queue") {
+ responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(xx_recycleTCCResponseQueue, "xx", desc="recycle TCC response queue") {
+ w_TCCResponse_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(vv_recycleTCCUnblockQueue, "vv", desc="Recycle the probe request queue") {
+ w_TCCUnblock_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(xy_recycleUnblockQueue, "xy", desc="Recycle the probe request queue") {
+ w_TCCUnblock_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(ww_recycleProbeRequest, "ww", desc="Recycle the probe request queue") {
+ probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(x_decrementAcks, "x", desc="decrement Acks pending") {
+ tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+ }
+
+ action(o_checkForAckCompletion, "o", desc="check for ack completion") {
+ if (tbe.NumPendingAcks == 0) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:AcksComplete;
+ }
+ }
+ APPEND_TRANSITION_COMMENT(" tbe acks ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ }
+
+ action(tp_allocateTBE, "tp", desc="allocate TBE Entry for upward transactions") {
+ check_allocate(TBEs);
+ peek(probeNetwork_in, NBProbeRequestMsg) {
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ tbe.Dirty := false;
+ tbe.NumPendingAcks := 0;
+ tbe.UntransferredOwnerExists := false;
+ }
+ }
+
+ action(tv_allocateTBE, "tv", desc="allocate TBE Entry for TCC transactions") {
+ check_allocate(TBEs);
+ peek(w_TCCRequest_in, CPURequestMsg) {
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ tbe.DataBlk := in_msg.DataBlk; // Data only for WBs
+ tbe.Dirty := false;
+ tbe.OriginalRequestor := in_msg.Requestor;
+ tbe.NumPendingAcks := 0;
+ tbe.UntransferredOwnerExists := false;
+ }
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ check_allocate(TBEs);//check whether resources are full
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs
+ tbe.Dirty := false;
+ tbe.Upgrade := false;
+ tbe.OriginalRequestor := in_msg.Requestor;
+ tbe.NumPendingAcks := 0;
+ tbe.UntransferredOwnerExists := false;
+ tbe.Sender := machineID;
+ }
+ }
+
+ action(tr_allocateTBE, "tr", desc="allocate TBE Entry for recall") {
+ check_allocate(TBEs);//check whether resources are full
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs
+ tbe.Dirty := false;
+ tbe.Upgrade := false;
+ tbe.OriginalRequestor := machineID; //Recall request, Self initiated
+ tbe.NumPendingAcks := 0;
+ tbe.UntransferredOwnerExists := false;
+ }
+
+ action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") {
+ TBEs.deallocate(address);
+ unset_tbe();
+ }
+
+
+ action(d_allocateDir, "d", desc="allocate Directory Cache") {
+ if (is_invalid(cache_entry)) {
+ set_cache_entry(directory.allocate(address, new Entry));
+ }
+ }
+
+ action(dd_deallocateDir, "dd", desc="deallocate Directory Cache") {
+ if (is_valid(cache_entry)) {
+ directory.deallocate(address);
+ }
+ unset_cache_entry();
+ }
+
+ action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+ enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:StaleNotif;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(wb_data, "wb", desc="write back data") {
+ enqueue(responseToNB_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUData;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.Dirty := tbe.Dirty;
+ if (tbe.Shared) {
+ out_msg.NbReqShared := true;
+ } else {
+ out_msg.NbReqShared := false;
+ }
+ out_msg.State := CoherenceState:Shared; // faux info
+ out_msg.MessageSize := MessageSizeType:Writeback_Data;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") {
+ assert(is_valid(tbe));
+ tbe.Shared := true;
+ }
+
+ action(y_writeDataToTBE, "y", desc="write Probe Data to TBE") {
+ peek(responseNetwork_in, ResponseMsg) {
+ if (!tbe.Dirty || in_msg.Dirty) {
+ tbe.DataBlk := in_msg.DataBlk;
+ tbe.Dirty := in_msg.Dirty;
+ }
+ if (in_msg.Hit) {
+ tbe.Cached := true;
+ }
+ }
+ }
+
+ action(ty_writeTCCDataToTBE, "ty", desc="write TCC Probe Data to TBE") {
+ peek(w_TCCResponse_in, ResponseMsg) {
+ if (!tbe.Dirty || in_msg.Dirty) {
+ tbe.DataBlk := in_msg.DataBlk;
+ tbe.Dirty := in_msg.Dirty;
+ }
+ if (in_msg.Hit) {
+ tbe.Cached := true;
+ }
+ }
+ }
+
+
+ action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+ directory.setMRU(address);
+ }
+
+ // TRANSITIONS
+
+ // Handling TCP/SQC requests (similar to how NB dir handles TCC events with some changes to account for stateful directory).
+
+
+ // transitions from base
+ transition(I, RdBlk, I_ES){TagArrayRead} {
+ d_allocateDir;
+ t_allocateTBE;
+ n_issueRdBlk;
+ i_popIncomingRequestQueue;
+ }
+
+ transition(I, RdBlkS, I_S){TagArrayRead} {
+ d_allocateDir;
+ t_allocateTBE;
+ nS_issueRdBlkS;
+ i_popIncomingRequestQueue;
+ }
+
+
+ transition(I_S, NB_AckS, BBB_S) {
+ fa_forwardSysAck;
+ pR_popResponseFromNBQueue;
+ }
+
+ transition(I_ES, NB_AckS, BBB_S) {
+ fa_forwardSysAck;
+ pR_popResponseFromNBQueue;
+ }
+
+ transition(I_ES, NB_AckE, BBB_E) {
+ fa_forwardSysAck;
+ pR_popResponseFromNBQueue;
+ }
+
+ transition({S_M, O_M}, {NB_AckCtoD,NB_AckM}, BBB_M) {
+ fa_forwardSysAck;
+ pR_popResponseFromNBQueue;
+ }
+
+ transition(I_M, NB_AckM, BBB_M) {
+ fa_forwardSysAck;
+ pR_popResponseFromNBQueue;
+ }
+
+ transition(BBB_M, CoreUnblock, M){TagArrayWrite} {
+ c_clearOwner;
+ cc_clearSharers;
+ e_ownerIsUnblocker;
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ j_popIncomingUnblockQueue;
+ }
+
+ transition(BBB_S, CoreUnblock, S){TagArrayWrite} {
+ as_addToSharers;
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ j_popIncomingUnblockQueue;
+ }
+
+ transition(BBB_E, CoreUnblock, E){TagArrayWrite} {
+ as_addToSharers;
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ j_popIncomingUnblockQueue;
+ }
+
+
+ transition(I, RdBlkM, I_M){TagArrayRead} {
+ d_allocateDir;
+ t_allocateTBE;
+ nM_issueRdBlkM;
+ i_popIncomingRequestQueue;
+ }
+
+ //
+ transition(S, {RdBlk, RdBlkS}, BBS_S){TagArrayRead} {
+ t_allocateTBE;
+ sc_probeShrCoreData;
+ s2_probeShrL2Data;
+ q_addOutstandingMergedSharer;
+ i_popIncomingRequestQueue;
+ }
+ // Merging of read sharing into a single request
+ transition(BBS_S, {RdBlk, RdBlkS}) {
+ q_addOutstandingMergedSharer;
+ i_popIncomingRequestQueue;
+ }
+ // Wait for probe acks to be complete
+ transition(BBS_S, CPUPrbResp) {
+ ccr_copyCoreResponseToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+
+ transition(BBS_S, TCCPrbResp) {
+ ctr_copyTCCResponseToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+
+ // Window for merging complete with this transition
+ // Send responses to all outstanding
+ transition(BBS_S, ProbeAcksComplete, BB_S) {
+ sCS_sendCollectiveResponseS;
+ pt_popTriggerQueue;
+ }
+
+ transition(BB_S, CoreUnblock, BB_S) {
+ m_addUnlockerToSharers;
+ j_popIncomingUnblockQueue;
+ }
+
+ transition(BB_S, LastCoreUnblock, S) {
+ m_addUnlockerToSharers;
+ dt_deallocateTBE;
+ j_popIncomingUnblockQueue;
+ }
+
+ transition(O, {RdBlk, RdBlkS}, BBO_O){TagArrayRead} {
+ t_allocateTBE;
+ pso_probeSharedDataOwner;
+ q_addOutstandingMergedSharer;
+ i_popIncomingRequestQueue;
+ }
+ // Merging of read sharing into a single request
+ transition(BBO_O, {RdBlk, RdBlkS}) {
+ q_addOutstandingMergedSharer;
+ i_popIncomingRequestQueue;
+ }
+
+ // Wait for probe acks to be complete
+ transition(BBO_O, CPUPrbResp) {
+ ccr_copyCoreResponseToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+
+ transition(BBO_O, TCCPrbResp) {
+ ctr_copyTCCResponseToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+
+ // Window for merging complete with this transition
+ // Send responses to all outstanding
+ transition(BBO_O, ProbeAcksComplete, BB_OO) {
+ sCS_sendCollectiveResponseS;
+ pt_popTriggerQueue;
+ }
+
+ transition(BB_OO, CoreUnblock) {
+ m_addUnlockerToSharers;
+ j_popIncomingUnblockQueue;
+ }
+
+ transition(BB_OO, LastCoreUnblock, O){TagArrayWrite} {
+ m_addUnlockerToSharers;
+ dt_deallocateTBE;
+ j_popIncomingUnblockQueue;
+ }
+
+ transition(S, CPUWrite, BW_S){TagArrayRead} {
+ t_allocateTBE;
+ rC_removeCoreFromSharers;
+ sT_sendRequestToTCC;
+ i_popIncomingRequestQueue;
+ }
+
+ transition(E, CPUWrite, BW_E){TagArrayRead} {
+ t_allocateTBE;
+ rC_removeCoreFromSharers;
+ sT_sendRequestToTCC;
+ i_popIncomingRequestQueue;
+ }
+
+ transition(O, CPUWrite, BW_O){TagArrayRead} {
+ t_allocateTBE;
+ rCo_removeCoreFromOwner;
+ rC_removeCoreFromSharers;
+ sT_sendRequestToTCC;
+ i_popIncomingRequestQueue;
+ }
+
+ transition(M, CPUWrite, BW_M){TagArrayRead} {
+ t_allocateTBE;
+ rCo_removeCoreFromOwner;
+ rC_removeCoreFromSharers;
+ sT_sendRequestToTCC;
+ i_popIncomingRequestQueue;
+ }
+
+ transition(BW_S, TCCUnblock_Sharer, S){TagArrayWrite} {
+ aT_addTCCToSharers;
+ dt_deallocateTBE;
+ plu_popTCCUnblockQueue;
+ }
+
+ transition(BW_S, TCCUnblock_NotValid, S){TagArrayWrite} {
+ dt_deallocateTBE;
+ plu_popTCCUnblockQueue;
+ }
+
+ transition(BW_E, TCCUnblock, E){TagArrayWrite} {
+ cc_clearSharers;
+ aT_addTCCToSharers;
+ dt_deallocateTBE;
+ plu_popTCCUnblockQueue;
+ }
+
+ transition(BW_E, TCCUnblock_NotValid, E) {
+ dt_deallocateTBE;
+ plu_popTCCUnblockQueue;
+ }
+
+ transition(BW_M, TCCUnblock, M) {
+ c_clearOwner;
+ cc_clearSharers;
+ eT_ownerIsUnblocker;
+ dt_deallocateTBE;
+ plu_popTCCUnblockQueue;
+ }
+
+ transition(BW_M, TCCUnblock_NotValid, M) {
+ // Note this transition should only be executed if we received a stale wb
+ dt_deallocateTBE;
+ plu_popTCCUnblockQueue;
+ }
+
+ transition(BW_O, TCCUnblock, O) {
+ c_clearOwner;
+ eT_ownerIsUnblocker;
+ dt_deallocateTBE;
+ plu_popTCCUnblockQueue;
+ }
+
+ transition(BW_O, TCCUnblock_NotValid, O) {
+ // Note this transition should only be executed if we received a stale wb
+ dt_deallocateTBE;
+ plu_popTCCUnblockQueue;
+ }
+
+ // We lost the owner likely do to an invalidation racing with a 'O' wb
+ transition(BW_O, TCCUnblock_Sharer, S) {
+ c_clearOwner;
+ aT_addTCCToSharers;
+ dt_deallocateTBE;
+ plu_popTCCUnblockQueue;
+ }
+
+ transition({BW_M, BW_S, BW_E, BW_O}, {PrbInv,PrbInvData,PrbShrData}) {
+ ww_recycleProbeRequest;
+ }
+
+ transition(BRWD_I, {PrbInvData, PrbInv, PrbShrData}) {
+ ww_recycleProbeRequest;
+ }
+
+ // Three step process: locally invalidate others, issue CtoD, wait for NB_AckCtoD
+ transition(S, CtoD, BBS_UM) {TagArrayRead} {
+ t_allocateTBE;
+ lpc_probeInvCore;
+ i2_probeInvL2;
+ o_checkForAckCompletion;
+ i_popIncomingRequestQueue;
+ }
+
+ transition(BBS_UM, CPUPrbResp, BBS_UM) {
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+
+ transition(BBS_UM, TCCPrbResp) {
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+
+ transition(BBS_UM, ProbeAcksComplete, S_M) {
+ rU_rememberUpgrade;
+ nM_issueRdBlkM;
+ pt_popTriggerQueue;
+ }
+
+ // Three step process: locally invalidate others, issue CtoD, wait for NB_AckCtoD
+ transition(O, CtoD, BBO_UM){TagArrayRead} {
+ t_allocateTBE;
+ lpc_probeInvCore;
+ i2_probeInvL2;
+ o_checkForAckCompletion;
+ i_popIncomingRequestQueue;
+ }
+
+ transition(BBO_UM, CPUPrbResp, BBO_UM) {
+ ruo_rememberUntransferredOwner;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+
+ transition(BBO_UM, TCCPrbResp) {
+ ruoT_rememberUntransferredOwnerTCC;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+
+ transition(BBO_UM, ProbeAcksComplete, O_M) {
+ rU_rememberUpgrade;
+ nM_issueRdBlkM;
+ pt_popTriggerQueue;
+ }
+
+ transition({S,E}, RdBlkM, BBS_M){TagArrayWrite} {
+ t_allocateTBE;
+ ldc_probeInvCoreData;
+ ld2_probeInvL2Data;
+ o_checkForAckCompletion;
+ i_popIncomingRequestQueue;
+ }
+
+ transition(BBS_M, CPUPrbResp) {
+ ccr_copyCoreResponseToTBE;
+ rR_removeResponderFromSharers;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+
+ transition(BBS_M, TCCPrbResp) {
+ ctr_copyTCCResponseToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+
+ transition(BBS_M, ProbeAcksComplete, S_M) {
+ nM_issueRdBlkM;
+ pt_popTriggerQueue;
+ }
+
+ transition(O, RdBlkM, BBO_M){TagArrayRead} {
+ t_allocateTBE;
+ ldc_probeInvCoreData;
+ ld2_probeInvL2Data;
+ o_checkForAckCompletion;
+ i_popIncomingRequestQueue;
+ }
+
+ transition(BBO_M, CPUPrbResp) {
+ ccr_copyCoreResponseToTBE;
+ rR_removeResponderFromSharers;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+
+ transition(BBO_M, TCCPrbResp) {
+ ctr_copyTCCResponseToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+
+ transition(BBO_M, ProbeAcksComplete, O_M) {
+ nM_issueRdBlkM;
+ pt_popTriggerQueue;
+ }
+
+ //
+ transition(M, RdBlkM, BBM_M){TagArrayRead} {
+ t_allocateTBE;
+ ldc_probeInvCoreData;
+ ld2_probeInvL2Data;
+ i_popIncomingRequestQueue;
+ }
+
+ transition(BBM_M, CPUPrbResp) {
+ ccr_copyCoreResponseToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+
+ // TCP recalled block before receiving probe
+ transition({BBM_M, BBS_M, BBO_M}, {CPUWrite,NoCPUWrite}) {
+ zz_recycleRequest;
+ }
+
+ transition(BBM_M, TCCPrbResp) {
+ ctr_copyTCCResponseToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+
+ transition(BBM_M, ProbeAcksComplete, BB_M) {
+ sM_sendResponseM;
+ pt_popTriggerQueue;
+ }
+
+ transition(BB_M, CoreUnblock, M){TagArrayWrite} {
+ e_ownerIsUnblocker;
+ dt_deallocateTBE;
+ j_popIncomingUnblockQueue;
+ }
+
+ transition(M, {RdBlkS, RdBlk}, BBM_O){TagArrayRead} {
+ t_allocateTBE;
+ sc_probeShrCoreData;
+ s2_probeShrL2Data;
+ i_popIncomingRequestQueue;
+ }
+
+ transition(E, {RdBlkS, RdBlk}, BBM_O){TagArrayRead} {
+ t_allocateTBE;
+ eto_moveExSharerToOwner;
+ sc_probeShrCoreData;
+ s2_probeShrL2Data;
+ i_popIncomingRequestQueue;
+ }
+
+ transition(BBM_O, CPUPrbResp) {
+ ccr_copyCoreResponseToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+ transition(BBM_O, TCCPrbResp) {
+ ctr_copyTCCResponseToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+ transition(BBM_O, ProbeAcksComplete, BB_O) {
+ sS_sendResponseS;
+ pt_popTriggerQueue;
+ }
+
+ transition(BB_O, CoreUnblock, O){TagArrayWrite} {
+ as_addToSharers;
+ dt_deallocateTBE;
+ j_popIncomingUnblockQueue;
+ }
+
+ transition({BBO_O, BBM_M, BBS_S, BBM_O, BB_M, BB_O, BB_S, BBO_UM, BBS_UM, BBS_M, BBO_M, BB_OO}, {PrbInvData, PrbInv,PrbShrData}) {
+ ww_recycleProbeRequest;
+ }
+
+ transition({BBM_O, BBS_S, CP_S, CP_O, CP_SM, CP_OM, BBO_O}, {CPUWrite,NoCPUWrite}) {
+ zz_recycleRequest;
+ }
+
+ // stale CtoD raced with external invalidation
+ transition({I, CP_I, B_I, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, CtoD) {
+ i_popIncomingRequestQueue;
+ }
+
+ // stale CtoD raced with internal RdBlkM
+ transition({BBM_M, BBS_M, BBO_M, BBB_M, BBS_UM, BBO_UM}, CtoD) {
+ i_popIncomingRequestQueue;
+ }
+
+ transition({E, M}, CtoD) {
+ i_popIncomingRequestQueue;
+ }
+
+
+ // TCC-directory has sent out (And potentially received acks for) probes.
+ // TCP/SQC replacement (known to be stale subsequent) are popped off.
+ transition({BBO_UM, BBS_UM}, {CPUWrite,NoCPUWrite}) {
+ nC_sendNullWBAckToCore;
+ i_popIncomingRequestQueue;
+ }
+
+ transition(S_M, {NoCPUWrite, CPUWrite}) {
+ zz_recycleRequest;
+ }
+
+ transition(O_M, {NoCPUWrite, CPUWrite}) {
+ zz_recycleRequest;
+ }
+
+
+ transition({BBM_M, BBS_M, BBO_M, BBO_UM, BBS_UM}, {VicDirty, VicClean, VicDirtyLast, NoVic}) {
+ nT_sendNullWBAckToTCC;
+ pl_popTCCRequestQueue;
+ }
+
+ transition({CP_S, CP_O, CP_OM, CP_SM}, {VicDirty, VicClean, VicDirtyLast, CancelWB, NoVic}) {
+ yy_recycleTCCRequestQueue;
+ }
+
+ // However, when TCCdir has sent out PrbSharedData, one cannot ignore.
+ transition({BBS_S, BBO_O, BBM_O, S_M, O_M, BBB_M, BBB_S, BBB_E}, {VicDirty, VicClean, VicDirtyLast,CancelWB}) {
+ yy_recycleTCCRequestQueue;
+ }
+
+ transition({BW_S,BW_E,BW_O, BW_M}, {VicDirty, VicClean, VicDirtyLast, NoVic}) {
+ yy_recycleTCCRequestQueue;
+ }
+
+ transition({BW_S,BW_E,BW_O, BW_M}, CancelWB) {
+ nT_sendNullWBAckToTCC;
+ pl_popTCCRequestQueue;
+ }
+
+
+ /// recycle if waiting for unblocks.
+ transition({BB_M,BB_O,BB_S,BB_OO}, {VicDirty, VicClean, VicDirtyLast,NoVic,CancelWB}) {
+ yy_recycleTCCRequestQueue;
+ }
+
+ transition({BBS_S, BBO_O}, NoVic) {
+ rT_removeTCCFromSharers;
+ nT_sendNullWBAckToTCC;
+ pl_popTCCRequestQueue;
+ }
+
+ // stale. Pop message and send dummy ack.
+ transition({I_S, I_ES, I_M}, {VicDirty, VicClean, VicDirtyLast, NoVic}) {
+ nT_sendNullWBAckToTCC;
+ pl_popTCCRequestQueue;
+ }
+
+ transition(M, VicDirtyLast, VM_I){TagArrayRead} {
+ tv_allocateTBE;
+ vd_victim;
+ pl_popTCCRequestQueue;
+ }
+
+ transition(E, VicDirty, VM_I){TagArrayRead} {
+ tv_allocateTBE;
+ vd_victim;
+ pl_popTCCRequestQueue;
+ }
+
+ transition(O, VicDirty, VO_S){TagArrayRead} {
+ tv_allocateTBE;
+ vd_victim;
+ pl_popTCCRequestQueue;
+ }
+
+ transition(O, {VicDirtyLast, VicClean}, VO_I){TagArrayRead} {
+ tv_allocateTBE;
+ vd_victim;
+ pl_popTCCRequestQueue;
+ }
+
+ transition({E, S}, VicClean, VES_I){TagArrayRead} {
+ tv_allocateTBE;
+ vc_victim;
+ pl_popTCCRequestQueue;
+ }
+
+ transition({O, S}, NoVic){TagArrayRead} {
+ rT_removeTCCFromSharers;
+ nT_sendNullWBAckToTCC;
+ pl_popTCCRequestQueue;
+ }
+
+ transition({O,S}, NoCPUWrite){TagArrayRead} {
+ rC_removeCoreFromSharers;
+ nC_sendNullWBAckToCore;
+ i_popIncomingRequestQueue;
+ }
+
+ transition({M,E}, NoCPUWrite){TagArrayRead} {
+ rC_removeCoreFromSharers;
+ nC_sendNullWBAckToCore;
+ i_popIncomingRequestQueue;
+ }
+
+ // This can only happen if it is race. (TCCdir sent out probes which caused this cancel in the first place.)
+ transition({VM_I, VES_I, VO_I}, CancelWB) {
+ pl_popTCCRequestQueue;
+ }
+
+ transition({VM_I, VES_I, VO_I}, NB_AckWB, I){TagArrayWrite} {
+ c_clearOwner;
+ cc_clearSharers;
+ wb_data;
+ fw2_forwardWBAck;
+ dt_deallocateTBE;
+ dd_deallocateDir;
+ pR_popResponseFromNBQueue;
+ }
+
+ transition(VO_S, NB_AckWB, S){TagArrayWrite} {
+ c_clearOwner;
+ wb_data;
+ fw2_forwardWBAck;
+ dt_deallocateTBE;
+ pR_popResponseFromNBQueue;
+ }
+
+ transition(I_C, NB_AckWB, I){TagArrayWrite} {
+ c_clearOwner;
+ cc_clearSharers;
+ ss_sendStaleNotification;
+ fw2_forwardWBAck;
+ dt_deallocateTBE;
+ dd_deallocateDir;
+ pR_popResponseFromNBQueue;
+ }
+
+ transition(I_W, NB_AckWB, I) {
+ ss_sendStaleNotification;
+ dt_deallocateTBE;
+ dd_deallocateDir;
+ pR_popResponseFromNBQueue;
+ }
+
+
+
+ // Do not handle replacements, reads of any kind or writebacks from transients; recycle
+ transition({I_M, I_ES, I_S, MO_I, ES_I, S_M, O_M, VES_I, VO_I, VO_S, VM_I, I_C, I_W}, {RdBlkS,RdBlkM,RdBlk,CtoD}) {
+ zz_recycleRequest;
+ }
+
+ transition( VO_S, NoCPUWrite) {
+ zz_recycleRequest;
+ }
+
+ transition({BW_M, BW_S, BW_O, BW_E}, {RdBlkS,RdBlkM,RdBlk,CtoD,NoCPUWrite, CPUWrite}) {
+ zz_recycleRequest;
+ }
+
+ transition({BBB_M, BBB_S, BBB_E, BB_O, BB_M, BB_S, BB_OO}, { RdBlk, RdBlkS, RdBlkM, CPUWrite, NoCPUWrite}) {
+ zz_recycleRequest;
+ }
+
+ transition({BBB_S, BBB_E, BB_O, BB_S, BB_OO}, { CtoD}) {
+ zz_recycleRequest;
+ }
+
+ transition({BBS_UM, BBO_UM, BBM_M, BBM_O, BBS_M, BBO_M}, { RdBlk, RdBlkS, RdBlkM}) {
+ zz_recycleRequest;
+ }
+
+ transition(BBM_O, CtoD) {
+ zz_recycleRequest;
+ }
+
+ transition({BBS_S, BBO_O}, {RdBlkM, CtoD}) {
+ zz_recycleRequest;
+ }
+
+ transition({B_I, CP_I, CP_S, CP_O, CP_OM, CP_SM, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, {RdBlk, RdBlkS, RdBlkM}) {
+ zz_recycleRequest;
+ }
+
+ transition({CP_O, CP_S, CP_OM}, CtoD) {
+ zz_recycleRequest;
+ }
+
+ // Ignore replacement related messages after probe got in.
+ transition({CP_I, B_I, CP_IOM, CP_ISM, CP_OSIW, BRWD_I, BRW_I, BRD_I}, {CPUWrite, NoCPUWrite}) {
+ zz_recycleRequest;
+ }
+
+ // Ignore replacement related messages after probes processed
+ transition({I, I_S, I_ES, I_M, I_C, I_W}, {CPUWrite,NoCPUWrite}) {
+ nC_sendNullWBAckToCore;
+ i_popIncomingRequestQueue;
+ }
+ // cannot ignore cancel... otherwise TCP/SQC will be stuck in I_C
+ transition({I, I_S, I_ES, I_M, I_C, I_W, S_M, M, O, E, S}, CPUWriteCancel){TagArrayRead} {
+ nC_sendNullWBAckToCore;
+ i_popIncomingRequestQueue;
+ }
+
+ transition({CP_I, B_I, CP_IOM, CP_ISM, BRWD_I, BRW_I, BRD_I}, {NoVic, VicClean, VicDirty, VicDirtyLast}){
+ nT_sendNullWBAckToTCC;
+ pl_popTCCRequestQueue;
+ }
+
+ // Handling Probes from NB (General process: (1) propagate up, go to blocking state (2) process acks (3) on last ack downward.)
+
+ // step 1
+ transition({M, O, E, S}, PrbInvData, CP_I){TagArrayRead} {
+ tp_allocateTBE;
+ dc_probeInvCoreData;
+ d2_probeInvL2Data;
+ pp_popProbeQueue;
+ }
+ // step 2a
+ transition(CP_I, CPUPrbResp) {
+ y_writeDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+ // step 2b
+ transition(CP_I, TCCPrbResp) {
+ ty_writeTCCDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+ // step 3
+ transition(CP_I, ProbeAcksComplete, I){TagArrayWrite} {
+ pd_sendProbeResponseData;
+ c_clearOwner;
+ cc_clearSharers;
+ dt_deallocateTBE;
+ dd_deallocateDir;
+ pt_popTriggerQueue;
+ }
+
+ // step 1
+ transition({M, O, E, S}, PrbInv, B_I){TagArrayWrite} {
+ tp_allocateTBE;
+ ipc_probeInvCore;
+ i2_probeInvL2;
+ pp_popProbeQueue;
+ }
+ // step 2
+ transition(B_I, CPUPrbResp) {
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+ // step 2b
+ transition(B_I, TCCPrbResp) {
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+ // step 3
+ transition(B_I, ProbeAcksComplete, I){TagArrayWrite} {
+ // send response down to NB
+ pi_sendProbeResponseInv;
+ c_clearOwner;
+ cc_clearSharers;
+ dt_deallocateTBE;
+ dd_deallocateDir;
+ pt_popTriggerQueue;
+ }
+
+
+ // step 1
+ transition({M, O}, PrbShrData, CP_O){TagArrayRead} {
+ tp_allocateTBE;
+ sc_probeShrCoreData;
+ s2_probeShrL2Data;
+ pp_popProbeQueue;
+ }
+
+ transition(E, PrbShrData, CP_O){TagArrayRead} {
+ tp_allocateTBE;
+ eto_moveExSharerToOwner;
+ sc_probeShrCoreData;
+ s2_probeShrL2Data;
+ pp_popProbeQueue;
+ }
+ // step 2
+ transition(CP_O, CPUPrbResp) {
+ y_writeDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+ // step 2b
+ transition(CP_O, TCCPrbResp) {
+ ty_writeTCCDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+ // step 3
+ transition(CP_O, ProbeAcksComplete, O){TagArrayWrite} {
+ // send response down to NB
+ pd_sendProbeResponseData;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+
+ //step 1
+ transition(S, PrbShrData, CP_S) {
+ tp_allocateTBE;
+ sc_probeShrCoreData;
+ s2_probeShrL2Data;
+ pp_popProbeQueue;
+ }
+ // step 2
+ transition(CP_S, CPUPrbResp) {
+ y_writeDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+ // step 2b
+ transition(CP_S, TCCPrbResp) {
+ ty_writeTCCDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+ // step 3
+ transition(CP_S, ProbeAcksComplete, S) {
+ // send response down to NB
+ pd_sendProbeResponseData;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+
+ // step 1
+ transition(O_M, PrbInvData, CP_IOM) {
+ dc_probeInvCoreData;
+ d2_probeInvL2Data;
+ pp_popProbeQueue;
+ }
+ // step 2a
+ transition(CP_IOM, CPUPrbResp) {
+ y_writeDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+ // step 2b
+ transition(CP_IOM, TCCPrbResp) {
+ ty_writeTCCDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+ // step 3
+ transition(CP_IOM, ProbeAcksComplete, I_M) {
+ pdm_sendProbeResponseDataMs;
+ c_clearOwner;
+ cc_clearSharers;
+ cd_clearDirtyBitTBE;
+ pt_popTriggerQueue;
+ }
+
+ transition(CP_IOM, ProbeAcksCompleteReissue, I){TagArrayWrite} {
+ pdm_sendProbeResponseDataMs;
+ c_clearOwner;
+ cc_clearSharers;
+ dt_deallocateTBE;
+ dd_deallocateDir;
+ pt_popTriggerQueue;
+ }
+
+ // step 1
+ transition(S_M, PrbInvData, CP_ISM) {
+ dc_probeInvCoreData;
+ d2_probeInvL2Data;
+ o_checkForAckCompletion;
+ pp_popProbeQueue;
+ }
+ // step 2a
+ transition(CP_ISM, CPUPrbResp) {
+ y_writeDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+ // step 2b
+ transition(CP_ISM, TCCPrbResp) {
+ ty_writeTCCDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+ // step 3
+ transition(CP_ISM, ProbeAcksComplete, I_M) {
+ pdm_sendProbeResponseDataMs;
+ c_clearOwner;
+ cc_clearSharers;
+ cd_clearDirtyBitTBE;
+
+ //dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+ transition(CP_ISM, ProbeAcksCompleteReissue, I){TagArrayWrite} {
+ pim_sendProbeResponseInvMs;
+ c_clearOwner;
+ cc_clearSharers;
+ dt_deallocateTBE;
+ dd_deallocateDir;
+ pt_popTriggerQueue;
+ }
+
+ // step 1
+ transition({S_M, O_M}, {PrbInv}, CP_ISM) {
+ dc_probeInvCoreData;
+ d2_probeInvL2Data;
+ pp_popProbeQueue;
+ }
+ // next steps inherited from BS_ISM
+
+ // Simpler cases
+
+ transition({I_C, I_W}, {PrbInvData, PrbInv, PrbShrData}) {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ //If the directory is certain that the block is not present, one can send an acknowledgement right away.
+ // No need for three step process.
+ transition(I, {PrbInv,PrbShrData,PrbInvData}){TagArrayRead} {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition({I_M, I_ES, I_S}, {PrbInv, PrbInvData}) {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition({I_M, I_ES, I_S}, PrbShrData) {
+ prm_sendProbeResponseMiss;
+ pp_popProbeQueue;
+ }
+
+ //step 1
+ transition(S_M, PrbShrData, CP_SM) {
+ sc_probeShrCoreData;
+ s2_probeShrL2Data;
+ o_checkForAckCompletion;
+ pp_popProbeQueue;
+ }
+ // step 2
+ transition(CP_SM, CPUPrbResp) {
+ y_writeDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+ // step 2b
+ transition(CP_SM, TCCPrbResp) {
+ ty_writeTCCDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+ // step 3
+ transition(CP_SM, {ProbeAcksComplete,ProbeAcksCompleteReissue}, S_M){DataArrayRead} {
+ // send response down to NB
+ pd_sendProbeResponseData;
+ pt_popTriggerQueue;
+ }
+
+ //step 1
+ transition(O_M, PrbShrData, CP_OM) {
+ sc_probeShrCoreData;
+ s2_probeShrL2Data;
+ pp_popProbeQueue;
+ }
+ // step 2
+ transition(CP_OM, CPUPrbResp) {
+ y_writeDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+ // step 2b
+ transition(CP_OM, TCCPrbResp) {
+ ty_writeTCCDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+ // step 3
+ transition(CP_OM, {ProbeAcksComplete,ProbeAcksCompleteReissue}, O_M) {
+ // send response down to NB
+ pd_sendProbeResponseData;
+ pt_popTriggerQueue;
+ }
+
+ transition(BRW_I, PrbInvData, I_W) {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition({VM_I,VO_I}, PrbInvData, I_C) {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition(VES_I, {PrbInvData,PrbInv}, I_C) {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition({VM_I, VO_I, BRW_I}, PrbInv, I_W) {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition({VM_I, VO_I, VO_S, VES_I, BRW_I}, PrbShrData) {
+ pd_sendProbeResponseData;
+ sf_setSharedFlip;
+ pp_popProbeQueue;
+ }
+
+ transition(VO_S, PrbInvData, CP_OSIW) {
+ dc_probeInvCoreData;
+ d2_probeInvL2Data;
+ pp_popProbeQueue;
+ }
+
+ transition(CP_OSIW, TCCPrbResp) {
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+ transition(CP_OSIW, CPUPrbResp) {
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+
+ transition(CP_OSIW, ProbeAcksComplete, I_C) {
+ pd_sendProbeResponseData;
+ cd_clearDirtyBitTBE;
+ pt_popTriggerQueue;
+ }
+
+ transition({I, S, E, O, M, CP_O, CP_S, CP_OM, CP_SM, CP_OSIW, BW_S, BW_E, BW_O, BW_M, I_M, I_ES, I_S, BBS_S, BBO_O, BBM_M, BBM_O, BB_M, BB_O, BB_OO, BB_S, BBS_M, BBO_M, BBO_UM, BBS_UM, S_M, O_M, BBB_S, BBB_M, BBB_E, VES_I, VM_I, VO_I, VO_S, ES_I, MO_I, I_C, I_W}, StaleVic) {
+ nT_sendNullWBAckToTCC;
+ pl_popTCCRequestQueue;
+ }
+
+ transition({CP_I, B_I, CP_IOM, CP_ISM, BRWD_I, BRW_I, BRD_I}, StaleVic) {
+ nT_sendNullWBAckToTCC;
+ pl_popTCCRequestQueue;
+ }
+
+ // Recall Transistions
+ // transient states still require the directory state
+ transition({M, O}, Recall, BRWD_I) {
+ tr_allocateTBE;
+ vd_victim;
+ dc_probeInvCoreData;
+ d2_probeInvL2Data;
+ }
+
+ transition({E, S}, Recall, BRWD_I) {
+ tr_allocateTBE;
+ vc_victim;
+ dc_probeInvCoreData;
+ d2_probeInvL2Data;
+ }
+
+ transition(I, Recall) {
+ dd_deallocateDir;
+ }
+
+ transition({BRWD_I, BRD_I}, CPUPrbResp) {
+ y_writeDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ pk_popResponseQueue;
+ }
+
+ transition({BRWD_I, BRD_I}, TCCPrbResp) {
+ ty_writeTCCDataToTBE;
+ x_decrementAcks;
+ o_checkForAckCompletion;
+ plr_popTCCResponseQueue;
+ }
+
+ transition(BRWD_I, NB_AckWB, BRD_I) {
+ pR_popResponseFromNBQueue;
+ }
+
+ transition(BRWD_I, ProbeAcksComplete, BRW_I) {
+ pt_popTriggerQueue;
+ }
+
+ transition(BRW_I, NB_AckWB, I) {
+ wb_data;
+ dt_deallocateTBE;
+ dd_deallocateDir;
+ pR_popResponseFromNBQueue;
+ }
+
+ transition(BRD_I, ProbeAcksComplete, I) {
+ wb_data;
+ dt_deallocateTBE;
+ dd_deallocateDir;
+ pt_popTriggerQueue;
+ }
+
+ // wait for stable state for Recall
+ transition({BRWD_I,BRD_I,BRW_I,CP_O, CP_S, CP_OM, CP_SM, CP_OSIW, BW_S, BW_E, BW_O, BW_M, I_M, I_ES, I_S, BBS_S, BBO_O, BBM_M, BBM_O, BB_M, BB_O, BB_OO, BB_S, BBS_M, BBO_M, BBO_UM, BBS_UM, S_M, O_M, BBB_S, BBB_M, BBB_E, VES_I, VM_I, VO_I, VO_S, ES_I, MO_I, I_C, I_W, CP_I}, Recall) {
+ zz_recycleRequest; // stall and wait would be for the wrong address
+ ut_updateTag; // try to find an easier recall
+ }
+
+}
diff --git a/src/mem/protocol/GPU_RfO-TCP.sm b/src/mem/protocol/GPU_RfO-TCP.sm
new file mode 100644
index 000000000..6cf9224a6
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO-TCP.sm
@@ -0,0 +1,1009 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
+ : GPUCoalescer* coalescer;
+ Sequencer* sequencer;
+ bool use_seq_not_coal;
+ CacheMemory * L1cache;
+ int TCC_select_num_bits;
+ Cycles issue_latency := 40; // time to send data down to TCC
+ Cycles l2_hit_latency := 18;
+
+ MessageBuffer * requestFromTCP, network="To", virtual_network="1", vnet_type="request";
+ MessageBuffer * responseFromTCP, network="To", virtual_network="3", vnet_type="response";
+ MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock";
+
+ MessageBuffer * probeToTCP, network="From", virtual_network="1", vnet_type="request";
+ MessageBuffer * responseToTCP, network="From", virtual_network="3", vnet_type="response";
+
+ MessageBuffer * mandatoryQueue;
+{
+ state_declaration(State, desc="TCP Cache States", default="TCP_State_I") {
+ I, AccessPermission:Invalid, desc="Invalid";
+ S, AccessPermission:Read_Only, desc="Shared";
+ E, AccessPermission:Read_Write, desc="Exclusive";
+ O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line";
+ M, AccessPermission:Read_Write, desc="Modified";
+
+ I_M, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+ I_ES, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+ S_M, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+ O_M, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+
+ ES_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for clean WB ack";
+ MO_I, AccessPermission:Read_Only, desc="L1 replacement, waiting for dirty WB ack";
+
+ MO_PI, AccessPermission:Read_Only, desc="L1 downgrade, waiting for CtoD ack (or ProbeInvalidateData)";
+
+ I_C, AccessPermission:Invalid, desc="Invalid, waiting for WBAck from TCC for canceled WB";
+ }
+
+ enumeration(Event, desc="TCP Events") {
+ // Core initiated
+ Load, desc="Load";
+ Store, desc="Store";
+
+ // TCC initiated
+ TCC_AckS, desc="TCC Ack to Core Request";
+ TCC_AckE, desc="TCC Ack to Core Request";
+ TCC_AckM, desc="TCC Ack to Core Request";
+ TCC_AckCtoD, desc="TCC Ack to Core Request";
+ TCC_AckWB, desc="TCC Ack for clean WB";
+ TCC_NackWB, desc="TCC Nack for clean WB";
+
+ // Mem sys initiated
+ Repl, desc="Replacing block from cache";
+
+ // Probe Events
+ PrbInvData, desc="probe, return O or M data";
+ PrbInv, desc="probe, no need for data";
+ LocalPrbInv, desc="local probe, no need for data";
+ PrbShrData, desc="probe downgrade, return O or M data";
+ }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ DataArrayRead, desc="Read the data array";
+ DataArrayWrite, desc="Write the data array";
+ TagArrayRead, desc="Read the data array";
+ TagArrayWrite, desc="Write the data array";
+ }
+
+
+ structure(Entry, desc="...", interface="AbstractCacheEntry") {
+ State CacheState, desc="cache state";
+ bool Dirty, desc="Is the data dirty (diff than memory)?";
+ DataBlock DataBlk, desc="data for the block";
+ bool FromL2, default="false", desc="block just moved from L2";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ DataBlock DataBlk, desc="data for the block, required for concurrent writebacks";
+ bool Dirty, desc="Is the data dirty (different than memory)?";
+ int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for";
+ bool Shared, desc="Victim hit by shared probe";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ TBETable TBEs, template="<TCP_TBE>", constructor="m_number_of_TBEs";
+ int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+ Tick clockEdge();
+ Tick cyclesToTicks(Cycles c);
+
+ void set_cache_entry(AbstractCacheEntry b);
+ void unset_cache_entry();
+ void set_tbe(TBE b);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+ Cycles curCycle();
+
+ // Internal functions
+ Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+ Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
+ return cache_entry;
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return tbe.DataBlk;
+ } else {
+ return getCacheEntry(addr).DataBlk;
+ }
+ }
+
+ State getState(TBE tbe, Entry cache_entry, Addr addr) {
+ if(is_valid(tbe)) {
+ return tbe.TBEState;
+ } else if (is_valid(cache_entry)) {
+ return cache_entry.CacheState;
+ }
+ return State:I;
+ }
+
+ void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+ if (is_valid(tbe)) {
+ tbe.TBEState := state;
+ }
+
+ if (is_valid(cache_entry)) {
+ cache_entry.CacheState := state;
+ }
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return TCP_State_to_permission(tbe.TBEState);
+ }
+
+ Entry cache_entry := getCacheEntry(addr);
+ if(is_valid(cache_entry)) {
+ return TCP_State_to_permission(cache_entry.CacheState);
+ }
+
+ return AccessPermission:NotPresent;
+ }
+
+ bool isValid(Addr addr) {
+ AccessPermission perm := getAccessPermission(addr);
+ if (perm == AccessPermission:NotPresent ||
+ perm == AccessPermission:Invalid ||
+ perm == AccessPermission:Busy) {
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+ if (is_valid(cache_entry)) {
+ cache_entry.changePermission(TCP_State_to_permission(state));
+ }
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ testAndRead(addr, tbe.DataBlk, pkt);
+ } else {
+ functionalMemoryRead(pkt);
+ }
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ int num_functional_writes := 0;
+
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ num_functional_writes := num_functional_writes +
+ testAndWrite(addr, tbe.DataBlk, pkt);
+ }
+
+ num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+ return num_functional_writes;
+ }
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else {
+ error("Invalid RequestType type in checkResourceAvailable");
+ return true;
+ }
+ }
+
+ MachineType getCoherenceType(MachineID myMachID,
+ MachineID senderMachID) {
+ if(myMachID == senderMachID) {
+ return MachineType:TCP;
+ } else if(machineIDToMachineType(senderMachID) == MachineType:TCP) {
+ return MachineType:L1Cache_wCC;
+ } else if(machineIDToMachineType(senderMachID) == MachineType:TCC) {
+ return MachineType:TCC;
+ } else {
+ return MachineType:TCCdir;
+ }
+ }
+
+ // Out Ports
+
+ out_port(requestNetwork_out, CPURequestMsg, requestFromTCP);
+ out_port(responseNetwork_out, ResponseMsg, responseFromTCP);
+ out_port(unblockNetwork_out, UnblockMsg, unblockFromCore);
+
+ // In Ports
+
+ in_port(probeNetwork_in, TDProbeRequestMsg, probeToTCP) {
+ if (probeNetwork_in.isReady(clockEdge())) {
+ peek(probeNetwork_in, TDProbeRequestMsg, block_on="addr") {
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ DPRINTF(RubySlicc, "machineID: %s\n", machineID);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+
+ if (in_msg.Type == ProbeRequestType:PrbInv) {
+ if (in_msg.ReturnData) {
+ trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+ } else {
+ if(in_msg.localCtoD) {
+ trigger(Event:LocalPrbInv, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+ assert(in_msg.ReturnData);
+ trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+ }
+
+ in_port(responseToTCP_in, ResponseMsg, responseToTCP) {
+ if (responseToTCP_in.isReady(clockEdge())) {
+ peek(responseToTCP_in, ResponseMsg, block_on="addr") {
+
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+
+ if (in_msg.Type == CoherenceResponseType:TDSysResp) {
+ if (in_msg.State == CoherenceState:Modified) {
+ if (in_msg.CtoD) {
+ trigger(Event:TCC_AckCtoD, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:TCC_AckM, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.State == CoherenceState:Shared) {
+ trigger(Event:TCC_AckS, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.State == CoherenceState:Exclusive) {
+ trigger(Event:TCC_AckE, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck) {
+ trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceResponseType:TDSysWBNack) {
+ trigger(Event:TCC_NackWB, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("Unexpected Response Message to Core");
+ }
+ }
+ }
+ }
+
+ in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+ if (mandatoryQueue_in.isReady(clockEdge())) {
+ peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+ Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+ TBE tbe := TBEs.lookup(in_msg.LineAddress);
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ if (in_msg.Type == RubyRequestType:LD) {
+ if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+ trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
+ } else {
+ Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+ trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } else {
+ if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+ trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
+ } else {
+ Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+ trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ }
+ }
+ }
+ }
+
+ // Actions
+
+ action(ic_invCache, "ic", desc="invalidate cache") {
+ if(is_valid(cache_entry)) {
+ L1cache.deallocate(address);
+ }
+ unset_cache_entry();
+ }
+
+ action(n_issueRdBlk, "n", desc="Issue RdBlk") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlk;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := curCycle();
+ }
+ }
+
+ action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlkM;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := curCycle();
+ }
+ }
+
+ action(vd_victim, "vd", desc="Victimize M/O Data") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ assert(is_valid(cache_entry));
+ out_msg.DataBlk := cache_entry.DataBlk;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.Type := CoherenceRequestType:VicDirty;
+ out_msg.InitialRequestTime := curCycle();
+ if (cache_entry.CacheState == State:O) {
+ out_msg.Shared := true;
+ } else {
+ out_msg.Shared := false;
+ }
+ out_msg.Dirty := cache_entry.Dirty;
+ }
+ }
+
+ action(vc_victim, "vc", desc="Victimize E/S Data") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.Type := CoherenceRequestType:VicClean;
+ out_msg.InitialRequestTime := curCycle();
+ if (cache_entry.CacheState == State:S) {
+ out_msg.Shared := true;
+ } else {
+ out_msg.Shared := false;
+ }
+ }
+ }
+
+ action(a_allocate, "a", desc="allocate block") {
+ if (is_invalid(cache_entry)) {
+ set_cache_entry(L1cache.allocate(address, new Entry));
+ }
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ check_allocate(TBEs);
+ assert(is_valid(cache_entry));
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ tbe.DataBlk := cache_entry.DataBlk; // Data only used for WBs
+ tbe.Dirty := cache_entry.Dirty;
+ tbe.Shared := false;
+ }
+
+ action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+ TBEs.deallocate(address);
+ unset_tbe();
+ }
+
+ action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+ mandatoryQueue_in.dequeue(clockEdge());
+ }
+
+ action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+ responseToTCP_in.dequeue(clockEdge());
+ }
+
+ action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+ probeNetwork_in.dequeue(clockEdge());
+ }
+
+ action(l_loadDone, "l", desc="local load done") {
+ assert(is_valid(cache_entry));
+ if (use_seq_not_coal) {
+ sequencer.readCallback(address, cache_entry.DataBlk,
+ false, MachineType:TCP);
+ } else {
+ coalescer.readCallback(address, MachineType:TCP, cache_entry.DataBlk);
+ }
+ }
+
+ action(xl_loadDone, "xl", desc="remote load done") {
+ peek(responseToTCP_in, ResponseMsg) {
+ assert(is_valid(cache_entry));
+ if (use_seq_not_coal) {
+ coalescer.recordCPReadCallBack(machineID, in_msg.Sender);
+ sequencer.readCallback(address,
+ cache_entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ } else {
+ MachineType cc_mach_type := getCoherenceType(machineID,
+ in_msg.Sender);
+ coalescer.readCallback(address,
+ cc_mach_type,
+ cache_entry.DataBlk,
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ }
+ }
+ }
+
+ action(s_storeDone, "s", desc="local store done") {
+ assert(is_valid(cache_entry));
+ if (use_seq_not_coal) {
+ coalescer.recordCPWriteCallBack(machineID, machineID);
+ sequencer.writeCallback(address, cache_entry.DataBlk,
+ false, MachineType:TCP);
+ } else {
+ coalescer.writeCallback(address, MachineType:TCP, cache_entry.DataBlk);
+ }
+ cache_entry.Dirty := true;
+ }
+
+ action(xs_storeDone, "xs", desc="remote store done") {
+ peek(responseToTCP_in, ResponseMsg) {
+ assert(is_valid(cache_entry));
+ if (use_seq_not_coal) {
+ coalescer.recordCPWriteCallBack(machineID, in_msg.Sender);
+ sequencer.writeCallback(address,
+ cache_entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ } else {
+ MachineType cc_mach_type := getCoherenceType(machineID,
+ in_msg.Sender);
+ coalescer.writeCallback(address,
+ cc_mach_type,
+ cache_entry.DataBlk,
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ }
+ cache_entry.Dirty := true;
+ }
+ }
+
+ action(w_writeCache, "w", desc="write data to cache") {
+ peek(responseToTCP_in, ResponseMsg) {
+ assert(is_valid(cache_entry));
+ cache_entry.DataBlk := in_msg.DataBlk;
+ cache_entry.Dirty := in_msg.Dirty;
+ }
+ }
+
+ action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+ peek(responseToTCP_in, ResponseMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:StaleNotif;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(wb_data, "wb", desc="write back data") {
+ peek(responseToTCP_in, ResponseMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUData;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.Dirty := tbe.Dirty;
+ if (tbe.Shared) {
+ out_msg.NbReqShared := true;
+ } else {
+ out_msg.NbReqShared := false;
+ }
+ out_msg.State := CoherenceState:Shared; // faux info
+ out_msg.MessageSize := MessageSizeType:Writeback_Data;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(piu_sendProbeResponseInvUntransferredOwnership, "piu", desc="send probe ack inv, no data, retain ownership") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes
+ out_msg.Sender := machineID;
+ // will this always be ok? probably not for multisocket
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.Dirty := false;
+ out_msg.Hit := false;
+ out_msg.Ntsl := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.UntransferredOwner :=true;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.Dirty := false;
+ out_msg.Hit := false;
+ out_msg.Ntsl := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ out_msg.isValid := isValid(address);
+ }
+ }
+
+ action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and TCC respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.Dirty := false;
+ out_msg.Ntsl := true;
+ out_msg.Hit := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ out_msg.isValid := isValid(address);
+ }
+ }
+
+ action(prm_sendProbeResponseMiss, "prm", desc="send probe ack PrbShrData, no data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and TCC respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.Dirty := false; // only true if sending back data i think
+ out_msg.Hit := false;
+ out_msg.Ntsl := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ out_msg.isValid := isValid(address);
+ }
+ }
+
+ action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ assert(is_valid(cache_entry) || is_valid(tbe));
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.DataBlk := getDataBlock(address);
+ if (is_valid(tbe)) {
+ out_msg.Dirty := tbe.Dirty;
+ } else {
+ out_msg.Dirty := cache_entry.Dirty;
+ }
+ out_msg.Hit := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.isValid := isValid(address);
+ APPEND_TRANSITION_COMMENT("Sending ack with dirty ");
+ APPEND_TRANSITION_COMMENT(out_msg.Dirty);
+ }
+ }
+
+ action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ assert(is_valid(cache_entry) || is_valid(tbe));
+ assert(is_valid(cache_entry));
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.DataBlk := getDataBlock(address);
+ if (is_valid(tbe)) {
+ out_msg.Dirty := tbe.Dirty;
+ } else {
+ out_msg.Dirty := cache_entry.Dirty;
+ }
+ out_msg.Hit := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.isValid := isValid(address);
+ APPEND_TRANSITION_COMMENT("Sending ack with dirty ");
+ APPEND_TRANSITION_COMMENT(out_msg.Dirty);
+ DPRINTF(RubySlicc, "Data is %s\n", out_msg.DataBlk);
+ }
+ }
+
+ action(sf_setSharedFlip, "sf", desc="hit by shared probe, status may be different") {
+ assert(is_valid(tbe));
+ tbe.Shared := true;
+ }
+
+ action(mru_updateMRU, "mru", desc="Touch block for replacement policy") {
+ L1cache.setMRU(address);
+ }
+
+ action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+ enqueue(unblockNetwork_out, UnblockMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ out_msg.wasValid := isValid(address);
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") {
+ probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+ mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ // Transitions
+
+ // transitions from base
+ transition(I, Load, I_ES) {TagArrayRead} {
+ a_allocate;
+ n_issueRdBlk;
+ p_popMandatoryQueue;
+ }
+
+ transition(I, Store, I_M) {TagArrayRead, TagArrayWrite} {
+ a_allocate;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+
+ transition(S, Store, S_M) {TagArrayRead} {
+ mru_updateMRU;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+
+ transition(E, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ mru_updateMRU;
+ s_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition(O, Store, O_M) {TagArrayRead, DataArrayWrite} {
+ mru_updateMRU;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+
+ transition(M, Store) {TagArrayRead, DataArrayWrite} {
+ mru_updateMRU;
+ s_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ // simple hit transitions
+ transition({S, E, O, M}, Load) {TagArrayRead, DataArrayRead} {
+ l_loadDone;
+ mru_updateMRU;
+ p_popMandatoryQueue;
+ }
+
+ // recycles from transients
+ transition({I_M, I_ES, ES_I, MO_I, S_M, O_M, MO_PI, I_C}, {Load, Store, Repl}) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({S, E}, Repl, ES_I) {TagArrayRead} {
+ t_allocateTBE;
+ vc_victim;
+ ic_invCache;
+ }
+
+ transition({O, M}, Repl, MO_I) {TagArrayRead, DataArrayRead} {
+ t_allocateTBE;
+ vd_victim;
+ ic_invCache;
+ }
+
+ // TD event transitions
+ transition(I_M, {TCC_AckM, TCC_AckCtoD}, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ w_writeCache;
+ xs_storeDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_ES, TCC_AckS, S) {TagArrayWrite, DataArrayWrite} {
+ w_writeCache;
+ xl_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_ES, TCC_AckE, E) {TagArrayWrite, DataArrayWrite} {
+ w_writeCache;
+ xl_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition({S_M, O_M}, TCC_AckM, M) {TagArrayWrite, DataArrayWrite} {
+ xs_storeDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition({MO_I, ES_I}, TCC_NackWB, I){TagArrayWrite} {
+ d_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition({MO_I, ES_I}, TCC_AckWB, I) {TagArrayWrite, DataArrayRead} {
+ wb_data;
+ d_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(I_C, TCC_AckWB, I) {TagArrayWrite} {
+ ss_sendStaleNotification;
+ d_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(I_C, TCC_NackWB, I) {TagArrayWrite} {
+ d_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ // Probe transitions
+ transition({M, O}, PrbInvData, I) {TagArrayRead, TagArrayWrite} {
+ pd_sendProbeResponseData;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition(I, PrbInvData) {TagArrayRead, TagArrayWrite} {
+ prm_sendProbeResponseMiss;
+ pp_popProbeQueue;
+ }
+
+ transition({E, S}, PrbInvData, I) {TagArrayRead, TagArrayWrite} {
+ pd_sendProbeResponseData;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition(I_C, PrbInvData, I_C) {} {
+ pi_sendProbeResponseInv;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ // Needed for TCC-based protocols. Must hold on to ownership till transfer complete
+ transition({M, O}, LocalPrbInv, MO_PI){TagArrayRead, TagArrayWrite} {
+ piu_sendProbeResponseInvUntransferredOwnership;
+ pp_popProbeQueue;
+ }
+
+ // If there is a race and we see a probe invalidate, handle normally.
+ transition(MO_PI, PrbInvData, I){TagArrayWrite} {
+ pd_sendProbeResponseData;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_PI, PrbInv, I){TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ // normal exit when ownership is successfully transferred
+ transition(MO_PI, TCC_AckCtoD, I) {TagArrayWrite} {
+ ic_invCache;
+ pr_popResponseQueue;
+ }
+
+ transition({M, O, E, S, I}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition({E, S, I}, LocalPrbInv, I){TagArrayRead, TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+
+ transition({M, E, O}, PrbShrData, O) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_PI, PrbShrData) {DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+
+ transition(S, PrbShrData, S) {TagArrayRead, DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition({I, I_C}, PrbShrData) {TagArrayRead} {
+ prm_sendProbeResponseMiss;
+ pp_popProbeQueue;
+ }
+
+ transition(I_C, PrbInv, I_C) {} {
+ pi_sendProbeResponseInv;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition({I_M, I_ES}, {PrbInv, PrbInvData}){TagArrayRead} {
+ pi_sendProbeResponseInv;
+ ic_invCache;
+ a_allocate; // but make sure there is room for incoming data when it arrives
+ pp_popProbeQueue;
+ }
+
+ transition({I_M, I_ES}, PrbShrData) {} {
+ prm_sendProbeResponseMiss;
+ pp_popProbeQueue;
+ }
+
+ transition(S_M, PrbInvData, I_M) {TagArrayRead} {
+ pim_sendProbeResponseInvMs;
+ ic_invCache;
+ a_allocate;
+ pp_popProbeQueue;
+ }
+
+ transition(O_M, PrbInvData, I_M) {TagArrayRead,DataArrayRead} {
+ pdm_sendProbeResponseDataMs;
+ ic_invCache;
+ a_allocate;
+ pp_popProbeQueue;
+ }
+
+ transition({S_M, O_M}, {PrbInv}, I_M) {TagArrayRead} {
+ pim_sendProbeResponseInvMs;
+ ic_invCache;
+ a_allocate;
+ pp_popProbeQueue;
+ }
+
+ transition(S_M, {LocalPrbInv}, I_M) {TagArrayRead} {
+ pim_sendProbeResponseInvMs;
+ ic_invCache;
+ a_allocate;
+ pp_popProbeQueue;
+ }
+
+ transition(O_M, LocalPrbInv, I_M) {TagArrayRead} {
+ piu_sendProbeResponseInvUntransferredOwnership;
+ ic_invCache;
+ a_allocate;
+ pp_popProbeQueue;
+ }
+
+ transition({S_M, O_M}, PrbShrData) {DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition(ES_I, PrbInvData, I_C){
+ pd_sendProbeResponseData;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_I, PrbInvData, I_C) {DataArrayRead} {
+ pd_sendProbeResponseData;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_I, PrbInv, I_C) {
+ pi_sendProbeResponseInv;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition(ES_I, PrbInv, I_C) {
+ pi_sendProbeResponseInv;
+ ic_invCache;
+ pp_popProbeQueue;
+ }
+
+ transition(ES_I, PrbShrData, ES_I) {DataArrayRead} {
+ pd_sendProbeResponseData;
+ sf_setSharedFlip;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_I, PrbShrData, MO_I) {DataArrayRead} {
+ pd_sendProbeResponseData;
+ sf_setSharedFlip;
+ pp_popProbeQueue;
+ }
+
+}
diff --git a/src/mem/protocol/GPU_RfO.slicc b/src/mem/protocol/GPU_RfO.slicc
new file mode 100644
index 000000000..7773ce6e0
--- /dev/null
+++ b/src/mem/protocol/GPU_RfO.slicc
@@ -0,0 +1,11 @@
+protocol "GPU_AMD_Base";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-dir.sm";
+include "MOESI_AMD_Base-CorePair.sm";
+include "GPU_RfO-TCP.sm";
+include "GPU_RfO-SQC.sm";
+include "GPU_RfO-TCC.sm";
+include "GPU_RfO-TCCdir.sm";
+include "MOESI_AMD_Base-L3cache.sm";
+include "MOESI_AMD_Base-RegionBuffer.sm";
diff --git a/src/mem/protocol/GPU_VIPER-SQC.sm b/src/mem/protocol/GPU_VIPER-SQC.sm
new file mode 100644
index 000000000..8d5b5699a
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER-SQC.sm
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Blake Hechtman
+ */
+
+machine(MachineType:SQC, "GPU SQC (L1 I Cache)")
+ : Sequencer* sequencer;
+ CacheMemory * L1cache;
+ int TCC_select_num_bits;
+ Cycles issue_latency := 80; // time to send data down to TCC
+ Cycles l2_hit_latency := 18; // for 1MB L2, 20 for 2MB
+
+ MessageBuffer * requestFromSQC, network="To", virtual_network="1", vnet_type="request";
+
+ MessageBuffer * probeToSQC, network="From", virtual_network="1", vnet_type="request";
+ MessageBuffer * responseToSQC, network="From", virtual_network="3", vnet_type="response";
+
+ MessageBuffer * mandatoryQueue;
+{
+ state_declaration(State, desc="SQC Cache States", default="SQC_State_I") {
+ I, AccessPermission:Invalid, desc="Invalid";
+ V, AccessPermission:Read_Only, desc="Valid";
+ }
+
+ enumeration(Event, desc="SQC Events") {
+ // Core initiated
+ Fetch, desc="Fetch";
+ // Mem sys initiated
+ Repl, desc="Replacing block from cache";
+ Data, desc="Received Data";
+ }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ DataArrayRead, desc="Read the data array";
+ DataArrayWrite, desc="Write the data array";
+ TagArrayRead, desc="Read the data array";
+ TagArrayWrite, desc="Write the data array";
+ }
+
+
+ structure(Entry, desc="...", interface="AbstractCacheEntry") {
+ State CacheState, desc="cache state";
+ bool Dirty, desc="Is the data dirty (diff than memory)?";
+ DataBlock DataBlk, desc="data for the block";
+ bool FromL2, default="false", desc="block just moved from L2";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ DataBlock DataBlk, desc="data for the block, required for concurrent writebacks";
+ bool Dirty, desc="Is the data dirty (different than memory)?";
+ int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for";
+ bool Shared, desc="Victim hit by shared probe";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ TBETable TBEs, template="<SQC_TBE>", constructor="m_number_of_TBEs";
+ int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+ void set_cache_entry(AbstractCacheEntry b);
+ void unset_cache_entry();
+ void set_tbe(TBE b);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+ Cycles curCycle();
+
+ // Internal functions
+ Tick clockEdge();
+
+ Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+ Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
+ return cache_entry;
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return tbe.DataBlk;
+ } else {
+ return getCacheEntry(addr).DataBlk;
+ }
+ }
+
+ State getState(TBE tbe, Entry cache_entry, Addr addr) {
+ if(is_valid(tbe)) {
+ return tbe.TBEState;
+ } else if (is_valid(cache_entry)) {
+ return cache_entry.CacheState;
+ }
+ return State:I;
+ }
+
+ void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+ if (is_valid(tbe)) {
+ tbe.TBEState := state;
+ }
+
+ if (is_valid(cache_entry)) {
+ cache_entry.CacheState := state;
+ }
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ testAndRead(addr, tbe.DataBlk, pkt);
+ } else {
+ functionalMemoryRead(pkt);
+ }
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ int num_functional_writes := 0;
+
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ num_functional_writes := num_functional_writes +
+ testAndWrite(addr, tbe.DataBlk, pkt);
+ }
+
+ num_functional_writes := num_functional_writes +
+ functionalMemoryWrite(pkt);
+ return num_functional_writes;
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return SQC_State_to_permission(tbe.TBEState);
+ }
+
+ Entry cache_entry := getCacheEntry(addr);
+ if(is_valid(cache_entry)) {
+ return SQC_State_to_permission(cache_entry.CacheState);
+ }
+
+ return AccessPermission:NotPresent;
+ }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+ if (is_valid(cache_entry)) {
+ cache_entry.changePermission(SQC_State_to_permission(state));
+ }
+ }
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else {
+ error("Invalid RequestType type in checkResourceAvailable");
+ return true;
+ }
+ }
+
+ // Out Ports
+
+ out_port(requestNetwork_out, CPURequestMsg, requestFromSQC);
+
+ // In Ports
+
+ in_port(responseToSQC_in, ResponseMsg, responseToSQC) {
+ if (responseToSQC_in.isReady(clockEdge())) {
+ peek(responseToSQC_in, ResponseMsg, block_on="addr") {
+
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+
+ if (in_msg.Type == CoherenceResponseType:TDSysResp) {
+ if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
+ trigger(Event:Data, in_msg.addr, cache_entry, tbe);
+ } else {
+ Addr victim := L1cache.cacheProbe(in_msg.addr);
+ trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } else {
+ error("Unexpected Response Message to Core");
+ }
+ }
+ }
+ }
+
+ in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+ if (mandatoryQueue_in.isReady(clockEdge())) {
+ peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+ Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+ TBE tbe := TBEs.lookup(in_msg.LineAddress);
+
+ assert(in_msg.Type == RubyRequestType:IFETCH);
+ trigger(Event:Fetch, in_msg.LineAddress, cache_entry, tbe);
+ }
+ }
+ }
+
+ // Actions
+
+ action(ic_invCache, "ic", desc="invalidate cache") {
+ if(is_valid(cache_entry)) {
+ L1cache.deallocate(address);
+ }
+ unset_cache_entry();
+ }
+
+ action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlk;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := curCycle();
+ }
+ }
+
+ action(a_allocate, "a", desc="allocate block") {
+ if (is_invalid(cache_entry)) {
+ set_cache_entry(L1cache.allocate(address, new Entry));
+ }
+ }
+
+ action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+ mandatoryQueue_in.dequeue(clockEdge());
+ }
+
+ action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+ responseToSQC_in.dequeue(clockEdge());
+ }
+
+ action(l_loadDone, "l", desc="local load done") {
+ assert(is_valid(cache_entry));
+ sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+ APPEND_TRANSITION_COMMENT(cache_entry.DataBlk);
+ }
+
+ action(w_writeCache, "w", desc="write data to cache") {
+ peek(responseToSQC_in, ResponseMsg) {
+ assert(is_valid(cache_entry));
+ cache_entry.DataBlk := in_msg.DataBlk;
+ cache_entry.Dirty := false;
+ }
+ }
+
+ // Transitions
+
+ // transitions from base
+ transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} {
+ ic_invCache
+ }
+
+ transition(I, Data, V) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+ a_allocate;
+ w_writeCache
+ l_loadDone;
+ pr_popResponseQueue;
+ }
+
+ transition(I, Fetch) {TagArrayRead, TagArrayWrite} {
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ // simple hit transitions
+ transition(V, Fetch) {TagArrayRead, DataArrayRead} {
+ l_loadDone;
+ p_popMandatoryQueue;
+ }
+}
diff --git a/src/mem/protocol/GPU_VIPER-TCC.sm b/src/mem/protocol/GPU_VIPER-TCC.sm
new file mode 100644
index 000000000..f62df9f4f
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER-TCC.sm
@@ -0,0 +1,739 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Blake Hechtman
+ */
+
+machine(MachineType:TCC, "TCC Cache")
+ : CacheMemory * L2cache;
+ bool WB; /*is this cache Writeback?*/
+ Cycles l2_request_latency := 50;
+ Cycles l2_response_latency := 20;
+
+ // From the TCPs or SQCs
+ MessageBuffer * requestFromTCP, network="From", virtual_network="1", vnet_type="request";
+ // To the Cores. TCC deals only with TCPs/SQCs.
+ MessageBuffer * responseToCore, network="To", virtual_network="3", vnet_type="response";
+ // From the NB
+ MessageBuffer * probeFromNB, network="From", virtual_network="0", vnet_type="request";
+ MessageBuffer * responseFromNB, network="From", virtual_network="2", vnet_type="response";
+ // To the NB
+ MessageBuffer * requestToNB, network="To", virtual_network="0", vnet_type="request";
+ MessageBuffer * responseToNB, network="To", virtual_network="2", vnet_type="response";
+ MessageBuffer * unblockToNB, network="To", virtual_network="4", vnet_type="unblock";
+
+ MessageBuffer * triggerQueue;
+
+{
+ // EVENTS
+ enumeration(Event, desc="TCC Events") {
+ // Requests coming from the Cores
+ RdBlk, desc="RdBlk event";
+ WrVicBlk, desc="L1 Write Through";
+ WrVicBlkBack, desc="L1 Write Through(dirty cache)";
+ Atomic, desc="Atomic Op";
+ AtomicDone, desc="AtomicOps Complete";
+ AtomicNotDone, desc="AtomicOps not Complete";
+ Data, desc="data messgae";
+ // Coming from this TCC
+ L2_Repl, desc="L2 Replacement";
+ // Probes
+ PrbInv, desc="Invalidating probe";
+ // Coming from Memory Controller
+ WBAck, desc="writethrough ack from memory";
+ }
+
+ // STATES
+ state_declaration(State, desc="TCC State", default="TCC_State_I") {
+ M, AccessPermission:Read_Write, desc="Modified(dirty cache only)";
+ W, AccessPermission:Read_Write, desc="Written(dirty cache only)";
+ V, AccessPermission:Read_Only, desc="Valid";
+ I, AccessPermission:Invalid, desc="Invalid";
+ IV, AccessPermission:Busy, desc="Waiting for Data";
+ WI, AccessPermission:Busy, desc="Waiting on Writethrough Ack";
+ A, AccessPermission:Busy, desc="Invalid waiting on atomici Data";
+ }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ DataArrayRead, desc="Read the data array";
+ DataArrayWrite, desc="Write the data array";
+ TagArrayRead, desc="Read the data array";
+ TagArrayWrite, desc="Write the data array";
+ }
+
+
+ // STRUCTURES
+
+ structure(Entry, desc="...", interface="AbstractCacheEntry") {
+ State CacheState, desc="cache state";
+ bool Dirty, desc="Is the data dirty (diff from memory?)";
+ DataBlock DataBlk, desc="Data for the block";
+ WriteMask writeMask, desc="Dirty byte mask";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ DataBlock DataBlk, desc="data for the block";
+ bool Dirty, desc="Is the data dirty?";
+ bool Shared, desc="Victim hit by shared probe";
+ MachineID From, desc="Waiting for writeback from...";
+ NetDest Destination, desc="Data destination";
+ int numAtomics, desc="number remaining atomics";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ TBETable TBEs, template="<TCC_TBE>", constructor="m_number_of_TBEs";
+
+ void set_cache_entry(AbstractCacheEntry b);
+ void unset_cache_entry();
+ void set_tbe(TBE b);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+
+
+ // FUNCTION DEFINITIONS
+ Tick clockEdge();
+
+ Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+ return static_cast(Entry, "pointer", L2cache.lookup(addr));
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ return getCacheEntry(addr).DataBlk;
+ }
+
+ bool presentOrAvail(Addr addr) {
+ return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+ }
+
+ State getState(TBE tbe, Entry cache_entry, Addr addr) {
+ if (is_valid(tbe)) {
+ return tbe.TBEState;
+ } else if (is_valid(cache_entry)) {
+ return cache_entry.CacheState;
+ }
+ return State:I;
+ }
+
+ void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+ if (is_valid(tbe)) {
+ tbe.TBEState := state;
+ }
+
+ if (is_valid(cache_entry)) {
+ cache_entry.CacheState := state;
+ }
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ testAndRead(addr, tbe.DataBlk, pkt);
+ } else {
+ functionalMemoryRead(pkt);
+ }
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ int num_functional_writes := 0;
+
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ num_functional_writes := num_functional_writes +
+ testAndWrite(addr, tbe.DataBlk, pkt);
+ }
+
+ num_functional_writes := num_functional_writes +
+ functionalMemoryWrite(pkt);
+ return num_functional_writes;
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return TCC_State_to_permission(tbe.TBEState);
+ }
+
+ Entry cache_entry := getCacheEntry(addr);
+ if(is_valid(cache_entry)) {
+ return TCC_State_to_permission(cache_entry.CacheState);
+ }
+
+ return AccessPermission:NotPresent;
+ }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+ if (is_valid(cache_entry)) {
+ cache_entry.changePermission(TCC_State_to_permission(state));
+ }
+ }
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else {
+ error("Invalid RequestType type in checkResourceAvailable");
+ return true;
+ }
+ }
+
+
+ // ** OUT_PORTS **
+
+ // Three classes of ports
+ // Class 1: downward facing network links to NB
+ out_port(requestToNB_out, CPURequestMsg, requestToNB);
+ out_port(responseToNB_out, ResponseMsg, responseToNB);
+ out_port(unblockToNB_out, UnblockMsg, unblockToNB);
+
+ // Class 2: upward facing ports to GPU cores
+ out_port(responseToCore_out, ResponseMsg, responseToCore);
+
+ out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+ //
+ // request queue going to NB
+ //
+
+
+// ** IN_PORTS **
+ in_port(triggerQueue_in, TiggerMsg, triggerQueue) {
+ if (triggerQueue_in.isReady(clockEdge())) {
+ peek(triggerQueue_in, TriggerMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (tbe.numAtomics == 0) {
+ trigger(Event:AtomicDone, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:AtomicNotDone, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+ }
+
+
+
+ in_port(responseFromNB_in, ResponseMsg, responseFromNB) {
+ if (responseFromNB_in.isReady(clockEdge())) {
+ peek(responseFromNB_in, ResponseMsg, block_on="addr") {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+ if(presentOrAvail(in_msg.addr)) {
+ trigger(Event:Data, in_msg.addr, cache_entry, tbe);
+ } else {
+ Addr victim := L2cache.cacheProbe(in_msg.addr);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+ trigger(Event:WBAck, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("Unexpected Response Message to Core");
+ }
+ }
+ }
+ }
+
+ // Finally handling incoming requests (from TCP) and probes (from NB).
+ in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB) {
+ if (probeNetwork_in.isReady(clockEdge())) {
+ peek(probeNetwork_in, NBProbeRequestMsg) {
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+
+ in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) {
+ if (coreRequestNetwork_in.isReady(clockEdge())) {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ if(WB) {
+ if(presentOrAvail(in_msg.addr)) {
+ trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe);
+ } else {
+ Addr victim := L2cache.cacheProbe(in_msg.addr);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } else {
+ trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+ trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
+ trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+ } else {
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ error("Unexpected Response Message to Core");
+ }
+ }
+ }
+ }
+ // BEGIN ACTIONS
+
+ action(i_invL2, "i", desc="invalidate TCC cache block") {
+ if (is_valid(cache_entry)) {
+ L2cache.deallocate(address);
+ }
+ unset_cache_entry();
+ }
+
+ action(sd_sendData, "sd", desc="send Shared response") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.DataBlk := cache_entry.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := false;
+ out_msg.State := CoherenceState:Shared;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+
+ action(sdr_sendDataResponse, "sdr", desc="send Shared response") {
+ enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination := tbe.Destination;
+ out_msg.DataBlk := cache_entry.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := false;
+ out_msg.State := CoherenceState:Shared;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ enqueue(unblockToNB_out, UnblockMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+
+ action(rd_requestData, "r", desc="Miss in L2, pass on") {
+ if(tbe.Destination.count()==1){
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := in_msg.Type;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.Shared := false; // unneeded for this request
+ out_msg.MessageSize := in_msg.MessageSize;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+ }
+
+ action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+ peek(responseFromNB_in, ResponseMsg) {
+ enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysWBAck;
+ out_msg.Destination.clear();
+ out_msg.Destination.add(in_msg.WTRequestor);
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ }
+ }
+ }
+
+ action(swb_sendWBAck, "swb", desc="send WB Ack") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysWBAck;
+ out_msg.Destination.clear();
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ }
+ }
+ }
+
+ action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") {
+ peek(responseFromNB_in, ResponseMsg) {
+ enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysResp;
+ out_msg.Destination.add(in_msg.WTRequestor);
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := in_msg.MessageSize;
+ out_msg.DataBlk := in_msg.DataBlk;
+ }
+ }
+ }
+
+ action(a_allocateBlock, "a", desc="allocate TCC block") {
+ if (is_invalid(cache_entry)) {
+ set_cache_entry(L2cache.allocate(address, new Entry));
+ cache_entry.writeMask.clear();
+ }
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ if (is_invalid(tbe)) {
+ check_allocate(TBEs);
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ tbe.Destination.clear();
+ tbe.numAtomics := 0;
+ }
+ if (coreRequestNetwork_in.isReady(clockEdge())) {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){
+ tbe.Destination.add(in_msg.Requestor);
+ }
+ }
+ }
+ }
+
+ action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") {
+ tbe.Destination.clear();
+ TBEs.deallocate(address);
+ unset_tbe();
+ }
+
+ action(wcb_writeCacheBlock, "wcb", desc="write data to TCC") {
+ peek(responseFromNB_in, ResponseMsg) {
+ cache_entry.DataBlk := in_msg.DataBlk;
+ DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+ }
+ }
+
+ action(wdb_writeDirtyBytes, "wdb", desc="write data to TCC") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ cache_entry.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask);
+ cache_entry.writeMask.orMask(in_msg.writeMask);
+ DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+ }
+ }
+
+ action(wt_writeThrough, "wt", desc="write back data") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ out_msg.WTRequestor := in_msg.Requestor;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Data;
+ out_msg.Type := CoherenceRequestType:WriteThrough;
+ out_msg.Dirty := true;
+ out_msg.DataBlk := in_msg.DataBlk;
+ out_msg.writeMask.orMask(in_msg.writeMask);
+ }
+ }
+ }
+
+ action(wb_writeBack, "wb", desc="write back data") {
+ enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ out_msg.WTRequestor := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Data;
+ out_msg.Type := CoherenceRequestType:WriteThrough;
+ out_msg.Dirty := true;
+ out_msg.DataBlk := cache_entry.DataBlk;
+ out_msg.writeMask.orMask(cache_entry.writeMask);
+ }
+ }
+
+ action(at_atomicThrough, "at", desc="write back data") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ out_msg.WTRequestor := in_msg.Requestor;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Data;
+ out_msg.Type := CoherenceRequestType:Atomic;
+ out_msg.Dirty := true;
+ out_msg.writeMask.orMask(in_msg.writeMask);
+ }
+ }
+ }
+
+ action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+ enqueue(responseToNB_out, ResponseMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.Dirty := false;
+ out_msg.Hit := false;
+ out_msg.Ntsl := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+ action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+ L2cache.setMRU(address);
+ }
+
+ action(p_popRequestQueue, "p", desc="pop request queue") {
+ coreRequestNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pr_popResponseQueue, "pr", desc="pop response queue") {
+ responseFromNB_in.dequeue(clockEdge());
+ }
+
+ action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+ probeNetwork_in.dequeue(clockEdge());
+ }
+
+ action(z_stall, "z", desc="stall") {
+ // built-in
+ }
+
+
+ action(ina_incrementNumAtomics, "ina", desc="inc num atomics") {
+ tbe.numAtomics := tbe.numAtomics + 1;
+ }
+
+
+ action(dna_decrementNumAtomics, "dna", desc="inc num atomics") {
+ tbe.numAtomics := tbe.numAtomics - 1;
+ if (tbe.numAtomics==0) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:AtomicDone;
+ }
+ }
+ }
+
+ action(ptr_popTriggerQueue, "ptr", desc="pop Trigger") {
+ triggerQueue_in.dequeue(clockEdge());
+ }
+
+ // END ACTIONS
+
+ // BEGIN TRANSITIONS
+ // transitions from base
+ // Assumptions for ArrayRead/Write
+ // TBE checked before tags
+ // Data Read/Write requires Tag Read
+
+ // Stalling transitions do NOT check the tag array...and if they do,
+ // they can cause a resource stall deadlock!
+
+ transition(WI, {RdBlk, WrVicBlk, Atomic, WrVicBlkBack}) { //TagArrayRead} {
+ z_stall;
+ }
+ transition(A, {RdBlk, WrVicBlk, WrVicBlkBack}) { //TagArrayRead} {
+ z_stall;
+ }
+ transition(IV, {WrVicBlk, Atomic, WrVicBlkBack}) { //TagArrayRead} {
+ z_stall;
+ }
+ transition({M, V}, RdBlk) {TagArrayRead, DataArrayRead} {
+ sd_sendData;
+ ut_updateTag;
+ p_popRequestQueue;
+ }
+ transition(W, RdBlk, WI) {TagArrayRead, DataArrayRead} {
+ t_allocateTBE;
+ wb_writeBack;
+ }
+
+ transition(I, RdBlk, IV) {TagArrayRead} {
+ t_allocateTBE;
+ rd_requestData;
+ p_popRequestQueue;
+ }
+
+ transition(IV, RdBlk) {
+ t_allocateTBE;
+ rd_requestData;
+ p_popRequestQueue;
+ }
+
+ transition({V, I},Atomic, A) {TagArrayRead} {
+ i_invL2;
+ t_allocateTBE;
+ at_atomicThrough;
+ ina_incrementNumAtomics;
+ p_popRequestQueue;
+ }
+
+ transition(A, Atomic) {
+ at_atomicThrough;
+ ina_incrementNumAtomics;
+ p_popRequestQueue;
+ }
+
+ transition({M, W}, Atomic, WI) {TagArrayRead} {
+ t_allocateTBE;
+ wb_writeBack;
+ }
+
+ transition(I, WrVicBlk) {TagArrayRead} {
+ wt_writeThrough;
+ p_popRequestQueue;
+ }
+
+ transition(V, WrVicBlk) {TagArrayRead, DataArrayWrite} {
+ ut_updateTag;
+ wdb_writeDirtyBytes;
+ wt_writeThrough;
+ p_popRequestQueue;
+ }
+
+ transition({V, M}, WrVicBlkBack, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ ut_updateTag;
+ swb_sendWBAck;
+ wdb_writeDirtyBytes;
+ p_popRequestQueue;
+ }
+
+ transition(W, WrVicBlkBack) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ ut_updateTag;
+ swb_sendWBAck;
+ wdb_writeDirtyBytes;
+ p_popRequestQueue;
+ }
+
+ transition(I, WrVicBlkBack, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ a_allocateBlock;
+ ut_updateTag;
+ swb_sendWBAck;
+ wdb_writeDirtyBytes;
+ p_popRequestQueue;
+ }
+
+ transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} {
+ t_allocateTBE;
+ wb_writeBack;
+ i_invL2;
+ }
+
+ transition({I, V}, L2_Repl, I) {TagArrayRead, TagArrayWrite} {
+ i_invL2;
+ }
+
+ transition({A, IV, WI}, L2_Repl) {
+ i_invL2;
+ }
+
+ transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition(M, PrbInv, W) {TagArrayRead, TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition(W, PrbInv) {TagArrayRead} {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition({A, IV, WI}, PrbInv) {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ a_allocateBlock;
+ ut_updateTag;
+ wcb_writeCacheBlock;
+ sdr_sendDataResponse;
+ pr_popResponseQueue;
+ dt_deallocateTBE;
+ }
+
+ transition(A, Data) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ a_allocateBlock;
+ ar_sendAtomicResponse;
+ dna_decrementNumAtomics;
+ pr_popResponseQueue;
+ }
+
+ transition(A, AtomicDone, I) {TagArrayRead, TagArrayWrite} {
+ dt_deallocateTBE;
+ ptr_popTriggerQueue;
+ }
+
+ transition(A, AtomicNotDone) {TagArrayRead} {
+ ptr_popTriggerQueue;
+ }
+
+ //M,W should not see WBAck as the cache is in WB mode
+ //WBAcks do not need to check tags
+ transition({I, V, IV, A}, WBAck) {
+ w_sendResponseWBAck;
+ pr_popResponseQueue;
+ }
+
+ transition(WI, WBAck,I) {
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+}
diff --git a/src/mem/protocol/GPU_VIPER-TCP.sm b/src/mem/protocol/GPU_VIPER-TCP.sm
new file mode 100644
index 000000000..d81196b17
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER-TCP.sm
@@ -0,0 +1,747 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Blake Hechtman
+ */
+
+machine(MachineType:TCP, "GPU TCP (L1 Data Cache)")
+ : VIPERCoalescer* coalescer;
+ Sequencer* sequencer;
+ bool use_seq_not_coal;
+ CacheMemory * L1cache;
+ bool WB; /*is this cache Writeback?*/
+ bool disableL1; /* bypass L1 cache? */
+ int TCC_select_num_bits;
+ Cycles issue_latency := 40; // time to send data down to TCC
+ Cycles l2_hit_latency := 18;
+
+ MessageBuffer * requestFromTCP, network="To", virtual_network="1", vnet_type="request";
+ MessageBuffer * responseFromTCP, network="To", virtual_network="3", vnet_type="response";
+ MessageBuffer * unblockFromCore, network="To", virtual_network="5", vnet_type="unblock";
+
+ MessageBuffer * probeToTCP, network="From", virtual_network="1", vnet_type="request";
+ MessageBuffer * responseToTCP, network="From", virtual_network="3", vnet_type="response";
+ MessageBuffer * mandatoryQueue;
+
+{
+ state_declaration(State, desc="TCP Cache States", default="TCP_State_I") {
+ I, AccessPermission:Invalid, desc="Invalid";
+ V, AccessPermission:Read_Only, desc="Valid";
+ W, AccessPermission:Read_Write, desc="Written";
+ M, AccessPermission:Read_Write, desc="Written and Valid";
+ L, AccessPermission:Read_Write, desc="Local access is modifable";
+ A, AccessPermission:Invalid, desc="Waiting on Atomic";
+ }
+
+ enumeration(Event, desc="TCP Events") {
+ // Core initiated
+ Load, desc="Load";
+ Store, desc="Store to L1 (L1 is dirty)";
+ StoreThrough, desc="Store directly to L2(L1 is clean)";
+ StoreLocal, desc="Store to L1 but L1 is clean";
+ Atomic, desc="Atomic";
+ Flush, desc="Flush if dirty(wbL1 for Store Release)";
+ Evict, desc="Evict if clean(invL1 for Load Acquire)";
+ // Mem sys initiated
+ Repl, desc="Replacing block from cache";
+
+ // TCC initiated
+ TCC_Ack, desc="TCC Ack to Core Request";
+ TCC_AckWB, desc="TCC Ack for WB";
+ // Disable L1 cache
+ Bypass, desc="Bypass the entire L1 cache";
+ }
+
+ enumeration(RequestType,
+ desc="To communicate stats from transitions to recordStats") {
+ DataArrayRead, desc="Read the data array";
+ DataArrayWrite, desc="Write the data array";
+ TagArrayRead, desc="Read the data array";
+ TagArrayWrite, desc="Write the data array";
+ TagArrayFlash, desc="Flash clear the data array";
+ }
+
+
+ structure(Entry, desc="...", interface="AbstractCacheEntry") {
+ State CacheState, desc="cache state";
+ bool Dirty, desc="Is the data dirty (diff than memory)?";
+ DataBlock DataBlk, desc="data for the block";
+ bool FromL2, default="false", desc="block just moved from L2";
+ WriteMask writeMask, desc="written bytes masks";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ DataBlock DataBlk, desc="data for the block, required for concurrent writebacks";
+ bool Dirty, desc="Is the data dirty (different than memory)?";
+ int NumPendingMsgs,desc="Number of acks/data messages that this processor is waiting for";
+ bool Shared, desc="Victim hit by shared probe";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ TBETable TBEs, template="<TCP_TBE>", constructor="m_number_of_TBEs";
+ int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+ int WTcnt, default="0";
+ int Fcnt, default="0";
+ bool inFlush, default="false";
+
+ void set_cache_entry(AbstractCacheEntry b);
+ void unset_cache_entry();
+ void set_tbe(TBE b);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+ Cycles curCycle();
+
+ // Internal functions
+ Tick clockEdge();
+ Tick cyclesToTicks(Cycles c);
+ Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+ Entry cache_entry := static_cast(Entry, "pointer", L1cache.lookup(address));
+ return cache_entry;
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return tbe.DataBlk;
+ } else {
+ return getCacheEntry(addr).DataBlk;
+ }
+ }
+
+ State getState(TBE tbe, Entry cache_entry, Addr addr) {
+ if (is_valid(tbe)) {
+ return tbe.TBEState;
+ } else if (is_valid(cache_entry)) {
+ return cache_entry.CacheState;
+ }
+ return State:I;
+ }
+
+ void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+ if (is_valid(tbe)) {
+ tbe.TBEState := state;
+ }
+
+ if (is_valid(cache_entry)) {
+ cache_entry.CacheState := state;
+ }
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ testAndRead(addr, tbe.DataBlk, pkt);
+ } else {
+ functionalMemoryRead(pkt);
+ }
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ int num_functional_writes := 0;
+
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ num_functional_writes := num_functional_writes +
+ testAndWrite(addr, tbe.DataBlk, pkt);
+ }
+
+ num_functional_writes := num_functional_writes +
+ functionalMemoryWrite(pkt);
+ return num_functional_writes;
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return TCP_State_to_permission(tbe.TBEState);
+ }
+
+ Entry cache_entry := getCacheEntry(addr);
+ if(is_valid(cache_entry)) {
+ return TCP_State_to_permission(cache_entry.CacheState);
+ }
+
+ return AccessPermission:NotPresent;
+ }
+
+ bool isValid(Addr addr) {
+ AccessPermission perm := getAccessPermission(addr);
+ if (perm == AccessPermission:NotPresent ||
+ perm == AccessPermission:Invalid ||
+ perm == AccessPermission:Busy) {
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+ if (is_valid(cache_entry)) {
+ cache_entry.changePermission(TCP_State_to_permission(state));
+ }
+ }
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ L1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ L1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:TagArrayFlash) {
+ L1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ L1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ return L1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:TagArrayFlash) {
+ // FIXME should check once per cache, rather than once per cacheline
+ return L1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else {
+ error("Invalid RequestType type in checkResourceAvailable");
+ return true;
+ }
+ }
+
+ // Out Ports
+
+ out_port(requestNetwork_out, CPURequestMsg, requestFromTCP);
+
+ // In Ports
+
+ in_port(responseToTCP_in, ResponseMsg, responseToTCP) {
+ if (responseToTCP_in.isReady(clockEdge())) {
+ peek(responseToTCP_in, ResponseMsg, block_on="addr") {
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ if (in_msg.Type == CoherenceResponseType:TDSysResp) {
+ // disable L1 cache
+ if (disableL1) {
+ trigger(Event:Bypass, in_msg.addr, cache_entry, tbe);
+ } else {
+ if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.addr)) {
+ trigger(Event:TCC_Ack, in_msg.addr, cache_entry, tbe);
+ } else {
+ Addr victim := L1cache.cacheProbe(in_msg.addr);
+ trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ }
+ } else if (in_msg.Type == CoherenceResponseType:TDSysWBAck ||
+ in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+ trigger(Event:TCC_AckWB, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("Unexpected Response Message to Core");
+ }
+ }
+ }
+ }
+
+ in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+ if (mandatoryQueue_in.isReady(clockEdge())) {
+ peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+ Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+ TBE tbe := TBEs.lookup(in_msg.LineAddress);
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ if (in_msg.Type == RubyRequestType:LD) {
+ trigger(Event:Load, in_msg.LineAddress, cache_entry, tbe);
+ } else if (in_msg.Type == RubyRequestType:ATOMIC) {
+ trigger(Event:Atomic, in_msg.LineAddress, cache_entry, tbe);
+ } else if (in_msg.Type == RubyRequestType:ST) {
+ if(disableL1) {
+ trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
+ } else {
+ if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+ if (in_msg.segment == HSASegment:SPILL) {
+ trigger(Event:StoreLocal, in_msg.LineAddress, cache_entry, tbe);
+ } else if (WB) {
+ trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
+ } else {
+ trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
+ }
+ } else {
+ Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+ trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } // end if (disableL1)
+ } else if (in_msg.Type == RubyRequestType:FLUSH) {
+ trigger(Event:Flush, in_msg.LineAddress, cache_entry, tbe);
+ } else if (in_msg.Type == RubyRequestType:REPLACEMENT){
+ trigger(Event:Evict, in_msg.LineAddress, cache_entry, tbe);
+ } else {
+ error("Unexpected Request Message from VIC");
+ if (is_valid(cache_entry) || L1cache.cacheAvail(in_msg.LineAddress)) {
+ if (WB) {
+ trigger(Event:Store, in_msg.LineAddress, cache_entry, tbe);
+ } else {
+ trigger(Event:StoreThrough, in_msg.LineAddress, cache_entry, tbe);
+ }
+ } else {
+ Addr victim := L1cache.cacheProbe(in_msg.LineAddress);
+ trigger(Event:Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ }
+ }
+ }
+ }
+
+ // Actions
+
+ action(ic_invCache, "ic", desc="invalidate cache") {
+ if(is_valid(cache_entry)) {
+ cache_entry.writeMask.clear();
+ L1cache.deallocate(address);
+ }
+ unset_cache_entry();
+ }
+
+ action(n_issueRdBlk, "n", desc="Issue RdBlk") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlk;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := curCycle();
+ }
+ }
+
+ action(rb_bypassDone, "rb", desc="bypass L1 of read access") {
+ peek(responseToTCP_in, ResponseMsg) {
+ DataBlock tmp:= in_msg.DataBlk;
+ if (use_seq_not_coal) {
+ sequencer.readCallback(address, tmp, false, MachineType:L1Cache);
+ } else {
+ coalescer.readCallback(address, MachineType:L1Cache, tmp);
+ }
+ if(is_valid(cache_entry)) {
+ unset_cache_entry();
+ }
+ }
+ }
+
+ action(wab_bypassDone, "wab", desc="bypass L1 of write access") {
+ peek(responseToTCP_in, ResponseMsg) {
+ DataBlock tmp := in_msg.DataBlk;
+ if (use_seq_not_coal) {
+ sequencer.writeCallback(address, tmp, false, MachineType:L1Cache);
+ } else {
+ coalescer.writeCallback(address, MachineType:L1Cache, tmp);
+ }
+ }
+ }
+
+ action(norl_issueRdBlkOrloadDone, "norl", desc="local load done") {
+ peek(mandatoryQueue_in, RubyRequest){
+ if (cache_entry.writeMask.cmpMask(in_msg.writeMask)) {
+ if (use_seq_not_coal) {
+ sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+ } else {
+ coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+ }
+ } else {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlk;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := curCycle();
+ }
+ }
+ }
+ }
+
+ action(wt_writeThrough, "wt", desc="Flush dirty data") {
+ WTcnt := WTcnt + 1;
+ APPEND_TRANSITION_COMMENT("write++ = ");
+ APPEND_TRANSITION_COMMENT(WTcnt);
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ assert(is_valid(cache_entry));
+ out_msg.DataBlk := cache_entry.DataBlk;
+ out_msg.writeMask.clear();
+ out_msg.writeMask.orMask(cache_entry.writeMask);
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Data;
+ out_msg.Type := CoherenceRequestType:WriteThrough;
+ out_msg.InitialRequestTime := curCycle();
+ out_msg.Shared := false;
+ }
+ }
+
+ action(at_atomicThrough, "at", desc="send Atomic") {
+ peek(mandatoryQueue_in, RubyRequest) {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ out_msg.writeMask.clear();
+ out_msg.writeMask.orMask(in_msg.writeMask);
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ out_msg.MessageSize := MessageSizeType:Data;
+ out_msg.Type := CoherenceRequestType:Atomic;
+ out_msg.InitialRequestTime := curCycle();
+ out_msg.Shared := false;
+ }
+ }
+ }
+
+ action(a_allocate, "a", desc="allocate block") {
+ if (is_invalid(cache_entry)) {
+ set_cache_entry(L1cache.allocate(address, new Entry));
+ }
+ cache_entry.writeMask.clear();
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ check_allocate(TBEs);
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ }
+
+ action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+ TBEs.deallocate(address);
+ unset_tbe();
+ }
+
+ action(sf_setFlush, "sf", desc="set flush") {
+ inFlush := true;
+ APPEND_TRANSITION_COMMENT(" inFlush is true");
+ }
+
+ action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+ mandatoryQueue_in.dequeue(clockEdge());
+ }
+
+ action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+ responseToTCP_in.dequeue(clockEdge());
+ }
+
+ action(l_loadDone, "l", desc="local load done") {
+ assert(is_valid(cache_entry));
+ if (use_seq_not_coal) {
+ sequencer.readCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+ } else {
+ coalescer.readCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+ }
+ }
+
+ action(s_storeDone, "s", desc="local store done") {
+ assert(is_valid(cache_entry));
+
+ if (use_seq_not_coal) {
+ sequencer.writeCallback(address, cache_entry.DataBlk, false, MachineType:L1Cache);
+ } else {
+ coalescer.writeCallback(address, MachineType:L1Cache, cache_entry.DataBlk);
+ }
+ cache_entry.Dirty := true;
+ }
+
+ action(inv_invDone, "inv", desc="local inv done") {
+ if (use_seq_not_coal) {
+ DPRINTF(RubySlicc, "Sequencer does not define invCallback!\n");
+ assert(false);
+ } else {
+ coalescer.invCallback(address);
+ }
+ }
+
+ action(wb_wbDone, "wb", desc="local wb done") {
+ if (inFlush == true) {
+ Fcnt := Fcnt + 1;
+ if (Fcnt > WTcnt) {
+ if (use_seq_not_coal) {
+ DPRINTF(RubySlicc, "Sequencer does not define wbCallback!\n");
+ assert(false);
+ } else {
+ coalescer.wbCallback(address);
+ }
+ Fcnt := Fcnt - 1;
+ }
+ if (WTcnt == 0 && Fcnt == 0) {
+ inFlush := false;
+ APPEND_TRANSITION_COMMENT(" inFlush is false");
+ }
+ }
+ }
+
+ action(wd_wtDone, "wd", desc="writethrough done") {
+ WTcnt := WTcnt - 1;
+ if (inFlush == true) {
+ Fcnt := Fcnt -1;
+ }
+ assert(WTcnt >= 0);
+ APPEND_TRANSITION_COMMENT("write-- = ");
+ APPEND_TRANSITION_COMMENT(WTcnt);
+ }
+
+ action(dw_dirtyWrite, "dw", desc="update write mask"){
+ peek(mandatoryQueue_in, RubyRequest) {
+ cache_entry.DataBlk.copyPartial(in_msg.WTData,in_msg.writeMask);
+ cache_entry.writeMask.orMask(in_msg.writeMask);
+ }
+ }
+ action(w_writeCache, "w", desc="write data to cache") {
+ peek(responseToTCP_in, ResponseMsg) {
+ assert(is_valid(cache_entry));
+ DataBlock tmp := in_msg.DataBlk;
+ tmp.copyPartial(cache_entry.DataBlk,cache_entry.writeMask);
+ cache_entry.DataBlk := tmp;
+ }
+ }
+
+ action(mru_updateMRU, "mru", desc="Touch block for replacement policy") {
+ L1cache.setMRU(address);
+ }
+
+// action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+// mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+// }
+
+ action(z_stall, "z", desc="stall; built-in") {
+ // built-int action
+ }
+
+ // Transitions
+ // ArrayRead/Write assumptions:
+ // All requests read Tag Array
+ // TBE allocation write the TagArray to I
+ // TBE only checked on misses
+ // Stores will also write dirty bits in the tag
+ // WriteThroughs still need to use cache entry as staging buffer for wavefront
+
+ // Stalling transitions do NOT check the tag array...and if they do,
+ // they can cause a resource stall deadlock!
+
+ transition({A}, {Load, Store, Atomic, StoreThrough}) { //TagArrayRead} {
+ z_stall;
+ }
+
+ transition({M, V, L}, Load) {TagArrayRead, DataArrayRead} {
+ l_loadDone;
+ mru_updateMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(I, Load) {TagArrayRead} {
+ n_issueRdBlk;
+ p_popMandatoryQueue;
+ }
+
+ transition({V, I}, Atomic, A) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ mru_updateMRU;
+ at_atomicThrough;
+ p_popMandatoryQueue;
+ }
+
+ transition({M, W}, Atomic, A) {TagArrayRead, TagArrayWrite} {
+ wt_writeThrough;
+ t_allocateTBE;
+ at_atomicThrough;
+ ic_invCache;
+ }
+
+ transition(W, Load, I) {TagArrayRead, DataArrayRead} {
+ wt_writeThrough;
+ norl_issueRdBlkOrloadDone;
+ p_popMandatoryQueue;
+ }
+
+ transition({I}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ a_allocate;
+ dw_dirtyWrite;
+ s_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition({L, V}, StoreLocal, L) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ dw_dirtyWrite;
+ mru_updateMRU;
+ s_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition(I, Store, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ a_allocate;
+ dw_dirtyWrite;
+ s_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition(V, Store, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ dw_dirtyWrite;
+ mru_updateMRU;
+ s_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition({M, W}, Store) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ dw_dirtyWrite;
+ mru_updateMRU;
+ s_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ //M,W should not see storeThrough
+ transition(I, StoreThrough) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ a_allocate;
+ dw_dirtyWrite;
+ s_storeDone;
+ wt_writeThrough;
+ ic_invCache;
+ p_popMandatoryQueue;
+ }
+
+ transition({V,L}, StoreThrough, I) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ dw_dirtyWrite;
+ s_storeDone;
+ wt_writeThrough;
+ ic_invCache;
+ p_popMandatoryQueue;
+ }
+
+ transition(I, TCC_Ack, V) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} {
+ a_allocate;
+ w_writeCache;
+ l_loadDone;
+ pr_popResponseQueue;
+ }
+
+ transition(I, Bypass, I) {
+ rb_bypassDone;
+ pr_popResponseQueue;
+ }
+
+ transition(A, Bypass, I){
+ d_deallocateTBE;
+ wab_bypassDone;
+ pr_popResponseQueue;
+ }
+
+ transition(A, TCC_Ack, I) {TagArrayRead, DataArrayRead, DataArrayWrite} {
+ d_deallocateTBE;
+ a_allocate;
+ w_writeCache;
+ s_storeDone;
+ pr_popResponseQueue;
+ ic_invCache;
+ }
+
+ transition(V, TCC_Ack, V) {TagArrayRead, DataArrayRead, DataArrayWrite} {
+ w_writeCache;
+ l_loadDone;
+ pr_popResponseQueue;
+ }
+
+ transition({W, M}, TCC_Ack, M) {TagArrayRead, TagArrayWrite, DataArrayRead, DataArrayWrite} {
+ w_writeCache;
+ l_loadDone;
+ pr_popResponseQueue;
+ }
+
+ transition({I, V}, Repl, I) {TagArrayRead, TagArrayWrite} {
+ ic_invCache;
+ }
+
+ transition({A}, Repl) {TagArrayRead, TagArrayWrite} {
+ ic_invCache;
+ }
+
+ transition({W, M}, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+ wt_writeThrough;
+ ic_invCache;
+ }
+
+ transition(L, Repl, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+ wt_writeThrough;
+ ic_invCache;
+ }
+
+ transition({W, M}, Flush, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+ sf_setFlush;
+ wt_writeThrough;
+ ic_invCache;
+ p_popMandatoryQueue;
+ }
+
+ transition({V, I, A, L},Flush) {TagArrayFlash} {
+ sf_setFlush;
+ wb_wbDone;
+ p_popMandatoryQueue;
+ }
+
+ transition({I, V}, Evict, I) {TagArrayFlash} {
+ inv_invDone;
+ p_popMandatoryQueue;
+ ic_invCache;
+ }
+
+ transition({W, M}, Evict, W) {TagArrayFlash} {
+ inv_invDone;
+ p_popMandatoryQueue;
+ }
+
+ transition({A, L}, Evict) {TagArrayFlash} {
+ inv_invDone;
+ p_popMandatoryQueue;
+ }
+
+ // TCC_AckWB only snoops TBE
+ transition({V, I, A, M, W, L}, TCC_AckWB) {
+ wd_wtDone;
+ wb_wbDone;
+ pr_popResponseQueue;
+ }
+}
diff --git a/src/mem/protocol/GPU_VIPER.slicc b/src/mem/protocol/GPU_VIPER.slicc
new file mode 100644
index 000000000..45f7f3477
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER.slicc
@@ -0,0 +1,9 @@
+protocol "GPU_VIPER";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-dir.sm";
+include "MOESI_AMD_Base-CorePair.sm";
+include "GPU_VIPER-TCP.sm";
+include "GPU_VIPER-SQC.sm";
+include "GPU_VIPER-TCC.sm";
+include "MOESI_AMD_Base-L3cache.sm";
diff --git a/src/mem/protocol/GPU_VIPER_Baseline.slicc b/src/mem/protocol/GPU_VIPER_Baseline.slicc
new file mode 100644
index 000000000..49bdce38c
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER_Baseline.slicc
@@ -0,0 +1,9 @@
+protocol "GPU_VIPER";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-probeFilter.sm";
+include "MOESI_AMD_Base-CorePair.sm";
+include "GPU_VIPER-TCP.sm";
+include "GPU_VIPER-SQC.sm";
+include "GPU_VIPER-TCC.sm";
+include "MOESI_AMD_Base-L3cache.sm";
diff --git a/src/mem/protocol/GPU_VIPER_Region-TCC.sm b/src/mem/protocol/GPU_VIPER_Region-TCC.sm
new file mode 100644
index 000000000..c3aef15a3
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER_Region-TCC.sm
@@ -0,0 +1,773 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor, Blake Hechtman
+ */
+
+/*
+ * This file is inherited from GPU_VIPER-TCC.sm and retains its structure.
+ * There are very few modifications in this file from the original VIPER TCC
+ */
+
+machine(MachineType:TCC, "TCC Cache")
+ : CacheMemory * L2cache;
+ bool WB; /*is this cache Writeback?*/
+ int regionBufferNum;
+ Cycles l2_request_latency := 50;
+ Cycles l2_response_latency := 20;
+
+ // From the TCPs or SQCs
+ MessageBuffer * requestFromTCP, network="From", virtual_network="1", ordered="true", vnet_type="request";
+ // To the Cores. TCC deals only with TCPs/SQCs. CP cores do not communicate directly with TCC.
+ MessageBuffer * responseToCore, network="To", virtual_network="3", ordered="true", vnet_type="response";
+ // From the NB
+ MessageBuffer * probeFromNB, network="From", virtual_network="0", ordered="false", vnet_type="request";
+ MessageBuffer * responseFromNB, network="From", virtual_network="2", ordered="false", vnet_type="response";
+ // To the NB
+ MessageBuffer * requestToNB, network="To", virtual_network="0", ordered="false", vnet_type="request";
+ MessageBuffer * responseToNB, network="To", virtual_network="2", ordered="false", vnet_type="response";
+ MessageBuffer * unblockToNB, network="To", virtual_network="4", ordered="false", vnet_type="unblock";
+
+ MessageBuffer * triggerQueue, ordered="true", random="false";
+{
+ // EVENTS
+ enumeration(Event, desc="TCC Events") {
+ // Requests coming from the Cores
+ RdBlk, desc="RdBlk event";
+ WrVicBlk, desc="L1 Write Through";
+ WrVicBlkBack, desc="L1 Write Back(dirty cache)";
+ Atomic, desc="Atomic Op";
+ AtomicDone, desc="AtomicOps Complete";
+ AtomicNotDone, desc="AtomicOps not Complete";
+ Data, desc="data messgae";
+ // Coming from this TCC
+ L2_Repl, desc="L2 Replacement";
+ // Probes
+ PrbInv, desc="Invalidating probe";
+ // Coming from Memory Controller
+ WBAck, desc="writethrough ack from memory";
+ }
+
+ // STATES
+ state_declaration(State, desc="TCC State", default="TCC_State_I") {
+ M, AccessPermission:Read_Write, desc="Modified(dirty cache only)";
+ W, AccessPermission:Read_Write, desc="Written(dirty cache only)";
+ V, AccessPermission:Read_Only, desc="Valid";
+ I, AccessPermission:Invalid, desc="Invalid";
+ IV, AccessPermission:Busy, desc="Waiting for Data";
+ WI, AccessPermission:Busy, desc="Waiting on Writethrough Ack";
+ A, AccessPermission:Busy, desc="Invalid waiting on atomic Data";
+ }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ DataArrayRead, desc="Read the data array";
+ DataArrayWrite, desc="Write the data array";
+ TagArrayRead, desc="Read the data array";
+ TagArrayWrite, desc="Write the data array";
+ }
+
+
+ // STRUCTURES
+
+ structure(Entry, desc="...", interface="AbstractCacheEntry") {
+ State CacheState, desc="cache state";
+ bool Dirty, desc="Is the data dirty (diff from memory?)";
+ DataBlock DataBlk, desc="Data for the block";
+ WriteMask writeMask, desc="Dirty byte mask";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ DataBlock DataBlk, desc="data for the block";
+ bool Dirty, desc="Is the data dirty?";
+ bool Shared, desc="Victim hit by shared probe";
+ MachineID From, desc="Waiting for writeback from...";
+ NetDest Destination, desc="Data destination";
+ int numAtomics, desc="number remaining atomics";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ TBETable TBEs, template="<TCC_TBE>", constructor="m_number_of_TBEs";
+
+ void set_cache_entry(AbstractCacheEntry b);
+ void unset_cache_entry();
+ void set_tbe(TBE b);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+
+
+ // FUNCTION DEFINITIONS
+
+ Tick clockEdge();
+ Tick cyclesToTicks(Cycles c);
+
+ MachineID getPeer(MachineID mach) {
+ return createMachineID(MachineType:RegionBuffer, intToID(regionBufferNum));
+ }
+
+ Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+ return static_cast(Entry, "pointer", L2cache.lookup(addr));
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ return getCacheEntry(addr).DataBlk;
+ }
+
+ bool presentOrAvail(Addr addr) {
+ return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+ }
+
+ State getState(TBE tbe, Entry cache_entry, Addr addr) {
+ if (is_valid(tbe)) {
+ return tbe.TBEState;
+ } else if (is_valid(cache_entry)) {
+ return cache_entry.CacheState;
+ }
+ return State:I;
+ }
+
+ void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+ if (is_valid(tbe)) {
+ tbe.TBEState := state;
+ }
+
+ if (is_valid(cache_entry)) {
+ cache_entry.CacheState := state;
+ }
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ testAndRead(addr, tbe.DataBlk, pkt);
+ } else {
+ functionalMemoryRead(pkt);
+ }
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ int num_functional_writes := 0;
+
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ num_functional_writes := num_functional_writes +
+ testAndWrite(addr, tbe.DataBlk, pkt);
+ }
+
+ num_functional_writes := num_functional_writes +
+ functionalMemoryWrite(pkt);
+ return num_functional_writes;
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return TCC_State_to_permission(tbe.TBEState);
+ }
+
+ Entry cache_entry := getCacheEntry(addr);
+ if(is_valid(cache_entry)) {
+ return TCC_State_to_permission(cache_entry.CacheState);
+ }
+
+ return AccessPermission:NotPresent;
+ }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+ if (is_valid(cache_entry)) {
+ cache_entry.changePermission(TCC_State_to_permission(state));
+ }
+ }
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ L2cache.recordRequestType(CacheRequestType:DataArrayRead,addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ L2cache.recordRequestType(CacheRequestType:DataArrayWrite,addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ L2cache.recordRequestType(CacheRequestType:TagArrayRead,addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ L2cache.recordRequestType(CacheRequestType:TagArrayWrite,addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else {
+ error("Invalid RequestType type in checkResourceAvailable");
+ return true;
+ }
+ }
+
+
+ // ** OUT_PORTS **
+
+ // Three classes of ports
+ // Class 1: downward facing network links to NB
+ out_port(requestToNB_out, CPURequestMsg, requestToNB);
+ out_port(responseToNB_out, ResponseMsg, responseToNB);
+ out_port(unblockToNB_out, UnblockMsg, unblockToNB);
+
+ // Class 2: upward facing ports to GPU cores
+ out_port(responseToCore_out, ResponseMsg, responseToCore);
+
+ out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+ //
+ // request queue going to NB
+ //
+
+
+// ** IN_PORTS **
+ in_port(triggerQueue_in, TiggerMsg, triggerQueue) {
+ if (triggerQueue_in.isReady(clockEdge())) {
+ peek(triggerQueue_in, TriggerMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (tbe.numAtomics == 0) {
+ trigger(Event:AtomicDone, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:AtomicNotDone, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+ }
+
+
+
+ in_port(responseFromNB_in, ResponseMsg, responseFromNB) {
+ if (responseFromNB_in.isReady(clockEdge())) {
+ peek(responseFromNB_in, ResponseMsg, block_on="addr") {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+ if(presentOrAvail(in_msg.addr)) {
+ trigger(Event:Data, in_msg.addr, cache_entry, tbe);
+ } else {
+ Addr victim := L2cache.cacheProbe(in_msg.addr);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+ trigger(Event:WBAck, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("Unexpected Response Message to Core");
+ }
+ }
+ }
+ }
+
+ // Finally handling incoming requests (from TCP) and probes (from NB).
+
+ in_port(probeNetwork_in, NBProbeRequestMsg, probeFromNB) {
+ if (probeNetwork_in.isReady(clockEdge())) {
+ peek(probeNetwork_in, NBProbeRequestMsg) {
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ DPRINTF(RubySlicc, "machineID: %s\n", machineID);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+
+
+ in_port(coreRequestNetwork_in, CPURequestMsg, requestFromTCP, rank=0) {
+ if (coreRequestNetwork_in.isReady(clockEdge())) {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ if(WB) {
+ if(presentOrAvail(in_msg.addr)) {
+ trigger(Event:WrVicBlkBack, in_msg.addr, cache_entry, tbe);
+ } else {
+ Addr victim := L2cache.cacheProbe(in_msg.addr);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } else {
+ trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+ trigger(Event:Atomic, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
+ trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+ } else {
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ error("Unexpected Response Message to Core");
+ }
+ }
+ }
+ }
+ // BEGIN ACTIONS
+
+ action(i_invL2, "i", desc="invalidate TCC cache block") {
+ if (is_valid(cache_entry)) {
+ L2cache.deallocate(address);
+ }
+ unset_cache_entry();
+ }
+
+ // Data available at TCC. Send the DATA to TCP
+ action(sd_sendData, "sd", desc="send Shared response") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.DataBlk := cache_entry.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := false;
+ out_msg.State := CoherenceState:Shared;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+
+ // Data was not available at TCC. So, TCC forwarded the request to
+ // directory and directory responded back with data. Now, forward the
+ // DATA to TCP and send the unblock ack back to directory.
+ action(sdr_sendDataResponse, "sdr", desc="send Shared response") {
+ enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination := tbe.Destination;
+ out_msg.DataBlk := cache_entry.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := false;
+ out_msg.State := CoherenceState:Shared;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ enqueue(unblockToNB_out, UnblockMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+
+ action(rd_requestData, "r", desc="Miss in L2, pass on") {
+ if(tbe.Destination.count()==1){
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := in_msg.Type;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.Shared := false; // unneeded for this request
+ out_msg.MessageSize := in_msg.MessageSize;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+ }
+
+ action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+ peek(responseFromNB_in, ResponseMsg) {
+ enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysWBAck;
+ out_msg.Destination.clear();
+ out_msg.Destination.add(in_msg.WTRequestor);
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ }
+ }
+ }
+
+ action(swb_sendWBAck, "swb", desc="send WB Ack") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysWBAck;
+ out_msg.Destination.clear();
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ }
+ }
+ }
+
+ action(ar_sendAtomicResponse, "ar", desc="send Atomic Ack") {
+ peek(responseFromNB_in, ResponseMsg) {
+ enqueue(responseToCore_out, ResponseMsg, l2_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:TDSysResp;
+ out_msg.Destination.add(in_msg.WTRequestor);
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := in_msg.MessageSize;
+ out_msg.DataBlk := in_msg.DataBlk;
+ }
+ }
+ }
+ action(sd2rb_sendDone2RegionBuffer, "sd2rb", desc="Request finished, send done ack") {
+ enqueue(unblockToNB_out, UnblockMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.DoneAck := true;
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ if (is_valid(tbe)) {
+ out_msg.Dirty := tbe.Dirty;
+ } else {
+ out_msg.Dirty := false;
+ }
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(a_allocateBlock, "a", desc="allocate TCC block") {
+ if (is_invalid(cache_entry)) {
+ set_cache_entry(L2cache.allocate(address, new Entry));
+ cache_entry.writeMask.clear();
+ }
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ if (is_invalid(tbe)) {
+ check_allocate(TBEs);
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ tbe.Destination.clear();
+ tbe.numAtomics := 0;
+ }
+ if (coreRequestNetwork_in.isReady(clockEdge())) {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ if(in_msg.Type == CoherenceRequestType:RdBlk || in_msg.Type == CoherenceRequestType:Atomic){
+ tbe.Destination.add(in_msg.Requestor);
+ }
+ }
+ }
+ }
+
+ action(dt_deallocateTBE, "dt", desc="Deallocate TBE entry") {
+ tbe.Destination.clear();
+ TBEs.deallocate(address);
+ unset_tbe();
+ }
+
+ action(wcb_writeCacheBlock, "wcb", desc="write data to TCC") {
+ peek(responseFromNB_in, ResponseMsg) {
+ cache_entry.DataBlk := in_msg.DataBlk;
+ DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+ }
+ }
+
+ action(wdb_writeDirtyBytes, "wdb", desc="write data to TCC") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ cache_entry.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask);
+ cache_entry.writeMask.orMask(in_msg.writeMask);
+ DPRINTF(RubySlicc, "Writing to TCC: %s\n", in_msg);
+ }
+ }
+
+ action(wt_writeThrough, "wt", desc="write through data") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ out_msg.WTRequestor := in_msg.Requestor;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.MessageSize := MessageSizeType:Data;
+ out_msg.Type := CoherenceRequestType:WriteThrough;
+ out_msg.Dirty := true;
+ out_msg.DataBlk := in_msg.DataBlk;
+ out_msg.writeMask.orMask(in_msg.writeMask);
+ }
+ }
+ }
+
+ action(wb_writeBack, "wb", desc="write back data") {
+ enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ out_msg.WTRequestor := machineID;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.MessageSize := MessageSizeType:Data;
+ out_msg.Type := CoherenceRequestType:WriteThrough;
+ out_msg.Dirty := true;
+ out_msg.DataBlk := cache_entry.DataBlk;
+ out_msg.writeMask.orMask(cache_entry.writeMask);
+ }
+ }
+
+ action(at_atomicThrough, "at", desc="write back data") {
+ peek(coreRequestNetwork_in, CPURequestMsg) {
+ enqueue(requestToNB_out, CPURequestMsg, l2_request_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ out_msg.WTRequestor := in_msg.Requestor;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.MessageSize := MessageSizeType:Data;
+ out_msg.Type := CoherenceRequestType:Atomic;
+ out_msg.Dirty := true;
+ out_msg.writeMask.orMask(in_msg.writeMask);
+ }
+ }
+ }
+
+ action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+ enqueue(responseToNB_out, ResponseMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // TCC, L3 respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.Dirty := false;
+ out_msg.Hit := false;
+ out_msg.Ntsl := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+ action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+ L2cache.setMRU(address);
+ }
+
+ action(p_popRequestQueue, "p", desc="pop request queue") {
+ coreRequestNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pr_popResponseQueue, "pr", desc="pop response queue") {
+ responseFromNB_in.dequeue(clockEdge());
+ }
+
+ action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+ probeNetwork_in.dequeue(clockEdge());
+ }
+ action(zz_recycleRequestQueue, "z", desc="stall"){
+ coreRequestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+
+ action(ina_incrementNumAtomics, "ina", desc="inc num atomics") {
+ tbe.numAtomics := tbe.numAtomics + 1;
+ }
+
+
+ action(dna_decrementNumAtomics, "dna", desc="dec num atomics") {
+ tbe.numAtomics := tbe.numAtomics - 1;
+ if (tbe.numAtomics==0) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:AtomicDone;
+ }
+ }
+ }
+
+ action(ptr_popTriggerQueue, "ptr", desc="pop Trigger") {
+ triggerQueue_in.dequeue(clockEdge());
+ }
+
+ // END ACTIONS
+
+ // BEGIN TRANSITIONS
+ // transitions from base
+ // Assumptions for ArrayRead/Write
+ // TBE checked before tags
+ // Data Read/Write requires Tag Read
+
+ transition(WI, {RdBlk, WrVicBlk, Atomic, WrVicBlkBack}) {TagArrayRead} {
+ zz_recycleRequestQueue;
+ }
+ transition(A, {RdBlk, WrVicBlk, WrVicBlkBack}) {TagArrayRead} {
+ zz_recycleRequestQueue;
+ }
+ transition(IV, {WrVicBlk, Atomic, WrVicBlkBack}) {TagArrayRead} {
+ zz_recycleRequestQueue;
+ }
+ transition({M, V}, RdBlk) {TagArrayRead, DataArrayRead} {
+ sd_sendData;
+ ut_updateTag;
+ p_popRequestQueue;
+ }
+ transition(W, RdBlk, WI) {TagArrayRead, DataArrayRead} {
+ t_allocateTBE;
+ wb_writeBack;
+ }
+
+ transition(I, RdBlk, IV) {TagArrayRead} {
+ t_allocateTBE;
+ rd_requestData;
+ p_popRequestQueue;
+ }
+
+ transition(IV, RdBlk) {
+ t_allocateTBE;
+ rd_requestData;
+ p_popRequestQueue;
+ }
+
+ transition({V, I},Atomic, A) {TagArrayRead} {
+ i_invL2;
+ t_allocateTBE;
+ at_atomicThrough;
+ ina_incrementNumAtomics;
+ p_popRequestQueue;
+ }
+
+ transition(A, Atomic) {
+ at_atomicThrough;
+ ina_incrementNumAtomics;
+ p_popRequestQueue;
+ }
+
+ transition({M, W}, Atomic, WI) {TagArrayRead} {
+ t_allocateTBE;
+ wb_writeBack;
+ }
+
+ // Cahceblock stays in I state which implies
+ // this TCC is a write-no-allocate cache
+ transition(I, WrVicBlk) {TagArrayRead} {
+ wt_writeThrough;
+ p_popRequestQueue;
+ }
+
+ transition(V, WrVicBlk) {TagArrayRead, DataArrayWrite} {
+ ut_updateTag;
+ wdb_writeDirtyBytes;
+ wt_writeThrough;
+ p_popRequestQueue;
+ }
+
+ transition({V, M}, WrVicBlkBack, M) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ ut_updateTag;
+ swb_sendWBAck;
+ wdb_writeDirtyBytes;
+ p_popRequestQueue;
+ }
+
+ transition(W, WrVicBlkBack) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ ut_updateTag;
+ swb_sendWBAck;
+ wdb_writeDirtyBytes;
+ p_popRequestQueue;
+ }
+
+ transition(I, WrVicBlkBack, W) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ a_allocateBlock;
+ ut_updateTag;
+ swb_sendWBAck;
+ wdb_writeDirtyBytes;
+ p_popRequestQueue;
+ }
+
+ transition({W, M}, L2_Repl, WI) {TagArrayRead, DataArrayRead} {
+ t_allocateTBE;
+ wb_writeBack;
+ i_invL2;
+ }
+
+ transition({I, V}, L2_Repl, I) {TagArrayRead, TagArrayWrite} {
+ i_invL2;
+ }
+
+ transition({A, IV, WI}, L2_Repl) {
+ i_invL2;
+ }
+
+ transition({I, V}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition(M, PrbInv, W) {TagArrayRead, TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition(W, PrbInv) {TagArrayRead} {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition({A, IV, WI}, PrbInv) {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition(IV, Data, V) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ a_allocateBlock;
+ ut_updateTag;
+ wcb_writeCacheBlock;
+ sdr_sendDataResponse;
+ sd2rb_sendDone2RegionBuffer;
+ pr_popResponseQueue;
+ dt_deallocateTBE;
+ }
+
+ transition(A, Data) {TagArrayRead, TagArrayWrite, DataArrayWrite} {
+ a_allocateBlock;
+ ar_sendAtomicResponse;
+ sd2rb_sendDone2RegionBuffer;
+ dna_decrementNumAtomics;
+ pr_popResponseQueue;
+ }
+
+ transition(A, AtomicDone, I) {TagArrayRead, TagArrayWrite} {
+ dt_deallocateTBE;
+ ptr_popTriggerQueue;
+ }
+
+ transition(A, AtomicNotDone) {TagArrayRead} {
+ ptr_popTriggerQueue;
+ }
+
+ //M,W should not see WBAck as the cache is in WB mode
+ //WBAcks do not need to check tags
+ transition({I, V, IV, A}, WBAck) {
+ w_sendResponseWBAck;
+ sd2rb_sendDone2RegionBuffer;
+ pr_popResponseQueue;
+ }
+
+ transition(WI, WBAck,I) {
+ sd2rb_sendDone2RegionBuffer;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+}
diff --git a/src/mem/protocol/GPU_VIPER_Region.slicc b/src/mem/protocol/GPU_VIPER_Region.slicc
new file mode 100644
index 000000000..cbfef9de3
--- /dev/null
+++ b/src/mem/protocol/GPU_VIPER_Region.slicc
@@ -0,0 +1,11 @@
+protocol "GPU_VIPER_Region";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-Region-CorePair.sm";
+include "MOESI_AMD_Base-L3cache.sm";
+include "MOESI_AMD_Base-Region-dir.sm";
+include "GPU_VIPER_Region-TCC.sm";
+include "GPU_VIPER-TCP.sm";
+include "GPU_VIPER-SQC.sm";
+include "MOESI_AMD_Base-RegionDir.sm";
+include "MOESI_AMD_Base-RegionBuffer.sm";
diff --git a/src/mem/protocol/MOESI_AMD_Base-CorePair.sm b/src/mem/protocol/MOESI_AMD_Base-CorePair.sm
new file mode 100644
index 000000000..76fe77230
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-CorePair.sm
@@ -0,0 +1,2904 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:CorePair, "CP-like Core Coherence")
+ : Sequencer * sequencer;
+ Sequencer * sequencer1;
+ CacheMemory * L1Icache;
+ CacheMemory * L1D0cache;
+ CacheMemory * L1D1cache;
+ CacheMemory * L2cache; // func mem logic looks in this CacheMemory
+ bool send_evictions := "False";
+ Cycles issue_latency := 5; // time to send data down to NB
+ Cycles l2_hit_latency := 18;
+
+ // BEGIN Core Buffers
+
+ // To the Network
+ MessageBuffer * requestFromCore, network="To", virtual_network="0", vnet_type="request";
+ MessageBuffer * responseFromCore, network="To", virtual_network="2", vnet_type="response";
+ MessageBuffer * unblockFromCore, network="To", virtual_network="4", vnet_type="unblock";
+
+ // From the Network
+ MessageBuffer * probeToCore, network="From", virtual_network="0", vnet_type="request";
+ MessageBuffer * responseToCore, network="From", virtual_network="2", vnet_type="response";
+
+ MessageBuffer * mandatoryQueue;
+
+ MessageBuffer * triggerQueue, ordered="true";
+
+ // END Core Buffers
+
+{
+ // BEGIN STATES
+ state_declaration(State, desc="Cache states", default="CorePair_State_I") {
+
+ // Base States
+ I, AccessPermission:Invalid, desc="Invalid";
+ S, AccessPermission:Read_Only, desc="Shared";
+ E0, AccessPermission:Read_Write, desc="Exclusive with Cluster 0 ownership";
+ E1, AccessPermission:Read_Write, desc="Exclusive with Cluster 1 ownership";
+ Es, AccessPermission:Read_Write, desc="Exclusive in core";
+ O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line";
+ Ms, AccessPermission:Read_Write, desc="Modified in core, both clusters may be sharing line";
+ M0, AccessPermission:Read_Write, desc="Modified with cluster ownership";
+ M1, AccessPermission:Read_Write, desc="Modified with cluster ownership";
+
+ // Transient States
+ I_M0, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+ I_M1, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+ I_M0M1, AccessPermission:Busy, desc="Was in I_M0, got a store request from other cluster as well";
+ I_M1M0, AccessPermission:Busy, desc="Was in I_M1, got a store request from other cluster as well";
+ I_M0Ms, AccessPermission:Busy, desc="Was in I_M0, got a load request from other cluster as well";
+ I_M1Ms, AccessPermission:Busy, desc="Was in I_M1, got a load request from other cluster as well";
+ I_E0S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+ I_E1S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+ I_ES, AccessPermission:Busy, desc="S_F got hit by invalidating probe, RdBlk response needs to go to both clusters";
+
+ IF_E0S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E0S but expecting a L2_to_L1D0 trigger, just drop when receive";
+ IF_E1S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E1S but expecting a L2_to_L1D1 trigger, just drop when receive";
+ IF_ES, AccessPermission:Busy, desc="same, but waiting for two fills";
+ IF0_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one";
+ IF1_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one";
+ F_S0, AccessPermission:Busy, desc="same, but going to S0 when trigger received";
+ F_S1, AccessPermission:Busy, desc="same, but going to S1 when trigger received";
+
+ ES_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for clean writeback ack";
+ MO_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for dirty writeback ack";
+ MO_S0, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS";
+ MO_S1, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS";
+ S_F0, AccessPermission:Read_Only, desc="Shared, filling L1";
+ S_F1, AccessPermission:Read_Only, desc="Shared, filling L1";
+ S_F, AccessPermission:Read_Only, desc="Shared, filling L1";
+ O_F0, AccessPermission:Read_Only, desc="Owned, filling L1";
+ O_F1, AccessPermission:Read_Only, desc="Owned, filling L1";
+ O_F, AccessPermission:Read_Only, desc="Owned, filling L1";
+ Si_F0, AccessPermission:Read_Only, desc="Shared, filling icache";
+ Si_F1, AccessPermission:Read_Only, desc="Shared, filling icache";
+ S_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+ S_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+ O_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+ O_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+ S0, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 0, waiting for response";
+ S1, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 1, waiting for response";
+
+ Es_F0, AccessPermission:Read_Write, desc="Es, Cluster read, filling";
+ Es_F1, AccessPermission:Read_Write, desc="Es, Cluster read, filling";
+ Es_F, AccessPermission:Read_Write, desc="Es, other cluster read, filling";
+ E0_F, AccessPermission:Read_Write, desc="E0, cluster read, filling";
+ E1_F, AccessPermission:Read_Write, desc="...";
+ E0_Es, AccessPermission:Read_Write, desc="...";
+ E1_Es, AccessPermission:Read_Write, desc="...";
+ Ms_F0, AccessPermission:Read_Write, desc="...";
+ Ms_F1, AccessPermission:Read_Write, desc="...";
+ Ms_F, AccessPermission:Read_Write, desc="...";
+ M0_F, AccessPermission:Read_Write, desc="...";
+ M0_Ms, AccessPermission:Read_Write, desc="...";
+ M1_F, AccessPermission:Read_Write, desc="...";
+ M1_Ms, AccessPermission:Read_Write, desc="...";
+
+ I_C, AccessPermission:Invalid, desc="Invalid, but waiting for WBAck from NB from canceled writeback";
+ S0_C, AccessPermission:Busy, desc="MO_S0 hit by invalidating probe, waiting for WBAck form NB for canceled WB";
+ S1_C, AccessPermission:Busy, desc="MO_S1 hit by invalidating probe, waiting for WBAck form NB for canceled WB";
+ S_C, AccessPermission:Busy, desc="S*_C got NB_AckS, still waiting for WBAck";
+
+ } // END STATES
+
+ // BEGIN EVENTS
+ enumeration(Event, desc="CP Events") {
+ // CP Initiated events
+ C0_Load_L1miss, desc="Cluster 0 load, L1 missed";
+ C0_Load_L1hit, desc="Cluster 0 load, L1 hit";
+ C1_Load_L1miss, desc="Cluster 1 load L1 missed";
+ C1_Load_L1hit, desc="Cluster 1 load L1 hit";
+ Ifetch0_L1hit, desc="Instruction fetch, hit in the L1";
+ Ifetch1_L1hit, desc="Instruction fetch, hit in the L1";
+ Ifetch0_L1miss, desc="Instruction fetch, missed in the L1";
+ Ifetch1_L1miss, desc="Instruction fetch, missed in the L1";
+ C0_Store_L1miss, desc="Cluster 0 store missed in L1";
+ C0_Store_L1hit, desc="Cluster 0 store hit in L1";
+ C1_Store_L1miss, desc="Cluster 1 store missed in L1";
+ C1_Store_L1hit, desc="Cluster 1 store hit in L1";
+ // NB Initiated events
+ NB_AckS, desc="NB Ack to Core Request";
+ NB_AckM, desc="NB Ack to Core Request";
+ NB_AckE, desc="NB Ack to Core Request";
+
+ NB_AckWB, desc="NB Ack for writeback";
+
+ // Memory System initiatied events
+ L1I_Repl, desc="Replace address from L1I"; // Presumed clean
+ L1D0_Repl, desc="Replace address from L1D0"; // Presumed clean
+ L1D1_Repl, desc="Replace address from L1D1"; // Presumed clean
+ L2_Repl, desc="Replace address from L2";
+
+ L2_to_L1D0, desc="L1 fill from L2";
+ L2_to_L1D1, desc="L1 fill from L2";
+ L2_to_L1I, desc="L1 fill from L2";
+
+ // Probe Events
+ PrbInvData, desc="probe, return O or M data";
+ PrbInv, desc="probe, no need for data";
+ PrbShrData, desc="probe downgrade, return O or M data";
+
+ } // END EVENTS
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ L1D0DataArrayRead, desc="Read the data array";
+ L1D0DataArrayWrite, desc="Write the data array";
+ L1D0TagArrayRead, desc="Read the data array";
+ L1D0TagArrayWrite, desc="Write the data array";
+ L1D1DataArrayRead, desc="Read the data array";
+ L1D1DataArrayWrite, desc="Write the data array";
+ L1D1TagArrayRead, desc="Read the data array";
+ L1D1TagArrayWrite, desc="Write the data array";
+ L1IDataArrayRead, desc="Read the data array";
+ L1IDataArrayWrite, desc="Write the data array";
+ L1ITagArrayRead, desc="Read the data array";
+ L1ITagArrayWrite, desc="Write the data array";
+ L2DataArrayRead, desc="Read the data array";
+ L2DataArrayWrite, desc="Write the data array";
+ L2TagArrayRead, desc="Read the data array";
+ L2TagArrayWrite, desc="Write the data array";
+ }
+
+
+ // BEGIN STRUCTURE DEFINITIONS
+
+
+ // Cache Entry
+ structure(Entry, desc="...", interface="AbstractCacheEntry") {
+ State CacheState, desc="cache state";
+ bool Dirty, desc="Is the data dirty (diff than memory)?";
+ DataBlock DataBlk, desc="data for the block";
+ bool FromL2, default="false", desc="block just moved from L2";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ DataBlock DataBlk, desc="data for the block, required for concurrent writebacks";
+ bool Dirty, desc="Is the data dirty (different than memory)?";
+ int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for";
+ bool Shared, desc="Victim hit by shared probe";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ TBETable TBEs, template="<CorePair_TBE>", constructor="m_number_of_TBEs";
+
+ void set_cache_entry(AbstractCacheEntry b);
+ void unset_cache_entry();
+ void set_tbe(TBE b);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+ Cycles curCycle();
+
+ // END STRUCTURE DEFINITIONS
+
+ // BEGIN INTERNAL FUNCTIONS
+
+ Tick clockEdge();
+ Tick cyclesToTicks(Cycles c);
+
+ bool addressInCore(Addr addr) {
+ return (L2cache.isTagPresent(addr) || L1Icache.isTagPresent(addr) || L1D0cache.isTagPresent(addr) || L1D1cache.isTagPresent(addr));
+ }
+
+ Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+ Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address));
+ return L2cache_entry;
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return tbe.DataBlk;
+ } else {
+ return getCacheEntry(addr).DataBlk;
+ }
+ }
+
+ Entry getL1CacheEntry(Addr addr, int cluster), return_by_pointer="yes" {
+ if (cluster == 0) {
+ Entry L1D0_entry := static_cast(Entry, "pointer", L1D0cache.lookup(addr));
+ return L1D0_entry;
+ } else {
+ Entry L1D1_entry := static_cast(Entry, "pointer", L1D1cache.lookup(addr));
+ return L1D1_entry;
+ }
+ }
+
+ Entry getICacheEntry(Addr addr), return_by_pointer="yes" {
+ Entry c_entry := static_cast(Entry, "pointer", L1Icache.lookup(addr));
+ return c_entry;
+ }
+
+ bool presentOrAvail2(Addr addr) {
+ return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+ }
+
+ bool presentOrAvailI(Addr addr) {
+ return L1Icache.isTagPresent(addr) || L1Icache.cacheAvail(addr);
+ }
+
+ bool presentOrAvailD0(Addr addr) {
+ return L1D0cache.isTagPresent(addr) || L1D0cache.cacheAvail(addr);
+ }
+
+ bool presentOrAvailD1(Addr addr) {
+ return L1D1cache.isTagPresent(addr) || L1D1cache.cacheAvail(addr);
+ }
+
+ State getState(TBE tbe, Entry cache_entry, Addr addr) {
+ if(is_valid(tbe)) {
+ return tbe.TBEState;
+ } else if (is_valid(cache_entry)) {
+ return cache_entry.CacheState;
+ }
+ return State:I;
+ }
+
+ void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+ if (is_valid(tbe)) {
+ tbe.TBEState := state;
+ }
+
+ if (is_valid(cache_entry)) {
+ cache_entry.CacheState := state;
+ }
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return CorePair_State_to_permission(tbe.TBEState);
+ }
+
+ Entry cache_entry := getCacheEntry(addr);
+ if(is_valid(cache_entry)) {
+ return CorePair_State_to_permission(cache_entry.CacheState);
+ }
+
+ return AccessPermission:NotPresent;
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ testAndRead(addr, tbe.DataBlk, pkt);
+ } else {
+ functionalMemoryRead(pkt);
+ }
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ int num_functional_writes := 0;
+
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ num_functional_writes := num_functional_writes +
+ testAndWrite(addr, tbe.DataBlk, pkt);
+ }
+
+ num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+ return num_functional_writes;
+ }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+ if (is_valid(cache_entry)) {
+ cache_entry.changePermission(CorePair_State_to_permission(state));
+ }
+ }
+
+ MachineType testAndClearLocalHit(Entry cache_entry) {
+ assert(is_valid(cache_entry));
+ if (cache_entry.FromL2) {
+ cache_entry.FromL2 := false;
+ return MachineType:L2Cache;
+ } else {
+ return MachineType:L1Cache;
+ }
+ }
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:L1D0DataArrayRead) {
+ L1D0cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:L1D0DataArrayWrite) {
+ L1D0cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:L1D0TagArrayRead) {
+ L1D0cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:L1D0TagArrayWrite) {
+ L1D0cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ } else if (request_type == RequestType:L1D1DataArrayRead) {
+ L1D1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:L1D1DataArrayWrite) {
+ L1D1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:L1D1TagArrayRead) {
+ L1D1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:L1D1TagArrayWrite) {
+ L1D1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ } else if (request_type == RequestType:L1IDataArrayRead) {
+ L1Icache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:L1IDataArrayWrite) {
+ L1Icache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:L1ITagArrayRead) {
+ L1Icache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:L1ITagArrayWrite) {
+ L1Icache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ } else if (request_type == RequestType:L2DataArrayRead) {
+ L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:L2DataArrayWrite) {
+ L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:L2TagArrayRead) {
+ L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:L2TagArrayWrite) {
+ L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:L2DataArrayRead) {
+ return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L2DataArrayWrite) {
+ return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L2TagArrayRead) {
+ return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L2TagArrayWrite) {
+ return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L1D0DataArrayRead) {
+ return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L1D0DataArrayWrite) {
+ return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L1D0TagArrayRead) {
+ return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L1D0TagArrayWrite) {
+ return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L1D1DataArrayRead) {
+ return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L1D1DataArrayWrite) {
+ return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L1D1TagArrayRead) {
+ return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L1D1TagArrayWrite) {
+ return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L1IDataArrayRead) {
+ return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L1IDataArrayWrite) {
+ return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L1ITagArrayRead) {
+ return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L1ITagArrayWrite) {
+ return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+
+ } else {
+ return true;
+ }
+ }
+
+ // END INTERNAL FUNCTIONS
+
+ // ** OUT_PORTS **
+
+ out_port(requestNetwork_out, CPURequestMsg, requestFromCore);
+ out_port(responseNetwork_out, ResponseMsg, responseFromCore);
+ out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+ out_port(unblockNetwork_out, UnblockMsg, unblockFromCore);
+
+ // ** IN_PORTS **
+
+ in_port(triggerQueue_in, TriggerMsg, triggerQueue, block_on="addr") {
+ if (triggerQueue_in.isReady(clockEdge())) {
+ peek(triggerQueue_in, TriggerMsg) {
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+
+ if (in_msg.Type == TriggerType:L2_to_L1) {
+ if (in_msg.Dest == CacheId:L1I) {
+ trigger(Event:L2_to_L1I, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Dest == CacheId:L1D0) {
+ trigger(Event:L2_to_L1D0, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Dest == CacheId:L1D1) {
+ trigger(Event:L2_to_L1D1, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("unexpected trigger dest");
+ }
+ }
+ }
+ }
+ }
+
+
+ in_port(probeNetwork_in, NBProbeRequestMsg, probeToCore) {
+ if (probeNetwork_in.isReady(clockEdge())) {
+ peek(probeNetwork_in, NBProbeRequestMsg, block_on="addr") {
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+
+ if (in_msg.Type == ProbeRequestType:PrbInv) {
+ if (in_msg.ReturnData) {
+ trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+ assert(in_msg.ReturnData);
+ trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+ }
+
+
+ // ResponseNetwork
+ in_port(responseToCore_in, ResponseMsg, responseToCore) {
+ if (responseToCore_in.isReady(clockEdge())) {
+ peek(responseToCore_in, ResponseMsg, block_on="addr") {
+
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+
+ if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+ if (in_msg.State == CoherenceState:Modified) {
+ trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.State == CoherenceState:Shared) {
+ trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.State == CoherenceState:Exclusive) {
+ trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+ trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("Unexpected Response Message to Core");
+ }
+ }
+ }
+ }
+
+ // Nothing from the Unblock Network
+
+ // Mandatory Queue
+ in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+ if (mandatoryQueue_in.isReady(clockEdge())) {
+ peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+
+ Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+ TBE tbe := TBEs.lookup(in_msg.LineAddress);
+
+ if (in_msg.Type == RubyRequestType:IFETCH) {
+ // FETCH ACCESS
+
+ if (L1Icache.isTagPresent(in_msg.LineAddress)) {
+ if (mod(in_msg.contextId, 2) == 0) {
+ trigger(Event:Ifetch0_L1hit, in_msg.LineAddress, cache_entry, tbe);
+ } else {
+ trigger(Event:Ifetch1_L1hit, in_msg.LineAddress, cache_entry, tbe);
+ }
+ } else {
+ if (presentOrAvail2(in_msg.LineAddress)) {
+ if (presentOrAvailI(in_msg.LineAddress)) {
+ if (mod(in_msg.contextId, 2) == 0) {
+ trigger(Event:Ifetch0_L1miss, in_msg.LineAddress, cache_entry,
+ tbe);
+ } else {
+ trigger(Event:Ifetch1_L1miss, in_msg.LineAddress, cache_entry,
+ tbe);
+ }
+ } else {
+ Addr victim := L1Icache.cacheProbe(in_msg.LineAddress);
+ trigger(Event:L1I_Repl, victim,
+ getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } else { // Not present or avail in L2
+ Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+ TBEs.lookup(victim));
+ }
+ }
+ } else {
+ // DATA ACCESS
+ if (mod(in_msg.contextId, 2) == 1) {
+ if (L1D1cache.isTagPresent(in_msg.LineAddress)) {
+ if (in_msg.Type == RubyRequestType:LD) {
+ trigger(Event:C1_Load_L1hit, in_msg.LineAddress, cache_entry,
+ tbe);
+ } else {
+ // Stores must write through, make sure L2 avail.
+ if (presentOrAvail2(in_msg.LineAddress)) {
+ trigger(Event:C1_Store_L1hit, in_msg.LineAddress, cache_entry,
+ tbe);
+ } else {
+ Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+ TBEs.lookup(victim));
+ }
+ }
+ } else {
+ if (presentOrAvail2(in_msg.LineAddress)) {
+ if (presentOrAvailD1(in_msg.LineAddress)) {
+ if (in_msg.Type == RubyRequestType:LD) {
+ trigger(Event:C1_Load_L1miss, in_msg.LineAddress,
+ cache_entry, tbe);
+ } else {
+ trigger(Event:C1_Store_L1miss, in_msg.LineAddress,
+ cache_entry, tbe);
+ }
+ } else {
+ Addr victim := L1D1cache.cacheProbe(in_msg.LineAddress);
+ trigger(Event:L1D1_Repl, victim,
+ getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } else { // not present or avail in L2
+ Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ }
+ } else {
+ Entry L1D0cache_entry := getL1CacheEntry(in_msg.LineAddress, 0);
+ if (is_valid(L1D0cache_entry)) {
+ if (in_msg.Type == RubyRequestType:LD) {
+ trigger(Event:C0_Load_L1hit, in_msg.LineAddress, cache_entry,
+ tbe);
+ } else {
+ if (presentOrAvail2(in_msg.LineAddress)) {
+ trigger(Event:C0_Store_L1hit, in_msg.LineAddress, cache_entry,
+ tbe);
+ } else {
+ Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+ TBEs.lookup(victim));
+ }
+ }
+ } else {
+ if (presentOrAvail2(in_msg.LineAddress)) {
+ if (presentOrAvailD0(in_msg.LineAddress)) {
+ if (in_msg.Type == RubyRequestType:LD) {
+ trigger(Event:C0_Load_L1miss, in_msg.LineAddress,
+ cache_entry, tbe);
+ } else {
+ trigger(Event:C0_Store_L1miss, in_msg.LineAddress,
+ cache_entry, tbe);
+ }
+ } else {
+ Addr victim := L1D0cache.cacheProbe(in_msg.LineAddress);
+ trigger(Event:L1D0_Repl, victim, getCacheEntry(victim),
+ TBEs.lookup(victim));
+ }
+ } else {
+ Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+ TBEs.lookup(victim));
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+ // ACTIONS
+ action(ii_invIcache, "ii", desc="invalidate iCache") {
+ if (L1Icache.isTagPresent(address)) {
+ L1Icache.deallocate(address);
+ }
+ }
+
+ action(i0_invCluster, "i0", desc="invalidate cluster 0") {
+ if (L1D0cache.isTagPresent(address)) {
+ L1D0cache.deallocate(address);
+ }
+ }
+
+ action(i1_invCluster, "i1", desc="invalidate cluster 1") {
+ if (L1D1cache.isTagPresent(address)) {
+ L1D1cache.deallocate(address);
+ }
+ }
+
+ action(ib_invBothClusters, "ib", desc="invalidate both clusters") {
+ if (L1D0cache.isTagPresent(address)) {
+ L1D0cache.deallocate(address);
+ }
+ if (L1D1cache.isTagPresent(address)) {
+ L1D1cache.deallocate(address);
+ }
+ }
+
+ action(i2_invL2, "i2", desc="invalidate L2") {
+ if(is_valid(cache_entry)) {
+ L2cache.deallocate(address);
+ }
+ unset_cache_entry();
+ }
+
+ action(mru_setMRU, "mru", desc="Update LRU state") {
+ L2cache.setMRU(address);
+ }
+
+ action(mruD1_setD1cacheMRU, "mruD1", desc="Update LRU state") {
+ L1D1cache.setMRU(address);
+ }
+
+ action(mruD0_setD0cacheMRU, "mruD0", desc="Update LRU state") {
+ L1D0cache.setMRU(address);
+ }
+
+ action(mruI_setIcacheMRU, "mruI", desc="Update LRU state") {
+ L1Icache.setMRU(address);
+ }
+
+ action(n_issueRdBlk, "n", desc="Issue RdBlk") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlk;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ DPRINTF(RubySlicc,"%s\n",out_msg.Destination);
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := curCycle();
+ }
+ }
+
+ action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlkM;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := curCycle();
+ }
+ }
+
+ action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlkS;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := curCycle();
+ }
+ }
+
+ action(vd_victim, "vd", desc="Victimize M/O L2 Data") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ assert(is_valid(cache_entry));
+ out_msg.DataBlk := cache_entry.DataBlk;
+ assert(cache_entry.Dirty);
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.Type := CoherenceRequestType:VicDirty;
+ out_msg.InitialRequestTime := curCycle();
+ if (cache_entry.CacheState == State:O) {
+ out_msg.Shared := true;
+ } else {
+ out_msg.Shared := false;
+ }
+ }
+ }
+
+ action(vc_victim, "vc", desc="Victimize E/S L2 Data") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.Type := CoherenceRequestType:VicClean;
+ out_msg.InitialRequestTime := curCycle();
+ if (cache_entry.CacheState == State:S) {
+ out_msg.Shared := true;
+ } else {
+ out_msg.Shared := false;
+ }
+ }
+ }
+
+ action(a0_allocateL1D, "a0", desc="Allocate L1D0 Block") {
+ if (L1D0cache.isTagPresent(address) == false) {
+ L1D0cache.allocateVoid(address, new Entry);
+ }
+ }
+
+ action(a1_allocateL1D, "a1", desc="Allocate L1D1 Block") {
+ if (L1D1cache.isTagPresent(address) == false) {
+ L1D1cache.allocateVoid(address, new Entry);
+ }
+ }
+
+ action(ai_allocateL1I, "ai", desc="Allocate L1I Block") {
+ if (L1Icache.isTagPresent(address) == false) {
+ L1Icache.allocateVoid(address, new Entry);
+ }
+ }
+
+ action(a2_allocateL2, "a2", desc="Allocate L2 Block") {
+ if (is_invalid(cache_entry)) {
+ set_cache_entry(L2cache.allocate(address, new Entry));
+ }
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ check_allocate(TBEs);
+ assert(is_valid(cache_entry));
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ tbe.DataBlk := cache_entry.DataBlk; // Data only used for WBs
+ tbe.Dirty := cache_entry.Dirty;
+ tbe.Shared := false;
+ }
+
+ action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+ TBEs.deallocate(address);
+ unset_tbe();
+ }
+
+ action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+ mandatoryQueue_in.dequeue(clockEdge());
+ }
+
+ action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+ responseToCore_in.dequeue(clockEdge());
+ }
+
+ action(pt_popTriggerQueue, "pt", desc="Pop Trigger Queue") {
+ triggerQueue_in.dequeue(clockEdge());
+ }
+
+ action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+ probeNetwork_in.dequeue(clockEdge());
+ }
+
+ action(il0_loadDone, "il0", desc="Cluster 0 i load done") {
+ Entry entry := getICacheEntry(address);
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ assert(is_valid(entry));
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ sequencer.readCallback(address,
+ l2entry.DataBlk,
+ true,
+ testAndClearLocalHit(entry));
+ }
+
+ action(il1_loadDone, "il1", desc="Cluster 1 i load done") {
+ Entry entry := getICacheEntry(address);
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ assert(is_valid(entry));
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ sequencer1.readCallback(address,
+ l2entry.DataBlk,
+ true,
+ testAndClearLocalHit(entry));
+ }
+
+ action(l0_loadDone, "l0", desc="Cluster 0 load done") {
+ Entry entry := getL1CacheEntry(address, 0);
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ assert(is_valid(entry));
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ sequencer.readCallback(address,
+ l2entry.DataBlk,
+ true,
+ testAndClearLocalHit(entry));
+ }
+
+ action(l1_loadDone, "l1", desc="Cluster 1 load done") {
+ Entry entry := getL1CacheEntry(address, 1);
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ assert(is_valid(entry));
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ sequencer1.readCallback(address,
+ l2entry.DataBlk,
+ true,
+ testAndClearLocalHit(entry));
+ }
+
+ action(xl0_loadDone, "xl0", desc="Cluster 0 load done") {
+ peek(responseToCore_in, ResponseMsg) {
+ assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+ (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ DPRINTF(ProtocolTrace, "CP Load Done 0 -- address %s, data: %s\n", address, l2entry.DataBlk);
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ sequencer.readCallback(address,
+ l2entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ }
+ }
+
+ action(xl1_loadDone, "xl1", desc="Cluster 1 load done") {
+ peek(responseToCore_in, ResponseMsg) {
+ assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+ (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ sequencer1.readCallback(address,
+ l2entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ }
+ }
+
+ action(xi0_loadDone, "xi0", desc="Cluster 0 i-load done") {
+ peek(responseToCore_in, ResponseMsg) {
+ assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+ (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ sequencer.readCallback(address,
+ l2entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ }
+ }
+
+ action(xi1_loadDone, "xi1", desc="Cluster 1 i-load done") {
+ peek(responseToCore_in, ResponseMsg) {
+ assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+ (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ sequencer1.readCallback(address,
+ l2entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ }
+ }
+
+ action(s0_storeDone, "s0", desc="Cluster 0 store done") {
+ Entry entry := getL1CacheEntry(address, 0);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ sequencer.writeCallback(address,
+ cache_entry.DataBlk,
+ true,
+ testAndClearLocalHit(entry));
+ cache_entry.Dirty := true;
+ entry.DataBlk := cache_entry.DataBlk;
+ entry.Dirty := true;
+ DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+ }
+
+ action(s1_storeDone, "s1", desc="Cluster 1 store done") {
+ Entry entry := getL1CacheEntry(address, 1);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ sequencer1.writeCallback(address,
+ cache_entry.DataBlk,
+ true,
+ testAndClearLocalHit(entry));
+ cache_entry.Dirty := true;
+ entry.Dirty := true;
+ entry.DataBlk := cache_entry.DataBlk;
+ DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+ }
+
+ action(xs0_storeDone, "xs0", desc="Cluster 0 store done") {
+ peek(responseToCore_in, ResponseMsg) {
+ Entry entry := getL1CacheEntry(address, 0);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+ (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+ sequencer.writeCallback(address,
+ cache_entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ cache_entry.Dirty := true;
+ entry.Dirty := true;
+ entry.DataBlk := cache_entry.DataBlk;
+ DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+ }
+ }
+
+ action(xs1_storeDone, "xs1", desc="Cluster 1 store done") {
+ peek(responseToCore_in, ResponseMsg) {
+ Entry entry := getL1CacheEntry(address, 1);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+ (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+ sequencer1.writeCallback(address,
+ cache_entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ cache_entry.Dirty := true;
+ entry.Dirty := true;
+ entry.DataBlk := cache_entry.DataBlk;
+ DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+ }
+ }
+
+ action(forward_eviction_to_cpu0, "fec0", desc="sends eviction information to processor0") {
+ if (send_evictions) {
+ DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+ sequencer.evictionCallback(address);
+ }
+ }
+
+ action(forward_eviction_to_cpu1, "fec1", desc="sends eviction information to processor1") {
+ if (send_evictions) {
+ DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+ sequencer1.evictionCallback(address);
+ }
+ }
+
+ action(ci_copyL2ToL1, "ci", desc="copy L2 data to L1") {
+ Entry entry := getICacheEntry(address);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ entry.Dirty := cache_entry.Dirty;
+ entry.DataBlk := cache_entry.DataBlk;
+ entry.FromL2 := true;
+ }
+
+ action(c0_copyL2ToL1, "c0", desc="copy L2 data to L1") {
+ Entry entry := getL1CacheEntry(address, 0);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ entry.Dirty := cache_entry.Dirty;
+ entry.DataBlk := cache_entry.DataBlk;
+ entry.FromL2 := true;
+ }
+
+ action(c1_copyL2ToL1, "c1", desc="copy L2 data to L1") {
+ Entry entry := getL1CacheEntry(address, 1);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ entry.Dirty := cache_entry.Dirty;
+ entry.DataBlk := cache_entry.DataBlk;
+ entry.FromL2 := true;
+ }
+
+ action(fi_L2ToL1, "fi", desc="L2 to L1 inst fill") {
+ enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:L2_to_L1;
+ out_msg.Dest := CacheId:L1I;
+ }
+ }
+
+ action(f0_L2ToL1, "f0", desc="L2 to L1 data fill") {
+ enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:L2_to_L1;
+ out_msg.Dest := CacheId:L1D0;
+ }
+ }
+
+ action(f1_L2ToL1, "f1", desc="L2 to L1 data fill") {
+ enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:L2_to_L1;
+ out_msg.Dest := CacheId:L1D1;
+ }
+ }
+
+ action(wi_writeIcache, "wi", desc="write data to icache (and l2)") {
+ peek(responseToCore_in, ResponseMsg) {
+ Entry entry := getICacheEntry(address);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ entry.DataBlk := in_msg.DataBlk;
+ entry.Dirty := in_msg.Dirty;
+ cache_entry.DataBlk := in_msg.DataBlk;
+ cache_entry.Dirty := in_msg.Dirty;
+ }
+ }
+
+ action(w0_writeDcache, "w0", desc="write data to dcache 0 (and l2)") {
+ peek(responseToCore_in, ResponseMsg) {
+ Entry entry := getL1CacheEntry(address, 0);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ DPRINTF(ProtocolTrace, "CP writeD0: address %s, data: %s\n", address, in_msg.DataBlk);
+ entry.DataBlk := in_msg.DataBlk;
+ entry.Dirty := in_msg.Dirty;
+ cache_entry.DataBlk := in_msg.DataBlk;
+ cache_entry.Dirty := in_msg.Dirty;
+ }
+ }
+
+ action(w1_writeDcache, "w1", desc="write data to dcache 1 (and l2)") {
+ peek(responseToCore_in, ResponseMsg) {
+ Entry entry := getL1CacheEntry(address, 1);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ entry.DataBlk := in_msg.DataBlk;
+ entry.Dirty := in_msg.Dirty;
+ cache_entry.DataBlk := in_msg.DataBlk;
+ cache_entry.Dirty := in_msg.Dirty;
+ }
+ }
+
+ action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+ peek(responseToCore_in, ResponseMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:StaleNotif;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(wb_data, "wb", desc="write back data") {
+ peek(responseToCore_in, ResponseMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUData;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.Dirty := tbe.Dirty;
+ if (tbe.Shared) {
+ out_msg.NbReqShared := true;
+ } else {
+ out_msg.NbReqShared := false;
+ }
+ out_msg.State := CoherenceState:Shared; // faux info
+ out_msg.MessageSize := MessageSizeType:Writeback_Data;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ out_msg.Dirty := false;
+ out_msg.Hit := false;
+ out_msg.Ntsl := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ out_msg.Dirty := false;
+ out_msg.Ntsl := true;
+ out_msg.Hit := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(ph_sendProbeResponseHit, "ph", desc="send probe ack PrbShrData, no data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ assert(addressInCore(address) || is_valid(tbe));
+ out_msg.Dirty := false; // only true if sending back data i think
+ out_msg.Hit := true;
+ out_msg.Ntsl := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(pb_sendProbeResponseBackprobe, "pb", desc="send probe ack PrbShrData, no data, check for L1 residence") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ if (addressInCore(address)) {
+ out_msg.Hit := true;
+ } else {
+ out_msg.Hit := false;
+ }
+ out_msg.Dirty := false; // not sending back data, so def. not dirty
+ out_msg.Ntsl := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ assert(is_valid(cache_entry));
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ out_msg.DataBlk := cache_entry.DataBlk;
+ assert(cache_entry.Dirty);
+ out_msg.Dirty := true;
+ out_msg.Hit := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ }
+ }
+
+ action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ assert(is_valid(cache_entry));
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ out_msg.DataBlk := cache_entry.DataBlk;
+ assert(cache_entry.Dirty);
+ out_msg.Dirty := true;
+ out_msg.Hit := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ }
+ }
+
+ action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ assert(is_valid(tbe));
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.DataBlk := tbe.DataBlk;
+ assert(tbe.Dirty);
+ out_msg.Dirty := true;
+ out_msg.Hit := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ }
+ }
+
+ action(s_setSharedFlip, "s", desc="hit by shared probe, status may be different") {
+ assert(is_valid(tbe));
+ tbe.Shared := true;
+ }
+
+ action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+ enqueue(unblockNetwork_out, UnblockMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(l2m_profileMiss, "l2m", desc="l2m miss profile") {
+ ++L2cache.demand_misses;
+ }
+
+ action(l10m_profileMiss, "l10m", desc="l10m miss profile") {
+ ++L1D0cache.demand_misses;
+ }
+
+ action(l11m_profileMiss, "l11m", desc="l11m miss profile") {
+ ++L1D1cache.demand_misses;
+ }
+
+ action(l1im_profileMiss, "l1lm", desc="l1im miss profile") {
+ ++L1Icache.demand_misses;
+ }
+
+ action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") {
+ probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(xx_recycleResponseQueue, "xx", desc="recycle response queue") {
+ responseToCore_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+ mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ // END ACTIONS
+
+ // BEGIN TRANSITIONS
+
+ // transitions from base
+ transition(I, C0_Load_L1miss, I_E0S) {L1D0TagArrayRead, L2TagArrayRead} {
+ // track misses, if implemented
+ // since in I state, L2 miss as well
+ l2m_profileMiss;
+ l10m_profileMiss;
+ a0_allocateL1D;
+ a2_allocateL2;
+ i1_invCluster;
+ ii_invIcache;
+ n_issueRdBlk;
+ p_popMandatoryQueue;
+ }
+
+ transition(I, C1_Load_L1miss, I_E1S) {L1D1TagArrayRead, L2TagArrayRead} {
+ // track misses, if implemented
+ // since in I state, L2 miss as well
+ l2m_profileMiss;
+ l11m_profileMiss;
+ a1_allocateL1D;
+ a2_allocateL2;
+ i0_invCluster;
+ ii_invIcache;
+ n_issueRdBlk;
+ p_popMandatoryQueue;
+ }
+
+ transition(I, Ifetch0_L1miss, S0) {L1ITagArrayRead,L2TagArrayRead} {
+ // track misses, if implemented
+ // L2 miss as well
+ l2m_profileMiss;
+ l1im_profileMiss;
+ ai_allocateL1I;
+ a2_allocateL2;
+ ib_invBothClusters;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ transition(I, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} {
+ // track misses, if implemented
+ // L2 miss as well
+ l2m_profileMiss;
+ l1im_profileMiss;
+ ai_allocateL1I;
+ a2_allocateL2;
+ ib_invBothClusters;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ transition(I, C0_Store_L1miss, I_M0) {L1D0TagArrayRead, L2TagArrayRead} {
+ l2m_profileMiss;
+ l10m_profileMiss;
+ a0_allocateL1D;
+ a2_allocateL2;
+ i1_invCluster;
+ ii_invIcache;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+
+ transition(I, C1_Store_L1miss, I_M1) {L1D0TagArrayRead, L2TagArrayRead} {
+ l2m_profileMiss;
+ l11m_profileMiss;
+ a1_allocateL1D;
+ a2_allocateL2;
+ i0_invCluster;
+ ii_invIcache;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+
+ transition(S, C0_Load_L1miss, S_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(S, C1_Load_L1miss, S_F1) {L1D1TagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(S, Ifetch0_L1miss, Si_F0) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+ l1im_profileMiss;
+ ai_allocateL1I;
+ fi_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(S, Ifetch1_L1miss, Si_F1) {L1ITagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+ l1im_profileMiss;
+ ai_allocateL1I;
+ fi_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition({S}, {C0_Store_L1hit, C0_Store_L1miss}, S_M0) {L1D0TagArrayRead, L2TagArrayRead} {
+ l2m_profileMiss;
+ l10m_profileMiss;
+ a0_allocateL1D;
+ mruD0_setD0cacheMRU;
+ i1_invCluster;
+ ii_invIcache;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+
+ transition({S}, {C1_Store_L1hit, C1_Store_L1miss}, S_M1) {L1D1TagArrayRead, L2TagArrayRead} {
+ l2m_profileMiss;
+ l11m_profileMiss;
+ a1_allocateL1D;
+ mruD1_setD1cacheMRU;
+ i0_invCluster;
+ ii_invIcache;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+
+ transition(Es, C0_Load_L1miss, Es_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { // can this be folded with S_F?
+ a0_allocateL1D;
+ l10m_profileMiss;
+ f0_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(Es, C1_Load_L1miss, Es_F1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} { // can this be folded with S_F?
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(Es, Ifetch0_L1miss, S0) {L1ITagArrayRead, L1ITagArrayWrite, L2TagArrayRead, L2TagArrayWrite} {
+ l1im_profileMiss;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ ib_invBothClusters;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ transition(Es, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} {
+ l1im_profileMiss;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ ib_invBothClusters;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ // THES SHOULD NOT BE INSTANTANEOUS BUT OH WELL FOR NOW
+ transition(Es, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+ a0_allocateL1D;
+ i1_invCluster;
+ s0_storeDone; // instantaneous L1/L2 dirty - no writethrough delay
+ mruD0_setD0cacheMRU;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(Es, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+ a1_allocateL1D;
+ i0_invCluster;
+ s1_storeDone;
+ mruD1_setD1cacheMRU;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(E0, C0_Load_L1miss, E0_F) {L1D0TagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(E0, C1_Load_L1miss, E0_Es) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(E0, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} {
+ l2m_profileMiss; // permissions miss, still issue RdBlkS
+ l1im_profileMiss;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ i0_invCluster;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ transition(E0, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead} {
+ l2m_profileMiss; // permissions miss, still issue RdBlkS
+ l1im_profileMiss;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ i0_invCluster;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ transition(E0, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+ a0_allocateL1D;
+ s0_storeDone;
+ mruD0_setD0cacheMRU;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(E0, C1_Store_L1miss, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ i0_invCluster;
+ s1_storeDone;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(E1, C1_Load_L1miss, E1_F) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(E1, C0_Load_L1miss, E1_Es) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+ l11m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(E1, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead} {
+ l2m_profileMiss; // permissions miss, still issue RdBlkS
+ l1im_profileMiss;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ i1_invCluster;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ transition(E1, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} {
+ l2m_profileMiss; // permissions miss, still issue RdBlkS
+ l1im_profileMiss;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ i1_invCluster;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ transition(E1, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite} {
+ a1_allocateL1D;
+ s1_storeDone;
+ mruD1_setD1cacheMRU;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(E1, C0_Store_L1miss, M0) {L1D0TagArrayRead, L2TagArrayRead, L2TagArrayWrite, L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ i1_invCluster;
+ s0_storeDone;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition({O}, {C0_Store_L1hit, C0_Store_L1miss}, O_M0) {L1D0TagArrayRead,L2TagArrayRead} {
+ l2m_profileMiss; // permissions miss, still issue CtoD
+ l10m_profileMiss;
+ a0_allocateL1D;
+ mruD0_setD0cacheMRU;
+ i1_invCluster;
+ ii_invIcache;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+
+ transition({O}, {C1_Store_L1hit, C1_Store_L1miss}, O_M1) {L1D1TagArrayRead, L2TagArrayRead} {
+ l2m_profileMiss; // permissions miss, still issue RdBlkS
+ l11m_profileMiss;
+ a1_allocateL1D;
+ mruD1_setD1cacheMRU;
+ i0_invCluster;
+ ii_invIcache;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+
+ transition(O, C0_Load_L1miss, O_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(O, C1_Load_L1miss, O_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(Ms, C0_Load_L1miss, Ms_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(Ms, C1_Load_L1miss, Ms_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition({Ms, M0, M1, O}, Ifetch0_L1miss, MO_S0) {L1ITagArrayRead, L2DataArrayRead, L2TagArrayRead} {
+ l2m_profileMiss; // permissions miss
+ l1im_profileMiss;
+ ai_allocateL1I;
+ t_allocateTBE;
+ ib_invBothClusters;
+ vd_victim;
+// i2_invL2;
+ p_popMandatoryQueue;
+ }
+
+ transition({Ms, M0, M1, O}, Ifetch1_L1miss, MO_S1) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead } {
+ l2m_profileMiss; // permissions miss
+ l1im_profileMiss;
+ ai_allocateL1I;
+ t_allocateTBE;
+ ib_invBothClusters;
+ vd_victim;
+// i2_invL2;
+ p_popMandatoryQueue;
+ }
+
+ transition(Ms, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+ a0_allocateL1D;
+ i1_invCluster;
+ s0_storeDone;
+ mruD0_setD0cacheMRU;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(Ms, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+ a1_allocateL1D;
+ i0_invCluster;
+ s1_storeDone;
+ mruD1_setD1cacheMRU;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(M0, C0_Load_L1miss, M0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(M0, C1_Load_L1miss, M0_Ms) {L2TagArrayRead, L2DataArrayRead,L1D0TagArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(M0, {C0_Store_L1hit, C0_Store_L1miss}) {L1D0TagArrayRead,L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead} {
+ a0_allocateL1D;
+ s0_storeDone;
+ mruD0_setD0cacheMRU;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(M0, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead, L2TagArrayWrite} {
+ a1_allocateL1D;
+ i0_invCluster;
+ s1_storeDone;
+ mruD1_setD1cacheMRU;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(M1, C0_Load_L1miss, M1_Ms) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(M1, C1_Load_L1miss, M1_F) {L1D1TagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+ a1_allocateL1D;
+ f1_L2ToL1;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(M1, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+ a0_allocateL1D;
+ i1_invCluster;
+ s0_storeDone;
+ mruD0_setD0cacheMRU;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(M1, {C1_Store_L1hit, C1_Store_L1miss}) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayWrite} {
+ a1_allocateL1D;
+ s1_storeDone;
+ mruD1_setD1cacheMRU;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ // end transitions from base
+
+ // Begin simple hit transitions
+ transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es,
+ Ms_F1, M0_Ms}, C0_Load_L1hit) {L1D0TagArrayRead, L1D0DataArrayRead} {
+ // track hits, if implemented
+ l0_loadDone;
+ mruD0_setD0cacheMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es,
+ Ms_F0, M1_Ms}, C1_Load_L1hit) {L1D1TagArrayRead, L1D1DataArrayRead} {
+ // track hits, if implemented
+ l1_loadDone;
+ mruD1_setD1cacheMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition({S, S_C, S_F0, S_F1, S_F}, Ifetch0_L1hit) {L1ITagArrayRead, L1IDataArrayRead} {
+ // track hits, if implemented
+ il0_loadDone;
+ mruI_setIcacheMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition({S, S_C, S_F0, S_F1, S_F}, Ifetch1_L1hit) {L1ITagArrayRead, L1IDataArrayWrite} {
+ // track hits, if implemented
+ il1_loadDone;
+ mruI_setIcacheMRU;
+ p_popMandatoryQueue;
+ }
+
+ // end simple hit transitions
+
+ // Transitions from transient states
+
+ // recycles
+ transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+ IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F,
+ E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, C0_Load_L1hit) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M1,
+ O_M1, S0, S1, I_C, S0_C, S1_C, S_C}, C0_Load_L1miss) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+ IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F,
+ E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, C1_Load_L1hit) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M0,
+ O_M0, S0, S1, I_C, S0_C, S1_C, S_C}, C1_Load_L1miss) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, {Ifetch0_L1hit, Ifetch1_L1hit}) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES,
+ IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, ES_I, MO_I, S_F0, S_F1, S_F,
+ O_F0, O_F1, O_F, S_M0, S_M1, O_M0, O_M1, Es_F0, Es_F1, Es_F, E0_F,
+ E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, I_C,
+ S_C}, {Ifetch0_L1miss, Ifetch1_L1miss}) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_E1S, IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, S_F1, O_F1,
+ Si_F0, Si_F1, S_M1, O_M1, S0, S1, Es_F1, E1_F, E0_Es, Ms_F1, M0_Ms,
+ M1_F, I_C, S0_C, S1_C, S_C}, {C0_Store_L1miss}) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_E0S, IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1 S_F0, O_F0,
+ Si_F0, Si_F1, S_M0, O_M0, S0, S1, Es_F0, E0_F, E1_Es, Ms_F0, M0_F,
+ M1_Ms, I_C, S0_C, S1_C, S_C}, {C1_Store_L1miss}) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+ IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M0, O_M0, Es_F0, Es_F1, Es_F, E0_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_Ms}, {C0_Store_L1hit}) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+ IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M1,
+ O_M1, Es_F0, Es_F1, Es_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F,
+ M0_Ms, M1_F, M1_Ms}, {C1_Store_L1hit}) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+ IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F,
+ E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, L1D0_Repl) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+ IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F,
+ E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, L1D1_Repl) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, L1I_Repl) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({S_C, S0_C, S1_C, S0, S1, Si_F0, Si_F1, I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, S_M0, O_M0, S_M1, O_M1, Es_F0, Es_F1, Es_F, E0_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, MO_S0, MO_S1, IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, L2_Repl) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, {NB_AckS,
+ PrbInvData, PrbInv, PrbShrData}) {} {
+ yy_recycleProbeQueue; // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary.
+ }
+
+ transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES}, NB_AckE) {} {
+ xx_recycleResponseQueue; // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary.
+ }
+
+ transition({E0_Es, E1_F, Es_F1}, C0_Load_L1miss, Es_F) {L2DataArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(S_F1, C0_Load_L1miss, S_F) {L2DataArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(O_F1, C0_Load_L1miss, O_F) {L2DataArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition({Ms_F1, M0_Ms, M1_F}, C0_Load_L1miss, Ms_F) {L2DataArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(I_M0, C1_Load_L1miss, I_M0Ms) {} {
+ l2m_profileMiss;
+ l11m_profileMiss;
+ a1_allocateL1D;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(I_M1, C0_Load_L1miss, I_M1Ms) {} {
+ l2m_profileMiss;
+ l10m_profileMiss;
+ a0_allocateL1D;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(I_M0, C1_Store_L1miss, I_M0M1) {} {
+ l2m_profileMiss;
+ l11m_profileMiss;
+ a1_allocateL1D;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(I_M1, C0_Store_L1miss, I_M1M0) {} {
+ l2m_profileMiss;
+ l10m_profileMiss;
+ a0_allocateL1D;
+ mru_setMRU;
+ p_popMandatoryQueue;
+ }
+
+ transition(I_E0S, C1_Load_L1miss, I_ES) {} {
+ l2m_profileMiss;
+ l11m_profileMiss;
+ a1_allocateL1D;
+ p_popMandatoryQueue;
+ }
+
+ transition(I_E1S, C0_Load_L1miss, I_ES) {} {
+ l2m_profileMiss;
+ l10m_profileMiss;
+ a0_allocateL1D;
+ p_popMandatoryQueue;
+ }
+
+ transition({E1_Es, E0_F, Es_F0}, C1_Load_L1miss, Es_F) {L2DataArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(S_F0, C1_Load_L1miss, S_F) {L2DataArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(O_F0, C1_Load_L1miss, O_F) {L2DataArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition({Ms_F0, M1_Ms, M0_F}, C1_Load_L1miss, Ms_F) { L2DataArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es, Ms_F1, M0_Ms}, L1D0_Repl) {L1D0TagArrayRead} {
+ i0_invCluster;
+ }
+
+ transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es, Ms_F0, M1_Ms}, L1D1_Repl) {L1D1TagArrayRead} {
+ i1_invCluster;
+ }
+
+ transition({S, S_C, S_F0, S_F1}, L1I_Repl) {L1ITagArrayRead} {
+ ii_invIcache;
+ }
+
+ transition({S, E0, E1, Es}, L2_Repl, ES_I) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead, L1D1TagArrayRead} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ t_allocateTBE;
+ vc_victim;
+ ib_invBothClusters;
+ i2_invL2;
+ ii_invIcache;
+ }
+
+ transition({Ms, M0, M1, O}, L2_Repl, MO_I) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead, L1D1TagArrayRead} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ t_allocateTBE;
+ vd_victim;
+ i2_invL2;
+ ib_invBothClusters; // nothing will happen for D0 on M1, vice versa
+ }
+
+ transition(S0, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ wi_writeIcache;
+ xi0_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(S1, NB_AckS, S) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ wi_writeIcache;
+ xi1_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(S0_C, NB_AckS, S_C) {L1D0DataArrayWrite,L2DataArrayWrite} {
+ wi_writeIcache;
+ xi0_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(S1_C, NB_AckS, S_C) {L1D1DataArrayWrite, L2DataArrayWrite} {
+ wi_writeIcache;
+ xi1_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+ w0_writeDcache;
+ xs0_storeDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ w1_writeDcache;
+ xs1_storeDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ // THESE MO->M1 should not be instantaneous but oh well for now.
+ transition(I_M0M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ w0_writeDcache;
+ xs0_storeDone;
+ uu_sendUnblock;
+ i0_invCluster;
+ s1_storeDone;
+ pr_popResponseQueue;
+ }
+
+ transition(I_M1M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ w1_writeDcache;
+ xs1_storeDone;
+ uu_sendUnblock;
+ i1_invCluster;
+ s0_storeDone;
+ pr_popResponseQueue;
+ }
+
+ // Above shoudl be more like this, which has some latency to xfer to L1
+ transition(I_M0Ms, NB_AckM, M0_Ms) {L1D0DataArrayWrite,L2DataArrayWrite} {
+ w0_writeDcache;
+ xs0_storeDone;
+ uu_sendUnblock;
+ f1_L2ToL1;
+ pr_popResponseQueue;
+ }
+
+ transition(I_M1Ms, NB_AckM, M1_Ms) {L1D1DataArrayWrite, L2DataArrayWrite} {
+ w1_writeDcache;
+ xs1_storeDone;
+ uu_sendUnblock;
+ f0_L2ToL1;
+ pr_popResponseQueue;
+ }
+
+ transition(I_E0S, NB_AckE, E0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ w0_writeDcache;
+ xl0_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_E1S, NB_AckE, E1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ w1_writeDcache;
+ xl1_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_ES, NB_AckE, Es) {L1D1DataArrayWrite, L1D1TagArrayWrite, L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite } {
+ w0_writeDcache;
+ xl0_loadDone;
+ w1_writeDcache;
+ xl1_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_E0S, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+ w0_writeDcache;
+ xl0_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_E1S, NB_AckS, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} {
+ w1_writeDcache;
+ xl1_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_ES, NB_AckS, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} {
+ w0_writeDcache;
+ xl0_loadDone;
+ w1_writeDcache;
+ xl1_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(S_F0, L2_to_L1D0, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ mru_setMRU;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(S_F1, L2_to_L1D1, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ mru_setMRU;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Si_F0, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ ci_copyL2ToL1;
+ mru_setMRU;
+ il0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Si_F1, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ ci_copyL2ToL1;
+ mru_setMRU;
+ il1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(S_F, L2_to_L1D0, S_F1) { L1D0DataArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ mru_setMRU;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(S_F, L2_to_L1D1, S_F0) { L1D1DataArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ mru_setMRU;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(O_F0, L2_to_L1D0, O) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ mru_setMRU;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(O_F1, L2_to_L1D1, O) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ mru_setMRU;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(O_F, L2_to_L1D0, O_F1) { L1D0DataArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ mru_setMRU;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(O_F, L2_to_L1D1, O_F0) { L1D1DataArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ mru_setMRU;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(M1_F, L2_to_L1D1, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ mru_setMRU;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(M0_F, L2_to_L1D0, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ mru_setMRU;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Ms_F0, L2_to_L1D0, Ms) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ mru_setMRU;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Ms_F1, L2_to_L1D1, Ms) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ mru_setMRU;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Ms_F, L2_to_L1D0, Ms_F1) {L1D0DataArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ mru_setMRU;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Ms_F, L2_to_L1D1, Ms_F0) {L1IDataArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ mru_setMRU;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(M1_Ms, L2_to_L1D0, Ms) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ mru_setMRU;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(M0_Ms, L2_to_L1D1, Ms) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ mru_setMRU;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Es_F0, L2_to_L1D0, Es) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ mru_setMRU;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Es_F1, L2_to_L1D1, Es) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ mru_setMRU;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Es_F, L2_to_L1D0, Es_F1) {L2TagArrayRead, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ mru_setMRU;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Es_F, L2_to_L1D1, Es_F0) {L2TagArrayRead, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ mru_setMRU;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(E0_F, L2_to_L1D0, E0) {L2TagArrayRead, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ mru_setMRU;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(E1_F, L2_to_L1D1, E1) {L2TagArrayRead, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ mru_setMRU;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(E1_Es, L2_to_L1D0, Es) {L2TagArrayRead, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ mru_setMRU;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(E0_Es, L2_to_L1D1, Es) {L2TagArrayRead, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ mru_setMRU;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(IF_E0S, L2_to_L1D0, I_E0S) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(IF_E1S, L2_to_L1D1, I_E1S) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(IF_ES, L2_to_L1D0, IF1_ES) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(IF_ES, L2_to_L1D1, IF0_ES) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(IF0_ES, L2_to_L1D0, I_ES) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(IF1_ES, L2_to_L1D1, I_ES) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(F_S0, L2_to_L1I, S0) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(F_S1, L2_to_L1I, S1) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition({S_M0, O_M0}, NB_AckM, M0) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ mru_setMRU;
+ xs0_storeDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition({S_M1, O_M1}, NB_AckM, M1) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ mru_setMRU;
+ xs1_storeDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} {
+ wb_data;
+ d_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} {
+ wb_data;
+ d_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} {
+ wb_data;
+ i2_invL2;
+ a2_allocateL2;
+ d_deallocateTBE; // FOO
+ nS_issueRdBlkS;
+ pr_popResponseQueue;
+ }
+
+ transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} {
+ wb_data;
+ i2_invL2;
+ a2_allocateL2;
+ d_deallocateTBE; // FOO
+ nS_issueRdBlkS;
+ pr_popResponseQueue;
+ }
+
+ // Writeback cancel "ack"
+ transition(I_C, NB_AckWB, I) {L2TagArrayWrite} {
+ ss_sendStaleNotification;
+ d_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(S0_C, NB_AckWB, S0) {L2TagArrayWrite} {
+ ss_sendStaleNotification;
+ pr_popResponseQueue;
+ }
+
+ transition(S1_C, NB_AckWB, S1) {L2TagArrayWrite} {
+ ss_sendStaleNotification;
+ pr_popResponseQueue;
+ }
+
+ transition(S_C, NB_AckWB, S) {L2TagArrayWrite} {
+ ss_sendStaleNotification;
+ pr_popResponseQueue;
+ }
+
+ // Begin Probe Transitions
+
+ transition({Ms, M0, M1, O}, PrbInvData, I) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pd_sendProbeResponseData;
+ i2_invL2;
+ ib_invBothClusters;
+ pp_popProbeQueue;
+ }
+
+ transition({Es, E0, E1, S, I}, PrbInvData, I) {L2TagArrayRead, L2TagArrayWrite} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ i2_invL2;
+ ib_invBothClusters;
+ ii_invIcache; // only relevant for S
+ pp_popProbeQueue;
+ }
+
+ transition(S_C, PrbInvData, I_C) {L2TagArrayWrite} {
+ t_allocateTBE;
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ i2_invL2;
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition(I_C, PrbInvData, I_C) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms, M0, M1, O, Es, E0, E1, S, I}, PrbInv, I) {L2TagArrayRead, L2TagArrayWrite} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ i2_invL2; // nothing will happen in I
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition(S_C, PrbInv, I_C) {L2TagArrayWrite} {
+ t_allocateTBE;
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ i2_invL2;
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition(I_C, PrbInv, I_C) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms, M0, M1, O}, PrbShrData, O) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition({Es, E0, E1, S}, PrbShrData, S) {L2TagArrayRead, L2TagArrayWrite} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition(S_C, PrbShrData) {} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition({I, I_C}, PrbShrData) {L2TagArrayRead} {
+ pb_sendProbeResponseBackprobe;
+ pp_popProbeQueue;
+ }
+
+ transition({I_M0, I_E0S}, {PrbInv, PrbInvData}) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters; // must invalidate current data (only relevant for I_M0)
+ a0_allocateL1D; // but make sure there is room for incoming data when it arrives
+ pp_popProbeQueue;
+ }
+
+ transition({I_M1, I_E1S}, {PrbInv, PrbInvData}) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters; // must invalidate current data (only relevant for I_M1)
+ a1_allocateL1D; // but make sure there is room for incoming data when it arrives
+ pp_popProbeQueue;
+ }
+
+ transition({I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_ES}, {PrbInv, PrbInvData, PrbShrData}) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ a0_allocateL1D;
+ a1_allocateL1D;
+ pp_popProbeQueue;
+ }
+
+ transition({I_M0, I_E0S, I_M1, I_E1S}, PrbShrData) {} {
+ pb_sendProbeResponseBackprobe;
+ pp_popProbeQueue;
+ }
+
+ transition(ES_I, PrbInvData, I_C) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_I, PrbInvData, I_C) {} {
+ pdt_sendProbeResponseDataFromTBE;
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_I, PrbInv, I_C) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition(ES_I, PrbInv, I_C) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition(ES_I, PrbShrData, ES_I) {} {
+ ph_sendProbeResponseHit;
+ s_setSharedFlip;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_I, PrbShrData, MO_I) {} {
+ pdt_sendProbeResponseDataFromTBE;
+ s_setSharedFlip;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_S0, PrbInvData, S0_C) {L2TagArrayWrite} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pdt_sendProbeResponseDataFromTBE;
+ i2_invL2;
+ a2_allocateL2;
+ d_deallocateTBE;
+ nS_issueRdBlkS;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_S1, PrbInvData, S1_C) {L2TagArrayWrite} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pdt_sendProbeResponseDataFromTBE;
+ i2_invL2;
+ a2_allocateL2;
+ d_deallocateTBE;
+ nS_issueRdBlkS;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_S0, PrbInv, S0_C) {L2TagArrayWrite} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ i2_invL2;
+ a2_allocateL2;
+ d_deallocateTBE;
+ nS_issueRdBlkS;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_S1, PrbInv, S1_C) {L2TagArrayWrite} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ i2_invL2;
+ a2_allocateL2;
+ d_deallocateTBE;
+ nS_issueRdBlkS;
+ pp_popProbeQueue;
+ }
+
+ transition({MO_S0, MO_S1}, PrbShrData) {} {
+ pdt_sendProbeResponseDataFromTBE;
+ s_setSharedFlip;
+ pp_popProbeQueue;
+ }
+
+ transition({S_F0, Es_F0, E0_F, E1_Es}, {PrbInvData, PrbInv}, IF_E0S) {}{
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ // invalidate everything you've got
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ // but make sure you have room for what you need from the fill
+ a0_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({S_F1, Es_F1, E1_F, E0_Es}, {PrbInvData, PrbInv}, IF_E1S) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ // invalidate everything you've got
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ // but make sure you have room for what you need from the fill
+ a1_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({S_F, Es_F}, {PrbInvData, PrbInv}, IF_ES) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ // invalidate everything you've got
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ // but make sure you have room for what you need from the fill
+ a0_allocateL1D;
+ a1_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition(Si_F0, {PrbInvData, PrbInv}, F_S0) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ nS_issueRdBlkS;
+ pp_popProbeQueue;
+ }
+
+ transition(Si_F1, {PrbInvData, PrbInv}, F_S1) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ nS_issueRdBlkS;
+ pp_popProbeQueue;
+ }
+
+ transition({Es_F0, E0_F, E1_Es}, PrbShrData, S_F0) {} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition({Es_F1, E1_F, E0_Es}, PrbShrData, S_F1) {} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition(Es_F, PrbShrData, S_F) {} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition({S_F0, S_F1, S_F, Si_F0, Si_F1}, PrbShrData) {} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition(S_M0, PrbInvData, I_M0) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pim_sendProbeResponseInvMs;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ a0_allocateL1D;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition(O_M0, PrbInvData, I_M0) {L2DataArrayRead} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pdm_sendProbeResponseDataMs;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ a0_allocateL1D;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition({S_M0, O_M0}, {PrbInv}, I_M0) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pim_sendProbeResponseInvMs;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ a0_allocateL1D;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition(S_M1, PrbInvData, I_M1) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pim_sendProbeResponseInvMs;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ a1_allocateL1D;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition(O_M1, PrbInvData, I_M1) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pdm_sendProbeResponseDataMs;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ a1_allocateL1D;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition({S_M1, O_M1}, {PrbInv}, I_M1) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pim_sendProbeResponseInvMs;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ a1_allocateL1D;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition({S0, S0_C}, {PrbInvData, PrbInv}) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition({S1, S1_C}, {PrbInvData, PrbInv}) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition({S_M0, S_M1}, PrbShrData) {} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition({O_M0, O_M1}, PrbShrData) {L2DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition({S0, S1, S0_C, S1_C}, PrbShrData) {} {
+ pb_sendProbeResponseBackprobe;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInvData, IF_E0S) { L2DataArrayRead} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pd_sendProbeResponseData;
+ ib_invBothClusters;
+ i2_invL2;
+ a0_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInvData, IF_E1S) {L2DataArrayRead} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pd_sendProbeResponseData;
+ ib_invBothClusters;
+ i2_invL2;
+ a1_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F, O_F}, PrbInvData, IF_ES) {L2DataArrayRead} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pd_sendProbeResponseData;
+ ib_invBothClusters;
+ i2_invL2;
+ a0_allocateL1D;
+ a1_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInv, IF_E0S) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ i2_invL2;
+ a0_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInv, IF_E1S) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ i2_invL2;
+ a1_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F, O_F}, PrbInv, IF_ES) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ i2_invL2;
+ a0_allocateL1D;
+ a1_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F0, M0_F, M1_Ms}, PrbShrData, O_F0) {L2DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F1, M1_F, M0_Ms}, PrbShrData, O_F1) {} {
+ }
+
+ transition({Ms_F}, PrbShrData, O_F) {L2DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition({O_F0, O_F1, O_F}, PrbShrData) {L2DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ // END TRANSITIONS
+}
+
+
diff --git a/src/mem/protocol/MOESI_AMD_Base-L3cache.sm b/src/mem/protocol/MOESI_AMD_Base-L3cache.sm
new file mode 100644
index 000000000..479cf4e78
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-L3cache.sm
@@ -0,0 +1,1130 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:L3Cache, "L3")
+ : CacheMemory * L3cache;
+ WireBuffer * reqToDir;
+ WireBuffer * respToDir;
+ WireBuffer * l3UnblockToDir;
+ WireBuffer * reqToL3;
+ WireBuffer * probeToL3;
+ WireBuffer * respToL3;
+ Cycles l3_request_latency := 1;
+ Cycles l3_response_latency := 35;
+
+ // To the general response network
+ MessageBuffer * responseFromL3, network="To", virtual_network="2", ordered="false", vnet_type="response";
+
+ // From the general response network
+ MessageBuffer * responseToL3, network="From", virtual_network="2", ordered="false", vnet_type="response";
+
+{
+ // EVENTS
+ enumeration(Event, desc="L3 Events") {
+ // Requests coming from the Cores
+ RdBlk, desc="CPU RdBlk event";
+ RdBlkM, desc="CPU RdBlkM event";
+ RdBlkS, desc="CPU RdBlkS event";
+ CtoD, desc="Change to Dirty request";
+ WrVicBlk, desc="L2 Victim (dirty)";
+ WrVicBlkShared, desc="L2 Victim (dirty)";
+ ClVicBlk, desc="L2 Victim (clean)";
+ ClVicBlkShared, desc="L2 Victim (clean)";
+
+ CPUData, desc="WB data from CPU";
+ CPUDataShared, desc="WB data from CPU, NBReqShared 1";
+ StaleWB, desc="WB stale; no data";
+
+ L3_Repl, desc="L3 Replacement";
+
+ // Probes
+ PrbInvData, desc="Invalidating probe, return dirty data";
+ PrbInv, desc="Invalidating probe, no need to return data";
+ PrbShrData, desc="Downgrading probe, return data";
+
+ // Coming from Memory Controller
+ WBAck, desc="ack from memory";
+
+ CancelWB, desc="Cancel WB from L2";
+ }
+
+ // STATES
+ // Base States:
+ state_declaration(State, desc="L3 State", default="L3Cache_State_I") {
+ M, AccessPermission:Read_Write, desc="Modified"; // No other cache has copy, memory stale
+ O, AccessPermission:Read_Only, desc="Owned"; // Correct most recent copy, others may exist in S
+ E, AccessPermission:Read_Write, desc="Exclusive"; // Correct, most recent, and only copy (and == Memory)
+ S, AccessPermission:Read_Only, desc="Shared"; // Correct, most recent. If no one in O, then == Memory
+ I, AccessPermission:Invalid, desc="Invalid";
+
+ I_M, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data";
+ I_O, AccessPermission:Busy, desc="Invalid, received WrVicBlk, sent Ack, waiting for Data";
+ I_E, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data";
+ I_S, AccessPermission:Busy, desc="Invalid, receive ClVicBlk, sent Ack, waiting for Data";
+ S_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to M";
+ S_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O";
+ S_E, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to E";
+ S_S, AccessPermission:Busy, desc="Shared, received ClVicBlk, sent Ack, waiting for Data, then go to S";
+ E_M, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O";
+ E_O, AccessPermission:Busy, desc="received WrVicBlkShared, sent Ack, waiting for Data, then go to O";
+ E_E, AccessPermission:Busy, desc="received WrVicBlk, sent Ack, waiting for Data, then go to O";
+ E_S, AccessPermission:Busy, desc="Shared, received WrVicBlk, sent Ack, waiting for Data";
+ O_M, AccessPermission:Busy, desc="...";
+ O_O, AccessPermission:Busy, desc="...";
+ O_E, AccessPermission:Busy, desc="...";
+ O_S, AccessPermission:Busy, desc="...";
+ M_M, AccessPermission:Busy, desc="...";
+ M_O, AccessPermission:Busy, desc="...";
+ M_E, AccessPermission:Busy, desc="...";
+ M_S, AccessPermission:Busy, desc="...";
+ D_I, AccessPermission:Invalid, desc="drop WB data on the floor when receive";
+ MOD_I, AccessPermission:Busy, desc="drop WB data on the floor, waiting for WBAck from Mem";
+ MO_I, AccessPermission:Busy, desc="M or O, received L3_Repl, waiting for WBAck from Mem";
+ I_I, AccessPermission:Busy, desc="I_MO received L3_Repl";
+ I_CD, AccessPermission:Busy, desc="I_I received WBAck, now just waiting for CPUData";
+ I_C, AccessPermission:Invalid, desc="sent cancel, just waiting to receive mem wb ack so nothing gets confused";
+ }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ DataArrayRead, desc="Read the data array";
+ DataArrayWrite, desc="Write the data array";
+ TagArrayRead, desc="Read the data array";
+ TagArrayWrite, desc="Write the data array";
+ }
+
+ // STRUCTURES
+
+ structure(Entry, desc="...", interface="AbstractCacheEntry") {
+ State CacheState, desc="cache state";
+ bool Dirty, desc="Is the data dirty (diff from memory?)";
+ DataBlock DataBlk, desc="Data for the block";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ DataBlock DataBlk, desc="data for the block";
+ bool Dirty, desc="Is the data dirty?";
+ bool Shared, desc="Victim hit by shared probe";
+ MachineID From, desc="Waiting for writeback from...";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ TBETable TBEs, template="<L3Cache_TBE>", constructor="m_number_of_TBEs";
+
+ void set_cache_entry(AbstractCacheEntry b);
+ void unset_cache_entry();
+ void set_tbe(TBE b);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+
+
+ // FUNCTION DEFINITIONS
+ Tick clockEdge();
+ Tick cyclesToTicks(Cycles c);
+
+ Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+ return static_cast(Entry, "pointer", L3cache.lookup(addr));
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ return getCacheEntry(addr).DataBlk;
+ }
+
+ bool presentOrAvail(Addr addr) {
+ return L3cache.isTagPresent(addr) || L3cache.cacheAvail(addr);
+ }
+
+ State getState(TBE tbe, Entry cache_entry, Addr addr) {
+ if (is_valid(tbe)) {
+ return tbe.TBEState;
+ } else if (is_valid(cache_entry)) {
+ return cache_entry.CacheState;
+ }
+ return State:I;
+ }
+
+ void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+ if (is_valid(tbe)) {
+ tbe.TBEState := state;
+ }
+
+ if (is_valid(cache_entry)) {
+ cache_entry.CacheState := state;
+ }
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ testAndRead(addr, tbe.DataBlk, pkt);
+ } else {
+ functionalMemoryRead(pkt);
+ }
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ int num_functional_writes := 0;
+
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ num_functional_writes := num_functional_writes +
+ testAndWrite(addr, tbe.DataBlk, pkt);
+ }
+
+ num_functional_writes := num_functional_writes +
+ functionalMemoryWrite(pkt);
+ return num_functional_writes;
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return L3Cache_State_to_permission(tbe.TBEState);
+ }
+
+ Entry cache_entry := getCacheEntry(addr);
+ if(is_valid(cache_entry)) {
+ return L3Cache_State_to_permission(cache_entry.CacheState);
+ }
+
+ return AccessPermission:NotPresent;
+ }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+ if (is_valid(cache_entry)) {
+ cache_entry.changePermission(L3Cache_State_to_permission(state));
+ }
+ }
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ return true;
+ }
+
+
+ // OUT PORTS
+ out_port(requestNetwork_out, CPURequestMsg, reqToDir);
+ out_port(L3Resp_out, ResponseMsg, respToDir);
+ out_port(responseNetwork_out, ResponseMsg, responseFromL3);
+ out_port(unblockNetwork_out, UnblockMsg, l3UnblockToDir);
+
+ // IN PORTS
+ in_port(NBResponse_in, ResponseMsg, respToL3) {
+ if (NBResponse_in.isReady(clockEdge())) {
+ peek(NBResponse_in, ResponseMsg) {
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+ trigger(Event:WBAck, in_msg.addr, cache_entry, tbe);
+ } else {
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ error("Error on NBResponse Type");
+ }
+ }
+ }
+ }
+
+ // Response Network
+ in_port(responseNetwork_in, ResponseMsg, responseToL3) {
+ if (responseNetwork_in.isReady(clockEdge())) {
+ peek(responseNetwork_in, ResponseMsg) {
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ if (in_msg.Type == CoherenceResponseType:CPUData) {
+ if (in_msg.NbReqShared) {
+ trigger(Event:CPUDataShared, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:CPUData, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+ trigger(Event:StaleWB, in_msg.addr, cache_entry, tbe);
+ } else {
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ error("Error on NBResponse Type");
+ }
+ }
+ }
+ }
+
+ // probe network
+ in_port(probeNetwork_in, NBProbeRequestMsg, probeToL3) {
+ if (probeNetwork_in.isReady(clockEdge())) {
+ peek(probeNetwork_in, NBProbeRequestMsg) {
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ if (in_msg.Type == ProbeRequestType:PrbInv) {
+ if (in_msg.ReturnData) {
+ trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+ if (in_msg.ReturnData) {
+ trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("Don't think I should get any of these");
+ }
+ }
+ }
+ }
+ }
+
+ // Request Network
+ in_port(requestNetwork_in, CPURequestMsg, reqToL3) {
+ if (requestNetwork_in.isReady(clockEdge())) {
+ peek(requestNetwork_in, CPURequestMsg) {
+ assert(in_msg.Destination.isElement(machineID));
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ if (in_msg.Type == CoherenceRequestType:RdBlk) {
+ trigger(Event:RdBlk, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+ trigger(Event:RdBlkS, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+ trigger(Event:RdBlkM, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+ if (presentOrAvail(in_msg.addr)) {
+ if (in_msg.Shared) {
+ trigger(Event:ClVicBlkShared, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:ClVicBlk, in_msg.addr, cache_entry, tbe);
+ }
+ } else {
+ Addr victim := L3cache.cacheProbe(in_msg.addr);
+ trigger(Event:L3_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+ if (presentOrAvail(in_msg.addr)) {
+ if (in_msg.Shared) {
+ trigger(Event:WrVicBlkShared, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:WrVicBlk, in_msg.addr, cache_entry, tbe);
+ }
+ } else {
+ Addr victim := L3cache.cacheProbe(in_msg.addr);
+ trigger(Event:L3_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } else if (in_msg.Type == CoherenceRequestType:WrCancel) {
+ if (is_valid(tbe) && tbe.From == in_msg.Requestor) {
+ trigger(Event:CancelWB, in_msg.addr, cache_entry, tbe);
+ } else {
+ requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+ }
+ }
+ }
+ }
+
+ // BEGIN ACTIONS
+
+ action(i_invL3, "i", desc="invalidate L3 cache block") {
+ if (is_valid(cache_entry)) {
+ L3cache.deallocate(address);
+ }
+ unset_cache_entry();
+ }
+
+ action(rm_sendResponseM, "rm", desc="send Modified response") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.DataBlk := cache_entry.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := cache_entry.Dirty;
+ out_msg.State := CoherenceState:Modified;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(rs_sendResponseS, "rs", desc="send Shared response") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.DataBlk := cache_entry.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := cache_entry.Dirty;
+ out_msg.State := CoherenceState:Shared;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+
+ action(r_requestToMem, "r", desc="Miss in L3, pass on") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := in_msg.Type;
+ out_msg.Requestor := in_msg.Requestor;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.Shared := false; // unneeded for this request
+ out_msg.MessageSize := in_msg.MessageSize;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ if (is_valid(cache_entry)) {
+ tbe.DataBlk := cache_entry.DataBlk; // Data only for WBs
+ tbe.Dirty := cache_entry.Dirty;
+ }
+ tbe.From := machineID;
+ }
+
+ action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+ TBEs.deallocate(address);
+ unset_tbe();
+ }
+
+ action(vd_vicDirty, "vd", desc="Victimize dirty L3 data") {
+ enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:VicDirty;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ }
+ }
+
+ action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, l3_response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysWBAck;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ }
+ }
+ }
+
+ action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+ enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ out_msg.Dirty := false;
+ out_msg.Hit := false;
+ out_msg.Ntsl := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(ph_sendProbeResponseHit, "ph", desc="send probe ack, no data") {
+ enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ out_msg.Dirty := false;
+ out_msg.Hit := true;
+ out_msg.Ntsl := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(pm_sendProbeResponseMiss, "pm", desc="send probe ack, no data") {
+ enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ out_msg.Dirty := false;
+ out_msg.Hit := false;
+ out_msg.Ntsl := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+ enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ out_msg.DataBlk := cache_entry.DataBlk;
+ assert(cache_entry.Dirty);
+ out_msg.Dirty := true;
+ out_msg.Hit := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ }
+ }
+
+ action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") {
+ enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.DataBlk := tbe.DataBlk;
+ assert(tbe.Dirty);
+ out_msg.Dirty := true;
+ out_msg.Hit := true;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.State := CoherenceState:NA;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(mc_cancelMemWriteback, "mc", desc="send writeback cancel to memory") {
+ enqueue(requestNetwork_out, CPURequestMsg, l3_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:WrCancel;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ }
+ }
+
+ action(a_allocateBlock, "a", desc="allocate L3 block") {
+ if (is_invalid(cache_entry)) {
+ set_cache_entry(L3cache.allocate(address, new Entry));
+ }
+ }
+
+ action(d_writeData, "d", desc="write data to L3") {
+ peek(responseNetwork_in, ResponseMsg) {
+ if (in_msg.Dirty) {
+ cache_entry.Dirty := in_msg.Dirty;
+ }
+ cache_entry.DataBlk := in_msg.DataBlk;
+ DPRINTF(RubySlicc, "Writing to L3: %s\n", in_msg);
+ }
+ }
+
+ action(rd_copyDataFromRequest, "rd", desc="write data to L3") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ cache_entry.DataBlk := in_msg.DataBlk;
+ cache_entry.Dirty := true;
+ }
+ }
+
+ action(f_setFrom, "f", desc="set who WB is expected to come from") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ tbe.From := in_msg.Requestor;
+ }
+ }
+
+ action(rf_resetFrom, "rf", desc="reset From") {
+ tbe.From := machineID;
+ }
+
+ action(wb_data, "wb", desc="write back data") {
+ enqueue(L3Resp_out, ResponseMsg, l3_request_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUData;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.Dirty := tbe.Dirty;
+ if (tbe.Shared) {
+ out_msg.NbReqShared := true;
+ } else {
+ out_msg.NbReqShared := false;
+ }
+ out_msg.State := CoherenceState:Shared; // faux info
+ out_msg.MessageSize := MessageSizeType:Writeback_Data;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(wt_writeDataToTBE, "wt", desc="write WB data to TBE") {
+ peek(responseNetwork_in, ResponseMsg) {
+ tbe.DataBlk := in_msg.DataBlk;
+ tbe.Dirty := in_msg.Dirty;
+ }
+ }
+
+ action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+ enqueue(unblockNetwork_out, UnblockMsg, l3_request_latency) {
+ out_msg.addr := address;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(ut_updateTag, "ut", desc="update Tag (i.e. set MRU)") {
+ L3cache.setMRU(address);
+ }
+
+ action(p_popRequestQueue, "p", desc="pop request queue") {
+ requestNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pr_popResponseQueue, "pr", desc="pop response queue") {
+ responseNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pn_popNBResponseQueue, "pn", desc="pop NB response queue") {
+ NBResponse_in.dequeue(clockEdge());
+ }
+
+ action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+ probeNetwork_in.dequeue(clockEdge());
+ }
+
+ action(zz_recycleRequestQueue, "\z", desc="recycle request queue") {
+ requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+
+ // END ACTIONS
+
+ // BEGIN TRANSITIONS
+
+ // transitions from base
+
+ transition({I, I_C}, {RdBlk, RdBlkS, RdBlkM, CtoD}) {TagArrayRead} {
+ r_requestToMem;
+ p_popRequestQueue;
+ }
+
+ transition(O, RdBlk ) {TagArrayRead, DataArrayRead} {
+ rs_sendResponseS;
+ ut_updateTag;
+ p_popRequestQueue;
+ }
+ transition(M, RdBlk, O) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+ rs_sendResponseS;
+ ut_updateTag;
+ p_popRequestQueue;
+ }
+
+ transition(S, RdBlk) {TagArrayRead, DataArrayRead} {
+ rs_sendResponseS;
+ ut_updateTag;
+ p_popRequestQueue;
+ }
+ transition(E, RdBlk, S) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+ rs_sendResponseS;
+ ut_updateTag;
+ p_popRequestQueue;
+ }
+
+ transition({M, O}, RdBlkS, O) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+ rs_sendResponseS;
+ ut_updateTag;
+ p_popRequestQueue;
+ }
+
+ transition({E, S}, RdBlkS, S) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+ rs_sendResponseS;
+ ut_updateTag;
+ p_popRequestQueue;
+ }
+
+ transition(M, RdBlkM, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+ rm_sendResponseM;
+ i_invL3;
+ p_popRequestQueue;
+ }
+
+ transition({O, S}, {RdBlkM, CtoD}) {TagArrayRead} {
+ r_requestToMem; // can't handle this, just forward
+ p_popRequestQueue;
+ }
+
+ transition(E, RdBlkM, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+ rm_sendResponseM;
+ i_invL3;
+ p_popRequestQueue;
+ }
+
+ transition({I}, WrVicBlk, I_M) {TagArrayRead, TagArrayWrite} {
+ a_allocateBlock;
+ t_allocateTBE;
+ f_setFrom;
+// rd_copyDataFromRequest;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(I_C, {WrVicBlk, WrVicBlkShared, ClVicBlk, ClVicBlkShared}) {} {
+ zz_recycleRequestQueue;
+ }
+
+ transition({I}, WrVicBlkShared, I_O) {TagArrayRead, TagArrayWrite} {
+ a_allocateBlock;
+ t_allocateTBE;
+ f_setFrom;
+// rd_copyDataFromRequest;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(S, WrVicBlkShared, S_O) {TagArrayRead, TagArrayWrite} {
+// rd_copyDataFromRequest;
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(S, WrVicBlk, S_M) {TagArrayRead, TagArrayWrite} { // should be technically not possible, but assume the data comes back with shared bit flipped
+// rd_copyDataFromRequest;
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(E, WrVicBlk, E_M) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(E, WrVicBlkShared, E_O) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(O, WrVicBlk, O_M) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(O, WrVicBlkShared, O_O) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(M, WrVicBlk, M_M) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(M, WrVicBlkShared, M_O) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition({I}, ClVicBlk, I_E) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ f_setFrom;
+ a_allocateBlock;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition({I}, ClVicBlkShared, I_S) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ f_setFrom;
+ a_allocateBlock;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(S, ClVicBlk, S_E) {TagArrayRead, TagArrayWrite} { // technically impossible, assume data comes back with shared bit flipped
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(S, ClVicBlkShared, S_S) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(E, ClVicBlk, E_E) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(E, ClVicBlkShared, E_S) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(O, ClVicBlk, O_E) {TagArrayRead, TagArrayWrite} { // technically impossible, but assume data comes back with shared bit flipped
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(O, ClVicBlkShared, O_S) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(M, ClVicBlk, M_E) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(M, ClVicBlkShared, M_S) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition({MO_I}, {RdBlk, RdBlkS, RdBlkM, CtoD}) {} {
+ r_requestToMem;
+ p_popRequestQueue;
+ }
+
+ transition(MO_I, {WrVicBlkShared, WrVicBlk, ClVicBlk, ClVicBlkShared}, MOD_I) {TagArrayWrite} {
+ f_setFrom;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(I_M, CPUData, M) {DataArrayWrite, TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ d_writeData;
+ pr_popResponseQueue;
+ }
+
+ transition(I_M, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ d_writeData;
+ pr_popResponseQueue;
+ }
+
+ transition(I_O, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ d_writeData;
+ pr_popResponseQueue;
+ }
+
+ transition(I_E, CPUData, E) {DataArrayWrite, TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ d_writeData;
+ pr_popResponseQueue;
+ }
+
+ transition(I_E, CPUDataShared, S) {DataArrayWrite, TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ d_writeData;
+ pr_popResponseQueue;
+ }
+
+ transition(I_S, {CPUData, CPUDataShared}, S) {DataArrayWrite, TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ d_writeData;
+ pr_popResponseQueue;
+ }
+
+ transition(S_M, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ d_writeData;
+ ut_updateTag; // update tag on writeback hits.
+ pr_popResponseQueue;
+ }
+
+ transition(S_O, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ d_writeData;
+ ut_updateTag; // update tag on writeback hits.
+ pr_popResponseQueue;
+ }
+
+ transition(S_E, CPUDataShared, S) {DataArrayWrite, TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ d_writeData;
+ ut_updateTag; // update tag on writeback hits.
+ pr_popResponseQueue;
+ }
+
+ transition(S_S, {CPUData, CPUDataShared}, S) {DataArrayWrite, TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ d_writeData;
+ ut_updateTag; // update tag on writeback hits.
+ pr_popResponseQueue;
+ }
+
+ transition(O_E, CPUDataShared, O) {DataArrayWrite, TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ d_writeData;
+ ut_updateTag; // update tag on writeback hits.
+ pr_popResponseQueue;
+ }
+
+ transition(O_S, {CPUData, CPUDataShared}, O) {DataArrayWrite, TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ d_writeData;
+ ut_updateTag; // update tag on writeback hits.
+ pr_popResponseQueue;
+ }
+
+ transition({D_I}, {CPUData, CPUDataShared}, I) {TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(MOD_I, {CPUData, CPUDataShared}, MO_I) {TagArrayWrite} {
+ uu_sendUnblock;
+ rf_resetFrom;
+ pr_popResponseQueue;
+ }
+
+ transition(I_I, {CPUData, CPUDataShared}, MO_I) {TagArrayWrite, DataArrayRead} {
+ uu_sendUnblock;
+ wt_writeDataToTBE;
+ rf_resetFrom;
+ pr_popResponseQueue;
+ }
+
+ transition(I_CD, {CPUData, CPUDataShared}, I) {DataArrayRead, TagArrayWrite} {
+ uu_sendUnblock;
+ wt_writeDataToTBE;
+ wb_data;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition({M, O}, L3_Repl, MO_I) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ vd_vicDirty;
+ i_invL3;
+ }
+
+ transition({E, S,}, L3_Repl, I) {TagArrayRead, TagArrayWrite} {
+ i_invL3;
+ }
+
+ transition({I_M, I_O, S_M, S_O, E_M, E_O}, L3_Repl) {} {
+ zz_recycleRequestQueue;
+ }
+
+ transition({O_M, O_O, O_E, O_S, M_M, M_O, M_E, M_S}, L3_Repl) {} {
+ zz_recycleRequestQueue;
+ }
+
+ transition({I_E, I_S, S_E, S_S, E_E, E_S}, L3_Repl) {} {
+ zz_recycleRequestQueue;
+ }
+
+ transition({M, O}, PrbInvData, I) {TagArrayRead, TagArrayWrite, DataArrayRead} {
+ pd_sendProbeResponseData;
+ i_invL3;
+ pp_popProbeQueue;
+ }
+
+ transition({E, S, I}, PrbInvData, I) {TagArrayRead, TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ i_invL3; // nothing will happen in I
+ pp_popProbeQueue;
+ }
+
+ transition({M, O, E, S, I}, PrbInv, I) {TagArrayRead, TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ i_invL3; // nothing will happen in I
+ pp_popProbeQueue;
+ }
+
+ transition({M, O}, PrbShrData, O) {TagArrayRead, DataArrayRead, TagArrayWrite} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition({E, S}, PrbShrData, S) {TagArrayRead, TagArrayWrite} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition(I, PrbShrData) {TagArrayRead} {
+ pm_sendProbeResponseMiss;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_I, PrbInvData, I_C) {TagArrayWrite, DataArrayRead} {
+ pdt_sendProbeResponseDataFromTBE;
+ mc_cancelMemWriteback;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_I, PrbInv, I_C) {TagArrayWrite} {
+ pi_sendProbeResponseInv;
+ mc_cancelMemWriteback;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_I, PrbShrData) {DataArrayRead} {
+ pdt_sendProbeResponseDataFromTBE;
+ pp_popProbeQueue;
+ }
+
+ transition(I_C, {PrbInvData, PrbInv}) {} {
+ pi_sendProbeResponseInv;
+ pp_popProbeQueue;
+ }
+
+ transition(I_C, PrbShrData) {} {
+ pm_sendProbeResponseMiss;
+ pp_popProbeQueue;
+ }
+
+ transition(I_I, {WBAck}, I_CD) {TagArrayWrite} {
+ pn_popNBResponseQueue;
+ }
+
+ transition(MOD_I, WBAck, D_I) {DataArrayRead} {
+ wb_data;
+ pn_popNBResponseQueue;
+ }
+
+ transition(MO_I, WBAck, I) {DataArrayRead, TagArrayWrite} {
+ wb_data;
+ dt_deallocateTBE;
+ pn_popNBResponseQueue;
+ }
+
+ transition(I_C, {WBAck}, I) {TagArrayWrite} {
+ dt_deallocateTBE;
+ pn_popNBResponseQueue;
+ }
+
+ transition({I_M, I_O, I_E, I_S}, CancelWB, I) {TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ i_invL3;
+ p_popRequestQueue;
+ }
+
+ transition({S_S, S_O, S_M, S_E}, CancelWB, S) {TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ p_popRequestQueue;
+ }
+
+ transition({E_M, E_O, E_E, E_S}, CancelWB, E) {TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ p_popRequestQueue;
+ }
+
+ transition({O_M, O_O, O_E, O_S}, CancelWB, O) {TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ p_popRequestQueue;
+ }
+
+ transition({M_M, M_O, M_E, M_S}, CancelWB, M) {TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ p_popRequestQueue;
+ }
+
+ transition(D_I, CancelWB, I) {TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ p_popRequestQueue;
+ }
+
+ transition(MOD_I, CancelWB, MO_I) {TagArrayWrite} {
+ uu_sendUnblock;
+ rf_resetFrom;
+ p_popRequestQueue;
+ }
+
+ transition(I_I, CancelWB, I_C) {TagArrayWrite} {
+ uu_sendUnblock;
+ rf_resetFrom;
+ mc_cancelMemWriteback;
+ p_popRequestQueue;
+ }
+
+ transition(I_CD, CancelWB, I) {TagArrayWrite} {
+ uu_sendUnblock;
+ dt_deallocateTBE;
+ mc_cancelMemWriteback;
+ p_popRequestQueue;
+ }
+
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm b/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm
new file mode 100644
index 000000000..fd84447a2
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-Region-CorePair.sm
@@ -0,0 +1,3009 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:CorePair, "CP-like Core Coherence")
+ : Sequencer * sequencer;
+ Sequencer * sequencer1;
+ CacheMemory * L1Icache;
+ CacheMemory * L1D0cache;
+ CacheMemory * L1D1cache;
+ CacheMemory * L2cache;
+ int regionBufferNum;
+ bool send_evictions := "False";
+ Cycles issue_latency := 5;
+ Cycles l2_hit_latency := 18;
+
+ // BEGIN Core Buffers
+
+ // To the Network
+ MessageBuffer * requestFromCore, network="To", virtual_network="0", ordered="true", vnet_type="request";
+ MessageBuffer * responseFromCore, network="To", virtual_network="2", ordered="false", vnet_type="response";
+ MessageBuffer * unblockFromCore, network="To", virtual_network="4", ordered="false", vnet_type="unblock";
+
+ // From the Network
+ MessageBuffer * probeToCore, network="From", virtual_network="0", ordered="false", vnet_type="request";
+ MessageBuffer * responseToCore, network="From", virtual_network="2", ordered="false", vnet_type="response";
+
+ MessageBuffer * mandatoryQueue, ordered="false";
+ MessageBuffer * triggerQueue, ordered="true";
+
+ // END Core Buffers
+
+{
+ // BEGIN STATES
+ state_declaration(State, desc="Cache states", default="CorePair_State_I") {
+
+ I, AccessPermission:Invalid, desc="Invalid";
+ S, AccessPermission:Read_Only, desc="Shared";
+ E0, AccessPermission:Read_Write, desc="Exclusive with Cluster 0 ownership";
+ E1, AccessPermission:Read_Write, desc="Exclusive with Cluster 1 ownership";
+ Es, AccessPermission:Read_Write, desc="Exclusive in core";
+ O, AccessPermission:Read_Only, desc="Owner state in core, both clusters and other cores may be sharing line";
+ Ms, AccessPermission:Read_Write, desc="Modified in core, both clusters may be sharing line";
+ M0, AccessPermission:Read_Write, desc="Modified with cluster ownership";
+ M1, AccessPermission:Read_Write, desc="Modified with cluster ownership";
+
+ // Transient States
+ I_M0, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+ I_M1, AccessPermission:Busy, desc="Invalid, issued RdBlkM, have not seen response yet";
+ I_M0M1, AccessPermission:Busy, desc="Was in I_M0, got a store request from other cluster as well";
+ I_M1M0, AccessPermission:Busy, desc="Was in I_M1, got a store request from other cluster as well";
+ I_M0Ms, AccessPermission:Busy, desc="Was in I_M0, got a load request from other cluster as well";
+ I_M1Ms, AccessPermission:Busy, desc="Was in I_M1, got a load request from other cluster as well";
+ I_E0S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+ I_E1S, AccessPermission:Busy, desc="Invalid, issued RdBlk, have not seen response yet";
+ I_ES, AccessPermission:Busy, desc="S_F got hit by invalidating probe, RdBlk response needs to go to both clusters";
+
+ IF_E0S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E0S but expecting a L2_to_L1D0 trigger, just drop when receive";
+ IF_E1S, AccessPermission:Busy, desc="something got hit with Probe Invalidate, now just I_E1S but expecting a L2_to_L1D1 trigger, just drop when receive";
+ IF_ES, AccessPermission:Busy, desc="same, but waiting for two fills";
+ IF0_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one";
+ IF1_ES, AccessPermission:Busy, desc="same, but waiting for two fills, got one";
+ F_S0, AccessPermission:Busy, desc="same, but going to S0 when trigger received";
+ F_S1, AccessPermission:Busy, desc="same, but going to S1 when trigger received";
+
+ ES_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for clean writeback ack";
+ MO_I, AccessPermission:Read_Only, desc="L2 replacement, waiting for dirty writeback ack";
+ MO_S0, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS";
+ MO_S1, AccessPermission:Read_Only, desc="M/O got Ifetch Miss, must write back first, then send RdBlkS";
+ S_F0, AccessPermission:Read_Only, desc="Shared, filling L1";
+ S_F1, AccessPermission:Read_Only, desc="Shared, filling L1";
+ S_F, AccessPermission:Read_Only, desc="Shared, filling L1";
+ O_F0, AccessPermission:Read_Only, desc="Owned, filling L1";
+ O_F1, AccessPermission:Read_Only, desc="Owned, filling L1";
+ O_F, AccessPermission:Read_Only, desc="Owned, filling L1";
+ Si_F0, AccessPermission:Read_Only, desc="Shared, filling icache";
+ Si_F1, AccessPermission:Read_Only, desc="Shared, filling icache";
+ S_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+ S_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+ O_M0, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+ O_M1, AccessPermission:Read_Only, desc="Shared, issued CtoD, have not seen response yet";
+ S0, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 0, waiting for response";
+ S1, AccessPermission:Busy, desc="RdBlkS on behalf of cluster 1, waiting for response";
+
+ Es_F0, AccessPermission:Read_Write, desc="Es, Cluster read, filling";
+ Es_F1, AccessPermission:Read_Write, desc="Es, Cluster read, filling";
+ Es_F, AccessPermission:Read_Write, desc="Es, other cluster read, filling";
+ E0_F, AccessPermission:Read_Write, desc="E0, cluster read, filling";
+ E1_F, AccessPermission:Read_Write, desc="...";
+ E0_Es, AccessPermission:Read_Write, desc="...";
+ E1_Es, AccessPermission:Read_Write, desc="...";
+ Ms_F0, AccessPermission:Read_Write, desc="...";
+ Ms_F1, AccessPermission:Read_Write, desc="...";
+ Ms_F, AccessPermission:Read_Write, desc="...";
+ M0_F, AccessPermission:Read_Write, desc="...";
+ M0_Ms, AccessPermission:Read_Write, desc="...";
+ M1_F, AccessPermission:Read_Write, desc="...";
+ M1_Ms, AccessPermission:Read_Write, desc="...";
+
+ I_C, AccessPermission:Invalid, desc="Invalid, but waiting for WBAck from NB from canceled writeback";
+ S0_C, AccessPermission:Busy, desc="MO_S0 hit by invalidating probe, waiting for WBAck form NB for canceled WB";
+ S1_C, AccessPermission:Busy, desc="MO_S1 hit by invalidating probe, waiting for WBAck form NB for canceled WB";
+ S_C, AccessPermission:Busy, desc="S*_C got NB_AckS, still waiting for WBAck";
+
+ } // END STATES
+
+ // BEGIN EVENTS
+ enumeration(Event, desc="CP Events") {
+ // CP Initiated events
+ C0_Load_L1miss, desc="Cluster 0 load, L1 missed";
+ C0_Load_L1hit, desc="Cluster 0 load, L1 hit";
+ C1_Load_L1miss, desc="Cluster 1 load L1 missed";
+ C1_Load_L1hit, desc="Cluster 1 load L1 hit";
+ Ifetch0_L1hit, desc="Instruction fetch, hit in the L1";
+ Ifetch1_L1hit, desc="Instruction fetch, hit in the L1";
+ Ifetch0_L1miss, desc="Instruction fetch, missed in the L1";
+ Ifetch1_L1miss, desc="Instruction fetch, missed in the L1";
+ C0_Store_L1miss, desc="Cluster 0 store missed in L1";
+ C0_Store_L1hit, desc="Cluster 0 store hit in L1";
+ C1_Store_L1miss, desc="Cluster 1 store missed in L1";
+ C1_Store_L1hit, desc="Cluster 1 store hit in L1";
+ // NB Initiated events
+ NB_AckS, desc="NB Ack to Core Request";
+ NB_AckM, desc="NB Ack to Core Request";
+ NB_AckE, desc="NB Ack to Core Request";
+
+ NB_AckWB, desc="NB Ack for writeback";
+
+ // Memory System initiatied events
+ L1I_Repl, desc="Replace address from L1I"; // Presumed clean
+ L1D0_Repl, desc="Replace address from L1D0"; // Presumed clean
+ L1D1_Repl, desc="Replace address from L1D1"; // Presumed clean
+ L2_Repl, desc="Replace address from L2";
+
+ L2_to_L1D0, desc="L1 fill from L2";
+ L2_to_L1D1, desc="L1 fill from L2";
+ L2_to_L1I, desc="L1 fill from L2";
+
+ // Probe Events
+ PrbInvData, desc="probe, return O or M data";
+ PrbInvDataDemand, desc="probe, return O or M data. Demand request";
+ PrbInv, desc="probe, no need for data";
+ PrbShrData, desc="probe downgrade, return O or M data";
+ PrbShrDataDemand, desc="probe downgrade, return O or M data. Demand request";
+ ForceRepl, desc="probe from r-buf. Act as though a repl";
+ ForceDowngrade, desc="probe from r-buf. Act as though a repl";
+
+ } // END EVENTS
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ L1D0DataArrayRead, desc="Read the data array";
+ L1D0DataArrayWrite, desc="Write the data array";
+ L1D0TagArrayRead, desc="Read the data array";
+ L1D0TagArrayWrite, desc="Write the data array";
+ L1D1DataArrayRead, desc="Read the data array";
+ L1D1DataArrayWrite, desc="Write the data array";
+ L1D1TagArrayRead, desc="Read the data array";
+ L1D1TagArrayWrite, desc="Write the data array";
+ L1IDataArrayRead, desc="Read the data array";
+ L1IDataArrayWrite, desc="Write the data array";
+ L1ITagArrayRead, desc="Read the data array";
+ L1ITagArrayWrite, desc="Write the data array";
+ L2DataArrayRead, desc="Read the data array";
+ L2DataArrayWrite, desc="Write the data array";
+ L2TagArrayRead, desc="Read the data array";
+ L2TagArrayWrite, desc="Write the data array";
+ }
+
+
+ // BEGIN STRUCTURE DEFINITIONS
+
+
+ // Cache Entry
+ structure(Entry, desc="...", interface="AbstractCacheEntry") {
+ State CacheState, desc="cache state";
+ bool Dirty, desc="Is the data dirty (diff than memory)?";
+ DataBlock DataBlk, desc="data for the block";
+ bool FromL2, default="false", desc="block just moved from L2";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ DataBlock DataBlk, desc="data for the block, required for concurrent writebacks";
+ bool Dirty, desc="Is the data dirty (different than memory)?";
+ int NumPendingMsgs, desc="Number of acks/data messages that this processor is waiting for";
+ bool Shared, desc="Victim hit by shared probe";
+ bool AckNeeded, desc="True if need to ack r-dir";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ TBETable TBEs, template="<CorePair_TBE>", constructor="m_number_of_TBEs";
+
+ Tick clockEdge();
+ Tick cyclesToTicks(Cycles c);
+
+ void set_cache_entry(AbstractCacheEntry b);
+ void unset_cache_entry();
+ void set_tbe(TBE b);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+ Cycles curCycle();
+
+ // END STRUCTURE DEFINITIONS
+
+ // BEGIN INTERNAL FUNCTIONS
+
+ MachineID getPeer(MachineID mach) {
+ return createMachineID(MachineType:RegionBuffer, intToID(regionBufferNum));
+ }
+
+ bool addressInCore(Addr addr) {
+ return (L2cache.isTagPresent(addr) || L1Icache.isTagPresent(addr) || L1D0cache.isTagPresent(addr) || L1D1cache.isTagPresent(addr));
+ }
+
+ Entry getCacheEntry(Addr address), return_by_pointer="yes" {
+ Entry L2cache_entry := static_cast(Entry, "pointer", L2cache.lookup(address));
+ return L2cache_entry;
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return tbe.DataBlk;
+ } else {
+ return getCacheEntry(addr).DataBlk;
+ }
+ }
+
+ Entry getL1CacheEntry(Addr addr, int cluster), return_by_pointer="yes" {
+ if (cluster == 0) {
+ Entry L1D0_entry := static_cast(Entry, "pointer", L1D0cache.lookup(addr));
+ return L1D0_entry;
+ } else {
+ Entry L1D1_entry := static_cast(Entry, "pointer", L1D1cache.lookup(addr));
+ return L1D1_entry;
+ }
+ }
+
+ Entry getICacheEntry(Addr addr), return_by_pointer="yes" {
+ Entry c_entry := static_cast(Entry, "pointer", L1Icache.lookup(addr));
+ return c_entry;
+ }
+
+ bool presentOrAvail2(Addr addr) {
+ return L2cache.isTagPresent(addr) || L2cache.cacheAvail(addr);
+ }
+
+ bool presentOrAvailI(Addr addr) {
+ return L1Icache.isTagPresent(addr) || L1Icache.cacheAvail(addr);
+ }
+
+ bool presentOrAvailD0(Addr addr) {
+ return L1D0cache.isTagPresent(addr) || L1D0cache.cacheAvail(addr);
+ }
+
+ bool presentOrAvailD1(Addr addr) {
+ return L1D1cache.isTagPresent(addr) || L1D1cache.cacheAvail(addr);
+ }
+
+ State getState(TBE tbe, Entry cache_entry, Addr addr) {
+ if(is_valid(tbe)) {
+ return tbe.TBEState;
+ } else if (is_valid(cache_entry)) {
+ return cache_entry.CacheState;
+ }
+ return State:I;
+ }
+
+ void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+ if (is_valid(tbe)) {
+ tbe.TBEState := state;
+ }
+
+ if (is_valid(cache_entry)) {
+ cache_entry.CacheState := state;
+ }
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ return CorePair_State_to_permission(tbe.TBEState);
+ }
+
+ Entry cache_entry := getCacheEntry(addr);
+ if(is_valid(cache_entry)) {
+ return CorePair_State_to_permission(cache_entry.CacheState);
+ }
+
+ return AccessPermission:NotPresent;
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ testAndRead(addr, tbe.DataBlk, pkt);
+ } else {
+ functionalMemoryRead(pkt);
+ }
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ int num_functional_writes := 0;
+
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ num_functional_writes := num_functional_writes +
+ testAndWrite(addr, tbe.DataBlk, pkt);
+ }
+
+ num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+ return num_functional_writes;
+ }
+
+ bool isValid(Addr addr) {
+ AccessPermission perm := getAccessPermission(addr);
+ if (perm == AccessPermission:NotPresent ||
+ perm == AccessPermission:Invalid ||
+ perm == AccessPermission:Busy) {
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+ if (is_valid(cache_entry)) {
+ cache_entry.changePermission(CorePair_State_to_permission(state));
+ }
+ }
+
+ MachineType testAndClearLocalHit(Entry cache_entry) {
+ assert(is_valid(cache_entry));
+ if (cache_entry.FromL2) {
+ cache_entry.FromL2 := false;
+ return MachineType:L2Cache;
+ } else {
+ return MachineType:L1Cache;
+ }
+ }
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:L1D0DataArrayRead) {
+ L1D0cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:L1D0DataArrayWrite) {
+ L1D0cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:L1D0TagArrayRead) {
+ L1D0cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:L1D0TagArrayWrite) {
+ L1D0cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ } else if (request_type == RequestType:L1D1DataArrayRead) {
+ L1D1cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:L1D1DataArrayWrite) {
+ L1D1cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:L1D1TagArrayRead) {
+ L1D1cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:L1D1TagArrayWrite) {
+ L1D1cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ } else if (request_type == RequestType:L1IDataArrayRead) {
+ L1Icache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:L1IDataArrayWrite) {
+ L1Icache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:L1ITagArrayRead) {
+ L1Icache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:L1ITagArrayWrite) {
+ L1Icache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ } else if (request_type == RequestType:L2DataArrayRead) {
+ L2cache.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:L2DataArrayWrite) {
+ L2cache.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:L2TagArrayRead) {
+ L2cache.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:L2TagArrayWrite) {
+ L2cache.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:L2DataArrayRead) {
+ return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L2DataArrayWrite) {
+ return L2cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L2TagArrayRead) {
+ return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L2TagArrayWrite) {
+ return L2cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L1D0DataArrayRead) {
+ return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L1D0DataArrayWrite) {
+ return L1D0cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L1D0TagArrayRead) {
+ return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L1D0TagArrayWrite) {
+ return L1D0cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L1D1DataArrayRead) {
+ return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L1D1DataArrayWrite) {
+ return L1D1cache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L1D1TagArrayRead) {
+ return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L1D1TagArrayWrite) {
+ return L1D1cache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L1IDataArrayRead) {
+ return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L1IDataArrayWrite) {
+ return L1Icache.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L1ITagArrayRead) {
+ return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L1ITagArrayWrite) {
+ return L1Icache.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else {
+ return true;
+ }
+ }
+
+ // END INTERNAL FUNCTIONS
+
+ // ** OUT_PORTS **
+
+ out_port(requestNetwork_out, CPURequestMsg, requestFromCore);
+ out_port(responseNetwork_out, ResponseMsg, responseFromCore);
+ out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+ out_port(unblockNetwork_out, UnblockMsg, unblockFromCore);
+
+ // ** IN_PORTS **
+
+ in_port(triggerQueue_in, TriggerMsg, triggerQueue, block_on="addr") {
+ if (triggerQueue_in.isReady(clockEdge())) {
+ peek(triggerQueue_in, TriggerMsg) {
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+
+ if (in_msg.Type == TriggerType:L2_to_L1) {
+ if (in_msg.Dest == CacheId:L1I) {
+ trigger(Event:L2_to_L1I, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Dest == CacheId:L1D0) {
+ trigger(Event:L2_to_L1D0, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Dest == CacheId:L1D1) {
+ trigger(Event:L2_to_L1D1, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("unexpected trigger dest");
+ }
+ }
+ }
+ }
+ }
+
+
+ in_port(probeNetwork_in, NBProbeRequestMsg, probeToCore) {
+ if (probeNetwork_in.isReady(clockEdge())) {
+ peek(probeNetwork_in, NBProbeRequestMsg, block_on="addr") {
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+
+ if (in_msg.Type == ProbeRequestType:PrbInv) {
+ if (in_msg.DemandRequest) {
+ trigger(Event:PrbInvDataDemand, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.ReturnData) {
+ trigger(Event:PrbInvData, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:PrbInv, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+ if (in_msg.DemandRequest) {
+ trigger(Event:PrbShrDataDemand, in_msg.addr, cache_entry, tbe);
+ } else {
+ assert(in_msg.ReturnData);
+ trigger(Event:PrbShrData, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == ProbeRequestType:PrbRepl) {
+ trigger(Event:ForceRepl, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == ProbeRequestType:PrbRegDowngrade) {
+ trigger(Event:ForceDowngrade, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("Unknown probe request");
+ }
+ }
+ }
+ }
+
+
+ // ResponseNetwork
+ in_port(responseToCore_in, ResponseMsg, responseToCore) {
+ if (responseToCore_in.isReady(clockEdge())) {
+ peek(responseToCore_in, ResponseMsg, block_on="addr") {
+
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+
+ if (in_msg.Type == CoherenceResponseType:NBSysResp) {
+ if (in_msg.State == CoherenceState:Modified) {
+ trigger(Event:NB_AckM, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.State == CoherenceState:Shared) {
+ trigger(Event:NB_AckS, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.State == CoherenceState:Exclusive) {
+ trigger(Event:NB_AckE, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceResponseType:NBSysWBAck) {
+ trigger(Event:NB_AckWB, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("Unexpected Response Message to Core");
+ }
+ }
+ }
+ }
+
+ // Nothing from the Unblock Network
+
+ // Mandatory Queue
+ in_port(mandatoryQueue_in, RubyRequest, mandatoryQueue, desc="...") {
+ if (mandatoryQueue_in.isReady(clockEdge())) {
+ peek(mandatoryQueue_in, RubyRequest, block_on="LineAddress") {
+
+ Entry cache_entry := getCacheEntry(in_msg.LineAddress);
+ TBE tbe := TBEs.lookup(in_msg.LineAddress);
+
+ if (in_msg.Type == RubyRequestType:IFETCH) {
+ // FETCH ACCESS
+
+ if (L1Icache.isTagPresent(in_msg.LineAddress)) {
+ if (mod(in_msg.contextId, 2) == 0) {
+ trigger(Event:Ifetch0_L1hit, in_msg.LineAddress, cache_entry, tbe);
+ } else {
+ trigger(Event:Ifetch1_L1hit, in_msg.LineAddress, cache_entry, tbe);
+ }
+ } else {
+ if (presentOrAvail2(in_msg.LineAddress)) {
+ if (presentOrAvailI(in_msg.LineAddress)) {
+ if (mod(in_msg.contextId, 2) == 0) {
+ trigger(Event:Ifetch0_L1miss, in_msg.LineAddress, cache_entry,
+ tbe);
+ } else {
+ trigger(Event:Ifetch1_L1miss, in_msg.LineAddress, cache_entry,
+ tbe);
+ }
+ } else {
+ Addr victim := L1Icache.cacheProbe(in_msg.LineAddress);
+ trigger(Event:L1I_Repl, victim,
+ getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } else { // Not present or avail in L2
+ Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+ DPRINTF(RubySlicc, "Victim for %s L2_Repl(0) is %s\n", in_msg.LineAddress, victim);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+ TBEs.lookup(victim));
+ }
+ }
+ } else {
+ // DATA ACCESS
+ if (mod(in_msg.contextId, 2) == 1) {
+ if (L1D1cache.isTagPresent(in_msg.LineAddress)) {
+ if (in_msg.Type == RubyRequestType:LD) {
+ trigger(Event:C1_Load_L1hit, in_msg.LineAddress, cache_entry,
+ tbe);
+ } else {
+ // Stores must write through, make sure L2 avail.
+ if (presentOrAvail2(in_msg.LineAddress)) {
+ trigger(Event:C1_Store_L1hit, in_msg.LineAddress, cache_entry,
+ tbe);
+ } else {
+ Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+ DPRINTF(RubySlicc, "Victim for %s L2_Repl(1) is %s\n", in_msg.LineAddress, victim);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+ TBEs.lookup(victim));
+ }
+ }
+ } else {
+ if (presentOrAvail2(in_msg.LineAddress)) {
+ if (presentOrAvailD1(in_msg.LineAddress)) {
+ if (in_msg.Type == RubyRequestType:LD) {
+ trigger(Event:C1_Load_L1miss, in_msg.LineAddress,
+ cache_entry, tbe);
+ } else {
+ trigger(Event:C1_Store_L1miss, in_msg.LineAddress,
+ cache_entry, tbe);
+ }
+ } else {
+ Addr victim := L1D1cache.cacheProbe(in_msg.LineAddress);
+ DPRINTF(RubySlicc, "Victim for %s L1D1_Repl is %s\n", in_msg.LineAddress, victim);
+ trigger(Event:L1D1_Repl, victim,
+ getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ } else { // not present or avail in L2
+ Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+ DPRINTF(RubySlicc, "Victim for %s L2_Repl(2) is %s\n", in_msg.LineAddress, victim);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim), TBEs.lookup(victim));
+ }
+ }
+ } else {
+ Entry L1D0cache_entry := getL1CacheEntry(in_msg.LineAddress, 0);
+ if (is_valid(L1D0cache_entry)) {
+ if (in_msg.Type == RubyRequestType:LD) {
+ trigger(Event:C0_Load_L1hit, in_msg.LineAddress, cache_entry,
+ tbe);
+ } else {
+ if (presentOrAvail2(in_msg.LineAddress)) {
+ trigger(Event:C0_Store_L1hit, in_msg.LineAddress, cache_entry,
+ tbe);
+ } else {
+ Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+ DPRINTF(RubySlicc, "Victim for %s L2_Repl(3) is %s\n", in_msg.LineAddress, victim);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+ TBEs.lookup(victim));
+ }
+ }
+ } else {
+ if (presentOrAvail2(in_msg.LineAddress)) {
+ if (presentOrAvailD0(in_msg.LineAddress)) {
+ if (in_msg.Type == RubyRequestType:LD) {
+ trigger(Event:C0_Load_L1miss, in_msg.LineAddress,
+ cache_entry, tbe);
+ } else {
+ trigger(Event:C0_Store_L1miss, in_msg.LineAddress,
+ cache_entry, tbe);
+ }
+ } else {
+ Addr victim := L1D0cache.cacheProbe(in_msg.LineAddress);
+ DPRINTF(RubySlicc, "Victim for %s L1D0_Repl is %s\n", in_msg.LineAddress, victim);
+ trigger(Event:L1D0_Repl, victim, getCacheEntry(victim),
+ TBEs.lookup(victim));
+ }
+ } else {
+ Addr victim := L2cache.cacheProbe(in_msg.LineAddress);
+ DPRINTF(RubySlicc, "Victim for %s L2_Repl(4) is %s\n", in_msg.LineAddress, victim);
+ trigger(Event:L2_Repl, victim, getCacheEntry(victim),
+ TBEs.lookup(victim));
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+ // ACTIONS
+ action(ii_invIcache, "ii", desc="invalidate iCache") {
+ if (L1Icache.isTagPresent(address)) {
+ L1Icache.deallocate(address);
+ }
+ }
+
+ action(i0_invCluster, "i0", desc="invalidate cluster 0") {
+ if (L1D0cache.isTagPresent(address)) {
+ L1D0cache.deallocate(address);
+ }
+ }
+
+ action(i1_invCluster, "i1", desc="invalidate cluster 1") {
+ if (L1D1cache.isTagPresent(address)) {
+ L1D1cache.deallocate(address);
+ }
+ }
+
+ action(ib_invBothClusters, "ib", desc="invalidate both clusters") {
+ if (L1D0cache.isTagPresent(address)) {
+ L1D0cache.deallocate(address);
+ }
+ if (L1D1cache.isTagPresent(address)) {
+ L1D1cache.deallocate(address);
+ }
+ }
+
+ action(i2_invL2, "i2", desc="invalidate L2") {
+ if(is_valid(cache_entry)) {
+ L2cache.deallocate(address);
+ }
+ unset_cache_entry();
+ }
+
+ action(n_issueRdBlk, "n", desc="Issue RdBlk") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlk;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := curCycle();
+ }
+ }
+
+ action(nM_issueRdBlkM, "nM", desc="Issue RdBlkM") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlkM;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := curCycle();
+ }
+ }
+
+ action(nMs_issueRdBlkMSinked, "nMs", desc="Issue RdBlkM with CtoDSinked") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlkM;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.CtoDSinked := true;
+ }
+ }
+
+ action(nS_issueRdBlkS, "nS", desc="Issue RdBlkS") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlkS;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := curCycle();
+ }
+ }
+
+ action(nSs_issueRdBlkSSinked, "nSs", desc="Issue RdBlkS with CtoDSinked") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:RdBlkS;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.CtoDSinked := true;
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ }
+ }
+
+ action(vd_victim, "vd", desc="Victimize M/O L2 Data") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ assert(is_valid(cache_entry));
+ out_msg.DataBlk := cache_entry.DataBlk;
+ assert(cache_entry.Dirty);
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.Type := CoherenceRequestType:VicDirty;
+ out_msg.InitialRequestTime := curCycle();
+ if (cache_entry.CacheState == State:O) {
+ out_msg.Shared := true;
+ } else {
+ out_msg.Shared := false;
+ }
+ }
+ }
+
+ action(vc_victim, "vc", desc="Victimize E/S L2 Data") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.Type := CoherenceRequestType:VicClean;
+ out_msg.InitialRequestTime := curCycle();
+ if (cache_entry.CacheState == State:S) {
+ out_msg.Shared := true;
+ } else {
+ out_msg.Shared := false;
+ }
+ }
+ }
+
+ // Could send these two directly to dir if we made a new out network on channel 0
+ action(vdf_victimForce, "vdf", desc="Victimize M/O L2 Data") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ assert(is_valid(cache_entry));
+ out_msg.DataBlk := cache_entry.DataBlk;
+ assert(cache_entry.Dirty);
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.Type := CoherenceRequestType:VicDirty;
+ out_msg.InitialRequestTime := curCycle();
+ if (cache_entry.CacheState == State:O) {
+ out_msg.Shared := true;
+ } else {
+ out_msg.Shared := false;
+ }
+ out_msg.Private := true;
+ }
+ }
+
+ action(vcf_victimForce, "vcf", desc="Victimize E/S L2 Data") {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.Type := CoherenceRequestType:VicClean;
+ out_msg.InitialRequestTime := curCycle();
+ if (cache_entry.CacheState == State:S) {
+ out_msg.Shared := true;
+ } else {
+ out_msg.Shared := false;
+ }
+ out_msg.Private := true;
+ }
+ }
+
+ action(a0_allocateL1D, "a0", desc="Allocate L1D0 Block") {
+ if (L1D0cache.isTagPresent(address) == false) {
+ L1D0cache.allocateVoid(address, new Entry);
+ }
+ }
+
+ action(a1_allocateL1D, "a1", desc="Allocate L1D1 Block") {
+ if (L1D1cache.isTagPresent(address) == false) {
+ L1D1cache.allocateVoid(address, new Entry);
+ }
+ }
+
+ action(ai_allocateL1I, "ai", desc="Allocate L1I Block") {
+ if (L1Icache.isTagPresent(address) == false) {
+ L1Icache.allocateVoid(address, new Entry);
+ }
+ }
+
+ action(a2_allocateL2, "a2", desc="Allocate L2 Block") {
+ if (is_invalid(cache_entry)) {
+ set_cache_entry(L2cache.allocate(address, new Entry));
+ }
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ check_allocate(TBEs);
+ assert(is_valid(cache_entry));
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ tbe.DataBlk := cache_entry.DataBlk; // Data only used for WBs
+ tbe.Dirty := cache_entry.Dirty;
+ tbe.Shared := false;
+ }
+
+ action(d_deallocateTBE, "d", desc="Deallocate TBE") {
+ TBEs.deallocate(address);
+ unset_tbe();
+ }
+
+ action(p_popMandatoryQueue, "pm", desc="Pop Mandatory Queue") {
+ mandatoryQueue_in.dequeue(clockEdge());
+ }
+
+ action(pr_popResponseQueue, "pr", desc="Pop Response Queue") {
+ responseToCore_in.dequeue(clockEdge());
+ }
+
+ action(pt_popTriggerQueue, "pt", desc="Pop Trigger Queue") {
+ triggerQueue_in.dequeue(clockEdge());
+ }
+
+ action(pp_popProbeQueue, "pp", desc="pop probe queue") {
+ probeNetwork_in.dequeue(clockEdge());
+ }
+
+ action(il0_loadDone, "il0", desc="Cluster 0 i load done") {
+ Entry entry := getICacheEntry(address);
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ assert(is_valid(entry));
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ sequencer.readCallback(address,
+ l2entry.DataBlk,
+ true,
+ testAndClearLocalHit(entry));
+ }
+
+ action(il1_loadDone, "il1", desc="Cluster 1 i load done") {
+ Entry entry := getICacheEntry(address);
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ assert(is_valid(entry));
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ sequencer1.readCallback(address,
+ l2entry.DataBlk,
+ true,
+ testAndClearLocalHit(entry));
+ }
+
+ action(l0_loadDone, "l0", desc="Cluster 0 load done") {
+ Entry entry := getL1CacheEntry(address, 0);
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ assert(is_valid(entry));
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ sequencer.readCallback(address,
+ l2entry.DataBlk,
+ true,
+ testAndClearLocalHit(entry));
+ }
+
+ action(l1_loadDone, "l1", desc="Cluster 1 load done") {
+ Entry entry := getL1CacheEntry(address, 1);
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ assert(is_valid(entry));
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ sequencer1.readCallback(address,
+ l2entry.DataBlk,
+ true,
+ testAndClearLocalHit(entry));
+ }
+
+ action(xl0_loadDone, "xl0", desc="Cluster 0 load done") {
+ peek(responseToCore_in, ResponseMsg) {
+ assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+ (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ DPRINTF(ProtocolTrace, "CP Load Done 0 -- address %s, data: %s\n",
+ address, l2entry.DataBlk);
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ assert(is_valid(l2entry));
+ sequencer.readCallback(address,
+ l2entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ }
+ }
+
+ action(xl1_loadDone, "xl1", desc="Cluster 1 load done") {
+ peek(responseToCore_in, ResponseMsg) {
+ assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+ (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ assert(is_valid(l2entry));
+ sequencer1.readCallback(address,
+ l2entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ }
+ }
+
+ action(xi0_loadDone, "xi0", desc="Cluster 0 i-load done") {
+ peek(responseToCore_in, ResponseMsg) {
+ assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+ (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ assert(is_valid(l2entry));
+ sequencer.readCallback(address,
+ l2entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ }
+ }
+
+ action(xi1_loadDone, "xi1", desc="Cluster 1 i-load done") {
+ peek(responseToCore_in, ResponseMsg) {
+ assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+ (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+ Entry l2entry := getCacheEntry(address); // Used for functional accesses
+ // L2 supplies data (functional accesses only look in L2, ok because L1
+ // writes through to L2)
+ assert(is_valid(l2entry));
+ sequencer1.readCallback(address,
+ l2entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ }
+ }
+
+ action(s0_storeDone, "s0", desc="Cluster 0 store done") {
+ Entry entry := getL1CacheEntry(address, 0);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ sequencer.writeCallback(address,
+ cache_entry.DataBlk,
+ true,
+ testAndClearLocalHit(entry));
+ cache_entry.Dirty := true;
+ entry.DataBlk := cache_entry.DataBlk;
+ entry.Dirty := true;
+ DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+ }
+
+ action(s1_storeDone, "s1", desc="Cluster 1 store done") {
+ Entry entry := getL1CacheEntry(address, 1);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ sequencer1.writeCallback(address,
+ cache_entry.DataBlk,
+ true,
+ testAndClearLocalHit(entry));
+ cache_entry.Dirty := true;
+ entry.Dirty := true;
+ entry.DataBlk := cache_entry.DataBlk;
+ DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+ }
+
+ action(xs0_storeDone, "xs0", desc="Cluster 0 store done") {
+ peek(responseToCore_in, ResponseMsg) {
+ Entry entry := getL1CacheEntry(address, 0);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+ (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+ sequencer.writeCallback(address,
+ cache_entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ cache_entry.Dirty := true;
+ entry.Dirty := true;
+ entry.DataBlk := cache_entry.DataBlk;
+ DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+ }
+ }
+
+ action(xs1_storeDone, "xs1", desc="Cluster 1 store done") {
+ peek(responseToCore_in, ResponseMsg) {
+ Entry entry := getL1CacheEntry(address, 1);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ assert((machineIDToMachineType(in_msg.Sender) == MachineType:Directory) ||
+ (machineIDToMachineType(in_msg.Sender) == MachineType:L3Cache));
+ sequencer1.writeCallback(address,
+ cache_entry.DataBlk,
+ false,
+ machineIDToMachineType(in_msg.Sender),
+ in_msg.InitialRequestTime,
+ in_msg.ForwardRequestTime,
+ in_msg.ProbeRequestStartTime);
+ cache_entry.Dirty := true;
+ entry.Dirty := true;
+ entry.DataBlk := cache_entry.DataBlk;
+ DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+ }
+ }
+
+ action(forward_eviction_to_cpu0, "fec0", desc="sends eviction information to processor0") {
+ if (send_evictions) {
+ DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+ sequencer.evictionCallback(address);
+ }
+ }
+
+ action(forward_eviction_to_cpu1, "fec1", desc="sends eviction information to processor1") {
+ if (send_evictions) {
+ DPRINTF(RubySlicc, "Sending invalidation for %s to the CPU\n", address);
+ sequencer1.evictionCallback(address);
+ }
+ }
+
+ action(ci_copyL2ToL1, "ci", desc="copy L2 data to L1") {
+ Entry entry := getICacheEntry(address);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ entry.Dirty := cache_entry.Dirty;
+ entry.DataBlk := cache_entry.DataBlk;
+ entry.FromL2 := true;
+ }
+
+ action(c0_copyL2ToL1, "c0", desc="copy L2 data to L1") {
+ Entry entry := getL1CacheEntry(address, 0);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ entry.Dirty := cache_entry.Dirty;
+ entry.DataBlk := cache_entry.DataBlk;
+ entry.FromL2 := true;
+ }
+
+ action(ss_sendStaleNotification, "ss", desc="stale data; nothing to writeback") {
+ peek(responseToCore_in, ResponseMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:StaleNotif;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(c1_copyL2ToL1, "c1", desc="copy L2 data to L1") {
+ Entry entry := getL1CacheEntry(address, 1);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ entry.Dirty := cache_entry.Dirty;
+ entry.DataBlk := cache_entry.DataBlk;
+ entry.FromL2 := true;
+ }
+
+ action(fi_L2ToL1, "fi", desc="L2 to L1 inst fill") {
+ enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:L2_to_L1;
+ out_msg.Dest := CacheId:L1I;
+ }
+ }
+
+ action(f0_L2ToL1, "f0", desc="L2 to L1 data fill") {
+ enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:L2_to_L1;
+ out_msg.Dest := CacheId:L1D0;
+ }
+ }
+
+ action(f1_L2ToL1, "f1", desc="L2 to L1 data fill") {
+ enqueue(triggerQueue_out, TriggerMsg, l2_hit_latency) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:L2_to_L1;
+ out_msg.Dest := CacheId:L1D1;
+ }
+ }
+
+ action(wi_writeIcache, "wi", desc="write data to icache (and l2)") {
+ peek(responseToCore_in, ResponseMsg) {
+ Entry entry := getICacheEntry(address);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ entry.DataBlk := in_msg.DataBlk;
+ entry.Dirty := in_msg.Dirty;
+ cache_entry.DataBlk := in_msg.DataBlk;
+ cache_entry.Dirty := in_msg.Dirty;
+ }
+ }
+
+ action(w0_writeDcache, "w0", desc="write data to dcache 0 (and l2)") {
+ peek(responseToCore_in, ResponseMsg) {
+ Entry entry := getL1CacheEntry(address, 0);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ entry.DataBlk := in_msg.DataBlk;
+ entry.Dirty := in_msg.Dirty;
+ cache_entry.DataBlk := in_msg.DataBlk;
+ cache_entry.Dirty := in_msg.Dirty;
+ }
+ }
+
+ action(w1_writeDcache, "w1", desc="write data to dcache 1 (and l2)") {
+ peek(responseToCore_in, ResponseMsg) {
+ Entry entry := getL1CacheEntry(address, 1);
+ assert(is_valid(entry));
+ assert(is_valid(cache_entry));
+ entry.DataBlk := in_msg.DataBlk;
+ entry.Dirty := in_msg.Dirty;
+ cache_entry.DataBlk := in_msg.DataBlk;
+ cache_entry.Dirty := in_msg.Dirty;
+ }
+ }
+
+ action(wb_data, "wb", desc="write back data") {
+ peek(responseToCore_in, ResponseMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUData;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.Dirty := tbe.Dirty;
+ if (tbe.Shared) {
+ out_msg.NbReqShared := true;
+ } else {
+ out_msg.NbReqShared := false;
+ }
+ out_msg.State := CoherenceState:Shared; // faux info
+ out_msg.MessageSize := MessageSizeType:Writeback_Data;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(pi_sendProbeResponseInv, "pi", desc="send probe ack inv, no data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ out_msg.Dirty := false;
+ out_msg.Hit := false;
+ out_msg.Ntsl := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ out_msg.isValid := isValid(address);
+ }
+ }
+
+ action(pim_sendProbeResponseInvMs, "pim", desc="send probe ack inv, no data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ out_msg.Dirty := false;
+ out_msg.Ntsl := true;
+ out_msg.Hit := false;
+ APPEND_TRANSITION_COMMENT("Setting Ms");
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ out_msg.isValid := isValid(address);
+ }
+ }
+
+ action(ph_sendProbeResponseHit, "ph", desc="send probe ack PrbShrData, no data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ assert(addressInCore(address) || is_valid(tbe));
+ out_msg.Dirty := false; // only true if sending back data i think
+ out_msg.Hit := true;
+ out_msg.Ntsl := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ out_msg.isValid := isValid(address);
+ }
+ }
+
+ action(pb_sendProbeResponseBackprobe, "pb", desc="send probe ack PrbShrData, no data, check for L1 residence") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ if (addressInCore(address)) {
+ out_msg.Hit := true;
+ } else {
+ out_msg.Hit := false;
+ }
+ out_msg.Dirty := false; // not sending back data, so def. not dirty
+ out_msg.Ntsl := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ out_msg.isValid := isValid(address);
+ }
+ }
+
+ action(pd_sendProbeResponseData, "pd", desc="send probe ack, with data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ assert(is_valid(cache_entry));
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ out_msg.DataBlk := cache_entry.DataBlk;
+ assert(cache_entry.Dirty);
+ out_msg.Dirty := true;
+ out_msg.Hit := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.isValid := isValid(address);
+ }
+ }
+
+ action(pdm_sendProbeResponseDataMs, "pdm", desc="send probe ack, with data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ assert(is_valid(cache_entry));
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ out_msg.DataBlk := cache_entry.DataBlk;
+ assert(cache_entry.Dirty);
+ out_msg.Dirty := true;
+ out_msg.Hit := true;
+ APPEND_TRANSITION_COMMENT("Setting Ms");
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.isValid := isValid(address);
+ }
+ }
+
+ action(pdt_sendProbeResponseDataFromTBE, "pdt", desc="send probe ack with data") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ assert(is_valid(tbe));
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.DataBlk := tbe.DataBlk;
+ assert(tbe.Dirty);
+ out_msg.Dirty := true;
+ out_msg.Hit := true;
+ out_msg.State := CoherenceState:NA;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.isValid := isValid(address);
+ }
+ }
+
+ action(ra_sendReplAck, "ra", desc="Send ack to r-buf that line is replaced if needed") {
+ if (is_invalid(tbe) || tbe.AckNeeded) {
+ enqueue(requestNetwork_out, CPURequestMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceRequestType:InvAck;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ }
+ APPEND_TRANSITION_COMMENT(" Sending ack to r-buf ");
+ } else {
+ APPEND_TRANSITION_COMMENT(" NOT Sending ack to r-buf ");
+ }
+ }
+
+ action(m_markAckNeeded, "m", desc="Mark TBE to send ack when deallocated") {
+ assert(is_valid(tbe));
+ tbe.AckNeeded := true;
+ }
+
+ action(mc_cancelWB, "mc", desc="send writeback cancel to L3") {
+ enqueue(responseNetwork_out, ResponseMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:CPUCancelWB;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(s_setSharedFlip, "s", desc="hit by shared probe, status may be different") {
+ assert(is_valid(tbe));
+ tbe.Shared := true;
+ }
+
+ action(uu_sendUnblock, "uu", desc="state changed, unblock") {
+ enqueue(unblockNetwork_out, UnblockMsg, issue_latency) {
+ out_msg.addr := address;
+ out_msg.Destination.add(map_Address_to_Directory(address));
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ out_msg.wasValid := isValid(address);
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(sdv_sendDoneValid, "sdv", desc="Request finished, send done ack") {
+ enqueue(unblockNetwork_out, UnblockMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.DoneAck := true;
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ if (is_valid(tbe)) {
+ out_msg.Dirty := tbe.Dirty;
+ } else if (is_valid(cache_entry)) {
+ out_msg.Dirty := cache_entry.Dirty;
+ } else {
+ out_msg.Dirty := false;
+ }
+ out_msg.validToInvalid := false;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(sdi_sendDoneInvalid, "sdi", desc="Request finished, send done ack") {
+ enqueue(unblockNetwork_out, UnblockMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Destination.add(getPeer(machineID));
+ out_msg.DoneAck := true;
+ out_msg.MessageSize := MessageSizeType:Unblock_Control;
+ if (is_valid(tbe)) {
+ out_msg.Dirty := tbe.Dirty;
+ } else if (is_valid(cache_entry)) {
+ out_msg.Dirty := cache_entry.Dirty;
+ } else {
+ out_msg.Dirty := false;
+ }
+ out_msg.validToInvalid := true;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(l10m_profileMiss, "l10m", desc="l10m miss profile") {
+ ++L1D0cache.demand_misses;
+ }
+
+ action(l11m_profileMiss, "l11m", desc="l11m miss profile") {
+ ++L1D1cache.demand_misses;
+ }
+
+ action(l1im_profileMiss, "l1lm", desc="l1im miss profile") {
+ ++L1Icache.demand_misses;
+ }
+
+ action(l2m_profileMiss, "l2m", desc="l2m miss profile") {
+ ++L2cache.demand_misses;
+ }
+
+ action(yy_recycleProbeQueue, "yy", desc="recycle probe queue") {
+ probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(zz_recycleMandatoryQueue, "\z", desc="recycle mandatory queue") {
+ mandatoryQueue_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+ // END ACTIONS
+
+ // BEGIN TRANSITIONS
+
+ // transitions from base
+ transition(I, C0_Load_L1miss, I_E0S) {L1D0TagArrayRead, L2TagArrayRead} {
+ // track misses, if implemented
+ // since in I state, L2 miss as well
+ l2m_profileMiss;
+ l10m_profileMiss;
+ a0_allocateL1D;
+ l1im_profileMiss;
+ a2_allocateL2;
+ i1_invCluster;
+ ii_invIcache;
+ n_issueRdBlk;
+ p_popMandatoryQueue;
+ }
+
+ transition(I, C1_Load_L1miss, I_E1S) {L1D1TagArrayRead, L2TagArrayRead} {
+ // track misses, if implemented
+ // since in I state, L2 miss as well
+ l2m_profileMiss;
+ l11m_profileMiss;
+ a1_allocateL1D;
+ a2_allocateL2;
+ i0_invCluster;
+ ii_invIcache;
+ n_issueRdBlk;
+ p_popMandatoryQueue;
+ }
+
+ transition(I, Ifetch0_L1miss, S0) {L1ITagArrayRead, L2TagArrayRead} {
+ // track misses, if implemented
+ // L2 miss as well
+ l10m_profileMiss;
+ l2m_profileMiss;
+ l1im_profileMiss;
+ ai_allocateL1I;
+ a2_allocateL2;
+ ib_invBothClusters;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ transition(I, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} {
+ l11m_profileMiss;
+ // track misses, if implemented
+ // L2 miss as well
+ l2m_profileMiss;
+ l1im_profileMiss;
+ ai_allocateL1I;
+ a2_allocateL2;
+ ib_invBothClusters;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ transition(I, C0_Store_L1miss, I_M0) {L1D0TagArrayRead,L2TagArrayRead} {
+ l2m_profileMiss;
+ l10m_profileMiss;
+ a0_allocateL1D;
+ a2_allocateL2;
+ i1_invCluster;
+ ii_invIcache;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+
+ transition(I, C1_Store_L1miss, I_M1) {L1D0TagArrayRead, L2TagArrayRead} {
+ l2m_profileMiss;
+ l11m_profileMiss;
+ a1_allocateL1D;
+ a2_allocateL2;
+ i0_invCluster;
+ ii_invIcache;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+
+ transition(S, C0_Load_L1miss, S_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(S, C1_Load_L1miss, S_F1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(S, Ifetch0_L1miss, Si_F0) {L1ITagArrayRead,L2TagArrayRead, L2DataArrayRead} {
+ l1im_profileMiss;
+ ai_allocateL1I;
+ fi_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(S, Ifetch1_L1miss, Si_F1) {L1ITagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+ l1im_profileMiss;
+ ai_allocateL1I;
+ fi_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition({S}, {C0_Store_L1hit, C0_Store_L1miss}, S_M0) {L1D0TagArrayRead, L2TagArrayRead}{
+ l2m_profileMiss;
+ l10m_profileMiss;
+ a0_allocateL1D;
+ i1_invCluster;
+ ii_invIcache;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+
+ transition({S}, {C1_Store_L1hit, C1_Store_L1miss}, S_M1) {L1D1TagArrayRead,L2TagArrayRead} {
+ l2m_profileMiss;
+ l11m_profileMiss;
+ a1_allocateL1D;
+ i0_invCluster;
+ ii_invIcache;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+ transition(Es, C0_Load_L1miss, Es_F0) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} { // can this be folded with S_F?
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(Es, C1_Load_L1miss, Es_F1) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} { // can this be folded with S_F?
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(Es, Ifetch0_L1miss, S0) {L1ITagArrayRead, L2TagArrayRead} {
+ l1im_profileMiss;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ ib_invBothClusters;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ transition(Es, Ifetch1_L1miss, S1) {L1ITagArrayRead, L2TagArrayRead} {
+ l1im_profileMiss;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ ib_invBothClusters;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ // THES SHOULD NOT BE INSTANTANEOUS BUT OH WELL FOR NOW
+ transition(Es, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayWrite,L1D0TagArrayRead, L2TagArrayRead, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} {
+ a0_allocateL1D;
+ i1_invCluster;
+ s0_storeDone; // instantaneous L1/L2 dirty - no writethrough delay
+ p_popMandatoryQueue;
+ }
+
+ transition(Es, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} {
+ a1_allocateL1D;
+ i0_invCluster;
+ s1_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition(E0, C0_Load_L1miss, E0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(E0, C1_Load_L1miss, E0_Es) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(E0, Ifetch0_L1miss, S0) {L2TagArrayRead, L1ITagArrayRead} {
+ l2m_profileMiss; // permissions miss, still issue RdBlkS
+ l1im_profileMiss;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ i0_invCluster;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ transition(E0, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead } {
+ l2m_profileMiss; // permissions miss, still issue RdBlkS
+ l1im_profileMiss;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ i0_invCluster;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ transition(E0, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+ a0_allocateL1D;
+ s0_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition(E0, C1_Store_L1miss, M1) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+ a1_allocateL1D;
+ l11m_profileMiss;
+ i0_invCluster;
+ s1_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition(E1, C1_Load_L1miss, E1_F) {L1D1TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+ a1_allocateL1D;
+ l11m_profileMiss;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(E1, C0_Load_L1miss, E1_Es) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+ a0_allocateL1D;
+ l10m_profileMiss;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(E1, Ifetch1_L1miss, S1) {L2TagArrayRead, L1ITagArrayRead} {
+ l2m_profileMiss; // permissions miss, still issue RdBlkS
+ l1im_profileMiss;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ i1_invCluster;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ transition(E1, Ifetch0_L1miss, S0) {L2TagArrayRead,L1ITagArrayRead} {
+ l2m_profileMiss; // permissions miss, still issue RdBlkS
+ l1im_profileMiss;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ i1_invCluster;
+ nS_issueRdBlkS;
+ p_popMandatoryQueue;
+ }
+
+ transition(E1, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+ a1_allocateL1D;
+ s1_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition(E1, C0_Store_L1miss, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite, L2DataArrayWrite} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ i1_invCluster;
+ s0_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition({O}, {C0_Store_L1hit, C0_Store_L1miss}, O_M0) {L1D0TagArrayRead, L2TagArrayRead} {
+ l2m_profileMiss; // permissions miss, still issue CtoD
+ l10m_profileMiss;
+ a0_allocateL1D;
+ i1_invCluster;
+ ii_invIcache;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+
+ transition({O}, {C1_Store_L1hit, C1_Store_L1miss}, O_M1) {L1D1TagArrayRead, L2TagArrayRead} {
+ l2m_profileMiss; // permissions miss, still issue RdBlkS
+ l11m_profileMiss;
+ a1_allocateL1D;
+ i0_invCluster;
+ ii_invIcache;
+ nM_issueRdBlkM;
+ p_popMandatoryQueue;
+ }
+
+ transition(O, C0_Load_L1miss, O_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(O, C1_Load_L1miss, O_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(Ms, C0_Load_L1miss, Ms_F0) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(Ms, C1_Load_L1miss, Ms_F1) {L2TagArrayRead, L2DataArrayRead, L1D1TagArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition({Ms, M0, M1, O}, Ifetch0_L1miss, MO_S0) {L1ITagArrayRead, L2TagArrayRead} {
+ l2m_profileMiss; // permissions miss
+ l1im_profileMiss;
+ ai_allocateL1I;
+ t_allocateTBE;
+ ib_invBothClusters;
+ vd_victim;
+// i2_invL2;
+ p_popMandatoryQueue;
+ }
+
+ transition({Ms, M0, M1, O}, Ifetch1_L1miss, MO_S1) {L1ITagArrayRead L2TagArrayRead } {
+ l2m_profileMiss; // permissions miss
+ l10m_profileMiss;
+ ai_allocateL1I;
+ t_allocateTBE;
+ ib_invBothClusters;
+ vd_victim;
+// i2_invL2;
+ p_popMandatoryQueue;
+ }
+
+ transition(Ms, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+ a0_allocateL1D;
+ i1_invCluster;
+ s0_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition(Ms, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D1TagArrayRead, L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+ a1_allocateL1D;
+ i0_invCluster;
+ s1_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition(M0, C0_Load_L1miss, M0_F) {L1D0TagArrayRead, L2TagArrayRead, L2DataArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(M0, C1_Load_L1miss, M0_Ms) {L2TagArrayRead, L2DataArrayRead,L1D1TagArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(M0, {C0_Store_L1hit, C0_Store_L1miss}) {L1D0TagArrayRead, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead} {
+ a0_allocateL1D;
+ s0_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition(M0, {C1_Store_L1hit, C1_Store_L1miss}, M1) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayRead, L2TagArrayWrite} {
+ a1_allocateL1D;
+ i0_invCluster;
+ s1_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition(M1, C0_Load_L1miss, M1_Ms) {L2TagArrayRead, L2DataArrayRead, L1D0TagArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(M1, C1_Load_L1miss, M1_F) {L1D1TagArrayRead L2TagArrayRead, L2DataArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(M1, {C0_Store_L1hit, C0_Store_L1miss}, M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayRead, L2DataArrayWrite, L2TagArrayWrite} {
+ a0_allocateL1D;
+ i1_invCluster;
+ s0_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ transition(M1, {C1_Store_L1hit, C1_Store_L1miss}) {L1D1TagArrayRead, L1D1DataArrayWrite, L2TagArrayRead, L2DataArrayWrite} {
+ a1_allocateL1D;
+ s1_storeDone;
+ p_popMandatoryQueue;
+ }
+
+ // end transitions from base
+
+ // Begin simple hit transitions
+ transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es,
+ Ms_F1, M0_Ms}, C0_Load_L1hit) {L1D0TagArrayRead, L1D0DataArrayRead} {
+ // track hits, if implemented
+ l0_loadDone;
+ p_popMandatoryQueue;
+ }
+
+ transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es,
+ Ms_F0, M1_Ms}, C1_Load_L1hit) {L1D1TagArrayRead, L1D1DataArrayRead} {
+ // track hits, if implemented
+ l1_loadDone;
+ p_popMandatoryQueue;
+ }
+
+ transition({S, S_C, S_F0, S_F1, S_F}, Ifetch0_L1hit) {L1ITagArrayRead, L1IDataArrayRead} {
+ // track hits, if implemented
+ il0_loadDone;
+ p_popMandatoryQueue;
+ }
+
+ transition({S, S_C, S_F0, S_F1, S_F}, Ifetch1_L1hit) {L1ITagArrayRead, L1IDataArrayWrite} {
+ // track hits, if implemented
+ il1_loadDone;
+ p_popMandatoryQueue;
+ }
+
+ // end simple hit transitions
+
+ // Transitions from transient states
+
+ // recycles
+ transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+ IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F,
+ E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, C0_Load_L1hit) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M1,
+ O_M1, S0, S1, I_C, S0_C, S1_C, S_C}, C0_Load_L1miss) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+ IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F,
+ E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, C1_Load_L1hit) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, Si_F0, Si_F1, S_M0,
+ O_M0, S0, S1, I_C, S0_C, S1_C, S_C}, C1_Load_L1miss) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, {Ifetch0_L1hit, Ifetch1_L1hit}) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES,
+ IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, ES_I, MO_I, S_F0, S_F1, S_F,
+ O_F0, O_F1, O_F, S_M0, S_M1, O_M0, O_M1, Es_F0, Es_F1, Es_F, E0_F,
+ E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, I_C,
+ S_C}, {Ifetch0_L1miss, Ifetch1_L1miss}) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_E1S, IF_E1S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1, S_F1, O_F1,
+ Si_F0, Si_F1, S_M1, O_M1, S0, S1, Es_F1, E1_F, E0_Es, Ms_F1, M0_Ms,
+ M1_F, I_C, S0_C, S1_C, S_C}, {C0_Store_L1miss}) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_E0S, IF_E0S, F_S0, F_S1, ES_I, MO_I, MO_S0, MO_S1 S_F0, O_F0,
+ Si_F0, Si_F1, S_M0, O_M0, S0, S1, Es_F0, E0_F, E1_Es, Ms_F0, M0_F,
+ M1_Ms, I_C, S0_C, S1_C, S_C}, {C1_Store_L1miss}) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+ IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M0, O_M0, Es_F0, Es_F1, Es_F, E0_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_Ms}, {C0_Store_L1hit}) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+ IF0_ES, IF1_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, Si_F0, Si_F1, S_M1,
+ O_M1, Es_F0, Es_F1, Es_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F,
+ M0_Ms, M1_F, M1_Ms}, {C1_Store_L1hit}) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_M0, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_ES, IF_E0S, IF_ES,
+ IF0_ES, IF1_ES, S_F0, S_F, O_F0, O_F, S_M0, O_M0, Es_F0, Es_F, E0_F,
+ E1_Es, Ms_F0, Ms_F, M0_F, M1_Ms}, L1D0_Repl) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E1S, I_ES, IF_E1S, IF_ES,
+ IF0_ES, IF1_ES, S_F1, S_F, O_F1, O_F, S_M1, O_M1, Es_F1, Es_F, E1_F,
+ E0_Es, Ms_F1, Ms_F, M0_Ms, M1_F}, L1D1_Repl) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({F_S0, F_S1, MO_S0, MO_S1, Si_F0, Si_F1, S0, S1, S0_C, S1_C}, L1I_Repl) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({S_C, S0_C, S1_C, S0, S1, Si_F0, Si_F1, I_M0, I_M1, I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_E0S, I_E1S, I_ES, S_F0, S_F1, S_F, O_F0, O_F1, O_F, S_M0, O_M0, S_M1, O_M1, Es_F0, Es_F1, Es_F, E0_F, E1_F, E0_Es, E1_Es, Ms_F0, Ms_F1, Ms_F, M0_F, M0_Ms, M1_F, M1_Ms, MO_S0, MO_S1, IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, L2_Repl) {} {
+ zz_recycleMandatoryQueue;
+ }
+
+ transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES, F_S0, F_S1}, {NB_AckS,
+ PrbInvData, PrbInvDataDemand, PrbInv, PrbShrData, PrbShrDataDemand}) {} {
+ zz_recycleMandatoryQueue; // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary.
+ }
+
+ transition({IF_E0S, IF_E1S, IF_ES, IF0_ES, IF1_ES}, NB_AckE) {} {
+ zz_recycleMandatoryQueue; // these should be resolved soon, but I didn't want to add more states, though technically they could be solved now, and probes really could be solved but i don't think it's really necessary.
+ }
+
+ transition({E0_Es, E1_F, Es_F1}, C0_Load_L1miss, Es_F) {L2DataArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(S_F1, C0_Load_L1miss, S_F) {L2DataArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(O_F1, C0_Load_L1miss, O_F) {L2DataArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition({Ms_F1, M0_Ms, M1_F}, C0_Load_L1miss, Ms_F) {L2DataArrayRead} {
+ l10m_profileMiss;
+ a0_allocateL1D;
+ f0_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(I_M0, C1_Load_L1miss, I_M0Ms){
+ l11m_profileMiss;
+ l2m_profileMiss;
+ a1_allocateL1D;
+ p_popMandatoryQueue;
+ }
+
+ transition(I_M1, C0_Load_L1miss, I_M1Ms){
+ l10m_profileMiss;
+ l2m_profileMiss;
+ a0_allocateL1D;
+ p_popMandatoryQueue;
+ }
+
+ transition(I_M0, C1_Store_L1miss, I_M0M1) {
+ l11m_profileMiss;
+ l2m_profileMiss;
+ a1_allocateL1D;
+ p_popMandatoryQueue;
+ }
+
+ transition(I_M1, C0_Store_L1miss, I_M1M0) {L1D0TagArrayRead, L1D0TagArrayWrite, L2TagArrayRead, L2TagArrayWrite} {
+ l2m_profileMiss;
+ a0_allocateL1D;
+ p_popMandatoryQueue;
+ }
+
+ transition(I_E0S, C1_Load_L1miss, I_ES) {} {
+ l2m_profileMiss;
+ l11m_profileMiss;
+ a1_allocateL1D;
+ p_popMandatoryQueue;
+ }
+
+ transition(I_E1S, C0_Load_L1miss, I_ES) {} {
+ l2m_profileMiss;
+ l10m_profileMiss;
+ l2m_profileMiss;
+ a0_allocateL1D;
+ p_popMandatoryQueue;
+ }
+
+ transition({E1_Es, E0_F, Es_F0}, C1_Load_L1miss, Es_F) {L2DataArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(S_F0, C1_Load_L1miss, S_F) { L2DataArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition(O_F0, C1_Load_L1miss, O_F) {L2DataArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition({Ms_F0, M1_Ms, M0_F}, C1_Load_L1miss, Ms_F) {L2DataArrayRead} {
+ l11m_profileMiss;
+ a1_allocateL1D;
+ f1_L2ToL1;
+ p_popMandatoryQueue;
+ }
+
+ transition({S, Es, E0, O, Ms, M0, O_F1, S_F1, Si_F0, Si_F1, Es_F1, E0_Es, Ms_F1, M0_Ms}, L1D0_Repl) {L1D0TagArrayRead} {
+ i0_invCluster;
+ }
+
+ transition({S, Es, E1, O, Ms, M1, O_F0, S_F0, Si_F0, Si_F1, Es_F0, E1_Es, Ms_F0, M1_Ms}, L1D1_Repl) {L1D1TagArrayRead} {
+ i1_invCluster;
+ }
+
+ transition({S, S_C, S_F0, S_F1}, L1I_Repl) {L1ITagArrayRead} {
+ ii_invIcache;
+ }
+
+ transition({S, E0, E1, Es}, L2_Repl, ES_I) {L2TagArrayRead,L1D0TagArrayRead, L1D1TagArrayRead, L1ITagArrayRead} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ t_allocateTBE;
+ vc_victim;
+ ib_invBothClusters;
+ i2_invL2;
+ ii_invIcache;
+ }
+
+ transition({Ms, M0, M1, O}, L2_Repl, MO_I) {L2TagArrayRead, L2TagArrayWrite, L1D0TagArrayRead, L1D1TagArrayRead} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ t_allocateTBE;
+ vd_victim;
+ i2_invL2;
+ ib_invBothClusters; // nothing will happen for D0 on M1, vice versa
+ }
+
+ transition(S0, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ wi_writeIcache;
+ xi0_loadDone;
+ uu_sendUnblock;
+ sdv_sendDoneValid;
+ pr_popResponseQueue;
+ }
+
+ transition(S1, NB_AckS, S) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ wi_writeIcache;
+ xi1_loadDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(S0_C, NB_AckS, S_C) { L1IDataArrayWrite,L2DataArrayWrite} {
+ // does not need send done since the rdblks was "sinked"
+ wi_writeIcache;
+ xi0_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(S1_C, NB_AckS, S_C) { L1D1DataArrayWrite,L2DataArrayWrite} {
+ wi_writeIcache;
+ xi1_loadDone;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_M0, NB_AckM, M0) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ w0_writeDcache;
+ xs0_storeDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+ w1_writeDcache;
+ xs1_storeDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ // THESE MO->M1 should not be instantaneous but oh well for now.
+ transition(I_M0M1, NB_AckM, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+ w0_writeDcache;
+ xs0_storeDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ i0_invCluster;
+ s1_storeDone;
+ pr_popResponseQueue;
+ }
+
+ transition(I_M1M0, NB_AckM, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite,L2DataArrayWrite, L2TagArrayWrite} {
+ w1_writeDcache;
+ xs1_storeDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ i1_invCluster;
+ s0_storeDone;
+ pr_popResponseQueue;
+ }
+
+ // Above shoudl be more like this, which has some latency to xfer to L1
+ transition(I_M0Ms, NB_AckM, M0_Ms) {L1D0DataArrayWrite,L2DataArrayWrite} {
+ w0_writeDcache;
+ xs0_storeDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ f1_L2ToL1;
+ pr_popResponseQueue;
+ }
+
+ transition(I_M1Ms, NB_AckM, M1_Ms) {L1D1DataArrayWrite,L2DataArrayWrite} {
+ w1_writeDcache;
+ xs1_storeDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ f0_L2ToL1;
+ pr_popResponseQueue;
+ }
+
+ transition(I_E0S, NB_AckE, E0) {L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ w0_writeDcache;
+ xl0_loadDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_E1S, NB_AckE, E1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ w1_writeDcache;
+ xl1_loadDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_ES, NB_AckE, Es) {L1D1DataArrayWrite, L1D1TagArrayWrite, L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite } {
+ w0_writeDcache;
+ xl0_loadDone;
+ w1_writeDcache;
+ xl1_loadDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_E0S, NB_AckS, S) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ w0_writeDcache;
+ xl0_loadDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_E1S, NB_AckS, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} {
+ w1_writeDcache;
+ xl1_loadDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(I_ES, NB_AckS, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayWrite} {
+ w0_writeDcache;
+ xl0_loadDone;
+ w1_writeDcache;
+ xl1_loadDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(S_F0, L2_to_L1D0, S) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(S_F1, L2_to_L1D1, S) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Si_F0, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ ci_copyL2ToL1;
+ il0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Si_F1, L2_to_L1I, S) {L1ITagArrayWrite, L1IDataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ ci_copyL2ToL1;
+ il1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(S_F, L2_to_L1D0, S_F1) { L1D0DataArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(S_F, L2_to_L1D1, S_F0) { L1D1DataArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(O_F0, L2_to_L1D0, O) { L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(O_F1, L2_to_L1D1, O) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(O_F, L2_to_L1D0, O_F1) { L1D0DataArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(O_F, L2_to_L1D1, O_F0) { L1D1DataArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(M1_F, L2_to_L1D1, M1) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(M0_F, L2_to_L1D0, M0) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Ms_F0, L2_to_L1D0, Ms) {L1D0DataArrayWrite, L1D0TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Ms_F1, L2_to_L1D1, Ms) {L1D1DataArrayWrite, L1D1TagArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Ms_F, L2_to_L1D0, Ms_F1) {L1D0DataArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Ms_F, L2_to_L1D1, Ms_F0) {L1IDataArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(M1_Ms, L2_to_L1D0, Ms) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(M0_Ms, L2_to_L1D1, Ms) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Es_F0, L2_to_L1D0, Es) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Es_F1, L2_to_L1D1, Es) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2TagArrayWrite, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Es_F, L2_to_L1D0, Es_F1) {L2TagArrayRead, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(Es_F, L2_to_L1D1, Es_F0) {L2TagArrayRead, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(E0_F, L2_to_L1D0, E0) {L2TagArrayRead, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(E1_F, L2_to_L1D1, E1) {L2TagArrayRead, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(E1_Es, L2_to_L1D0, Es) {L2TagArrayRead, L2DataArrayRead} {
+ c0_copyL2ToL1;
+ l0_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(E0_Es, L2_to_L1D1, Es) {L2TagArrayRead, L2DataArrayRead} {
+ c1_copyL2ToL1;
+ l1_loadDone;
+ pt_popTriggerQueue;
+ }
+
+ transition(IF_E0S, L2_to_L1D0, I_E0S) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(IF_E1S, L2_to_L1D1, I_E1S) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(IF_ES, L2_to_L1D0, IF1_ES) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(IF_ES, L2_to_L1D1, IF0_ES) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(IF0_ES, L2_to_L1D0, I_ES) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(IF1_ES, L2_to_L1D1, I_ES) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(F_S0, L2_to_L1I, S0) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(F_S1, L2_to_L1I, S1) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition({S_M0, O_M0}, NB_AckM, M0) {L1D0TagArrayWrite, L1D0DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ xs0_storeDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition({S_M1, O_M1}, NB_AckM, M1) {L1D1TagArrayWrite, L1D1DataArrayWrite, L2DataArrayWrite, L2TagArrayWrite} {
+ xs1_storeDone;
+ sdv_sendDoneValid;
+ uu_sendUnblock;
+ pr_popResponseQueue;
+ }
+
+ transition(MO_I, NB_AckWB, I) {L2TagArrayWrite} {
+ wb_data;
+ ra_sendReplAck;
+ sdi_sendDoneInvalid;
+ d_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(ES_I, NB_AckWB, I) {L2TagArrayWrite} {
+ wb_data;
+ ra_sendReplAck;
+ sdi_sendDoneInvalid;
+ d_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(MO_S0, NB_AckWB, S0) {L2TagArrayWrite} {
+ wb_data;
+ i2_invL2;
+ a2_allocateL2;
+ sdv_sendDoneValid;
+ nS_issueRdBlkS;
+ d_deallocateTBE; // FOO
+ pr_popResponseQueue;
+ }
+
+ transition(MO_S1, NB_AckWB, S1) {L2TagArrayWrite} {
+ wb_data;
+ i2_invL2;
+ a2_allocateL2;
+ sdv_sendDoneValid;
+ nS_issueRdBlkS;
+ d_deallocateTBE; // FOO
+ pr_popResponseQueue;
+ }
+
+ // Writeback cancel "ack"
+ transition(I_C, NB_AckWB, I) {L2TagArrayWrite} {
+ ss_sendStaleNotification;
+ sdi_sendDoneInvalid;
+ d_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(S0_C, NB_AckWB, S0) {L2TagArrayWrite} {
+ ss_sendStaleNotification;
+ sdv_sendDoneValid;
+ pr_popResponseQueue;
+ }
+
+ transition(S1_C, NB_AckWB, S1) {L2TagArrayWrite} {
+ ss_sendStaleNotification;
+ sdv_sendDoneValid;
+ pr_popResponseQueue;
+ }
+
+ transition(S_C, NB_AckWB, S) {L2TagArrayWrite} {
+ ss_sendStaleNotification;
+ sdv_sendDoneValid;
+ pr_popResponseQueue;
+ }
+
+ // Begin Probe Transitions
+
+ transition({Ms, M0, M1, O}, {PrbInvData, PrbInvDataDemand}, I) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pd_sendProbeResponseData;
+ i2_invL2;
+ ib_invBothClusters;
+ pp_popProbeQueue;
+ }
+
+ transition({Es, E0, E1, S, I}, {PrbInvData, PrbInvDataDemand}, I) {L2TagArrayRead, L2TagArrayWrite} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ i2_invL2;
+ ib_invBothClusters;
+ ii_invIcache; // only relevant for S
+ pp_popProbeQueue;
+ }
+
+ transition(S_C, {PrbInvData, PrbInvDataDemand}, I_C) {L2TagArrayWrite} {
+ t_allocateTBE;
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ i2_invL2;
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition(I_C, {PrbInvData, PrbInvDataDemand}, I_C) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms, M0, M1, O, Es, E0, E1, S, I}, PrbInv, I) {L2TagArrayRead, L2TagArrayWrite} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ i2_invL2; // nothing will happen in I
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition(S_C, PrbInv, I_C) {L2TagArrayWrite} {
+ t_allocateTBE;
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ i2_invL2;
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition(I_C, PrbInv, I_C) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms, M0, M1, O}, {PrbShrData, PrbShrDataDemand}, O) {L2TagArrayRead, L2TagArrayWrite, L2DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition({Es, E0, E1, S}, {PrbShrData, PrbShrDataDemand}, S) {L2TagArrayRead, L2TagArrayWrite} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition(S_C, {PrbShrData, PrbShrDataDemand}) {} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition({I, I_C}, {PrbShrData, PrbShrDataDemand}) {L2TagArrayRead} {
+ pb_sendProbeResponseBackprobe;
+ pp_popProbeQueue;
+ }
+
+ transition({I_M0, I_E0S}, {PrbInv, PrbInvData, PrbInvDataDemand}) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters; // must invalidate current data (only relevant for I_M0)
+ a0_allocateL1D; // but make sure there is room for incoming data when it arrives
+ pp_popProbeQueue;
+ }
+
+ transition({I_M1, I_E1S}, {PrbInv, PrbInvData, PrbInvDataDemand}) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters; // must invalidate current data (only relevant for I_M1)
+ a1_allocateL1D; // but make sure there is room for incoming data when it arrives
+ pp_popProbeQueue;
+ }
+
+ transition({I_M0M1, I_M1M0, I_M0Ms, I_M1Ms, I_ES}, {PrbInv, PrbInvData, PrbInvDataDemand, PrbShrData, PrbShrDataDemand}) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ a0_allocateL1D;
+ a1_allocateL1D;
+ pp_popProbeQueue;
+ }
+
+ transition({I_M0, I_E0S, I_M1, I_E1S}, {PrbShrData, PrbShrDataDemand}) {} {
+ pb_sendProbeResponseBackprobe;
+ pp_popProbeQueue;
+ }
+
+ transition(ES_I, {PrbInvData, PrbInvDataDemand}, I_C) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_I, {PrbInvData, PrbInvDataDemand}, I_C) {} {
+ pdt_sendProbeResponseDataFromTBE;
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_I, PrbInv, I_C) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition(ES_I, PrbInv, I_C) {} {
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ pp_popProbeQueue;
+ }
+
+ transition(ES_I, {PrbShrData, PrbShrDataDemand}, ES_I) {} {
+ ph_sendProbeResponseHit;
+ s_setSharedFlip;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_I, {PrbShrData, PrbShrDataDemand}, MO_I) {} {
+ pdt_sendProbeResponseDataFromTBE;
+ s_setSharedFlip;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_S0, {PrbInvData, PrbInvDataDemand}, S0_C) {L2TagArrayWrite} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pdt_sendProbeResponseDataFromTBE;
+ i2_invL2;
+ a2_allocateL2;
+ nS_issueRdBlkS;
+ d_deallocateTBE;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_S1, {PrbInvData, PrbInvDataDemand}, S1_C) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pdt_sendProbeResponseDataFromTBE;
+ i2_invL2;
+ a2_allocateL2;
+ nS_issueRdBlkS;
+ d_deallocateTBE;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_S0, PrbInv, S0_C) {L2TagArrayWrite} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ i2_invL2;
+ a2_allocateL2;
+ nS_issueRdBlkS;
+ d_deallocateTBE;
+ pp_popProbeQueue;
+ }
+
+ transition(MO_S1, PrbInv, S1_C) {L2TagArrayWrite} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ i2_invL2;
+ a2_allocateL2;
+ nS_issueRdBlkS;
+ d_deallocateTBE;
+ pp_popProbeQueue;
+ }
+
+ transition({MO_S0, MO_S1}, {PrbShrData, PrbShrDataDemand}) {} {
+ pdt_sendProbeResponseDataFromTBE;
+ s_setSharedFlip;
+ pp_popProbeQueue;
+ }
+
+ transition({S_F0, Es_F0, E0_F, E1_Es}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_E0S) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ // invalidate everything you've got
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ // but make sure you have room for what you need from the fill
+ a0_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({S_F1, Es_F1, E1_F, E0_Es}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_E1S) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ // invalidate everything you've got
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ // but make sure you have room for what you need from the fill
+ a1_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({S_F, Es_F}, {PrbInvData, PrbInvDataDemand, PrbInv}, IF_ES) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ // invalidate everything you've got
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ // but make sure you have room for what you need from the fill
+ a0_allocateL1D;
+ a1_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition(Si_F0, {PrbInvData, PrbInvDataDemand, PrbInv}, F_S0) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ nS_issueRdBlkS;
+ pp_popProbeQueue;
+ }
+
+ transition(Si_F1, {PrbInvData, PrbInvDataDemand, PrbInv}, F_S1) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ nS_issueRdBlkS;
+ pp_popProbeQueue;
+ }
+
+ transition({Es_F0, E0_F, E1_Es}, {PrbShrData, PrbShrDataDemand}, S_F0) {} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition({Es_F1, E1_F, E0_Es}, {PrbShrData, PrbShrDataDemand}, S_F1) {} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition(Es_F, {PrbShrData, PrbShrDataDemand}, S_F) {} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition({S_F0, S_F1, S_F, Si_F0, Si_F1}, {PrbShrData, PrbShrDataDemand}) {} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition(S_M0, {PrbInvData, PrbInvDataDemand}, I_M0) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pim_sendProbeResponseInvMs;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ a0_allocateL1D;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition(O_M0, {PrbInvData, PrbInvDataDemand}, I_M0) {L2DataArrayRead} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pdm_sendProbeResponseDataMs;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ a0_allocateL1D;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition({S_M0, O_M0}, {PrbInv}, I_M0) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pim_sendProbeResponseInvMs;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ a0_allocateL1D;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition(S_M1, {PrbInvData, PrbInvDataDemand}, I_M1) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pim_sendProbeResponseInvMs;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ a1_allocateL1D;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition(O_M1, {PrbInvData, PrbInvDataDemand}, I_M1) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pdm_sendProbeResponseDataMs;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ a1_allocateL1D;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition({S_M1, O_M1}, {PrbInv}, I_M1) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pim_sendProbeResponseInvMs;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ a1_allocateL1D;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition({S0, S0_C}, {PrbInvData, PrbInvDataDemand, PrbInv}) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition({S1, S1_C}, {PrbInvData, PrbInvDataDemand, PrbInv}) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ ii_invIcache;
+ i2_invL2;
+ ai_allocateL1I;
+ a2_allocateL2;
+ pp_popProbeQueue;
+ }
+
+ transition({S_M0, S_M1}, {PrbShrData, PrbShrDataDemand}) {} {
+ ph_sendProbeResponseHit;
+ pp_popProbeQueue;
+ }
+
+ transition({O_M0, O_M1}, {PrbShrData, PrbShrDataDemand}) {L2DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition({S0, S1, S0_C, S1_C}, {PrbShrData, PrbShrDataDemand}) {} {
+ pb_sendProbeResponseBackprobe;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F0, M0_F, M1_Ms, O_F0}, {PrbInvData, PrbInvDataDemand}, IF_E0S) {L2DataArrayRead} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pd_sendProbeResponseData;
+ ib_invBothClusters;
+ i2_invL2;
+ a0_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F1, M1_F, M0_Ms, O_F1}, {PrbInvData, PrbInvDataDemand}, IF_E1S) {L2DataArrayRead} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pd_sendProbeResponseData;
+ ib_invBothClusters;
+ i2_invL2;
+ a1_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F, O_F}, {PrbInvData, PrbInvDataDemand}, IF_ES) {L2DataArrayRead} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pd_sendProbeResponseData;
+ ib_invBothClusters;
+ i2_invL2;
+ a0_allocateL1D;
+ a1_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F0, M0_F, M1_Ms, O_F0}, PrbInv, IF_E0S) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ i2_invL2;
+ a0_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F1, M1_F, M0_Ms, O_F1}, PrbInv, IF_E1S) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ i2_invL2;
+ a1_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F, O_F}, PrbInv, IF_ES) {} {
+ forward_eviction_to_cpu0;
+ forward_eviction_to_cpu1;
+ pi_sendProbeResponseInv;
+ ib_invBothClusters;
+ i2_invL2;
+ a0_allocateL1D;
+ a1_allocateL1D;
+ a2_allocateL2;
+ n_issueRdBlk;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F0, M0_F, M1_Ms}, {PrbShrData, PrbShrDataDemand}, O_F0) {L2DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition({Ms_F1, M1_F, M0_Ms}, {PrbShrData, PrbShrDataDemand}, O_F1) {} {
+ }
+
+ transition({Ms_F}, {PrbShrData, PrbShrDataDemand}, O_F) {L2DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ transition({O_F0, O_F1, O_F}, {PrbShrData, PrbShrDataDemand}) {L2DataArrayRead} {
+ pd_sendProbeResponseData;
+ pp_popProbeQueue;
+ }
+
+ // END TRANSITIONS
+}
+
+
diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm b/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm
new file mode 100644
index 000000000..52d87fb8b
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-Region-dir.sm
@@ -0,0 +1,2038 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:Directory, "AMD_Base-like protocol")
+: DirectoryMemory * directory;
+ CacheMemory * L3CacheMemory;
+ Cycles response_latency := 5;
+ Cycles response_latency_regionDir := 1;
+ Cycles l3_hit_latency := 30;
+ bool useL3OnWT := "False";
+ Cycles to_memory_controller_latency := 1;
+
+ // From the Cores
+ MessageBuffer * requestFromCores, network="From", virtual_network="0", vnet_type="request";
+ MessageBuffer * responseFromCores, network="From", virtual_network="2", vnet_type="response";
+ MessageBuffer * unblockFromCores, network="From", virtual_network="4", vnet_type="unblock";
+
+ // To the Cores
+ MessageBuffer * probeToCore, network="To", virtual_network="0", vnet_type="request";
+ MessageBuffer * responseToCore, network="To", virtual_network="2", vnet_type="response";
+
+ // From region buffer
+ MessageBuffer * reqFromRegBuf, network="From", virtual_network="7", vnet_type="request";
+
+ // To Region directory
+ MessageBuffer * reqToRegDir, network="To", virtual_network="5", vnet_type="request";
+ MessageBuffer * reqFromRegDir, network="From", virtual_network="5", vnet_type="request";
+ MessageBuffer * unblockToRegDir, network="To", virtual_network="4", vnet_type="unblock";
+
+ MessageBuffer * triggerQueue;
+ MessageBuffer * L3triggerQueue;
+ MessageBuffer * responseFromMemory;
+{
+ // STATES
+ state_declaration(State, desc="Directory states", default="Directory_State_U") {
+ U, AccessPermission:Backing_Store, desc="unblocked";
+ BR, AccessPermission:Backing_Store, desc="got CPU read request, blocked while sent to L3";
+ BW, AccessPermission:Backing_Store, desc="got CPU write request, blocked while sent to L3";
+ BL, AccessPermission:Busy, desc="got L3 WB request";
+ // BL is Busy because it's possible for the data only to be in the network
+ // in the WB, L3 has sent it and gone on with its business in possibly I
+ // state.
+ BI, AccessPermission:Backing_Store, desc="Blocked waiting for inv ack from core";
+ BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory";
+ BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory";
+ B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory";
+ BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory";
+ BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory";
+ BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory";
+ B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory";
+ BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack";
+
+ // These are needed for when a private requests was issued before an inv was received
+ // for writebacks
+ BS_Pm_BL, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ BM_Pm_BL, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ B_Pm_BL, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ BP_BL, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory";
+ // for reads
+ BS_Pm_B, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ BM_Pm_B, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ B_Pm_B, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ BP_B, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory";
+ }
+
+ // Events
+ enumeration(Event, desc="Directory events") {
+ // CPU requests
+ RdBlkS, desc="...";
+ RdBlkM, desc="...";
+ RdBlk, desc="...";
+ WriteThrough, desc="WriteThrough Message";
+ Atomic, desc="Atomic Message";
+
+ RdBlkSP, desc="...";
+ RdBlkMP, desc="...";
+ RdBlkP, desc="...";
+ VicDirtyP, desc="...";
+ VicCleanP, desc="...";
+ WriteThroughP, desc="WriteThrough Message";
+ AtomicP, desc="Atomic Message";
+
+ // writebacks
+ VicDirty, desc="...";
+ VicClean, desc="...";
+ CPUData, desc="WB data from CPU";
+ StaleWB, desc="WB response for a no longer valid request";
+
+ // probe responses
+ CPUPrbResp, desc="Probe Response Msg";
+ LastCPUPrbResp, desc="Last Probe Response Msg";
+
+ ProbeAcksComplete, desc="Probe Acks Complete";
+
+ L3Hit, desc="Hit in L3 return data to core";
+
+ // Memory Controller
+ MemData, desc="Fetched data from memory arrives";
+ WBAck, desc="Writeback Ack from memory arrives";
+
+ CoreUnblock, desc="Core received data, unblock";
+ UnblockWriteThrough, desc="unblock, self triggered";
+
+ StaleVicDirty, desc="Core invalidated before VicDirty processed";
+ StaleVicDirtyP, desc="Core invalidated before VicDirty processed";
+
+ // For region protocol
+ CPUReq, desc="Generic CPU request";
+ Inv, desc="Region dir needs a block invalidated";
+ Downgrade, desc="Region dir needs a block downgraded";
+
+ // For private accesses (bypassed reg-dir)
+ CPUReadP, desc="Initial req from core, sent to L3";
+ CPUWriteP, desc="Initial req from core, sent to L3";
+ }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ L3DataArrayRead, desc="Read the data array";
+ L3DataArrayWrite, desc="Write the data array";
+ L3TagArrayRead, desc="Read the data array";
+ L3TagArrayWrite, desc="Write the data array";
+ }
+
+ // TYPES
+
+ // DirectoryEntry
+ structure(Entry, desc="...", interface="AbstractEntry") {
+ State DirectoryState, desc="Directory state";
+ DataBlock DataBlk, desc="data for the block";
+ NetDest VicDirtyIgnore, desc="VicDirty coming from whom to ignore";
+ }
+
+ structure(CacheEntry, desc="...", interface="AbstractCacheEntry") {
+ DataBlock DataBlk, desc="data for the block";
+ MachineID LastSender, desc="Mach which this block came from";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ DataBlock DataBlk, desc="data for the block";
+ DataBlock DataBlkAux, desc="Auxiliary data for the block";
+ bool Dirty, desc="Is the data dirty?";
+ int NumPendingAcks, desc="num acks expected";
+ MachineID OriginalRequestor, desc="Original Requestor";
+ MachineID WTRequestor, desc="WT Requestor";
+ bool Cached, desc="data hit in Cache";
+ bool MemData, desc="Got MemData?",default="false";
+ bool wtData, desc="Got write through data?",default="false";
+ bool atomicData, desc="Got Atomic op?",default="false";
+ Cycles InitialRequestTime, desc="...";
+ Cycles ForwardRequestTime, desc="...";
+ Cycles ProbeRequestStartTime, desc="...";
+ bool DemandRequest, desc="for profiling";
+ MachineID LastSender, desc="Mach which this block came from";
+ bool L3Hit, default="false", desc="Was this an L3 hit?";
+ bool TriggeredAcksComplete, default="false", desc="True if already triggered acks complete";
+ WriteMask writeMask, desc="outstanding write through mask";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
+
+ Tick clockEdge();
+ Tick cyclesToTicks(Cycles c);
+
+ void set_tbe(TBE a);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+ Cycles curCycle();
+
+ Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" {
+ Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr));
+
+ if (is_valid(dir_entry)) {
+ //DPRINTF(RubySlicc, "Getting entry %s: %s\n", addr, dir_entry.DataBlk);
+ return dir_entry;
+ }
+
+ dir_entry := static_cast(Entry, "pointer",
+ directory.allocate(addr, new Entry));
+ return dir_entry;
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ TBE tbe := TBEs.lookup(addr);
+ if (is_valid(tbe) && tbe.MemData) {
+ DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe);
+ return tbe.DataBlk;
+ }
+ DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr));
+ return getDirectoryEntry(addr).DataBlk;
+ }
+
+ State getState(TBE tbe, CacheEntry entry, Addr addr) {
+ return getDirectoryEntry(addr).DirectoryState;
+ }
+
+ State getStateFromAddr(Addr addr) {
+ return getDirectoryEntry(addr).DirectoryState;
+ }
+
+ void setState(TBE tbe, CacheEntry entry, Addr addr, State state) {
+ getDirectoryEntry(addr).DirectoryState := state;
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ // For this Directory, all permissions are just tracked in Directory, since
+ // it's not possible to have something in TBE but not Dir, just keep track
+ // of state all in one place.
+ if(directory.isPresent(addr)) {
+ return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState);
+ }
+
+ return AccessPermission:NotPresent;
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ testAndRead(addr, tbe.DataBlk, pkt);
+ } else {
+ functionalMemoryRead(pkt);
+ }
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ int num_functional_writes := 0;
+
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ num_functional_writes := num_functional_writes +
+ testAndWrite(addr, tbe.DataBlk, pkt);
+ }
+
+ num_functional_writes := num_functional_writes + functionalMemoryWrite(pkt);
+ return num_functional_writes;
+ }
+
+ void setAccessPermission(CacheEntry entry, Addr addr, State state) {
+ getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state));
+ }
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:L3DataArrayRead) {
+ L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:L3DataArrayWrite) {
+ L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:L3TagArrayRead) {
+ L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:L3TagArrayWrite) {
+ L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:L3DataArrayRead) {
+ return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L3DataArrayWrite) {
+ return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L3TagArrayRead) {
+ return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L3TagArrayWrite) {
+ return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else {
+ error("Invalid RequestType type in checkResourceAvailable");
+ return true;
+ }
+ }
+
+ // ** OUT_PORTS **
+ out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore);
+ out_port(responseNetwork_out, ResponseMsg, responseToCore);
+
+ out_port(requestNetworkReg_out, CPURequestMsg, reqToRegDir);
+ out_port(regAckNetwork_out, UnblockMsg, unblockToRegDir);
+
+ out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+ out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue);
+
+ // ** IN_PORTS **
+
+ // Trigger Queue
+ in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=7) {
+ if (triggerQueue_in.isReady(clockEdge())) {
+ peek(triggerQueue_in, TriggerMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == TriggerType:AcksComplete) {
+ trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == TriggerType:UnblockWriteThrough) {
+ trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe);
+ } else {
+ error("Unknown trigger msg");
+ }
+ }
+ }
+ }
+
+ in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=6) {
+ if (L3TriggerQueue_in.isReady(clockEdge())) {
+ peek(L3TriggerQueue_in, TriggerMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == TriggerType:L3Hit) {
+ trigger(Event:L3Hit, in_msg.addr, entry, tbe);
+ } else {
+ error("Unknown trigger msg");
+ }
+ }
+ }
+ }
+
+ // Unblock Network
+ in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=5) {
+ if (unblockNetwork_in.isReady(clockEdge())) {
+ peek(unblockNetwork_in, UnblockMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ trigger(Event:CoreUnblock, in_msg.addr, entry, tbe);
+ }
+ }
+ }
+
+ // Core response network
+ in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=4) {
+ if (responseNetwork_in.isReady(clockEdge())) {
+ peek(responseNetwork_in, ResponseMsg) {
+ DPRINTF(RubySlicc, "core responses %s\n", in_msg);
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+ if (is_valid(tbe) && tbe.NumPendingAcks == 1
+ && tbe.TriggeredAcksComplete == false) {
+ trigger(Event:LastCPUPrbResp, in_msg.addr, entry, tbe);
+ } else {
+ trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceResponseType:CPUData) {
+ trigger(Event:CPUData, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+ trigger(Event:StaleWB, in_msg.addr, entry, tbe);
+ } else {
+ error("Unexpected response type");
+ }
+ }
+ }
+ }
+
+ // off-chip memory request/response is done
+ in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=3) {
+ if (memQueue_in.isReady(clockEdge())) {
+ peek(memQueue_in, MemoryMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == MemoryRequestType:MEMORY_READ) {
+ trigger(Event:MemData, in_msg.addr, entry, tbe);
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) {
+ trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them.
+ } else {
+ DPRINTF(RubySlicc, "%s\n", in_msg.Type);
+ error("Invalid message");
+ }
+ }
+ }
+ }
+
+ in_port(regBuf_in, CPURequestMsg, reqFromRegBuf, rank=2) {
+ if (regBuf_in.isReady(clockEdge())) {
+ peek(regBuf_in, CPURequestMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == CoherenceRequestType:ForceInv) {
+ trigger(Event:Inv, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:ForceDowngrade) {
+ trigger(Event:Downgrade, in_msg.addr, entry, tbe);
+ } else {
+ error("Bad request from region buffer");
+ }
+ }
+ }
+ }
+
+ in_port(regDir_in, CPURequestMsg, reqFromRegDir, rank=1) {
+ if (regDir_in.isReady(clockEdge())) {
+ peek(regDir_in, CPURequestMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == CoherenceRequestType:RdBlk) {
+ trigger(Event:RdBlk, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+ trigger(Event:RdBlkS, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+ trigger(Event:RdBlkM, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+ trigger(Event:Atomic, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ trigger(Event:WriteThrough, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+ if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+ DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr);
+ trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+ } else {
+ trigger(Event:VicDirty, in_msg.addr, entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+ if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+ DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr);
+ trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+ } else {
+ trigger(Event:VicClean, in_msg.addr, entry, tbe);
+ }
+ } else {
+ error("Bad message type fwded from Region Dir");
+ }
+ }
+ }
+ }
+
+ in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) {
+ if (requestNetwork_in.isReady(clockEdge())) {
+ peek(requestNetwork_in, CPURequestMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Private) {
+ // Bypass the region dir
+ if (in_msg.Type == CoherenceRequestType:RdBlk) {
+ trigger(Event:RdBlkP, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+ trigger(Event:RdBlkSP, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+ trigger(Event:RdBlkMP, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+ trigger(Event:AtomicP, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ trigger(Event:WriteThroughP, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+ if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+ DPRINTF(RubySlicc, "Dropping VicDirtyP for address %s\n", in_msg.addr);
+ trigger(Event:StaleVicDirtyP, in_msg.addr, entry, tbe);
+ } else {
+ DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr);
+ trigger(Event:VicDirtyP, in_msg.addr, entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+ if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+ DPRINTF(RubySlicc, "Dropping VicCleanP for address %s\n", in_msg.addr);
+ trigger(Event:StaleVicDirtyP, in_msg.addr, entry, tbe);
+ } else {
+ DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
+ trigger(Event:VicCleanP, in_msg.addr, entry, tbe);
+ }
+ } else {
+ error("Bad message type for private access");
+ }
+ } else {
+ trigger(Event:CPUReq, in_msg.addr, entry, tbe);
+ }
+ }
+ }
+ }
+
+ // Actions
+ action(s_sendResponseS, "s", desc="send Shared response") {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ if (tbe.L3Hit) {
+ out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+ } else {
+ out_msg.Sender := machineID;
+ }
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := false;
+ out_msg.State := CoherenceState:Shared;
+ out_msg.InitialRequestTime := tbe.InitialRequestTime;
+ out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+ out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+ out_msg.OriginalResponder := tbe.LastSender;
+ out_msg.DemandRequest := tbe.DemandRequest;
+ out_msg.L3Hit := tbe.L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(es_sendResponseES, "es", desc="send Exclusive or Shared response") {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ if (tbe.L3Hit) {
+ out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+ } else {
+ out_msg.Sender := machineID;
+ }
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := tbe.Dirty;
+ if (tbe.Cached) {
+ out_msg.State := CoherenceState:Shared;
+ } else {
+ out_msg.State := CoherenceState:Exclusive;
+ }
+ out_msg.InitialRequestTime := tbe.InitialRequestTime;
+ out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+ out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+ out_msg.OriginalResponder := tbe.LastSender;
+ out_msg.DemandRequest := tbe.DemandRequest;
+ out_msg.L3Hit := tbe.L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(m_sendResponseM, "m", desc="send Modified response") {
+ if (tbe.wtData) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:UnblockWriteThrough;
+ }
+ } else {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ if (tbe.L3Hit) {
+ out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+ } else {
+ out_msg.Sender := machineID;
+ }
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := tbe.Dirty;
+ out_msg.State := CoherenceState:Modified;
+ out_msg.CtoD := false;
+ out_msg.InitialRequestTime := tbe.InitialRequestTime;
+ out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+ out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+ out_msg.OriginalResponder := tbe.LastSender;
+ out_msg.DemandRequest := tbe.DemandRequest;
+ out_msg.L3Hit := tbe.L3Hit;
+ if (tbe.atomicData) {
+ out_msg.WTRequestor := tbe.WTRequestor;
+ }
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ if (tbe.atomicData) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:UnblockWriteThrough;
+ }
+ }
+ }
+ }
+
+ action(sb_sendResponseSBypass, "sb", desc="send Shared response") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ if (tbe.L3Hit) {
+ out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+ } else {
+ out_msg.Sender := machineID;
+ }
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := false;
+ out_msg.State := CoherenceState:Shared;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.ForwardRequestTime := curCycle();
+ out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+ out_msg.OriginalResponder := tbe.LastSender;
+ out_msg.DemandRequest := false;
+ out_msg.L3Hit := tbe.L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(esb_sendResponseESBypass, "esb", desc="send Exclusive or Shared response") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ if (tbe.L3Hit) {
+ out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+ } else {
+ out_msg.Sender := machineID;
+ }
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := tbe.Dirty;
+ if (tbe.Cached || in_msg.ForceShared) {
+ out_msg.State := CoherenceState:Shared;
+ } else {
+ out_msg.State := CoherenceState:Exclusive;
+ }
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.ForwardRequestTime := curCycle();
+ out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+ out_msg.OriginalResponder := tbe.LastSender;
+ out_msg.DemandRequest := false;
+ out_msg.L3Hit := tbe.L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(mbwt_sendResponseWriteThroughBypass, "mbwt", desc="send write through response") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysWBAck;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.WTRequestor := in_msg.WTRequestor;
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.ForwardRequestTime := curCycle();
+ out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+ out_msg.DemandRequest := false;
+ }
+ } else {
+ assert(in_msg.Type == CoherenceRequestType:Atomic);
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ if (tbe.L3Hit) {
+ out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+ } else {
+ out_msg.Sender := machineID;
+ }
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.DataBlk := getDirectoryEntry(address).DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := in_msg.Dirty;
+ out_msg.State := CoherenceState:Modified;
+ out_msg.CtoD := false;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.ForwardRequestTime := curCycle();
+ out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+ out_msg.OriginalResponder := tbe.LastSender;
+ out_msg.DemandRequest := false;
+ out_msg.L3Hit := tbe.L3Hit;
+ out_msg.WTRequestor := in_msg.WTRequestor;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:UnblockWriteThrough;
+ }
+ }
+ }
+
+ action(mb_sendResponseMBypass, "mb", desc="send Modified response") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ if (tbe.L3Hit) {
+ out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+ } else {
+ out_msg.Sender := machineID;
+ }
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := tbe.Dirty;
+ out_msg.State := CoherenceState:Modified;
+ out_msg.CtoD := false;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.ForwardRequestTime := curCycle();
+ out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+ out_msg.OriginalResponder := tbe.LastSender;
+ out_msg.DemandRequest := false;
+ out_msg.L3Hit := tbe.L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(c_sendResponseCtoD, "c", desc="send CtoD Ack") {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ out_msg.Dirty := false;
+ out_msg.State := CoherenceState:Modified;
+ out_msg.CtoD := true;
+ out_msg.InitialRequestTime := tbe.InitialRequestTime;
+ out_msg.ForwardRequestTime := curCycle();
+ out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+ out_msg.DemandRequest := tbe.DemandRequest;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(cp_sendResponseCtoDP, "cp", desc="send CtoD Ack") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ out_msg.Dirty := false;
+ out_msg.State := CoherenceState:Modified;
+ out_msg.CtoD := true;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.ForwardRequestTime := curCycle();
+ out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+ out_msg.DemandRequest := false;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+ peek(regDir_in, CPURequestMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysWBAck;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.WTRequestor := in_msg.WTRequestor;
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.ForwardRequestTime := in_msg.ForwardRequestTime;
+ out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+ out_msg.DemandRequest := false;
+ }
+ }
+ }
+
+ action(wp_sendResponseWBAckP, "wp", desc="send WB Ack") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysWBAck;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.WTRequestor := in_msg.WTRequestor;
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.ForwardRequestTime := curCycle();
+ out_msg.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+ out_msg.DemandRequest := false;
+ }
+ }
+ }
+
+ action(wc_sendResponseWBAck, "wc", desc="send WB Ack for cancel") {
+ peek(responseNetwork_in, ResponseMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysWBAck;
+ out_msg.Destination.add(in_msg.Sender);
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ }
+ }
+ }
+
+ action(ra_ackRegionDir, "ra", desc="Ack region dir") {
+ peek(regDir_in, CPURequestMsg) {
+ if (in_msg.NoAckNeeded == false) {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency_regionDir) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:DirReadyAck;
+ out_msg.Destination.add(map_Address_to_RegionDir(address));
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ }
+ }
+ }
+ }
+
+ action(l_queueMemRdReq, "lr", desc="Read data from memory") {
+ peek(regDir_in, CPURequestMsg) {
+ if (L3CacheMemory.isTagPresent(address)) {
+ enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+ tbe.DataBlk := entry.DataBlk;
+ tbe.LastSender := entry.LastSender;
+ tbe.L3Hit := true;
+ tbe.MemData := true;
+ DPRINTF(RubySlicc, "L3 data is %s\n", entry.DataBlk);
+ L3CacheMemory.deallocate(address);
+ } else {
+ queueMemoryRead(machineID, address, to_memory_controller_latency);
+ }
+ }
+ }
+
+ action(lrp_queueMemRdReqP, "lrp", desc="Read data from memory") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ if (L3CacheMemory.isTagPresent(address)) {
+ enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+ tbe.DataBlk := entry.DataBlk;
+ tbe.LastSender := entry.LastSender;
+ tbe.L3Hit := true;
+ tbe.MemData := true;
+ DPRINTF(RubySlicc, "L3 data is %s\n", entry.DataBlk);
+ L3CacheMemory.deallocate(address);
+ } else {
+ queueMemoryRead(machineID, address, to_memory_controller_latency);
+ }
+ }
+ }
+
+ action(dcr_probeInvCoreData, "dcr", desc="probe inv cores, return data") {
+ peek(regBuf_in, CPURequestMsg) {
+ enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination := in_msg.Sharers;
+ tbe.NumPendingAcks := tbe.NumPendingAcks + in_msg.Sharers.count();
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ APPEND_TRANSITION_COMMENT(" dcr: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ tbe.ProbeRequestStartTime := curCycle();
+ }
+ }
+ }
+
+ action(ddr_probeDownCoreData, "ddr", desc="probe inv cores, return data") {
+ peek(regBuf_in, CPURequestMsg) {
+ enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbDowngrade;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination := in_msg.Sharers;
+ tbe.NumPendingAcks := tbe.NumPendingAcks + in_msg.Sharers.count();
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ APPEND_TRANSITION_COMMENT(" dcr: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ tbe.ProbeRequestStartTime := curCycle();
+ }
+ }
+ }
+
+ action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") {
+ peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+ enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbDowngrade;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket
+ tbe.NumPendingAcks := tbe.NumPendingAcks +machineCount(MachineType:CorePair) - 1;
+ out_msg.Destination.broadcast(MachineType:TCP);
+ tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:TCP);
+ out_msg.Destination.broadcast(MachineType:SQC);
+ tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:SQC);
+ out_msg.Destination.remove(in_msg.Requestor);
+ DPRINTF(RubySlicc, "%s\n", (out_msg));
+ APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ tbe.ProbeRequestStartTime := curCycle();
+ }
+ }
+ }
+
+ action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") {
+ peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+ enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.ReturnData := false;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket
+ tbe.NumPendingAcks := tbe.NumPendingAcks +machineCount(MachineType:CorePair) - 1;
+ out_msg.Destination.broadcast(MachineType:TCP);
+ tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:TCP);
+ out_msg.Destination.broadcast(MachineType:SQC);
+ tbe.NumPendingAcks := tbe.NumPendingAcks + machineCount(MachineType:SQC);
+ out_msg.Destination.remove(in_msg.Requestor);
+ APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ tbe.ProbeRequestStartTime := curCycle();
+ }
+ }
+ }
+
+ action(d_writeDataToMemory, "d", desc="Write data to memory") {
+ peek(responseNetwork_in, ResponseMsg) {
+ getDirectoryEntry(address).DataBlk := in_msg.DataBlk;
+ DPRINTF(RubySlicc, "Writing Data: %s to address %s\n", in_msg.DataBlk,
+ in_msg.addr);
+ }
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ check_allocate(TBEs);
+ peek(regDir_in, CPURequestMsg) {
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ tbe.writeMask.clear();
+ tbe.writeMask.orMask(in_msg.writeMask);
+ tbe.wtData := true;
+ tbe.WTRequestor := in_msg.WTRequestor;
+ tbe.LastSender := in_msg.Requestor;
+ }
+ if (in_msg.Type == CoherenceRequestType:Atomic) {
+ tbe.writeMask.clear();
+ tbe.writeMask.orMask(in_msg.writeMask);
+ tbe.atomicData := true;
+ tbe.WTRequestor := in_msg.WTRequestor;
+ tbe.LastSender := in_msg.Requestor;
+ }
+ tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+ tbe.Dirty := false;
+ if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask);
+ tbe.Dirty := false;
+ }
+ tbe.OriginalRequestor := in_msg.Requestor;
+ tbe.NumPendingAcks := 0;
+ tbe.Cached := in_msg.ForceShared;
+ tbe.InitialRequestTime := in_msg.InitialRequestTime;
+ tbe.ForwardRequestTime := curCycle();
+ tbe.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+ tbe.DemandRequest := in_msg.DemandRequest;
+ }
+ }
+
+ action(tp_allocateTBEP, "tp", desc="allocate TBE Entry") {
+ check_allocate(TBEs);
+ peek(requestNetwork_in, CPURequestMsg) {
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ tbe.writeMask.clear();
+ tbe.writeMask.orMask(in_msg.writeMask);
+ tbe.wtData := true;
+ tbe.WTRequestor := in_msg.WTRequestor;
+ tbe.LastSender := in_msg.Requestor;
+ }
+ if (in_msg.Type == CoherenceRequestType:Atomic) {
+ tbe.writeMask.clear();
+ tbe.writeMask.orMask(in_msg.writeMask);
+ tbe.atomicData := true;
+ tbe.WTRequestor := in_msg.WTRequestor;
+ tbe.LastSender := in_msg.Requestor;
+ }
+ tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+ tbe.Dirty := false;
+ if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask);
+ tbe.Dirty := false;
+ }
+ tbe.OriginalRequestor := in_msg.Requestor;
+ tbe.NumPendingAcks := 0;
+ tbe.Cached := in_msg.ForceShared;
+ tbe.InitialRequestTime := in_msg.InitialRequestTime;
+ tbe.ForwardRequestTime := curCycle();
+ tbe.ProbeRequestStartTime := in_msg.ProbeRequestStartTime;
+ tbe.DemandRequest := false;
+ }
+ }
+
+ action(sa_setAcks, "sa", desc="setAcks") {
+ peek(regDir_in, CPURequestMsg) {
+ tbe.NumPendingAcks := in_msg.Acks;
+ APPEND_TRANSITION_COMMENT(" waiting for acks ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ }
+ }
+
+ action(tr_allocateTBE, "tr", desc="allocate TBE Entry for Region inv") {
+ check_allocate(TBEs);
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ tbe.NumPendingAcks := 0;
+ }
+
+ action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+ TBEs.deallocate(address);
+ unset_tbe();
+ }
+
+ action(wdp_writeBackDataPrivate, "wdp", desc="Write back data if needed") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ tbe.DataBlkAux := getDirectoryEntry(address).DataBlk;
+ tbe.DataBlkAux.copyPartial(in_msg.DataBlk,in_msg.writeMask);
+ getDirectoryEntry(address).DataBlk := tbe.DataBlkAux;
+ } else{
+ assert(in_msg.Type == CoherenceRequestType:Atomic);
+ tbe.DataBlkAux.atomicPartial(getDirectoryEntry(address).DataBlk,in_msg.writeMask);
+ getDirectoryEntry(address).DataBlk := tbe.DataBlkAux;
+ }
+ }
+ }
+
+ action(wd_writeBackData, "wd", desc="Write back data if needed") {
+ if (tbe.wtData) {
+ DataBlock tmp := getDirectoryEntry(address).DataBlk;
+ tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+ tbe.DataBlk := tmp;
+ getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+ } else if (tbe.atomicData) {
+ tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk,tbe.writeMask);
+ getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+ } else if (tbe.Dirty == true) {
+ APPEND_TRANSITION_COMMENT(" Wrote data back ");
+ getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+ }
+ }
+
+ action(wdi_writeBackDataInv, "wdi", desc="Write back inv data if needed") {
+ // Kind of opposite from above...?
+ if (tbe.Dirty == true) {
+ getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+ APPEND_TRANSITION_COMMENT("Writing dirty data to dir");
+ DPRINTF(RubySlicc, "Data %s: %s\n", address, tbe.DataBlk);
+ } else {
+ APPEND_TRANSITION_COMMENT("NOT!!! Writing dirty data to dir");
+ }
+ }
+
+ action(wdt_writeBackDataInvNoTBE, "wdt", desc="Write back inv data if needed no TBE") {
+ // Kind of opposite from above...?
+ peek(responseNetwork_in, ResponseMsg) {
+ if (in_msg.Dirty == true) {
+ getDirectoryEntry(address).DataBlk := in_msg.DataBlk;
+ APPEND_TRANSITION_COMMENT("Writing dirty data to dir");
+ DPRINTF(RubySlicc, "Data %s: %s\n", address, in_msg.DataBlk);
+ } else {
+ APPEND_TRANSITION_COMMENT("NOT!!! Writing dirty data to dir");
+ }
+ }
+ }
+
+ action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") {
+ peek(memQueue_in, MemoryMsg) {
+ if (tbe.Dirty == false) {
+ tbe.DataBlk := getDirectoryEntry(address).DataBlk;
+ }
+ tbe.MemData := true;
+ }
+ }
+
+ action(ml_writeL3DataToTBE, "ml", desc="write L3 data to TBE") {
+ assert(tbe.Dirty == false);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+ tbe.DataBlk := entry.DataBlk;
+ tbe.LastSender := entry.LastSender;
+ tbe.L3Hit := true;
+ tbe.MemData := true;
+ }
+
+ action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") {
+ peek(responseNetwork_in, ResponseMsg) {
+ if (in_msg.Dirty) {
+ DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender);
+ DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk);
+ if (tbe.wtData) {
+ DataBlock tmp := in_msg.DataBlk;
+ tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+ tbe.DataBlk := tmp;
+ } else if (tbe.Dirty) {
+ if(tbe.atomicData == false && tbe.wtData == false) {
+ DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender);
+ assert(tbe.DataBlk == in_msg.DataBlk); // in case of double data
+ }
+ } else {
+ tbe.DataBlk := in_msg.DataBlk;
+ tbe.Dirty := in_msg.Dirty;
+ tbe.LastSender := in_msg.Sender;
+ }
+ }
+ if (in_msg.Hit) {
+ tbe.Cached := true;
+ }
+ }
+ }
+
+ action(yc_writeCPUDataToTBE, "yc", desc="write CPU Data to TBE") {
+ peek(responseNetwork_in, ResponseMsg) {
+ if (in_msg.Dirty) {
+ DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender);
+ DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk);
+ if (tbe.Dirty) {
+ DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender);
+ assert(tbe.DataBlk == in_msg.DataBlk); // in case of double data
+ }
+ tbe.DataBlk := in_msg.DataBlk;
+ tbe.Dirty := false;
+ tbe.LastSender := in_msg.Sender;
+ }
+ }
+ }
+
+ action(x_decrementAcks, "x", desc="decrement Acks pending") {
+ if (tbe.NumPendingAcks > 0) {
+ tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+ } else {
+ APPEND_TRANSITION_COMMENT(" Double ack! ");
+ }
+ assert(tbe.NumPendingAcks >= 0);
+ APPEND_TRANSITION_COMMENT(" Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ }
+
+ action(o_checkForCompletion, "o", desc="check for ack completion") {
+ if (tbe.NumPendingAcks == 0 && tbe.TriggeredAcksComplete == false) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:AcksComplete;
+ }
+ tbe.TriggeredAcksComplete := true;
+ }
+ APPEND_TRANSITION_COMMENT(" Check: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ }
+
+ action(ont_checkForCompletionNoTrigger, "ont", desc="check for ack completion, no trigger") {
+ if (tbe.NumPendingAcks == 0 && tbe.TriggeredAcksComplete == false) {
+ tbe.TriggeredAcksComplete := true;
+ }
+ APPEND_TRANSITION_COMMENT(" Check: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ }
+
+ action(rvp_removeVicDirtyIgnore, "rvp", desc="Remove ignored core") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor);
+ }
+ }
+
+ action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") {
+ peek(regDir_in, CPURequestMsg) {
+ getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor);
+ }
+ }
+
+ action(r_sendRequestToRegionDir, "r", desc="send request to Region Directory") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(requestNetworkReg_out, CPURequestMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := in_msg.Type;
+ out_msg.Requestor := in_msg.Requestor;
+ out_msg.Destination.add(map_Address_to_RegionDir(address));
+ out_msg.Shared := in_msg.Shared;
+ out_msg.MessageSize := in_msg.MessageSize;
+ DPRINTF(RubySlicc, "out dest: %s\n", map_Address_to_RegionDir(address));
+ }
+ }
+ }
+
+ action(ai_ackInvalidate, "ai", desc="Ack to let the reg-dir know that the inv is ordered") {
+ peek(regBuf_in, CPURequestMsg) {
+ enqueue(regAckNetwork_out, UnblockMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ DPRINTF(RubySlicc, "ai out_msg: %s\n", out_msg);
+ }
+ }
+ }
+
+ action(aic_ackInvalidate, "aic", desc="Ack to let the reg-dir know that the inv is ordered") {
+ peek(responseNetwork_in, ResponseMsg) {
+ if (in_msg.NoAckNeeded == false) {
+ enqueue(regAckNetwork_out, UnblockMsg, 1) {
+ out_msg.addr := address;
+ if (machineIDToMachineType(in_msg.Sender) == MachineType:CorePair) {
+ out_msg.Destination.add(createMachineID(MachineType:RegionBuffer, intToID(0)));
+ } else {
+ out_msg.Destination.add(createMachineID(MachineType:RegionBuffer, intToID(1)));
+ }
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ DPRINTF(RubySlicc, "ai out_msg: %s\n", out_msg);
+ out_msg.wasValid := in_msg.isValid;
+ }
+ }
+ }
+ }
+
+ action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
+ peek(responseNetwork_in, ResponseMsg) {
+ if (L3CacheMemory.isTagPresent(address)) {
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+ APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+ entry.DataBlk := in_msg.DataBlk;
+ entry.LastSender := in_msg.Sender;
+ } else {
+ if (L3CacheMemory.cacheAvail(address) == false) {
+ Addr victim := L3CacheMemory.cacheProbe(address);
+ CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+ L3CacheMemory.lookup(victim));
+ queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+ victim_entry.DataBlk);
+ L3CacheMemory.deallocate(victim);
+ }
+ assert(L3CacheMemory.cacheAvail(address));
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+ APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+ entry.DataBlk := in_msg.DataBlk;
+ entry.LastSender := in_msg.Sender;
+ }
+ }
+ }
+
+ action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") {
+ if ((tbe.wtData || tbe.atomicData) && useL3OnWT) {
+ if (L3CacheMemory.isTagPresent(address)) {
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+ APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+ entry.DataBlk := tbe.DataBlk;
+ entry.LastSender := tbe.LastSender;
+ } else {
+ if (L3CacheMemory.cacheAvail(address) == false) {
+ Addr victim := L3CacheMemory.cacheProbe(address);
+ CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+ L3CacheMemory.lookup(victim));
+ queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+ victim_entry.DataBlk);
+ L3CacheMemory.deallocate(victim);
+ }
+ assert(L3CacheMemory.cacheAvail(address));
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+ APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+ entry.DataBlk := tbe.DataBlk;
+ entry.LastSender := tbe.LastSender;
+ }
+ }
+ }
+
+ action(ali_allocateL3Block, "ali", desc="allocate the L3 block on ForceInv") {
+ if (tbe.Dirty == true) {
+ if (L3CacheMemory.isTagPresent(address)) {
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+ APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+ entry.DataBlk := tbe.DataBlk;
+ entry.LastSender := tbe.LastSender;
+ } else {
+ if (L3CacheMemory.cacheAvail(address) == false) {
+ Addr victim := L3CacheMemory.cacheProbe(address);
+ CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+ L3CacheMemory.lookup(victim));
+ queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+ victim_entry.DataBlk);
+ L3CacheMemory.deallocate(victim);
+ }
+ assert(L3CacheMemory.cacheAvail(address));
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+ APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+ entry.DataBlk := tbe.DataBlk;
+ entry.LastSender := tbe.LastSender;
+ }
+ }
+ }
+
+ action(ali_allocateL3BlockNoTBE, "alt", desc="allocate the L3 block on ForceInv no TBE") {
+ peek(responseNetwork_in, ResponseMsg) {
+ if (in_msg.Dirty) {
+ if (L3CacheMemory.isTagPresent(address)) {
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+ APPEND_TRANSITION_COMMENT(" ali wrote data to L3 (hit) ");
+ entry.DataBlk := in_msg.DataBlk;
+ entry.LastSender := in_msg.Sender;
+ } else {
+ if (L3CacheMemory.cacheAvail(address) == false) {
+ Addr victim := L3CacheMemory.cacheProbe(address);
+ CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+ L3CacheMemory.lookup(victim));
+ queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+ victim_entry.DataBlk);
+ L3CacheMemory.deallocate(victim);
+ }
+ assert(L3CacheMemory.cacheAvail(address));
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+ APPEND_TRANSITION_COMMENT(" ali wrote data to L3 ");
+ entry.DataBlk := in_msg.DataBlk;
+ entry.LastSender := in_msg.Sender;
+ }
+ }
+ }
+ }
+
+ action(dl_deallocateL3, "dl", desc="deallocate the L3 block") {
+ L3CacheMemory.deallocate(address);
+ }
+
+ action(p_popRequestQueue, "p", desc="pop request queue") {
+ requestNetwork_in.dequeue(clockEdge());
+ }
+
+ action(prd_popRegionQueue, "prd", desc="pop request queue") {
+ regDir_in.dequeue(clockEdge());
+ }
+
+ action(prb_popRegionBufQueue, "prb", desc="pop request queue") {
+ regBuf_in.dequeue(clockEdge());
+ }
+
+ action(pr_popResponseQueue, "pr", desc="pop response queue") {
+ responseNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pm_popMemQueue, "pm", desc="pop mem queue") {
+ memQueue_in.dequeue(clockEdge());
+ }
+
+ action(pt_popTriggerQueue, "pt", desc="pop trigger queue") {
+ triggerQueue_in.dequeue(clockEdge());
+ }
+
+ action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") {
+ L3TriggerQueue_in.dequeue(clockEdge());
+ }
+
+ action(pu_popUnblockQueue, "pu", desc="pop unblock queue") {
+ unblockNetwork_in.dequeue(clockEdge());
+ }
+
+ action(yy_recycleResponseQueue, "yy", desc="recycle response queue") {
+ responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(ww_stallAndWaitRegRequestQueue, "ww", desc="recycle region dir request queue") {
+ stall_and_wait(regDir_in, address);
+ }
+
+ action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") {
+ stall_and_wait(requestNetwork_in, address);
+ }
+
+ action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") {
+ wakeUpBuffers(address);
+ }
+
+ action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") {
+ wakeUpAllBuffers();
+ }
+
+ action(z_stall, "z", desc="...") {
+ }
+
+ // TRANSITIONS
+
+ // transitions from U
+
+ transition({BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Inv, Downgrade}) {
+ ww_stallAndWaitRegRequestQueue;
+ }
+
+ transition(U, Inv, BI){L3TagArrayRead} {
+ tr_allocateTBE;
+ dcr_probeInvCoreData; // only need to invalidate sharers
+ ai_ackInvalidate;
+ prb_popRegionBufQueue;
+ }
+
+ transition(U, Downgrade, BI){L3TagArrayRead} {
+ tr_allocateTBE;
+ ddr_probeDownCoreData; // only need to invalidate sharers
+ ai_ackInvalidate;
+ prb_popRegionBufQueue;
+ }
+
+ // The next 2 transistions are needed in the event that an invalidation
+ // is waiting for its ack from the core, but the event makes it through
+ // the region directory before the acks. This wouldn't be needed if
+ // we waited to ack the region dir until the directory got all the acks
+ transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, WriteThrough, Atomic}) {
+ ww_stallAndWaitRegRequestQueue;
+ }
+
+ transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {RdBlkSP, RdBlkMP, RdBlkP}) {
+ st_stallAndWaitRequest;
+ }
+
+ transition({BR, BW, BI, BL, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {WriteThroughP,AtomicP}) {
+ st_stallAndWaitRequest;
+ }
+
+ transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead} {
+ t_allocateTBE;
+ l_queueMemRdReq;
+ sa_setAcks;
+ o_checkForCompletion;
+ ra_ackRegionDir;
+ prd_popRegionQueue;
+ }
+
+ transition(U, WriteThrough, BM_PM){L3TagArrayRead} {
+ t_allocateTBE;
+ w_sendResponseWBAck;
+ l_queueMemRdReq;
+ sa_setAcks;
+ o_checkForCompletion;
+ ra_ackRegionDir;
+ prd_popRegionQueue;
+ }
+
+ transition(U, {RdBlkM,Atomic}, BM_PM){L3TagArrayRead} {
+ t_allocateTBE;
+ l_queueMemRdReq;
+ sa_setAcks;
+ o_checkForCompletion;
+ ra_ackRegionDir;
+ prd_popRegionQueue;
+ }
+
+ transition(U, RdBlk, B_PM){L3TagArrayRead} {
+ t_allocateTBE;
+ l_queueMemRdReq;
+ sa_setAcks;
+ o_checkForCompletion;
+ ra_ackRegionDir;
+ prd_popRegionQueue;
+ }
+
+ transition(U, {RdBlkSP}, BS_M) {L3TagArrayRead} {
+ tp_allocateTBEP;
+ lrp_queueMemRdReqP;
+ p_popRequestQueue;
+ }
+
+ transition(U, WriteThroughP, BM_M) {L3TagArrayRead} {
+ tp_allocateTBEP;
+ wp_sendResponseWBAckP;
+ lrp_queueMemRdReqP;
+ p_popRequestQueue;
+ }
+
+ transition(U, {RdBlkMP,AtomicP}, BM_M) {L3TagArrayRead} {
+ tp_allocateTBEP;
+ lrp_queueMemRdReqP;
+ p_popRequestQueue;
+ }
+
+ transition(U, RdBlkP, B_M) {L3TagArrayRead} {
+ tp_allocateTBEP;
+ lrp_queueMemRdReqP;
+ p_popRequestQueue;
+ }
+
+ transition(U, VicDirtyP, BL) {L3TagArrayRead} {
+ tp_allocateTBEP;
+ wp_sendResponseWBAckP;
+ p_popRequestQueue;
+ }
+
+ transition(U, VicCleanP, BL) {L3TagArrayRead} {
+ tp_allocateTBEP;
+ wp_sendResponseWBAckP;
+ p_popRequestQueue;
+ }
+
+ transition(BM_Pm, RdBlkSP, BM_Pm_B) {L3DataArrayWrite} {
+ sb_sendResponseSBypass;
+ p_popRequestQueue;
+ }
+
+ transition(BS_Pm, RdBlkSP, BS_Pm_B) {L3DataArrayWrite} {
+ sb_sendResponseSBypass;
+ p_popRequestQueue;
+ }
+
+ transition(B_Pm, RdBlkSP, B_Pm_B) {L3DataArrayWrite} {
+ sb_sendResponseSBypass;
+ p_popRequestQueue;
+ }
+
+ transition(BP, RdBlkSP, BP_B) {L3DataArrayWrite} {
+ sb_sendResponseSBypass;
+ p_popRequestQueue;
+ }
+
+ transition(BM_Pm, RdBlkMP, BM_Pm_B) {L3DataArrayWrite} {
+ mb_sendResponseMBypass;
+ p_popRequestQueue;
+ }
+
+ transition(BS_Pm, RdBlkMP, BS_Pm_B) {L3DataArrayWrite} {
+ mb_sendResponseMBypass;
+ p_popRequestQueue;
+ }
+
+ transition(B_Pm, RdBlkMP, B_Pm_B) {L3DataArrayWrite} {
+ mb_sendResponseMBypass;
+ p_popRequestQueue;
+ }
+
+ transition(BP, RdBlkMP, BP_B) {L3DataArrayWrite} {
+ mb_sendResponseMBypass;
+ p_popRequestQueue;
+ }
+
+ transition(BM_Pm, {WriteThroughP,AtomicP}, BM_Pm_B) {L3DataArrayWrite} {
+ wdp_writeBackDataPrivate;
+ mbwt_sendResponseWriteThroughBypass;
+ p_popRequestQueue;
+ }
+
+ transition(BS_Pm, {WriteThroughP,AtomicP}, BS_Pm_B) {L3DataArrayWrite} {
+ wdp_writeBackDataPrivate;
+ mbwt_sendResponseWriteThroughBypass;
+ p_popRequestQueue;
+ }
+
+ transition(B_Pm, {WriteThroughP,AtomicP}, B_Pm_B) {L3DataArrayWrite} {
+ wdp_writeBackDataPrivate;
+ mbwt_sendResponseWriteThroughBypass;
+ p_popRequestQueue;
+ }
+
+ transition(BP, {WriteThroughP,AtomicP}, BP_B) {L3DataArrayWrite} {
+ wdp_writeBackDataPrivate;
+ mbwt_sendResponseWriteThroughBypass;
+ p_popRequestQueue;
+ }
+
+ transition(BM_Pm, RdBlkP, BM_Pm_B) {L3DataArrayWrite} {
+ esb_sendResponseESBypass;
+ p_popRequestQueue;
+ }
+
+ transition(BS_Pm, RdBlkP, BS_Pm_B) {L3DataArrayWrite} {
+ esb_sendResponseESBypass;
+ p_popRequestQueue;
+ }
+
+ transition(B_Pm, RdBlkP, B_Pm_B) {L3DataArrayWrite}{
+ esb_sendResponseESBypass;
+ p_popRequestQueue;
+ }
+
+ transition(BP, RdBlkP, BP_B) {L3DataArrayWrite}{
+ esb_sendResponseESBypass;
+ p_popRequestQueue;
+ }
+
+ transition(BM_Pm_B, CoreUnblock, BM_Pm) {
+ wa_wakeUpDependents;
+ pu_popUnblockQueue;
+ }
+
+ transition(BS_Pm_B, CoreUnblock, BS_Pm) {
+ wa_wakeUpDependents;
+ pu_popUnblockQueue;
+ }
+
+ transition(B_Pm_B, CoreUnblock, B_Pm) {
+ wa_wakeUpDependents;
+ pu_popUnblockQueue;
+ }
+
+ transition(BP_B, CoreUnblock, BP) {
+ wa_wakeUpDependents;
+ pu_popUnblockQueue;
+ }
+
+ transition(BM_Pm_B, UnblockWriteThrough, BM_Pm) {
+ wa_wakeUpDependents;
+ pt_popTriggerQueue;
+ }
+
+ transition(BS_Pm_B, UnblockWriteThrough, BS_Pm) {
+ wa_wakeUpDependents;
+ pt_popTriggerQueue;
+ }
+
+ transition(B_Pm_B, UnblockWriteThrough, B_Pm) {
+ wa_wakeUpDependents;
+ pt_popTriggerQueue;
+ }
+
+ transition(BP_B, UnblockWriteThrough, BP) {
+ wa_wakeUpDependents;
+ pt_popTriggerQueue;
+ }
+
+ transition(BM_Pm, VicDirtyP, BM_Pm_BL) {
+ wp_sendResponseWBAckP;
+ p_popRequestQueue;
+ }
+
+ transition(BS_Pm, VicDirtyP, BS_Pm_BL) {
+ wp_sendResponseWBAckP;
+ p_popRequestQueue;
+ }
+
+ transition(B_Pm, VicDirtyP, B_Pm_BL) {
+ wp_sendResponseWBAckP;
+ p_popRequestQueue;
+ }
+
+ transition(BP, VicDirtyP, BP_BL) {
+ wp_sendResponseWBAckP;
+ p_popRequestQueue;
+ }
+
+ transition(BM_Pm, VicCleanP, BM_Pm_BL) {
+ wp_sendResponseWBAckP;
+ p_popRequestQueue;
+ }
+
+ transition(BS_Pm, VicCleanP, BS_Pm_BL) {
+ wp_sendResponseWBAckP;
+ p_popRequestQueue;
+ }
+
+ transition(B_Pm, VicCleanP, B_Pm_BL) {
+ wp_sendResponseWBAckP;
+ p_popRequestQueue;
+ }
+
+ transition(BP, VicCleanP, BP_BL) {
+ wp_sendResponseWBAckP;
+ p_popRequestQueue;
+ }
+
+ transition(BM_Pm_BL, CPUData, BM_Pm) {
+ yc_writeCPUDataToTBE;
+ d_writeDataToMemory;
+ wa_wakeUpDependents;
+ pr_popResponseQueue;
+ }
+
+ transition(BS_Pm_BL, CPUData, BS_Pm) {
+ yc_writeCPUDataToTBE;
+ d_writeDataToMemory;
+ wa_wakeUpDependents;
+ pr_popResponseQueue;
+ }
+
+ transition(B_Pm_BL, CPUData, B_Pm) {
+ yc_writeCPUDataToTBE;
+ d_writeDataToMemory;
+ wa_wakeUpDependents;
+ pr_popResponseQueue;
+ }
+
+ transition(BP_BL, CPUData, BP) {
+ yc_writeCPUDataToTBE;
+ d_writeDataToMemory;
+ wa_wakeUpDependents;
+ pr_popResponseQueue;
+ }
+
+ transition({BR, BW, BL}, {VicDirtyP, VicCleanP}) {
+ st_stallAndWaitRequest;
+ }
+
+ transition({BR, BW, BL}, {VicDirty, VicClean}) {
+ ww_stallAndWaitRegRequestQueue;
+ }
+
+ transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} {
+ dt_deallocateTBE;
+ d_writeDataToMemory;
+ al_allocateL3Block;
+ wa_wakeUpDependents;
+ pr_popResponseQueue;
+ }
+
+ transition(BL, StaleWB, U) {L3TagArrayWrite} {
+ dt_deallocateTBE;
+ wa_wakeUpAllDependents;
+ pr_popResponseQueue;
+ }
+
+ transition({BI, B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {VicDirty, VicClean}) {
+ ww_stallAndWaitRegRequestQueue;
+ }
+
+ transition({BI, B, BS_M, BM_M, B_M, BS_PM, BM_PM, B_PM, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {VicDirtyP, VicCleanP}) {
+ st_stallAndWaitRequest;
+ }
+
+ transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, WBAck) {
+ pm_popMemQueue;
+ }
+
+ transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, StaleVicDirtyP) {
+ rvp_removeVicDirtyIgnore;
+ wp_sendResponseWBAckP;
+ p_popRequestQueue;
+ }
+
+ transition({U, BR, BW, BL, BI, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B, BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, StaleVicDirty) {
+ rv_removeVicDirtyIgnore;
+ w_sendResponseWBAck;
+ prd_popRegionQueue;
+ }
+
+ transition(U, VicDirty, BL) {L3TagArrayRead} {
+ t_allocateTBE;
+ ra_ackRegionDir;
+ w_sendResponseWBAck;
+ prd_popRegionQueue;
+ }
+
+ transition(U, VicClean, BL) {L3TagArrayRead} {
+ t_allocateTBE;
+ ra_ackRegionDir;
+ w_sendResponseWBAck;
+ prd_popRegionQueue;
+ }
+
+ transition({B, BR}, CoreUnblock, U) {
+ wa_wakeUpDependents;
+ pu_popUnblockQueue;
+ }
+
+ transition({B, BR}, UnblockWriteThrough, U) {
+ wa_wakeUpDependents;
+ pt_popTriggerQueue;
+ }
+
+ transition(BS_M, MemData, B) {L3TagArrayWrite, L3DataArrayWrite} {
+ mt_writeMemDataToTBE;
+ s_sendResponseS;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pm_popMemQueue;
+ }
+
+ transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+ mt_writeMemDataToTBE;
+ m_sendResponseM;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pm_popMemQueue;
+ }
+
+ transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+ mt_writeMemDataToTBE;
+ es_sendResponseES;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pm_popMemQueue;
+ }
+
+ transition(BS_PM, MemData, BS_Pm) {} {
+ mt_writeMemDataToTBE;
+ wa_wakeUpDependents;
+ pm_popMemQueue;
+ }
+
+ transition(BM_PM, MemData, BM_Pm){} {
+ mt_writeMemDataToTBE;
+ wa_wakeUpDependents;
+ pm_popMemQueue;
+ }
+
+ transition(B_PM, MemData, B_Pm){} {
+ mt_writeMemDataToTBE;
+ wa_wakeUpDependents;
+ pm_popMemQueue;
+ }
+
+ transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+ s_sendResponseS;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ ptl_popTriggerQueue;
+ }
+
+ transition(BM_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+ m_sendResponseM;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ ptl_popTriggerQueue;
+ }
+
+ transition(B_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+ es_sendResponseES;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ ptl_popTriggerQueue;
+ }
+
+ transition(BS_PM, L3Hit, BS_Pm) {
+ wa_wakeUpDependents;
+ ptl_popTriggerQueue;
+ }
+
+ transition(BM_PM, L3Hit, BM_Pm) {
+ wa_wakeUpDependents;
+ ptl_popTriggerQueue;
+ }
+
+ transition(B_PM, L3Hit, B_Pm) {
+ wa_wakeUpDependents;
+ ptl_popTriggerQueue;
+ }
+
+ transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BP, BI}, CPUPrbResp) {
+ aic_ackInvalidate;
+ y_writeProbeDataToTBE;
+ x_decrementAcks;
+ ont_checkForCompletionNoTrigger;
+ pr_popResponseQueue;
+ }
+
+ transition({B, B_M, BS_M, BM_M}, {CPUPrbResp, LastCPUPrbResp}) {
+ z_stall;
+ }
+
+ transition({BS_Pm_BL, BM_Pm_BL, B_Pm_BL, BP_BL, BS_Pm_B, BM_Pm_B, B_Pm_B, BP_B}, {CPUPrbResp, LastCPUPrbResp}) {
+ // recycling because PrbResponse and data come on the same network
+ yy_recycleResponseQueue;
+ }
+
+ transition(U, {CPUPrbResp, LastCPUPrbResp}) {L3TagArrayRead, L3DataArrayWrite} {
+ aic_ackInvalidate;
+ wdt_writeBackDataInvNoTBE;
+ ali_allocateL3BlockNoTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(BL, {CPUPrbResp, LastCPUPrbResp}) {} {
+ aic_ackInvalidate;
+ y_writeProbeDataToTBE;
+ wdi_writeBackDataInv;
+ ali_allocateL3Block;
+ pr_popResponseQueue;
+ }
+
+ transition(BS_PM, LastCPUPrbResp, BS_M) {
+ aic_ackInvalidate;
+ y_writeProbeDataToTBE;
+ x_decrementAcks;
+ ont_checkForCompletionNoTrigger;
+ pr_popResponseQueue;
+ }
+
+ transition(BS_PM, ProbeAcksComplete, BS_M) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(BM_PM, LastCPUPrbResp, BM_M) {
+ aic_ackInvalidate;
+ y_writeProbeDataToTBE;
+ x_decrementAcks;
+ ont_checkForCompletionNoTrigger;
+ pr_popResponseQueue;
+ }
+
+ transition(BM_PM, ProbeAcksComplete, BM_M) {} {
+ pt_popTriggerQueue;
+ }
+
+ transition(B_PM, LastCPUPrbResp, B_M) {
+ aic_ackInvalidate;
+ y_writeProbeDataToTBE;
+ x_decrementAcks;
+ ont_checkForCompletionNoTrigger;
+ pr_popResponseQueue;
+ }
+
+ transition(B_PM, ProbeAcksComplete, B_M){} {
+ pt_popTriggerQueue;
+ }
+
+ transition(BS_Pm, LastCPUPrbResp, B) {
+ aic_ackInvalidate;
+ y_writeProbeDataToTBE;
+ x_decrementAcks;
+ ont_checkForCompletionNoTrigger;
+ s_sendResponseS;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ ali_allocateL3Block;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+ s_sendResponseS;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ ali_allocateL3Block;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+
+ transition(BM_Pm, LastCPUPrbResp, B) {
+ aic_ackInvalidate;
+ y_writeProbeDataToTBE;
+ x_decrementAcks;
+ ont_checkForCompletionNoTrigger;
+ m_sendResponseM;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ ali_allocateL3Block;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+ m_sendResponseM;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ ali_allocateL3Block;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+
+ transition(B_Pm, LastCPUPrbResp, B) {
+ aic_ackInvalidate;
+ y_writeProbeDataToTBE;
+ x_decrementAcks;
+ ont_checkForCompletionNoTrigger;
+ es_sendResponseES;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ ali_allocateL3Block;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+ es_sendResponseES;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ ali_allocateL3Block;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+
+ transition(BP, LastCPUPrbResp, B) {
+ aic_ackInvalidate;
+ y_writeProbeDataToTBE;
+ x_decrementAcks;
+ ont_checkForCompletionNoTrigger;
+ c_sendResponseCtoD;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} {
+ c_sendResponseCtoD;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+
+ transition(BI, LastCPUPrbResp, B) {
+ aic_ackInvalidate;
+ y_writeProbeDataToTBE;
+ x_decrementAcks;
+ ont_checkForCompletionNoTrigger;
+ wa_wakeUpDependents;
+ wdi_writeBackDataInv;
+ ali_allocateL3Block;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(BI, ProbeAcksComplete, U) {L3TagArrayWrite, L3DataArrayWrite}{
+ wa_wakeUpDependents;
+ wdi_writeBackDataInv;
+ ali_allocateL3Block;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm b/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm
new file mode 100644
index 000000000..823933e57
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-Region-msg.sm
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+enumeration(CoherenceRequestType, desc="Coherence Request Types") {
+ // CPU Request Types ONLY
+ RdBlk, desc="Read Blk";
+ RdBlkM, desc="Read Blk Modified";
+ RdBlkS, desc="Read Blk Shared";
+ VicClean, desc="L2 clean eviction";
+ VicDirty, desc="L2 dirty eviction";
+
+ WrCancel, desc="want to cancel WB to Memory"; // should this be here?
+
+ WBApproval, desc="WB Approval";
+
+ // Messages between Dir and R-Dir
+ ForceInv, desc="Send invalide to the block";
+ ForceDowngrade, desc="Send downgrade to the block";
+ Unblock, desc="Used to let the dir know a message has been sunk";
+
+ // Messages between R-Dir and R-Buffer
+ PrivateNotify, desc="Let region buffer know it has private access";
+ SharedNotify, desc="Let region buffer know it has shared access";
+ WbNotify, desc="Let region buffer know it saw its wb request";
+ Downgrade, desc="Force the region buffer to downgrade to shared";
+ // Response to R-Dir (probably should be on a different network, but
+ // I need it to be ordered with respect to requests)
+ InvAck, desc="Let the R-Dir know when the inv has occured";
+
+ PrivateRequest, desc="R-buf wants the region in private";
+ UpgradeRequest, desc="R-buf wants the region in private";
+ SharedRequest, desc="R-buf wants the region in shared (could respond with private)";
+ CleanWbRequest, desc="R-buf wants to deallocate clean region";
+
+ NA, desc="So we don't get segfaults";
+}
+
+enumeration(ProbeRequestType, desc="Probe Request Types") {
+ PrbDowngrade, desc="Probe for Status"; // EtoS, MtoO, StoS
+ PrbInv, desc="Probe to Invalidate";
+
+ // For regions
+ PrbRepl, desc="Force the cache to do a replacement";
+ PrbRegDowngrade, desc="Probe for Status"; // EtoS, MtoO, StoS
+}
+
+
+enumeration(CoherenceResponseType, desc="Coherence Response Types") {
+ NBSysResp, desc="Northbridge response to CPU Rd request";
+ NBSysWBAck, desc="Northbridge response ok to WB";
+ TDSysResp, desc="TCCdirectory response to CPU Rd request";
+ TDSysWBAck, desc="TCCdirectory response ok to WB";
+ TDSysWBNack, desc="TCCdirectory response ok to drop";
+ CPUPrbResp, desc="CPU Probe Response";
+ CPUData, desc="CPU Data";
+ StaleNotif, desc="Notification of Stale WBAck, No data to writeback";
+ CPUCancelWB, desc="want to cancel WB to Memory";
+ MemData, desc="Data from Memory";
+
+ // for regions
+ PrivateAck, desc="Ack that r-buf received private notify";
+ RegionWbAck, desc="Writeback Ack that r-buf completed deallocation";
+ DirReadyAck, desc="Directory (mem ctrl)<->region dir handshake";
+}
+
+enumeration(CoherenceState, default="CoherenceState_NA", desc="Coherence State") {
+ Modified, desc="Modified";
+ Owned, desc="Owned state";
+ Exclusive, desc="Exclusive";
+ Shared, desc="Shared";
+ NA, desc="NA";
+}
+
+structure(CPURequestMsg, desc="...", interface="Message") {
+ Addr addr, desc="Physical address for this request";
+ Addr DemandAddress, desc="Physical block address for this request";
+ CoherenceRequestType Type, desc="Type of request";
+ DataBlock DataBlk, desc="data for the cache line"; // only for WB
+ bool Dirty, desc="whether WB data is dirty"; // only for WB
+ MachineID Requestor, desc="Node who initiated the request";
+ NetDest Destination, desc="Multicast destination mask";
+ bool Shared, desc="For CPU_WrVicBlk, vic is O not M. For CPU_ClVicBlk, vic is S";
+ MessageSizeType MessageSize, desc="size category of the message";
+ Cycles InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
+ Cycles ForwardRequestTime, default="0", desc="time the dir forwarded the request";
+ Cycles ProbeRequestStartTime, default="0", desc="the time the dir started the probe request";
+ bool DemandRequest, default="false", desc="For profiling purposes";
+
+ NetDest Sharers, desc="Caches that may have a valid copy of the data";
+ bool ForceShared, desc="R-dir knows it is shared, pass on so it sends an S copy, not E";
+ bool Private, default="false", desc="Requestor already has private permissions, no need for dir check";
+ bool CtoDSinked, default="false", desc="This is true if the CtoD previously sent must have been sunk";
+
+ bool NoAckNeeded, default="false", desc="True if region buffer doesn't need to ack";
+ int Acks, default="0", desc="Acks that the dir (mem ctrl) should expect to receive";
+ CoherenceRequestType OriginalType, default="CoherenceRequestType_NA", desc="Type of request from core fwded through region buffer";
+
+ bool functionalRead(Packet *pkt) {
+ // Only PUTX messages contains the data block
+ if (Type == CoherenceRequestType:VicDirty) {
+ return testAndRead(addr, DataBlk, pkt);
+ }
+
+ return false;
+ }
+
+ bool functionalWrite(Packet *pkt) {
+ // No check on message type required since the protocol should
+ // read data from those messages that contain the block
+ return testAndWrite(addr, DataBlk, pkt);
+ }
+}
+
+structure(NBProbeRequestMsg, desc="...", interface="Message") {
+ Addr addr, desc="Physical address for this request";
+ ProbeRequestType Type, desc="probe signal";
+ bool ReturnData, desc="Indicates CPU should return data";
+ NetDest Destination, desc="Node to whom the data is sent";
+ MessageSizeType MessageSize, desc="size category of the message";
+ bool DemandRequest, default="false", desc="demand request, requesting 3-hop transfer";
+ Addr DemandAddress, desc="Demand block address for a region request";
+ MachineID Requestor, desc="Requestor id for 3-hop requests";
+ bool NoAckNeeded, default="false", desc="For short circuting acks";
+
+ bool functionalRead(Packet *pkt) {
+ return false;
+ }
+
+ bool functionalWrite(Packet *pkt) {
+ // No check on message type required since the protocol should
+ // read data from those messages that contain the block
+ return false;
+ }
+
+}
+
+structure(TDProbeRequestMsg, desc="...", interface="Message") {
+ Addr addr, desc="Physical address for this request";
+ ProbeRequestType Type, desc="TD_PrbNxtState signal";
+ bool ReturnData, desc="Indicates CPU should return data";
+ bool localCtoD, desc="Indicates CtoD is within the GPU hierarchy (aka TCC subtree)";
+ NetDest Destination, desc="Node to whom the data is sent";
+ MessageSizeType MessageSize, desc="size category of the message";
+ MachineID Sender, desc="Node who sent the data";
+ bool currentOwner, default="false", desc="Is the sender the current owner";
+ bool DoneAck, default="false", desc="Is this a done ack?";
+ bool Dirty, default="false", desc="Was block dirty when evicted";
+ bool wasValid, default="false", desc="Was block valid when evicted";
+ bool valid, default="false", desc="Is block valid";
+ bool validToInvalid, default="false", desc="Was block valid when evicted";
+
+ bool functionalRead(Packet *pkt) {
+ return false;
+ }
+
+ bool functionalWrite(Packet *pkt) {
+ // No check on message type required since the protocol should
+ // read data from those messages that contain the block
+ return false;
+ }
+}
+
+// Response Messages seemed to be easily munged into one type
+structure(ResponseMsg, desc="...", interface="Message") {
+ Addr addr, desc="Physical address for this request";
+ CoherenceResponseType Type, desc="NB Sys Resp or CPU Response to Probe";
+ MachineID Sender, desc="Node who sent the data";
+ NetDest Destination, desc="Node to whom the data is sent";
+ // Begin Used Only By CPU Response
+ DataBlock DataBlk, desc="data for the cache line";
+ bool Hit, desc="probe hit valid line";
+ bool Shared, desc="True if S, or if NB Probe ReturnData==1 && O";
+ bool Dirty, desc="Is the data dirty (different than memory)?";
+ bool Ntsl, desc="indicates probed lin will be invalid after probe";
+ bool UntransferredOwner, desc="pending confirmation of ownership change";
+ // End Used Only By CPU Response
+
+ // Begin NB Response Only
+ CoherenceState State, default=CoherenceState_NA, desc="What returned data from NB should be in";
+ bool CtoD, desc="was the originator a CtoD?";
+ // End NB Response Only
+
+ bool NbReqShared, desc="modification of Shared field from initial request, e.g. hit by shared probe";
+
+ MessageSizeType MessageSize, desc="size category of the message";
+ Cycles InitialRequestTime, default="0", desc="time the initial requests was sent from the L1Cache";
+ Cycles ForwardRequestTime, default="0", desc="time the dir forwarded the request";
+ Cycles ProbeRequestStartTime, default="0", desc="the time the dir started the probe request";
+ bool DemandRequest, default="false", desc="For profiling purposes";
+
+ bool L3Hit, default="false", desc="Did memory or L3 supply the data?";
+ MachineID OriginalResponder, desc="Mach which wrote the data to the L3";
+
+ bool NotCached, default="false", desc="True when the Region buffer has already evicted the line";
+
+ bool NoAckNeeded, default="false", desc="For short circuting acks";
+ bool isValid, default="false", desc="Is acked block valid";
+
+ bool functionalRead(Packet *pkt) {
+ // Only PUTX messages contains the data block
+ if (Type == CoherenceResponseType:CPUData ||
+ Type == CoherenceResponseType:MemData) {
+ return testAndRead(addr, DataBlk, pkt);
+ }
+
+ return false;
+ }
+
+ bool functionalWrite(Packet *pkt) {
+ // No check on message type required since the protocol should
+ // read data from those messages that contain the block
+ return testAndWrite(addr, DataBlk, pkt);
+ }
+}
+
+structure(UnblockMsg, desc="...", interface="Message") {
+ Addr addr, desc="Physical address for this request";
+ NetDest Destination, desc="Destination (always directory)";
+ MessageSizeType MessageSize, desc="size category of the message";
+}
+
+enumeration(TriggerType, desc="Trigger Type") {
+ L2_to_L1, desc="L2 to L1 fill";
+ AcksComplete, desc="NB received all needed Acks";
+
+ // For regions
+ InvNext, desc="Invalidate the next block";
+ PrivateAck, desc="Loopback ack for machines with no Region Buffer";
+ AllOutstanding, desc="All outstanding requests have finished";
+ L3Hit, desc="L3 hit in dir";
+
+ // For region directory once the directory is blocked
+ InvRegion, desc="Invalidate region";
+ DowngradeRegion, desc="downgrade region";
+}
+
+enumeration(CacheId, desc="Which Cache in the Core") {
+ L1I, desc="L1 I-cache";
+ L1D0, desc="L1 D-cache cluster 0";
+ L1D1, desc="L1 D-cache cluster 1";
+ NA, desc="Default";
+}
+
+structure(TriggerMsg, desc="...", interface="Message") {
+ Addr addr, desc="Address";
+ TriggerType Type, desc="Type of trigger";
+ CacheId Dest, default="CacheId_NA", desc="Cache to invalidate";
+
+ bool functionalRead(Packet *pkt) {
+ return false;
+ }
+
+ bool functionalWrite(Packet *pkt) {
+ // No check on message type required since the protocol should
+ // read data from those messages that contain the block
+ return false;
+ }
+
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm b/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm
new file mode 100644
index 000000000..89f7d6fcb
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-RegionBuffer.sm
@@ -0,0 +1,1368 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Jason Power
+ */
+
+machine(MachineType:RegionBuffer, "Region Buffer for AMD_Base-like protocol")
+: CacheMemory *cacheMemory; // stores only region addresses. Must set block size same as below
+ bool isOnCPU;
+ int blocksPerRegion := 64; // 4k regions
+ Cycles toDirLatency := 5; // Latency to fwd requests to directory
+ Cycles toRegionDirLatency := 5; // Latency for requests and acks to directory
+ Cycles nextEvictLatency := 1; // latency added between each block while evicting region
+ bool noTCCdir := "False";
+ int TCC_select_num_bits := 1;
+
+ // From the Cores
+ MessageBuffer * requestFromCore, network="From", virtual_network="0", vnet_type="request";
+ MessageBuffer * responseFromCore, network="From", virtual_network="2", vnet_type="response";
+
+ // Requests to the cores or directory
+ MessageBuffer * requestToNetwork, network="To", virtual_network="0", vnet_type="request";
+
+ // From Region-Dir
+ MessageBuffer * notifyFromRegionDir, network="From", virtual_network="7", vnet_type="request";
+ MessageBuffer * probeFromRegionDir, network="From", virtual_network="8", vnet_type="request";
+
+ // From the directory
+ MessageBuffer * unblockFromDir, network="From", virtual_network="4", vnet_type="unblock";
+
+ // To the region-Dir
+ MessageBuffer * responseToRegDir, network="To", virtual_network="2", vnet_type="response";
+
+ MessageBuffer * triggerQueue;
+{
+
+ // States
+ state_declaration(State, desc="Region states", default="RegionBuffer_State_NP") {
+ NP, AccessPermission:Invalid, desc="Not present in region directory";
+ P, AccessPermission:Invalid, desc="Region is private to the cache";
+ S, AccessPermission:Invalid, desc="Region is possibly shared with others";
+
+ NP_PS, AccessPermission:Invalid, desc="Intermediate state waiting for notify from r-dir";
+ S_P, AccessPermission:Invalid, desc="Intermediate state while upgrading region";
+
+ P_NP, AccessPermission:Invalid, desc="Intermediate state while evicting all lines in region";
+ P_S, AccessPermission:Invalid, desc="Intermediate state while downgrading all lines in region";
+
+ S_NP_PS, AccessPermission:Invalid, desc="Got an inv in S_P, waiting for all inv acks, then going to since the write is already out there NP_PS";
+ P_NP_NP, AccessPermission:Invalid, desc="Evicting region on repl, then got an inv. Need to re-evict";
+
+ P_NP_O, AccessPermission:Invalid, desc="Waiting for all outstanding requests";
+ P_S_O, AccessPermission:Invalid, desc="Waiting for all outstanding requests";
+ S_O, AccessPermission:Invalid, desc="Waiting for all outstanding requests";
+ S_NP_PS_O, AccessPermission:Invalid, desc="Waiting for all outstanding requests";
+
+ SS_P, AccessPermission:Invalid, desc="Waiting for CPU write that we know is there";
+
+ P_NP_W, AccessPermission:Invalid, desc="Waiting for writeback ack";
+
+ NP_W, AccessPermission:Invalid, desc="Got a done ack before request, waiting for that victim";
+ }
+
+ enumeration(Event, desc="Region directory events") {
+ CPURead, desc="Access from CPU core";
+ CPUWrite, desc="Access from CPU core";
+ CPUWriteback, desc="Writeback request from CPU core";
+
+ ReplRegion, desc="Start a replace on a region";
+
+ PrivateNotify, desc="Update entry to private state";
+ SharedNotify, desc="Update entry to shared state";
+ WbNotify, desc="Writeback notification received";
+ InvRegion, desc="Start invalidating a region";
+ DowngradeRegion,desc="Start invalidating a region";
+
+ InvAck, desc="Ack from core";
+
+ DoneAck, desc="Ack from core that request has finished";
+ AllOutstanding, desc="All outstanding requests have now finished";
+
+ Evict, desc="Loopback to evict each block";
+ LastAck_PrbResp, desc="Done eviciting all the blocks, got the last ack from core, now respond to region dir";
+ LastAck_CleanWb, desc="Done eviciting all the blocks, got the last ack from core, now start clean writeback (note the dir has already been updated)";
+
+ StallAccess, desc="Wait for the done ack on the address before proceeding";
+ StallDoneAck, desc="Wait for the access on the address before proceeding";
+
+ StaleRequest, desc="Got a stale victim from the cache, fwd it without incrementing outstanding";
+ }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ TagArrayRead, desc="Read the data array";
+ TagArrayWrite, desc="Write the data array";
+ }
+
+ structure(BoolVec, external="yes") {
+ bool at(int);
+ void resize(int);
+ void clear();
+ int size();
+ }
+
+ structure(Entry, desc="Region entry", interface="AbstractCacheEntry") {
+ Addr addr, desc="Base address of this region";
+ State RegionState, desc="Region state";
+ DataBlock DataBlk, desc="Data for the block (always empty in region buffer)";
+ BoolVec ValidBlocks, desc="A vector to keep track of valid blocks";
+ int NumValidBlocks, desc="Number of trues in ValidBlocks to avoid iterating";
+ BoolVec UsedBlocks, desc="A vector to keep track of blocks ever valid";
+ bool dirty, desc="Dirty as best known by the region buffer";
+ // This is needed so we don't ack an invalidate until all requests are ordered
+ int NumOutstandingReqs, desc="Total outstanding private/shared requests";
+ BoolVec OutstandingReqs, desc="Blocks that have outstanding private/shared requests";
+ bool MustDowngrade, desc="Set when we got a downgrade before the shd or pvt permissions";
+ Cycles ProbeRequestTime, default="Cycles(0)", desc="Time region dir started the probe";
+ Cycles InitialRequestTime, default="Cycles(0)", desc="Time message was sent to region dir";
+ bool MsgSentToDir, desc="True if the current request required a message to the dir";
+ bool clearOnDone, default="false", desc="clear valid bit when request completes";
+ Addr clearOnDoneAddr, desc="clear valid bit when request completes";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ //int NumValidBlocks, desc="Number of blocks valid so we don't have to count a BoolVec";
+ BoolVec ValidBlocks, desc="A vector to keep track of valid blocks";
+ bool AllAcksReceived, desc="Got all necessary acks from dir";
+ bool DoneEvicting, desc="Done iterating through blocks checking for valids";
+ BoolVec AcksReceived, desc="Received acks for theses blocks\n";
+ bool SendAck, desc="If true, send an ack to the r-dir at end of inv";
+ ProbeRequestType MsgType, desc="Type of message to send while 'evicting' ";
+ int NumOutstandingReqs, desc="Total outstanding private/shared requests";
+ BoolVec OutstandingReqs, desc="Blocks that have outstanding private/shared requests";
+ MachineID Requestor, desc="Requestor for three hop transactions";
+ bool DemandRequest, default="false", desc="Associated with a demand request";
+ Addr DemandAddress, desc="Address for the demand request";
+ bool DoneAckReceived, default="false", desc="True if the done ack arrived before the message";
+ Addr DoneAckAddr, desc="Address of the done ack received early";
+ int OutstandingThreshold, desc="Number of outstanding requests to trigger AllOutstanding on";
+
+ ProbeRequestType NewMsgType, desc="Type of message to send while 'evicting' ";
+ MachineID NewRequestor, desc="Requestor for three hop transactions";
+ bool NewDemandRequest, default="false", desc="Associated with a demand request";
+ Addr NewDemandAddress, desc="Address for the demand request";
+ bool dirty, desc="dirty";
+ bool AllOutstandingTriggered, default="false", desc="bit for only one all outstanding";
+ int OutstandingAcks, default="0", desc="number of acks to wait for";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ // Stores only region addresses
+ TBETable TBEs, template="<RegionBuffer_TBE>", constructor="m_number_of_TBEs";
+ int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+ Tick clockEdge();
+ Tick cyclesToTicks(Cycles c);
+
+ void set_cache_entry(AbstractCacheEntry b);
+ void unset_cache_entry();
+ void set_tbe(TBE b);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+ Cycles curCycle();
+
+ int blockBits, default="RubySystem::getBlockSizeBits()";
+ int blockBytes, default="RubySystem::getBlockSizeBytes()";
+ int regionBits, default="log2(m_blocksPerRegion)";
+
+ // Functions
+
+ int getRegionOffset(Addr addr) {
+ if (blocksPerRegion > 1) {
+ Addr offset := bitSelect(addr, blockBits, regionBits+blockBits-1);
+ int ret := addressToInt(offset);
+ assert(ret < blocksPerRegion);
+ return ret;
+ } else {
+ return 0;
+ }
+ }
+
+ Addr getRegionBase(Addr addr) {
+ return maskLowOrderBits(addr, blockBits+regionBits);
+ }
+
+ Addr getNextBlock(Addr addr) {
+ Addr a := addr;
+ return makeNextStrideAddress(a, 1);
+ }
+
+ MachineID getPeer(MachineID mach, Addr address) {
+ if (isOnCPU) {
+ return createMachineID(MachineType:CorePair, intToID(0));
+ } else if (noTCCdir) {
+ return mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits);
+ } else {
+ return createMachineID(MachineType:TCCdir, intToID(0));
+ }
+ }
+
+ bool isOutstanding(TBE tbe, Entry cache_entry, Addr addr) {
+ if (is_valid(tbe) && tbe.OutstandingReqs.size() > 0) {
+ DPRINTF(RubySlicc, " outstanding tbe reqs %s %s %d %d\n",
+ tbe.OutstandingReqs, addr, getRegionOffset(addr),
+ tbe.OutstandingReqs.at(getRegionOffset(addr)));
+ return tbe.OutstandingReqs.at(getRegionOffset(addr));
+ } else if (is_valid(cache_entry)) {
+ DPRINTF(RubySlicc, " outstanding cache reqs %s %s %d %d\n",
+ cache_entry.OutstandingReqs, addr, getRegionOffset(addr),
+ cache_entry.OutstandingReqs.at(getRegionOffset(addr)));
+ return cache_entry.OutstandingReqs.at(getRegionOffset(addr));
+ } else {
+ return false;
+ }
+ }
+
+ bool isOnGPU() {
+ if (isOnCPU) {
+ return false;
+ }
+ return true;
+ }
+
+ bool isRead(CoherenceRequestType type) {
+ return (type == CoherenceRequestType:RdBlk || type == CoherenceRequestType:RdBlkS ||
+ type == CoherenceRequestType:VicClean);
+ }
+
+ bool presentOrAvail(Addr addr) {
+ return cacheMemory.isTagPresent(getRegionBase(addr)) || cacheMemory.cacheAvail(getRegionBase(addr));
+ }
+
+ // Returns a region entry!
+ Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+ return static_cast(Entry, "pointer", cacheMemory.lookup(getRegionBase(addr)));
+ }
+
+ TBE getTBE(Addr addr), return_by_pointer="yes" {
+ return TBEs.lookup(getRegionBase(addr));
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ return getCacheEntry(getRegionBase(addr)).DataBlk;
+ }
+
+ State getState(TBE tbe, Entry cache_entry, Addr addr) {
+ if (is_valid(tbe)) {
+ return tbe.TBEState;
+ } else if (is_valid(cache_entry)) {
+ return cache_entry.RegionState;
+ }
+ return State:NP;
+ }
+
+ void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+ if (is_valid(tbe)) {
+ tbe.TBEState := state;
+ }
+ if (is_valid(cache_entry)) {
+ cache_entry.RegionState := state;
+ }
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ TBE tbe := getTBE(addr);
+ if(is_valid(tbe)) {
+ return RegionBuffer_State_to_permission(tbe.TBEState);
+ }
+ Entry cache_entry := getCacheEntry(addr);
+ if(is_valid(cache_entry)) {
+ return RegionBuffer_State_to_permission(cache_entry.RegionState);
+ }
+ return AccessPermission:NotPresent;
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ functionalMemoryRead(pkt);
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ if (functionalMemoryWrite(pkt)) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+ if (is_valid(cache_entry)) {
+ cache_entry.changePermission(RegionBuffer_State_to_permission(state));
+ }
+ }
+
+ void recordRequestType(RequestType stat, Addr addr) {
+ if (stat == RequestType:TagArrayRead) {
+ cacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (stat == RequestType:TagArrayWrite) {
+ cacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:TagArrayRead) {
+ return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else {
+ error("Invalid RequestType type in checkResourceAvailable");
+ return true;
+ }
+ }
+
+ out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+
+ // Overloaded outgoing request nework for both probes to cores and reqeusts
+ // to the directory.
+ // Fix Me: These forwarded requests need to be on a separate virtual channel
+ // to avoid deadlock!
+ out_port(requestNetwork_out, CPURequestMsg, requestToNetwork);
+ out_port(probeNetwork_out, NBProbeRequestMsg, requestToNetwork);
+
+ out_port(responseNetwork_out, ResponseMsg, responseToRegDir);
+
+ in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=4) {
+ if (triggerQueue_in.isReady(clockEdge())) {
+ peek(triggerQueue_in, TriggerMsg) {
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := getTBE(in_msg.addr);
+ DPRINTF(RubySlicc, "trigger msg: %s (%s)\n", in_msg, getRegionBase(in_msg.addr));
+ assert(is_valid(tbe));
+ if (in_msg.Type == TriggerType:AcksComplete) {
+ if (tbe.SendAck) {
+ trigger(Event:LastAck_PrbResp, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:LastAck_CleanWb, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == TriggerType:AllOutstanding) {
+ trigger(Event:AllOutstanding, in_msg.addr, cache_entry, tbe);
+ } else {
+ assert(in_msg.Type == TriggerType:InvNext);
+ trigger(Event:Evict, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+ }
+
+ in_port(unblockNetwork_in, UnblockMsg, unblockFromDir, rank=3) {
+ if (unblockNetwork_in.isReady(clockEdge())) {
+ peek(unblockNetwork_in, UnblockMsg) {
+ TBE tbe := getTBE(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (in_msg.DoneAck) {
+ if (isOutstanding(tbe, cache_entry, in_msg.addr)) {
+ trigger(Event:DoneAck, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:StallDoneAck, in_msg.addr, cache_entry, tbe);
+ }
+ } else {
+ assert(is_valid(tbe));
+ trigger(Event:InvAck, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ }
+ }
+
+ in_port(probeNetwork_in, NBProbeRequestMsg, probeFromRegionDir, rank=2) {
+ if (probeNetwork_in.isReady(clockEdge())) {
+ peek(probeNetwork_in, NBProbeRequestMsg) {
+ TBE tbe := getTBE(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ assert(getRegionBase(in_msg.addr) == in_msg.addr);
+ if (in_msg.Type == ProbeRequestType:PrbInv) {
+ trigger(Event:InvRegion, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == ProbeRequestType:PrbDowngrade) {
+ trigger(Event:DowngradeRegion, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("Unknown probe message\n");
+ }
+ }
+ }
+ }
+
+ in_port(notifyNetwork_in, CPURequestMsg, notifyFromRegionDir, rank=1) {
+ if (notifyNetwork_in.isReady(clockEdge())) {
+ peek(notifyNetwork_in, CPURequestMsg) {
+ TBE tbe := getTBE(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ //Fix Me...add back in: assert(is_valid(cache_entry));
+ if (in_msg.Type == CoherenceRequestType:WbNotify) {
+ trigger(Event:WbNotify, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:SharedNotify) {
+ trigger(Event:SharedNotify, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:PrivateNotify) {
+ trigger(Event:PrivateNotify, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("Unknown notify message\n");
+ }
+ }
+ }
+ }
+
+ // In from cores
+ // NOTE: We get the cache / TBE entry based on the region address,
+ // but pass the block address to the actions
+ in_port(requestNetwork_in, CPURequestMsg, requestFromCore, rank=0) {
+ if (requestNetwork_in.isReady(clockEdge())) {
+ peek(requestNetwork_in, CPURequestMsg) {
+ TBE tbe := getTBE(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (is_valid(tbe) && tbe.DoneAckReceived && tbe.DoneAckAddr == in_msg.addr) {
+ DPRINTF(RubySlicc, "Stale/Stall request %s\n", in_msg.Type);
+ if (in_msg.Type == CoherenceRequestType:VicDirty || in_msg.Type == CoherenceRequestType:VicClean )
+ {
+ trigger(Event:StaleRequest, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:StallAccess, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (isOutstanding(tbe, cache_entry, in_msg.addr)) {
+ DPRINTF(RubySlicc, "Stall outstanding request %s\n", in_msg.Type);
+ trigger(Event:StallAccess, in_msg.addr, cache_entry, tbe);
+ } else {
+ if (presentOrAvail(in_msg.addr)) {
+ if (in_msg.Type == CoherenceRequestType:RdBlkM ) {
+ trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:WriteThrough ) {
+ trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:Atomic ) {
+ trigger(Event:CPUWrite, in_msg.addr, cache_entry, tbe);
+ } else {
+ if (in_msg.Type == CoherenceRequestType:VicDirty ||
+ in_msg.Type == CoherenceRequestType:VicClean) {
+ trigger(Event:CPUWriteback, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:CPURead, in_msg.addr, cache_entry, tbe);
+ }
+ }
+ } else {
+ Addr victim := cacheMemory.cacheProbe(getRegionBase(in_msg.addr));
+ TBE victim_tbe := getTBE(victim);
+ Entry victim_entry := getCacheEntry(victim);
+ DPRINTF(RubySlicc, "Replacing region %s for %s(%s)\n", victim, in_msg.addr, getRegionBase(in_msg.addr));
+ trigger(Event:ReplRegion, victim, victim_entry, victim_tbe);
+ }
+ }
+ }
+ }
+ }
+
+ // Actions
+ action(f_fwdReqToDir, "f", desc="Forward CPU request to directory") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+ out_msg.addr := in_msg.addr;
+ out_msg.Type := in_msg.Type;
+ out_msg.DataBlk := in_msg.DataBlk;
+ out_msg.Dirty := in_msg.Dirty;
+ out_msg.Requestor := in_msg.Requestor;
+ out_msg.WTRequestor := in_msg.WTRequestor;
+ out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+ out_msg.Shared := in_msg.Shared;
+ out_msg.MessageSize := in_msg.MessageSize;
+ out_msg.Private := true;
+ out_msg.InitialRequestTime := curCycle();
+ out_msg.ProbeRequestStartTime := curCycle();
+ if (getState(tbe, cache_entry, address) == State:S) {
+ out_msg.ForceShared := true;
+ }
+ DPRINTF(RubySlicc, "Fwd: %s\n", out_msg);
+ //assert(getState(tbe, cache_entry, address) == State:P || getState(tbe, cache_entry, address) == State:S);
+ if (getState(tbe, cache_entry, address) == State:NP_W) {
+ APPEND_TRANSITION_COMMENT(" fwding stale request: ");
+ APPEND_TRANSITION_COMMENT(out_msg.Type);
+ }
+ }
+ }
+ }
+
+ action(u_updateRegionEntry, "u", desc="Update the entry for profiling") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ if (is_valid(cache_entry)) {
+ if (in_msg.CtoDSinked == false) {
+ APPEND_TRANSITION_COMMENT(" incr outstanding ");
+ cache_entry.NumOutstandingReqs := 1 + cache_entry.NumOutstandingReqs;
+ assert(cache_entry.OutstandingReqs.at(getRegionOffset(address)) == false);
+ cache_entry.OutstandingReqs.at(getRegionOffset(address)) := true;
+ assert(cache_entry.NumOutstandingReqs == countBoolVec(cache_entry.OutstandingReqs));
+ } else {
+ APPEND_TRANSITION_COMMENT(" NOT incr outstanding ");
+ assert(in_msg.Type == CoherenceRequestType:RdBlkM || in_msg.Type == CoherenceRequestType:RdBlkS);
+ }
+ APPEND_TRANSITION_COMMENT(cache_entry.NumOutstandingReqs);
+ if (in_msg.Type == CoherenceRequestType:RdBlkM || in_msg.Type == CoherenceRequestType:Atomic ||
+ in_msg.Type == CoherenceRequestType:WriteThrough )
+ {
+ cache_entry.dirty := true;
+ }
+ if (in_msg.Type == CoherenceRequestType:VicDirty ||
+ in_msg.Type == CoherenceRequestType:VicClean) {
+ DPRINTF(RubySlicc, "Got %s for addr %s\n", in_msg.Type, address);
+ //assert(cache_entry.ValidBlocks.at(getRegionOffset(address)));
+ // can in fact be inv if core got an inv after a vicclean before it got here
+ if (cache_entry.ValidBlocks.at(getRegionOffset(address))) {
+ cache_entry.clearOnDone := true;
+ cache_entry.clearOnDoneAddr := address;
+ //cache_entry.ValidBlocks.at(getRegionOffset(address)) := false;
+ //cache_entry.NumValidBlocks := cache_entry.NumValidBlocks - 1;
+ }
+ } else {
+ if (cache_entry.ValidBlocks.at(getRegionOffset(address)) == false) {
+ cache_entry.NumValidBlocks := cache_entry.NumValidBlocks + 1;
+ }
+ DPRINTF(RubySlicc, "before valid addr %s bits %s\n",
+ in_msg.Type, address, cache_entry.ValidBlocks);
+ cache_entry.ValidBlocks.at(getRegionOffset(address)) := true;
+ DPRINTF(RubySlicc, "after valid addr %s bits %s\n",
+ in_msg.Type, address, cache_entry.ValidBlocks);
+ cache_entry.UsedBlocks.at(getRegionOffset(address)) := true;
+ }
+ assert(cache_entry.NumValidBlocks <= blocksPerRegion);
+ assert(cache_entry.NumValidBlocks >= 0);
+ APPEND_TRANSITION_COMMENT(" valid blocks ");
+ APPEND_TRANSITION_COMMENT(cache_entry.ValidBlocks);
+ } else {
+ error("This shouldn't happen anymore I think");
+ //tbe.ValidBlocks.at(getRegionOffest(address)) := true;
+ assert(getState(tbe, cache_entry, address) == State:P_NP);
+ }
+ }
+ }
+
+ action(uw_updatePossibleWriteback, "uw", desc="writeback request complete") {
+ peek(unblockNetwork_in, UnblockMsg) {
+ if (is_valid(cache_entry) && in_msg.validToInvalid &&
+ cache_entry.clearOnDone && cache_entry.clearOnDoneAddr == address) {
+ DPRINTF(RubySlicc, "I have no idea what is going on here\n");
+ cache_entry.ValidBlocks.at(getRegionOffset(address)) := false;
+ cache_entry.NumValidBlocks := cache_entry.NumValidBlocks - 1;
+ cache_entry.clearOnDone := false;
+ }
+ }
+ }
+
+
+ action(rp_requestPrivate, "rp", desc="Send private request r-dir") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ // No need to send acks on replacements
+ assert(is_invalid(tbe));
+ enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) {
+ out_msg.addr := address; // use the actual address so the demand request can be fulfilled
+ out_msg.DemandAddress := address;
+ out_msg.Type := CoherenceRequestType:PrivateRequest;
+ out_msg.OriginalType := in_msg.Type;
+ out_msg.Requestor := machineID;
+ out_msg.WTRequestor := in_msg.WTRequestor;
+ out_msg.InitialRequestTime := curCycle();
+ // will this always be ok? probably not for multisocket
+ out_msg.Destination.add(map_Address_to_RegionDir(address));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ DPRINTF(RubySlicc, "Private request %s\n", out_msg);
+ }
+ cache_entry.ProbeRequestTime := curCycle();
+ cache_entry.MsgSentToDir := true;
+ APPEND_TRANSITION_COMMENT(getRegionBase(address));
+ }
+ }
+
+ action(ru_requestUpgrade, "ru", desc="Send upgrade request r-dir") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ // No need to send acks on replacements
+ assert(is_invalid(tbe));
+ enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) {
+ out_msg.addr := address; // use the actual address so the demand request can be fulfilled
+ out_msg.Type := CoherenceRequestType:UpgradeRequest;
+ out_msg.OriginalType := in_msg.Type;
+ out_msg.Requestor := machineID;
+ out_msg.WTRequestor := in_msg.WTRequestor;
+ out_msg.InitialRequestTime := curCycle();
+ // will this always be ok? probably not for multisocket
+ out_msg.Destination.add(map_Address_to_RegionDir(address));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ }
+ cache_entry.ProbeRequestTime := curCycle();
+ cache_entry.MsgSentToDir := true;
+ APPEND_TRANSITION_COMMENT(getRegionBase(address));
+ }
+ }
+
+ action(rw_requestWriteback, "rq", desc="Send writeback request") {
+ // No need to send acks on replacements
+ enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) {
+ out_msg.addr := getRegionBase(address); // use the actual address so the demand request can be fulfilled
+ out_msg.Type := CoherenceRequestType:CleanWbRequest;
+ out_msg.Requestor := machineID;
+ out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.Dirty := tbe.dirty;
+ APPEND_TRANSITION_COMMENT(getRegionBase(address));
+ }
+ }
+
+ action(rs_requestShared, "rs", desc="Send shared request r-dir") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ // No need to send acks on replacements
+ assert(is_invalid(tbe));
+ enqueue(requestNetwork_out, CPURequestMsg, toRegionDirLatency) {
+ out_msg.addr := address; // use the actual address so the demand request can be fulfilled
+ out_msg.Type := CoherenceRequestType:SharedRequest;
+ out_msg.OriginalType := in_msg.Type;
+ out_msg.Requestor := machineID;
+ out_msg.WTRequestor := in_msg.WTRequestor;
+ out_msg.InitialRequestTime := curCycle();
+ // will this always be ok? probably not for multisocket
+ out_msg.Destination.add(map_Address_to_RegionDir(address));
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ }
+ cache_entry.ProbeRequestTime := curCycle();
+ cache_entry.MsgSentToDir := true;
+ APPEND_TRANSITION_COMMENT(getRegionBase(address));
+ }
+ }
+
+ action(ai_ackRegionInv, "ai", desc="Send ack to r-dir on region inv if tbe says so") {
+ // No need to send acks on replacements
+ assert(is_valid(tbe));
+ enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+ out_msg.addr := getRegionBase(address);
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(ad_ackDircetory, "ad", desc="send probe response to directory") {
+ if (noTCCdir && tbe.MsgType == ProbeRequestType:PrbDowngrade && isOnGPU()) { //VIPER tcc doesnt understand PrbShrData
+ assert(tbe.DemandRequest); //So, let RegionBuffer take care of sending back ack
+ enqueue(responseNetwork_out, ResponseMsg, toDirLatency) {
+ out_msg.addr := tbe.DemandAddress;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := getPeer(machineID,address);
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ out_msg.Dirty := false; // only true if sending back data i think
+ out_msg.Hit := false;
+ out_msg.Ntsl := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.NoAckNeeded := true;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+
+ action(aie_ackRegionExclusiveInv, "aie", desc="Send ack to r-dir on region inv if tbe says so") {
+ // No need to send acks on replacements
+ assert(is_valid(tbe));
+ enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+ out_msg.addr := getRegionBase(address);
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.NotCached := true;
+ out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ out_msg.Dirty := tbe.dirty;
+ }
+ }
+
+ action(ain_ackRegionInvNow, "ain", desc="Send ack to r-dir on region inv") {
+ enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+ out_msg.addr := getRegionBase(address);
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(aine_ackRegionInvExlusiveNow, "aine", desc="Send ack to r-dir on region inv with exlusive permission") {
+ enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+ out_msg.addr := getRegionBase(address);
+ out_msg.Type := CoherenceResponseType:CPUPrbResp;
+ out_msg.Sender := machineID;
+ out_msg.NotCached := true;
+ out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(ap_ackPrivateNotify, "ap", desc="Send ack to r-dir on private notify") {
+ enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+ out_msg.addr := getRegionBase(address);
+ out_msg.Type := CoherenceResponseType:PrivateAck;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+
+ action(aw_ackWbNotify, "aw", desc="Send ack to r-dir on writeback notify") {
+ peek(notifyNetwork_in, CPURequestMsg) {
+ if (in_msg.NoAckNeeded == false) {
+ enqueue(responseNetwork_out, ResponseMsg, toRegionDirLatency) {
+ out_msg.addr := getRegionBase(address);
+ out_msg.Type := CoherenceResponseType:RegionWbAck;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(map_Address_to_RegionDir(address)); // will this always be ok? probably not for multisocket
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ }
+ }
+ }
+ }
+
+ action(e_evictCurrent, "e", desc="Evict this block in the region") {
+ // send force invalidate message to directory to invalidate this block
+ // must invalidate all blocks since region buffer could have privitized it
+ if (tbe.ValidBlocks.at(getRegionOffset(address)) &&
+ (tbe.DemandRequest == false || tbe.DemandAddress != address)) {
+ DPRINTF(RubySlicc, "trying to evict address %s (base: %s, offset: %d)\n", address, getRegionBase(address), getRegionOffset(address));
+ DPRINTF(RubySlicc, "tbe valid blocks %s\n", tbe.ValidBlocks);
+
+ enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := tbe.MsgType;
+ out_msg.ReturnData := true;
+ if (address == tbe.DemandAddress) {
+ out_msg.DemandRequest := true;
+ }
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination.add(getPeer(machineID,address));
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ APPEND_TRANSITION_COMMENT(" current ");
+ APPEND_TRANSITION_COMMENT(tbe.ValidBlocks.at(getRegionOffset(address)));
+ tbe.AllAcksReceived := false;
+ } else {
+ DPRINTF(RubySlicc, "Not evicting demand %s\n", address);
+ }
+ }
+
+ action(ed_evictDemand, "ed", desc="Evict the demand request if it's valid") {
+ if (noTCCdir && tbe.MsgType == ProbeRequestType:PrbDowngrade && isOnGPU()) {
+ tbe.OutstandingAcks := 0;
+ tbe.AllAcksReceived := true;
+ tbe.DoneEvicting := true;
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.Type := TriggerType:AcksComplete;
+ out_msg.addr := getRegionBase(address);
+ }
+ } else if (tbe.DemandRequest) {
+ enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+ out_msg.addr := tbe.DemandAddress;
+ out_msg.Type := tbe.MsgType;
+ out_msg.ReturnData := true;
+ out_msg.DemandRequest := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination.add(getPeer(machineID,address));
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ tbe.AllAcksReceived := false;
+ }
+ if (tbe.ValidBlocks.at(getRegionOffset(tbe.DemandAddress)) == false) {
+ tbe.OutstandingAcks := tbe.OutstandingAcks + 1;
+ }
+ APPEND_TRANSITION_COMMENT("Evicting demand ");
+ APPEND_TRANSITION_COMMENT(tbe.DemandAddress);
+ }
+ APPEND_TRANSITION_COMMENT("waiting acks ");
+ APPEND_TRANSITION_COMMENT(tbe.OutstandingAcks);
+ }
+
+ action(adp_AckDemandProbe, "fp", desc="forward demand probe even if we know that the core is invalid") {
+ peek(probeNetwork_in, NBProbeRequestMsg) {
+ if (in_msg.DemandRequest) {
+ enqueue(responseNetwork_out, ResponseMsg, toDirLatency) {
+ out_msg.addr := in_msg.DemandAddress;
+ out_msg.Type := CoherenceResponseType:CPUPrbResp; // L3 and CPUs respond in same way to probes
+ out_msg.Sender := getPeer(machineID,address);
+ out_msg.Destination.add(map_Address_to_Directory(address)); // will this always be ok? probably not for multisocket
+ out_msg.Dirty := false; // only true if sending back data i think
+ out_msg.Hit := false;
+ out_msg.Ntsl := false;
+ out_msg.State := CoherenceState:NA;
+ out_msg.NoAckNeeded := true;
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+ }
+ }
+
+ action(en_enqueueNextEvict, "en", desc="Queue evict the next block in the region") {
+ // increment in_msg.addr by blockSize bytes and enqueue on triggerPort
+ // Only enqueue if the next address doesn't overrun the region bound
+ if (getRegionBase(getNextBlock(address)) == getRegionBase(address)) {
+ enqueue(triggerQueue_out, TriggerMsg, nextEvictLatency) {
+ out_msg.Type := TriggerType:InvNext;
+ out_msg.addr := getNextBlock(address);
+ }
+ } else {
+ tbe.DoneEvicting := true;
+ DPRINTF(RubySlicc, "Done evicing region %s\n", getRegionBase(address));
+ DPRINTF(RubySlicc, "Waiting for %s acks\n", tbe.OutstandingAcks);
+ if (tbe.AllAcksReceived == true) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.Type := TriggerType:AcksComplete;
+ out_msg.addr := getRegionBase(address);
+ }
+ }
+ }
+ }
+
+ action(ef_enqueueFirstEvict, "ef", desc="Queue the first block in the region to be evicted") {
+ if (tbe.DoneEvicting == false) {
+ enqueue(triggerQueue_out, TriggerMsg, nextEvictLatency) {
+ out_msg.Type := TriggerType:InvNext;
+ out_msg.addr := getRegionBase(address);
+ }
+ }
+ }
+
+ action(ra_receiveAck, "ra", desc="Mark TBE entry as received this ack") {
+ DPRINTF(RubySlicc, "received ack for %s reg: %s vec: %s pos: %d\n",
+ address, getRegionBase(address), tbe.ValidBlocks, getRegionOffset(address));
+ peek(unblockNetwork_in, UnblockMsg) {
+ //
+ // Note the tbe ValidBlock vec will be a conservative list of the
+ // valid blocks since the cache entry ValidBlock vec is set on the
+ // request
+ //
+ if (in_msg.wasValid) {
+ assert(tbe.ValidBlocks.at(getRegionOffset(address)));
+ }
+ }
+ tbe.OutstandingAcks := tbe.OutstandingAcks - 1;
+ tbe.AcksReceived.at(getRegionOffset(address)) := true;
+ assert(tbe.OutstandingAcks >= 0);
+ if (tbe.OutstandingAcks == 0) {
+ tbe.AllAcksReceived := true;
+ if (tbe.DoneEvicting) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.Type := TriggerType:AcksComplete;
+ out_msg.addr := getRegionBase(address);
+ }
+ }
+ }
+
+ APPEND_TRANSITION_COMMENT(getRegionBase(address));
+ APPEND_TRANSITION_COMMENT(" Acks left receive ");
+ APPEND_TRANSITION_COMMENT(tbe.OutstandingAcks);
+ }
+
+ action(do_decrementOutstanding, "do", desc="Decrement outstanding requests") {
+ APPEND_TRANSITION_COMMENT(" decr outstanding ");
+ if (is_valid(cache_entry)) {
+ cache_entry.NumOutstandingReqs := cache_entry.NumOutstandingReqs - 1;
+ assert(cache_entry.OutstandingReqs.at(getRegionOffset(address)));
+ cache_entry.OutstandingReqs.at(getRegionOffset(address)) := false;
+ assert(cache_entry.NumOutstandingReqs >= 0);
+ assert(cache_entry.NumOutstandingReqs == countBoolVec(cache_entry.OutstandingReqs));
+ APPEND_TRANSITION_COMMENT(cache_entry.NumOutstandingReqs);
+ }
+ if (is_valid(tbe)) {
+ tbe.NumOutstandingReqs := tbe.NumOutstandingReqs - 1;
+ assert(tbe.OutstandingReqs.at(getRegionOffset(address)));
+ tbe.OutstandingReqs.at(getRegionOffset(address)) := false;
+ assert(tbe.NumOutstandingReqs >= 0);
+ assert(tbe.NumOutstandingReqs == countBoolVec(tbe.OutstandingReqs));
+ APPEND_TRANSITION_COMMENT(tbe.NumOutstandingReqs);
+ }
+ }
+
+ action(co_checkOutstanding, "co", desc="check if there are no more outstanding requests") {
+ assert(is_valid(tbe));
+ if ((tbe.NumOutstandingReqs <= tbe.OutstandingThreshold) &&
+ (tbe.AllOutstandingTriggered == false)) {
+ APPEND_TRANSITION_COMMENT(" no more outstanding: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumOutstandingReqs);
+ APPEND_TRANSITION_COMMENT(tbe.OutstandingThreshold);
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.Type := TriggerType:AllOutstanding;
+ if (tbe.DemandRequest) {
+ out_msg.addr := tbe.DemandAddress;
+ } else {
+ out_msg.addr := getRegionBase(address);
+ }
+ DPRINTF(RubySlicc, "co enqueuing %s\n", out_msg);
+ tbe.AllOutstandingTriggered := true;
+ }
+ } else {
+ APPEND_TRANSITION_COMMENT(" still more outstanding ");
+ }
+ }
+
+ action(ro_resetAllOutstanding, "ro", desc="Reset all outstanding") {
+ tbe.AllOutstandingTriggered := false;
+ }
+
+ action(so_setOutstandingCheckOne, "so", desc="Check outstanding is waiting for 1, not 0") {
+ // Need this for S_P because one request is outstanding between here and r-dir
+ tbe.OutstandingThreshold := 1;
+ }
+
+ action(a_allocateRegionEntry, "a", desc="Allocate a new entry") {
+ set_cache_entry(cacheMemory.allocate(getRegionBase(address), new Entry));
+ cache_entry.ValidBlocks.clear();
+ cache_entry.ValidBlocks.resize(blocksPerRegion);
+ cache_entry.UsedBlocks.clear();
+ cache_entry.UsedBlocks.resize(blocksPerRegion);
+ cache_entry.dirty := false;
+ cache_entry.NumOutstandingReqs := 0;
+ cache_entry.OutstandingReqs.clear();
+ cache_entry.OutstandingReqs.resize(blocksPerRegion);
+ }
+
+ action(d_deallocateRegionEntry, "d", desc="Deallocate region entry") {
+ cacheMemory.deallocate(getRegionBase(address));
+ unset_cache_entry();
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ check_allocate(TBEs);
+ TBEs.allocate(getRegionBase(address));
+ set_tbe(getTBE(address));
+ tbe.OutstandingAcks := 0;
+ tbe.AllAcksReceived := true; // starts true since the region could be empty
+ tbe.DoneEvicting := false;
+ tbe.AcksReceived.clear();
+ tbe.AcksReceived.resize(blocksPerRegion);
+ tbe.SendAck := false;
+ tbe.OutstandingThreshold := 0;
+ if (is_valid(cache_entry)) {
+ tbe.NumOutstandingReqs := cache_entry.NumOutstandingReqs;
+ tbe.OutstandingReqs := cache_entry.OutstandingReqs;
+ assert(tbe.NumOutstandingReqs == countBoolVec(tbe.OutstandingReqs));
+ tbe.dirty := cache_entry.dirty;
+ tbe.ValidBlocks := cache_entry.ValidBlocks;
+ tbe.OutstandingAcks := countBoolVec(tbe.ValidBlocks);
+ APPEND_TRANSITION_COMMENT(" tbe valid blocks ");
+ APPEND_TRANSITION_COMMENT(tbe.ValidBlocks);
+ APPEND_TRANSITION_COMMENT(" cache valid blocks ");
+ APPEND_TRANSITION_COMMENT(cache_entry.ValidBlocks);
+ } else {
+ tbe.dirty := false;
+ }
+ }
+
+ action(m_markSendAck, "m", desc="Mark TBE that we need to ack at end") {
+ assert(is_valid(tbe));
+ tbe.SendAck := true;
+ }
+
+ action(db_markDirtyBit, "db", desc="Mark TBE dirty bit") {
+ peek(unblockNetwork_in, UnblockMsg) {
+ if (is_valid(tbe)) {
+ tbe.dirty := tbe.dirty || in_msg.Dirty;
+ }
+ }
+ }
+
+ action(dr_markDoneAckReceived, "dr", desc="Mark TBE that a done ack has been received") {
+ assert(is_valid(tbe));
+ tbe.DoneAckReceived := true;
+ tbe.DoneAckAddr := address;
+ APPEND_TRANSITION_COMMENT(" marking done ack on TBE ");
+ }
+
+ action(se_setTBE, "se", desc="Set msg type to evict") {
+ peek(probeNetwork_in, NBProbeRequestMsg) {
+ tbe.MsgType := in_msg.Type;
+ tbe.Requestor := in_msg.Requestor;
+ tbe.DemandAddress := in_msg.DemandAddress;
+ tbe.DemandRequest := in_msg.DemandRequest;
+ }
+ }
+
+ action(sne_setNewTBE, "sne", desc="Set msg type to evict") {
+ peek(probeNetwork_in, NBProbeRequestMsg) {
+ tbe.NewMsgType := in_msg.Type;
+ tbe.NewRequestor := in_msg.Requestor;
+ tbe.NewDemandAddress := in_msg.DemandAddress;
+ tbe.NewDemandRequest := in_msg.DemandRequest;
+ }
+ }
+
+ action(soe_setOldTBE, "soe", desc="Set msg type to evict") {
+ tbe.MsgType := tbe.NewMsgType;
+ tbe.Requestor := tbe.NewRequestor;
+ tbe.DemandAddress := tbe.NewDemandAddress;
+ tbe.DemandRequest := tbe.NewDemandRequest;
+ tbe.OutstandingAcks := countBoolVec(tbe.ValidBlocks);
+ tbe.AllAcksReceived := true; // starts true since the region could be empty
+ tbe.DoneEvicting := false;
+ tbe.AcksReceived.clear();
+ tbe.AcksReceived.resize(blocksPerRegion);
+ tbe.SendAck := false;
+ }
+
+ action(ser_setTBE, "ser", desc="Set msg type to evict repl") {
+ tbe.MsgType := ProbeRequestType:PrbInv;
+ }
+
+ action(md_setMustDowngrade, "md", desc="When permissions finally get here, must be shared") {
+ assert(is_valid(cache_entry));
+ cache_entry.MustDowngrade := true;
+ }
+
+ action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+ TBEs.deallocate(getRegionBase(address));
+ unset_tbe();
+ }
+
+ action(p_popRequestQueue, "p", desc="Pop the request queue") {
+ requestNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pl_popUnblockQueue, "pl", desc="Pop the unblock queue") {
+ unblockNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pn_popNotifyQueue, "pn", desc="Pop the notify queue") {
+ notifyNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pp_popProbeQueue, "pp", desc="Pop the probe queue") {
+ probeNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pt_popTriggerQueue, "pt", desc="Pop the trigger queue") {
+ DPRINTF(RubySlicc, "Trigger Before Contents: %s\n", triggerQueue_in);
+ triggerQueue_in.dequeue(clockEdge());
+ DPRINTF(RubySlicc, "Trigger After Contents: %s\n", triggerQueue_in);
+ }
+
+ // Must always use wake all, since non-region address wait on region addresses
+ action(wa_wakeUpAllDependents, "wa", desc="Wake up any requests waiting for this region") {
+ wakeUpAllBuffers();
+ }
+
+ action(zz_stallAndWaitRequestQueue, "\z", desc="recycle request queue") {
+ Addr regAddr := getRegionBase(address);
+ DPRINTF(RubySlicc, "Stalling address %s\n", regAddr);
+ stall_and_wait(requestNetwork_in, regAddr);
+ }
+
+ action(yy_stallAndWaitProbeQueue, "\y", desc="stall probe queue") {
+ Addr regAddr := getRegionBase(address);
+ stall_and_wait(probeNetwork_in, regAddr);
+ }
+
+ action(yyy_recycleProbeQueue, "\yy", desc="recycle probe queue") {
+ probeNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(zzz_recycleRequestQueue, "\zz", desc="recycle request queue") {
+ requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(www_recycleUnblockNetwork, "\ww", desc="recycle unblock queue") {
+ unblockNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(z_stall, "z", desc="stall request queue") {
+ // fake state
+ }
+
+ action(mru_setMRU, "mru", desc="set MRU") {
+ cacheMemory.setMRU(address, cache_entry.NumValidBlocks);
+ }
+
+ // Transitions
+
+ transition({NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, P_NP_W, P_NP_NP, NP_W}, {CPURead, CPUWriteback, CPUWrite}) {} {
+ zz_stallAndWaitRequestQueue;
+ }
+
+ transition(SS_P, {CPURead, CPUWriteback}) {
+ zz_stallAndWaitRequestQueue;
+ }
+
+ transition({NP, S, P, NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P, NP_W, P_NP_NP}, StallAccess) {} {
+ zz_stallAndWaitRequestQueue;
+ }
+
+ transition({S, P, NP_PS, S_P, S_NP_PS, P_NP, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P, P_NP_W, P_NP_NP, NP_W}, StallDoneAck) {
+ www_recycleUnblockNetwork;
+ }
+
+ transition(NP, StallDoneAck, NP_W) {
+ t_allocateTBE;
+ db_markDirtyBit;
+ dr_markDoneAckReceived;
+ pl_popUnblockQueue;
+ }
+
+ transition(NP_W, StaleRequest, NP) {
+ f_fwdReqToDir;
+ dt_deallocateTBE;
+ wa_wakeUpAllDependents;
+ p_popRequestQueue;
+ }
+
+ transition(P_NP_O, DowngradeRegion) {} {
+ z_stall; // should stall and wait
+ }
+
+ transition({NP_PS, S_NP_PS, S_P, P_S, P_NP_O, S_NP_PS_O, P_S_O, S_O, SS_P}, ReplRegion) {} {
+ zz_stallAndWaitRequestQueue; // can't let things get out of order!
+ }
+
+ transition({P_NP_O, S_O, SS_P}, InvRegion) {} {
+ yyy_recycleProbeQueue; // can't be z_stall because there could be a RdBlkM in the requestQueue which has the sinked flag which is blocking the inv
+ }
+
+ transition(P_NP, {InvRegion, DowngradeRegion}, P_NP_NP) {} {
+ sne_setNewTBE;
+ pp_popProbeQueue;
+ }
+
+ transition(S_P, DowngradeRegion) {} {
+ adp_AckDemandProbe;
+ ain_ackRegionInvNow;
+ pp_popProbeQueue;
+ }
+
+ transition(P_NP_W, InvRegion) {
+ adp_AckDemandProbe;
+ ain_ackRegionInvNow;
+ pp_popProbeQueue;
+ }
+
+ transition(P_NP_W, DowngradeRegion) {
+ adp_AckDemandProbe;
+ aine_ackRegionInvExlusiveNow;
+ pp_popProbeQueue;
+ }
+
+ transition({P, S}, {CPURead, CPUWriteback}) {TagArrayRead, TagArrayWrite} {
+ mru_setMRU;
+ f_fwdReqToDir;
+ u_updateRegionEntry;
+ p_popRequestQueue;
+ }
+
+ transition(P, CPUWrite) {TagArrayRead, TagArrayWrite} {
+ mru_setMRU;
+ f_fwdReqToDir;
+ u_updateRegionEntry;
+ p_popRequestQueue;
+ }
+
+ transition(S, CPUWrite, S_O) {TagArrayRead} {
+ mru_setMRU;
+ t_allocateTBE;
+ co_checkOutstanding;
+ zz_stallAndWaitRequestQueue;
+ }
+
+ transition(S_O, AllOutstanding, SS_P) {
+ wa_wakeUpAllDependents;
+ ro_resetAllOutstanding;
+ pt_popTriggerQueue;
+ }
+
+ transition(SS_P, CPUWrite, S_P) {
+ mru_setMRU;
+ dt_deallocateTBE;
+ ru_requestUpgrade;
+ u_updateRegionEntry;
+ p_popRequestQueue;
+ }
+
+ transition(NP, {CPURead, CPUWriteback}, NP_PS) {TagArrayRead, TagArrayWrite} {
+ a_allocateRegionEntry;
+ rs_requestShared;
+ u_updateRegionEntry;
+ p_popRequestQueue;//zz_stallAndWaitRequestQueue;
+ }
+
+ transition(NP, CPUWrite, NP_PS) {TagArrayRead, TagArrayWrite} {
+ a_allocateRegionEntry;
+ rp_requestPrivate;
+ u_updateRegionEntry;
+ p_popRequestQueue;//zz_stallAndWaitRequestQueue;
+ }
+
+ transition(NP_PS, PrivateNotify, P) {} {
+ ap_ackPrivateNotify;
+ wa_wakeUpAllDependents;
+ pn_popNotifyQueue;
+ }
+
+ transition(S_P, PrivateNotify, P) {} {
+ ap_ackPrivateNotify;
+ wa_wakeUpAllDependents;
+ pn_popNotifyQueue;
+ }
+
+ transition(NP_PS, SharedNotify, S) {} {
+ ap_ackPrivateNotify;
+ wa_wakeUpAllDependents;
+ pn_popNotifyQueue;
+ }
+
+ transition(P_NP_W, WbNotify, NP) {} {
+ aw_ackWbNotify;
+ wa_wakeUpAllDependents;
+ dt_deallocateTBE;
+ pn_popNotifyQueue;
+ }
+
+ transition({P, S}, ReplRegion, P_NP_O) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ ser_setTBE;
+ d_deallocateRegionEntry;
+ co_checkOutstanding;
+ }
+
+ transition({P, S}, InvRegion, P_NP_O) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ se_setTBE;
+ m_markSendAck;
+ d_deallocateRegionEntry;
+ co_checkOutstanding;
+ pp_popProbeQueue;
+ }
+
+ transition(P_NP_O, AllOutstanding, P_NP) {} {
+ ed_evictDemand;
+ ef_enqueueFirstEvict;
+ ro_resetAllOutstanding;
+ pt_popTriggerQueue;
+ }
+
+ transition(S_P, InvRegion, S_NP_PS_O) {TagArrayRead} {
+ t_allocateTBE;
+ se_setTBE;
+ m_markSendAck;
+ so_setOutstandingCheckOne;
+ co_checkOutstanding;
+ pp_popProbeQueue;
+ }
+
+ transition(S_NP_PS_O, AllOutstanding, S_NP_PS) {
+ ed_evictDemand;
+ ef_enqueueFirstEvict;
+ ro_resetAllOutstanding;
+ pt_popTriggerQueue;
+ }
+
+ transition(P, DowngradeRegion, P_S_O) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ se_setTBE;
+ m_markSendAck;
+ co_checkOutstanding;
+ pp_popProbeQueue;
+ }
+
+ transition(P_S_O, AllOutstanding, P_S) {} {
+ ed_evictDemand;
+ ef_enqueueFirstEvict;
+ ro_resetAllOutstanding;
+ pt_popTriggerQueue;
+ }
+
+ transition({P, S}, DoneAck) {TagArrayWrite} {
+ do_decrementOutstanding;
+ wa_wakeUpAllDependents;
+ db_markDirtyBit;
+ uw_updatePossibleWriteback;
+ pl_popUnblockQueue;
+ }
+
+ transition({S_P, NP_PS, S_NP_PS}, DoneAck) {TagArrayWrite} {
+ www_recycleUnblockNetwork;
+ }
+
+ transition({P_NP_O, S_NP_PS_O, P_S_O, S_O}, DoneAck) {} {
+ do_decrementOutstanding;
+ co_checkOutstanding;
+ db_markDirtyBit;
+ uw_updatePossibleWriteback;
+ pl_popUnblockQueue;
+ }
+
+ transition({P_NP, P_S, S_NP_PS, P_NP_NP}, Evict) {} {
+ e_evictCurrent;
+ en_enqueueNextEvict;
+ pt_popTriggerQueue;
+ }
+
+ transition({P_NP, P_S, S_NP_PS, P_NP_NP}, InvAck) {} {
+ ra_receiveAck;
+ db_markDirtyBit;
+ pl_popUnblockQueue;
+ }
+
+ transition(P_NP, LastAck_CleanWb, P_NP_W) {} {
+ rw_requestWriteback;
+ pt_popTriggerQueue;
+ }
+
+ transition(P_NP_NP, LastAck_CleanWb, P_NP) {} {
+ soe_setOldTBE;
+ m_markSendAck;
+ ed_evictDemand;
+ ef_enqueueFirstEvict;
+ pt_popTriggerQueue;
+ }
+
+ transition(P_NP, LastAck_PrbResp, NP) {} {
+ aie_ackRegionExclusiveInv;
+ dt_deallocateTBE;
+ wa_wakeUpAllDependents;
+ pt_popTriggerQueue;
+ }
+
+ transition(S_NP_PS, LastAck_PrbResp, NP_PS) {} {
+ aie_ackRegionExclusiveInv;
+ dt_deallocateTBE;
+ wa_wakeUpAllDependents;
+ pt_popTriggerQueue;
+ }
+
+ transition(P_S, LastAck_PrbResp, S) {} {
+ ai_ackRegionInv;
+ ad_ackDircetory;
+ dt_deallocateTBE;
+ wa_wakeUpAllDependents;
+ pt_popTriggerQueue;
+ }
+
+}
+
diff --git a/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm b/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm
new file mode 100644
index 000000000..b392311c5
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-RegionDir.sm
@@ -0,0 +1,1187 @@
+/*
+ * Copyright (c) 2012-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Jason Power
+ */
+
+machine(MachineType:RegionDir, "Region Directory for AMD_Base-like protocol")
+: CacheMemory *cacheMemory; // stores only region addresses. Must set block size same as below
+ NodeID cpuRegionBufferNum;
+ NodeID gpuRegionBufferNum;
+ int blocksPerRegion := 64; // 4k regions
+ Cycles toDirLatency := 10; // Latency to fwd requests and send invs to directory
+ bool always_migrate := "False";
+ bool sym_migrate := "False";
+ bool asym_migrate := "False";
+ bool noTCCdir := "False";
+ int TCC_select_num_bits := 1;
+
+ // To the directory
+ MessageBuffer * requestToDir, network="To", virtual_network="5", vnet_type="request";
+
+ // To the region buffers
+ MessageBuffer * notifyToRBuffer, network="To", virtual_network="7", vnet_type="request";
+ MessageBuffer * probeToRBuffer, network="To", virtual_network="8", vnet_type="request";
+
+ // From the region buffers
+ MessageBuffer * responseFromRBuffer, network="From", virtual_network="2", vnet_type="response";
+ MessageBuffer * requestFromRegBuf, network="From", virtual_network="0", vnet_type="request";
+
+ MessageBuffer * triggerQueue;
+{
+
+ // States
+ state_declaration(State, desc="Region states", default="RegionDir_State_NP") {
+ NP, AccessPermission:Invalid, desc="Not present in region directory";
+ P, AccessPermission:Invalid, desc="Region is private to owner";
+ S, AccessPermission:Invalid, desc="Region is shared between CPU and GPU";
+
+ P_NP, AccessPermission:Invalid, desc="Evicting the region";
+ NP_P, AccessPermission:Invalid, desc="Must wait for ack from R-buf";
+ NP_S, AccessPermission:Invalid, desc="Must wait for ack from R-buf";
+ P_P, AccessPermission:Invalid, desc="Waiting for ack from R-buf";
+ S_S, AccessPermission:Invalid, desc="Waiting for ack from R-buf";
+ P_S, AccessPermission:Invalid, desc="Downgrading the region";
+ S_P, AccessPermission:Invalid, desc="Upgrading the region";
+ P_AS, AccessPermission:Invalid, desc="Sent invalidates, waiting for acks";
+ S_AP, AccessPermission:Invalid, desc="Sent invalidates, waiting for acks";
+ P_AP, AccessPermission:Invalid, desc="Sent invalidates, waiting for acks";
+
+ SP_NP_W, AccessPermission:Invalid, desc="Last sharer writing back, waiting for ack";
+ S_W, AccessPermission:Invalid, desc="Sharer writing back, waiting for ack";
+
+ P_AP_W, AccessPermission:Invalid, desc="Fwded request to dir, waiting for ack";
+ P_AS_W, AccessPermission:Invalid, desc="Fwded request to dir, waiting for ack";
+ S_AP_W, AccessPermission:Invalid, desc="Fwded request to dir, waiting for ack";
+ }
+
+ enumeration(Event, desc="Region directory events") {
+ SendInv, desc="Send inv message to any machine that has a region buffer";
+ SendUpgrade, desc="Send upgrade message to any machine that has a region buffer";
+ SendDowngrade, desc="Send downgrade message to any machine that has a region buffer";
+
+ Evict, desc="Evict this region";
+
+ UpgradeRequest, desc="Request from r-buf for an upgrade";
+ SharedRequest, desc="Request from r-buf for read";
+ PrivateRequest, desc="Request from r-buf for write";
+
+ InvAckCore, desc="Ack from region buffer to order the invalidate";
+ InvAckCoreNoShare, desc="Ack from region buffer to order the invalidate, and it does not have the region";
+ CPUPrivateAck, desc="Ack from region buffer to order private notification";
+
+ LastAck, desc="Done eviciting all the blocks";
+
+ StaleCleanWbRequest, desc="stale clean writeback reqeust";
+ StaleCleanWbRequestNoShare, desc="stale clean wb req from a cache which should be removed from sharers";
+ CleanWbRequest, desc="clean writeback reqeust, multiple sharers";
+ CleanWbRequest_LastSharer, desc="clean writeback reqeust, last sharer";
+ WritebackAck, desc="Writeback Ack from region buffer";
+ DirReadyAck, desc="Directory is ready, waiting Ack from region buffer";
+
+ TriggerInv, desc="trigger invalidate message";
+ TriggerDowngrade, desc="trigger downgrade message";
+ }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ DataArrayRead, desc="Read the data array";
+ DataArrayWrite, desc="Write the data array";
+ TagArrayRead, desc="Read the data array";
+ TagArrayWrite, desc="Write the data array";
+ }
+
+ structure(BoolVec, external="yes") {
+ bool at(int);
+ void resize(int);
+ void clear();
+ }
+
+ structure(Entry, desc="Region entry", interface="AbstractCacheEntry") {
+ Addr addr, desc="Base address of this region";
+ NetDest Sharers, desc="Set of machines that are sharing, but not owners";
+ State RegionState, desc="Region state";
+ DataBlock DataBlk, desc="Data for the block (always empty in region dir)";
+ MachineID Owner, desc="Machine which owns all blocks in this region";
+ Cycles ProbeStart, desc="Time when the first probe request was issued";
+ bool LastWriten, default="false", desc="The last time someone accessed this region, it wrote it";
+ bool LastWritenByCpu, default="false", desc="The last time the CPU accessed this region, it wrote it";
+ bool LastWritenByGpu, default="false", desc="The last time the GPU accessed this region, it wrote it";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ MachineID Owner, desc="Machine which owns all blocks in this region";
+ NetDest Sharers, desc="Set of machines to send evicts";
+ int NumValidBlocks, desc="Number of blocks valid so we don't have to count a BoolVec";
+ bool AllAcksReceived, desc="Got all necessary acks from dir";
+ CoherenceRequestType MsgType, desc="Msg type for the evicts could be inv or dwngrd";
+ Cycles ProbeRequestTime, default="Cycles(0)", desc="Start of probe request";
+ Cycles InitialRequestTime, default="Cycles(0)", desc="To forward back on out msg";
+ Addr DemandAddress, desc="Demand address from original request";
+ uint64_t probe_id, desc="probe id for lifetime profiling";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ // Stores only region addresses
+ TBETable TBEs, template="<RegionDir_TBE>", constructor="m_number_of_TBEs";
+ int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+ Tick clockEdge();
+ Tick cyclesToTicks(Cycles c);
+
+ void set_cache_entry(AbstractCacheEntry b);
+ void unset_cache_entry();
+ void set_tbe(TBE b);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+ Cycles curCycle();
+
+ int blockBits, default="RubySystem::getBlockSizeBits()";
+ int blockBytes, default="RubySystem::getBlockSizeBytes()";
+ int regionBits, default="log2(m_blocksPerRegion)";
+
+ // Functions
+
+ MachineID getCoreMachine(MachineID rBuf, Addr address) {
+ if (machineIDToNodeID(rBuf) == cpuRegionBufferNum) {
+ return createMachineID(MachineType:CorePair, intToID(0));
+ } else if (machineIDToNodeID(rBuf) == gpuRegionBufferNum) {
+ if (noTCCdir) {
+ return mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits);
+ } else {
+ return createMachineID(MachineType:TCCdir, intToID(0));
+ }
+ } else {
+ error("Unexpected region buffer number");
+ }
+ }
+
+ bool isCpuMachine(MachineID rBuf) {
+ if (machineIDToNodeID(rBuf) == cpuRegionBufferNum) {
+ return true;
+ } else if (machineIDToNodeID(rBuf) == gpuRegionBufferNum) {
+ return false;
+ } else {
+ error("Unexpected region buffer number");
+ }
+ }
+
+ bool symMigrate(Entry cache_entry) {
+ return cache_entry.LastWriten;
+ }
+
+ bool asymMigrate(Entry cache_entry, MachineID requestor) {
+ if (isCpuMachine(requestor)) {
+ return cache_entry.LastWritenByCpu;
+ } else {
+ return cache_entry.LastWritenByGpu;
+ }
+ }
+
+ int getRegionOffset(Addr addr) {
+ if (blocksPerRegion > 1) {
+ Addr offset := bitSelect(addr, blockBits, regionBits+blockBits-1);
+ int ret := addressToInt(offset);
+ assert(ret < blocksPerRegion);
+ return ret;
+ } else {
+ return 0;
+ }
+ }
+
+ Addr getRegionBase(Addr addr) {
+ return maskLowOrderBits(addr, blockBits+regionBits);
+ }
+
+ Addr getNextBlock(Addr addr) {
+ Addr a := addr;
+ makeNextStrideAddress(a, 1);
+ return a;
+ }
+
+ bool presentOrAvail(Addr addr) {
+ DPRINTF(RubySlicc, "Present? %s, avail? %s\n", cacheMemory.isTagPresent(getRegionBase(addr)), cacheMemory.cacheAvail(getRegionBase(addr)));
+ return cacheMemory.isTagPresent(getRegionBase(addr)) || cacheMemory.cacheAvail(getRegionBase(addr));
+ }
+
+ // Returns a region entry!
+ Entry getCacheEntry(Addr addr), return_by_pointer="yes" {
+ return static_cast(Entry, "pointer", cacheMemory.lookup(getRegionBase(addr)));
+ }
+
+ TBE getTBE(Addr addr), return_by_pointer="yes" {
+ return TBEs.lookup(getRegionBase(addr));
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ return getCacheEntry(getRegionBase(addr)).DataBlk;
+ }
+
+ State getState(TBE tbe, Entry cache_entry, Addr addr) {
+ if (is_valid(tbe)) {
+ return tbe.TBEState;
+ } else if (is_valid(cache_entry)) {
+ return cache_entry.RegionState;
+ }
+ return State:NP;
+ }
+
+ void setState(TBE tbe, Entry cache_entry, Addr addr, State state) {
+ if (is_valid(tbe)) {
+ tbe.TBEState := state;
+ }
+ if (is_valid(cache_entry)) {
+ cache_entry.RegionState := state;
+ }
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ TBE tbe := getTBE(addr);
+ if(is_valid(tbe)) {
+ return RegionDir_State_to_permission(tbe.TBEState);
+ }
+ Entry cache_entry := getCacheEntry(addr);
+ if(is_valid(cache_entry)) {
+ return RegionDir_State_to_permission(cache_entry.RegionState);
+ }
+ return AccessPermission:NotPresent;
+ }
+
+ void setAccessPermission(Entry cache_entry, Addr addr, State state) {
+ if (is_valid(cache_entry)) {
+ cache_entry.changePermission(RegionDir_State_to_permission(state));
+ }
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ functionalMemoryRead(pkt);
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ if (functionalMemoryWrite(pkt)) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ cacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ cacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ cacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ cacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:DataArrayRead) {
+ return cacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:DataArrayWrite) {
+ return cacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:TagArrayRead) {
+ return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:TagArrayWrite) {
+ return cacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else {
+ error("Invalid RequestType type in checkResourceAvailable");
+ return true;
+ }
+ }
+
+ out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+
+ out_port(requestNetwork_out, CPURequestMsg, requestToDir);
+ out_port(notifyNetwork_out, CPURequestMsg, notifyToRBuffer);
+ out_port(probeNetwork_out, NBProbeRequestMsg, probeToRBuffer);
+
+ in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=2) {
+ if (triggerQueue_in.isReady(clockEdge())) {
+ peek(triggerQueue_in, TriggerMsg) {
+ assert(in_msg.addr == getRegionBase(in_msg.addr));
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ TBE tbe := getTBE(in_msg.addr);
+ DPRINTF(RubySlicc, "trigger msg: %s (%s)\n", in_msg, getRegionBase(in_msg.addr));
+ if (in_msg.Type == TriggerType:AcksComplete) {
+ assert(is_valid(tbe));
+ trigger(Event:LastAck, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == TriggerType:InvRegion) {
+ assert(is_valid(tbe));
+ trigger(Event:TriggerInv, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == TriggerType:DowngradeRegion) {
+ assert(is_valid(tbe));
+ trigger(Event:TriggerDowngrade, in_msg.addr, cache_entry, tbe);
+ } else {
+ error("Unknown trigger message");
+ }
+ }
+ }
+ }
+
+ in_port(responseNetwork_in, ResponseMsg, responseFromRBuffer, rank=1) {
+ if (responseNetwork_in.isReady(clockEdge())) {
+ peek(responseNetwork_in, ResponseMsg) {
+ TBE tbe := getTBE(in_msg.addr);
+ Entry cache_entry := getCacheEntry(in_msg.addr);
+ if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+ assert(in_msg.addr == getRegionBase(in_msg.addr));
+ assert(is_valid(tbe));
+ if (in_msg.NotCached) {
+ trigger(Event:InvAckCoreNoShare, in_msg.addr, cache_entry, tbe);
+ } else {
+ trigger(Event:InvAckCore, in_msg.addr, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceResponseType:PrivateAck) {
+ assert(in_msg.addr == getRegionBase(in_msg.addr));
+ assert(is_valid(cache_entry));
+ //Fix Me...add back in: assert(cache_entry.Sharers.isElement(in_msg.Sender));
+ trigger(Event:CPUPrivateAck, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceResponseType:RegionWbAck) {
+ //Fix Me...add back in: assert(cache_entry.Sharers.isElement(in_msg.Sender) == false);
+ assert(in_msg.addr == getRegionBase(in_msg.addr));
+ trigger(Event:WritebackAck, in_msg.addr, cache_entry, tbe);
+ } else if (in_msg.Type == CoherenceResponseType:DirReadyAck) {
+ assert(is_valid(tbe));
+ trigger(Event:DirReadyAck, getRegionBase(in_msg.addr), cache_entry, tbe);
+ } else {
+ error("Invalid response type");
+ }
+ }
+ }
+ }
+
+ // In from cores
+ // NOTE: We get the cache / TBE entry based on the region address,
+ // but pass the block address to the actions
+ in_port(requestNetwork_in, CPURequestMsg, requestFromRegBuf, rank=0) {
+ if (requestNetwork_in.isReady(clockEdge())) {
+ peek(requestNetwork_in, CPURequestMsg) {
+ //assert(in_msg.addr == getRegionBase(in_msg.addr));
+ Addr address := getRegionBase(in_msg.addr);
+ DPRINTF(RubySlicc, "Got %s, base %s\n", in_msg.addr, address);
+ if (presentOrAvail(address)) {
+ TBE tbe := getTBE(address);
+ Entry cache_entry := getCacheEntry(address);
+ if (in_msg.Type == CoherenceRequestType:PrivateRequest) {
+ if (is_valid(cache_entry) && (cache_entry.Owner != in_msg.Requestor ||
+ getState(tbe, cache_entry, address) == State:S)) {
+ trigger(Event:SendInv, address, cache_entry, tbe);
+ } else {
+ trigger(Event:PrivateRequest, address, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceRequestType:SharedRequest) {
+ if (is_invalid(cache_entry)) {
+ // If no one has ever requested this region give private permissions
+ trigger(Event:PrivateRequest, address, cache_entry, tbe);
+ } else {
+ if (always_migrate ||
+ (sym_migrate && symMigrate(cache_entry)) ||
+ (asym_migrate && asymMigrate(cache_entry, in_msg.Requestor))) {
+ if (cache_entry.Sharers.count() == 1 &&
+ cache_entry.Sharers.isElement(in_msg.Requestor)) {
+ trigger(Event:UpgradeRequest, address, cache_entry, tbe);
+ } else {
+ trigger(Event:SendInv, address, cache_entry, tbe);
+ }
+ } else { // don't migrate
+ if(cache_entry.Sharers.isElement(in_msg.Requestor) ||
+ getState(tbe, cache_entry, address) == State:S) {
+ trigger(Event:SharedRequest, address, cache_entry, tbe);
+ } else {
+ trigger(Event:SendDowngrade, address, cache_entry, tbe);
+ }
+ }
+ }
+ } else if (in_msg.Type == CoherenceRequestType:UpgradeRequest) {
+ if (is_invalid(cache_entry)) {
+ trigger(Event:PrivateRequest, address, cache_entry, tbe);
+ } else if (cache_entry.Sharers.count() == 1 && cache_entry.Sharers.isElement(in_msg.Requestor)) {
+ trigger(Event:UpgradeRequest, address, cache_entry, tbe);
+ } else {
+ trigger(Event:SendUpgrade, address, cache_entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceRequestType:CleanWbRequest) {
+ if (is_invalid(cache_entry) || cache_entry.Sharers.isElement(in_msg.Requestor) == false) {
+ trigger(Event:StaleCleanWbRequest, address, cache_entry, tbe);
+ } else {
+ DPRINTF(RubySlicc, "wb address %s(%s) owner %s sharers %s requestor %s %d %d\n", in_msg.addr, getRegionBase(in_msg.addr), cache_entry.Owner, cache_entry.Sharers, in_msg.Requestor, cache_entry.Sharers.isElement(in_msg.Requestor), cache_entry.Sharers.count());
+ if (cache_entry.Sharers.isElement(in_msg.Requestor) && cache_entry.Sharers.count() == 1) {
+ DPRINTF(RubySlicc, "last wb\n");
+ trigger(Event:CleanWbRequest_LastSharer, address, cache_entry, tbe);
+ } else {
+ DPRINTF(RubySlicc, "clean wb\n");
+ trigger(Event:CleanWbRequest, address, cache_entry, tbe);
+ }
+ }
+ } else {
+ error("unknown region dir request type");
+ }
+ } else {
+ Addr victim := cacheMemory.cacheProbe(getRegionBase(in_msg.addr));
+ TBE victim_tbe := getTBE(victim);
+ Entry victim_entry := getCacheEntry(victim);
+ DPRINTF(RubySlicc, "Evicting address %s for new region at address %s(%s)\n", victim, in_msg.addr, getRegionBase(in_msg.addr));
+ assert(is_valid(victim_entry));
+ trigger(Event:Evict, victim, victim_entry, victim_tbe);
+ }
+ }
+ }
+ }
+
+ // Actions
+
+ action(f_fwdReqToDir, "f", desc="Forward CPU request to directory") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+ out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address
+ out_msg.Type := in_msg.OriginalType;
+ out_msg.DataBlk := in_msg.DataBlk;
+ out_msg.Dirty := in_msg.Dirty;
+ out_msg.Requestor := getCoreMachine(in_msg.Requestor,address);
+ out_msg.WTRequestor := in_msg.WTRequestor;
+ out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+ out_msg.Shared := in_msg.Shared;
+ out_msg.MessageSize := in_msg.MessageSize;
+ out_msg.Private := in_msg.Private;
+ out_msg.NoAckNeeded := true;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.ProbeRequestStartTime := curCycle();
+ out_msg.DemandRequest := true;
+ if (is_valid(cache_entry) && getState(tbe, cache_entry, address) != State:S) {
+ out_msg.Acks := cache_entry.Sharers.count();
+ } else {
+ out_msg.Acks := 0;
+ }
+ }
+ }
+ }
+
+ action(f_fwdReqToDirShared, "fs", desc="Forward CPU request to directory (shared)") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+ out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address
+ out_msg.Type := in_msg.OriginalType;
+ out_msg.DataBlk := in_msg.DataBlk;
+ out_msg.Dirty := in_msg.Dirty;
+ out_msg.Requestor := getCoreMachine(in_msg.Requestor,address);
+ out_msg.WTRequestor := in_msg.WTRequestor;
+ out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+ out_msg.Shared := in_msg.Shared;
+ out_msg.MessageSize := in_msg.MessageSize;
+ out_msg.Private := in_msg.Private;
+ out_msg.NoAckNeeded := true;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.ProbeRequestStartTime := curCycle();
+ out_msg.DemandRequest := true;
+ out_msg.ForceShared := true;
+ if (is_valid(cache_entry) && getState(tbe, cache_entry, address) != State:S) {
+ out_msg.Acks := cache_entry.Sharers.count();
+ } else {
+ out_msg.Acks := 0;
+ }
+ }
+ }
+ }
+
+ action(f_fwdReqToDirWithAck, "fa", desc="Forward CPU request to directory with ack request") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+ out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address
+ out_msg.Type := in_msg.OriginalType;
+ out_msg.DataBlk := in_msg.DataBlk;
+ out_msg.Dirty := in_msg.Dirty;
+ out_msg.Requestor := getCoreMachine(in_msg.Requestor,address);
+ out_msg.WTRequestor := in_msg.WTRequestor;
+ out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+ out_msg.Shared := in_msg.Shared;
+ out_msg.MessageSize := in_msg.MessageSize;
+ out_msg.Private := in_msg.Private;
+ out_msg.NoAckNeeded := false;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.ProbeRequestStartTime := curCycle();
+ out_msg.DemandRequest := true;
+ if (is_valid(cache_entry)) {
+ out_msg.Acks := cache_entry.Sharers.count();
+ // Don't need an ack from the requestor!
+ if (cache_entry.Sharers.isElement(in_msg.Requestor)) {
+ out_msg.Acks := out_msg.Acks - 1;
+ }
+ } else {
+ out_msg.Acks := 0;
+ }
+ }
+ }
+ }
+
+ action(f_fwdReqToDirWithAckShared, "fas", desc="Forward CPU request to directory with ack request") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(requestNetwork_out, CPURequestMsg, toDirLatency) {
+ out_msg.addr := in_msg.addr; // This is the block address. "address" is the region address
+ out_msg.Type := in_msg.OriginalType;
+ out_msg.DataBlk := in_msg.DataBlk;
+ out_msg.Dirty := in_msg.Dirty;
+ out_msg.Requestor := getCoreMachine(in_msg.Requestor,address);
+ out_msg.WTRequestor := in_msg.WTRequestor;
+ out_msg.Destination.add(map_Address_to_Directory(in_msg.addr));
+ out_msg.Shared := in_msg.Shared;
+ out_msg.MessageSize := in_msg.MessageSize;
+ out_msg.Private := in_msg.Private;
+ out_msg.NoAckNeeded := false;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.ProbeRequestStartTime := curCycle();
+ out_msg.DemandRequest := true;
+ out_msg.ForceShared := true;
+ if (is_valid(cache_entry)) {
+ out_msg.Acks := cache_entry.Sharers.count();
+ // Don't need an ack from the requestor!
+ if (cache_entry.Sharers.isElement(in_msg.Requestor)) {
+ out_msg.Acks := out_msg.Acks - 1;
+ }
+ } else {
+ out_msg.Acks := 0;
+ }
+ }
+ }
+ }
+
+ action(a_allocateRegionEntry, "a", desc="Allocate a new entry") {
+ set_cache_entry(cacheMemory.allocate(getRegionBase(address), new Entry));
+ peek(requestNetwork_in, CPURequestMsg) {
+ APPEND_TRANSITION_COMMENT(in_msg.Requestor);
+ }
+ }
+
+ action(d_deallocateRegionEntry, "d", desc="Deallocate region entry") {
+ cacheMemory.deallocate(getRegionBase(address));
+ unset_cache_entry();
+ }
+
+ action(ra_receiveAck, "ra", desc="Mark TBE entry as received this ack") {
+ //assert(tbe.ValidBlocks.at(getRegionOffset(address)));
+ DPRINTF(RubySlicc, "received ack for %s reg: %s\n", address, getRegionBase(address));
+ tbe.NumValidBlocks := tbe.NumValidBlocks - 1;
+ assert(tbe.NumValidBlocks >= 0);
+ if (tbe.NumValidBlocks == 0) {
+ tbe.AllAcksReceived := true;
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.Type := TriggerType:AcksComplete;
+ out_msg.addr := address;
+ }
+ }
+ APPEND_TRANSITION_COMMENT(getRegionBase(address));
+ APPEND_TRANSITION_COMMENT(" Acks left receive ");
+ APPEND_TRANSITION_COMMENT(tbe.NumValidBlocks);
+ }
+
+ action(ca_checkAcks, "ca", desc="Check to see if we need more acks") {
+ if (tbe.NumValidBlocks == 0) {
+ tbe.AllAcksReceived := true;
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.Type := TriggerType:AcksComplete;
+ out_msg.addr := address;
+ }
+ }
+ }
+
+ action(ti_triggerInv, "ti", desc="") {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.Type := TriggerType:InvRegion;
+ out_msg.addr := address;
+ }
+ }
+
+ action(td_triggerDowngrade, "td", desc="") {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.Type := TriggerType:DowngradeRegion;
+ out_msg.addr := address;
+ }
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ check_allocate(TBEs);
+ TBEs.allocate(getRegionBase(address));
+ set_tbe(getTBE(address));
+ if (is_valid(cache_entry)) {
+ tbe.Owner := cache_entry.Owner;
+ tbe.Sharers := cache_entry.Sharers;
+ tbe.AllAcksReceived := true; // assume no acks are required
+ }
+ tbe.ProbeRequestTime := curCycle();
+ peek(requestNetwork_in, CPURequestMsg) {
+ tbe.InitialRequestTime := in_msg.InitialRequestTime;
+ tbe.DemandAddress := in_msg.addr;
+ }
+ APPEND_TRANSITION_COMMENT(getRegionBase(address));
+ APPEND_TRANSITION_COMMENT(" Acks left ");
+ APPEND_TRANSITION_COMMENT(tbe.NumValidBlocks);
+ APPEND_TRANSITION_COMMENT(" Owner, ");
+ APPEND_TRANSITION_COMMENT(tbe.Owner);
+ APPEND_TRANSITION_COMMENT(" sharers, ");
+ APPEND_TRANSITION_COMMENT(tbe.Sharers);
+ }
+
+ action(ss_setSharers, "ss", desc="Add requestor to sharers") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ cache_entry.Sharers.add(in_msg.Requestor);
+ APPEND_TRANSITION_COMMENT(cache_entry.Sharers);
+ }
+ }
+
+ action(rs_removeSharer, "rs", desc="Remove requestor to sharers") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ cache_entry.Sharers.remove(in_msg.Requestor);
+ APPEND_TRANSITION_COMMENT(" removing ");
+ APPEND_TRANSITION_COMMENT(in_msg.Requestor);
+ APPEND_TRANSITION_COMMENT(" sharers ");
+ APPEND_TRANSITION_COMMENT(cache_entry.Sharers);
+ }
+ }
+
+ action(rsr_removeSharerResponse, "rsr", desc="Remove requestor to sharers") {
+ peek(responseNetwork_in, ResponseMsg) {
+ cache_entry.Sharers.remove(in_msg.Sender);
+ APPEND_TRANSITION_COMMENT(cache_entry.Sharers);
+ }
+ }
+
+ action(cs_clearSharers, "cs", desc="Add requestor to sharers") {
+ cache_entry.Sharers.clear();
+ }
+
+ action(so_setOwner, "so", desc="Set the owner to the requestor") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ cache_entry.Owner := in_msg.Requestor;
+ APPEND_TRANSITION_COMMENT(" Owner now: ");
+ APPEND_TRANSITION_COMMENT(cache_entry.Owner);
+ }
+ }
+
+ action(rr_removeRequestorFromTBE, "rr", desc="Remove requestor from TBE sharers") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ tbe.Sharers.remove(in_msg.Requestor);
+ }
+ }
+
+ action(ur_updateDirtyStatusOnRequest, "ur", desc="Update dirty status on demand request") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ if (is_valid(cache_entry)) {
+ if ((in_msg.Type == CoherenceRequestType:SharedRequest) &&
+ (cache_entry.Sharers.isElement(in_msg.Requestor) == false)) {
+ cache_entry.LastWriten := false;
+ if (isCpuMachine(in_msg.Requestor)) {
+ cache_entry.LastWritenByCpu := false;
+ } else {
+ cache_entry.LastWritenByGpu := false;
+ }
+ } else if ((in_msg.Type == CoherenceRequestType:PrivateRequest) ||
+ (in_msg.Type == CoherenceRequestType:UpgradeRequest)) {
+ cache_entry.LastWriten := true;
+ if (isCpuMachine(in_msg.Requestor)) {
+ cache_entry.LastWritenByCpu := true;
+ } else {
+ cache_entry.LastWritenByGpu := true;
+ }
+ }
+ }
+ }
+ }
+
+ action(ud_updateDirtyStatusWithWb, "ud", desc="Update dirty status on writeback") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ if (is_valid(cache_entry) && in_msg.Dirty) {
+ cache_entry.LastWriten := true;
+ if (isCpuMachine(in_msg.Requestor)) {
+ cache_entry.LastWritenByCpu := true;
+ } else {
+ cache_entry.LastWritenByGpu := true;
+ }
+ }
+ }
+ }
+
+ action(sns_setNumAcksSharers, "sns", desc="Set number of acks to one per shared region buffer") {
+ assert(is_valid(tbe));
+ assert(is_valid(cache_entry));
+ tbe.NumValidBlocks := tbe.Sharers.count();
+ }
+
+ action(sno_setNumAcksOne, "sno", desc="Set number of acks to one per shared region buffer") {
+ assert(is_valid(tbe));
+ assert(is_valid(cache_entry));
+ tbe.NumValidBlocks := 1;
+ }
+
+ action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+ TBEs.deallocate(getRegionBase(address));
+ APPEND_TRANSITION_COMMENT(" reg: ");
+ APPEND_TRANSITION_COMMENT(getRegionBase(address));
+ unset_tbe();
+ }
+
+ action(wb_sendWbNotice, "wb", desc="Send notice to cache that writeback is acknowledged") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+ out_msg.addr := getRegionBase(address);
+ out_msg.Type := CoherenceRequestType:WbNotify;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.Requestor := machineID;
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ }
+ }
+ }
+
+ action(wbn_sendWbNoticeNoAck, "wbn", desc="Send notice to cache that writeback is acknowledged (no ack needed)") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+ out_msg.addr := getRegionBase(address);
+ out_msg.Type := CoherenceRequestType:WbNotify;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.Requestor := machineID;
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.NoAckNeeded := true;
+ }
+ }
+ }
+
+ action(b_sendPrivateNotice, "b", desc="Send notice to private cache that it has private access") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+ out_msg.addr := getRegionBase(address);
+ out_msg.Type := CoherenceRequestType:PrivateNotify;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.Requestor := machineID;
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ }
+ }
+ }
+
+ action(bs_sendSharedNotice, "bs", desc="Send notice to private cache that it has private access") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+ out_msg.addr := getRegionBase(address);
+ out_msg.Type := CoherenceRequestType:SharedNotify;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.Requestor := machineID;
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ }
+ }
+ }
+
+ action(c_sendSharedNoticeToOrigReq, "c", desc="Send notice to private cache that it has shared access") {
+ assert(is_valid(tbe));
+ enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+ out_msg.addr := getRegionBase(address);
+ out_msg.Type := CoherenceRequestType:SharedNotify;
+ out_msg.Destination.add(tbe.Owner);
+ out_msg.Requestor := machineID;
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.ProbeRequestStartTime := tbe.ProbeRequestTime;
+ out_msg.InitialRequestTime := tbe.InitialRequestTime;
+ APPEND_TRANSITION_COMMENT("dest: ");
+ APPEND_TRANSITION_COMMENT(out_msg.Destination);
+ }
+ }
+
+ action(sp_sendPrivateNoticeToOrigReq, "sp", desc="Send notice to private cache that it has private access") {
+ assert(is_valid(tbe));
+ enqueue(notifyNetwork_out, CPURequestMsg, 1) {
+ out_msg.addr := getRegionBase(address);
+ out_msg.Type := CoherenceRequestType:PrivateNotify;
+ out_msg.Destination.add(tbe.Owner);
+ out_msg.Requestor := machineID;
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ out_msg.ProbeRequestStartTime := tbe.ProbeRequestTime;
+ out_msg.InitialRequestTime := tbe.InitialRequestTime;
+ APPEND_TRANSITION_COMMENT("dest: ");
+ APPEND_TRANSITION_COMMENT(out_msg.Destination);
+ }
+ }
+
+ action(i_RegionInvNotify, "i", desc="Send notice to private cache that it no longer has private access") {
+ enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+ out_msg.addr := address;
+ out_msg.DemandAddress := tbe.DemandAddress;
+ //out_msg.Requestor := tbe.Requestor;
+ out_msg.Requestor := machineID;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ //Fix me: assert(tbe.Sharers.count() > 0);
+ out_msg.DemandRequest := true;
+ out_msg.Destination := tbe.Sharers;
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ APPEND_TRANSITION_COMMENT("dest: ");
+ APPEND_TRANSITION_COMMENT(out_msg.Destination);
+ }
+ }
+
+ action(i0_RegionInvNotifyDemand0, "i0", desc="Send notice to private cache that it no longer has private access") {
+ enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+ out_msg.addr := address;
+ // Demand address should default to 0 -> out_msg.DemandAddress := 0;
+ out_msg.Requestor := machineID;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.Destination := tbe.Sharers;
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ APPEND_TRANSITION_COMMENT("dest: ");
+ APPEND_TRANSITION_COMMENT(out_msg.Destination);
+ }
+ }
+
+ action(rd_RegionDowngrade, "rd", desc="Send notice to private cache that it only has shared access") {
+ enqueue(probeNetwork_out, NBProbeRequestMsg, 1) {
+ out_msg.addr := address;
+ out_msg.DemandAddress := tbe.DemandAddress;
+ out_msg.Requestor := machineID;
+ out_msg.Type := ProbeRequestType:PrbDowngrade;
+ out_msg.DemandRequest := true;
+ out_msg.Destination := tbe.Sharers;
+ out_msg.MessageSize := MessageSizeType:Request_Control;
+ APPEND_TRANSITION_COMMENT("dest: ");
+ APPEND_TRANSITION_COMMENT(out_msg.Destination);
+ }
+ }
+
+ action(p_popRequestQueue, "p", desc="Pop the request queue") {
+ requestNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pt_popTriggerQueue, "pt", desc="Pop the trigger queue") {
+ triggerQueue_in.dequeue(clockEdge());
+ }
+
+ action(pr_popResponseQueue, "pr", desc="Pop the response queue") {
+ responseNetwork_in.dequeue(clockEdge());
+ }
+
+ action(s_stallAndWaitRequest, "s", desc="Stall and wait on the region address") {
+ Addr regAddr := getRegionBase(address);
+ stall_and_wait(requestNetwork_in, regAddr);
+ }
+
+ action(w_wakeUpRegionDependents, "w", desc="Wake up any requests waiting for this region") {
+ wakeUpBuffers(getRegionBase(address));
+ }
+
+ action(wa_wakeUpAllDependents, "wa", desc="Wake up any requests waiting for this region") {
+ wakeUpAllBuffers();
+ }
+
+ action(zz_recycleRequestQueue, "\z", desc="...") {
+ requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(z_stall, "z", desc="stall request queue") {
+ // fake state
+ }
+
+ action(mru_setMRU, "mru", desc="set MRU") {
+ cacheMemory.setMRU(address);
+ }
+
+ // Transistions
+
+ transition({NP_P, P_P, NP_S, S_S, S_P, P_S, P_NP, S_AP, P_AS, P_AP, SP_NP_W, S_W, P_AP_W, P_AS_W, S_AP_W}, {PrivateRequest, SharedRequest, UpgradeRequest, SendInv, SendUpgrade, SendDowngrade, CleanWbRequest, CleanWbRequest_LastSharer, StaleCleanWbRequest}) {
+ s_stallAndWaitRequest
+ }
+
+ transition({NP_P, P_P, NP_S, S_S, S_P, S_W, P_S, P_NP, S_AP, P_AS, P_AP, P_AP_W, P_AS_W, S_AP_W}, Evict) {
+ zz_recycleRequestQueue;
+ }
+
+ transition(NP, {PrivateRequest, SendUpgrade}, NP_P) {TagArrayRead, TagArrayWrite} {
+ a_allocateRegionEntry;
+ ur_updateDirtyStatusOnRequest;
+ f_fwdReqToDir;
+ b_sendPrivateNotice;
+ so_setOwner;
+ ss_setSharers;
+ t_allocateTBE;
+ p_popRequestQueue;
+ }
+
+ transition(P, {PrivateRequest, UpgradeRequest}, P_P) {TagArrayRead} {
+ mru_setMRU;
+ ur_updateDirtyStatusOnRequest;
+ f_fwdReqToDir;
+ b_sendPrivateNotice;
+ t_allocateTBE;
+ p_popRequestQueue;
+ }
+
+ transition({NP_P, P_P}, CPUPrivateAck, P) {
+ dt_deallocateTBE;
+ w_wakeUpRegionDependents;
+ pr_popResponseQueue;
+ }
+
+ transition({NP, P, S}, StaleCleanWbRequest) {TagArrayRead, TagArrayWrite} {
+ wbn_sendWbNoticeNoAck;
+ ud_updateDirtyStatusWithWb;
+ p_popRequestQueue;
+ }
+
+ transition(NP, SharedRequest, NP_S) {TagArrayRead, TagArrayWrite} {
+ a_allocateRegionEntry;
+ ur_updateDirtyStatusOnRequest;
+ f_fwdReqToDirShared;
+ bs_sendSharedNotice;
+ so_setOwner;
+ ss_setSharers;
+ t_allocateTBE;
+ p_popRequestQueue;
+ }
+
+ // Could probably do this in parallel with other shared requests
+ transition(S, SharedRequest, S_S) {TagArrayRead, TagArrayWrite} {
+ mru_setMRU;
+ ur_updateDirtyStatusOnRequest;
+ f_fwdReqToDirShared;
+ bs_sendSharedNotice;
+ ss_setSharers;
+ t_allocateTBE;
+ p_popRequestQueue;
+ }
+
+ transition({P, S}, CleanWbRequest_LastSharer, SP_NP_W) {TagArrayRead, TagArrayWrite} {
+ ud_updateDirtyStatusWithWb;
+ wb_sendWbNotice;
+ rs_removeSharer;
+ t_allocateTBE;
+ d_deallocateRegionEntry;
+ p_popRequestQueue;
+ }
+
+ transition(S, CleanWbRequest, S_W) {TagArrayRead, TagArrayWrite} {
+ ud_updateDirtyStatusWithWb;
+ wb_sendWbNotice;
+ rs_removeSharer;
+ t_allocateTBE;
+ p_popRequestQueue;
+ }
+
+ transition(SP_NP_W, WritebackAck, NP) {
+ dt_deallocateTBE;
+ w_wakeUpRegionDependents;
+ pr_popResponseQueue;
+ }
+
+ transition(S_W, WritebackAck, S) {
+ dt_deallocateTBE;
+ w_wakeUpRegionDependents;
+ pr_popResponseQueue;
+ }
+
+ transition({NP_S, S_S}, CPUPrivateAck, S) {
+ dt_deallocateTBE;
+ w_wakeUpRegionDependents;
+ pr_popResponseQueue;
+ }
+
+ transition(S, UpgradeRequest, S_P) {TagArrayRead, TagArrayWrite} {
+ mru_setMRU;
+ ur_updateDirtyStatusOnRequest;
+ f_fwdReqToDir;
+ b_sendPrivateNotice;
+ so_setOwner;
+ t_allocateTBE;
+ p_popRequestQueue;
+ }
+
+ transition(S_P, CPUPrivateAck, P) {
+ dt_deallocateTBE;
+ w_wakeUpRegionDependents;
+ pr_popResponseQueue;
+ }
+
+ transition(P, SendInv, P_AP_W) {TagArrayRead, TagArrayWrite} {
+ mru_setMRU;
+ ur_updateDirtyStatusOnRequest;
+ f_fwdReqToDirWithAck;
+ so_setOwner;
+ t_allocateTBE;
+ rr_removeRequestorFromTBE;
+ sns_setNumAcksSharers;
+ cs_clearSharers;
+ ss_setSharers;
+ //i_RegionInvNotify;
+ p_popRequestQueue;
+ }
+
+ transition({P_AP_W, S_AP_W}, DirReadyAck) {
+ ti_triggerInv;
+ pr_popResponseQueue;
+ }
+
+ transition(P_AS_W, DirReadyAck) {
+ td_triggerDowngrade;
+ pr_popResponseQueue;
+ }
+
+ transition(P_AS_W, TriggerDowngrade, P_AS) {
+ rd_RegionDowngrade;
+ pt_popTriggerQueue;
+ }
+
+ transition(P_AP_W, TriggerInv, P_AP) {
+ i_RegionInvNotify;
+ pt_popTriggerQueue;
+ }
+
+ transition(S_AP_W, TriggerInv, S_AP) {
+ i_RegionInvNotify;
+ pt_popTriggerQueue;
+ }
+
+ transition(P, SendUpgrade, P_AP_W) {TagArrayRead, TagArrayWrite} {
+ mru_setMRU;
+ ur_updateDirtyStatusOnRequest;
+ f_fwdReqToDirWithAck;
+ so_setOwner;
+ t_allocateTBE;
+ rr_removeRequestorFromTBE;
+ sns_setNumAcksSharers;
+ cs_clearSharers;
+ ss_setSharers;
+ p_popRequestQueue;
+ }
+
+ transition(P, Evict, P_NP) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ sns_setNumAcksSharers;
+ i0_RegionInvNotifyDemand0;
+ d_deallocateRegionEntry;
+ }
+
+ transition(S, SendInv, P_AP_W) {TagArrayRead, TagArrayWrite} {
+ mru_setMRU;
+ ur_updateDirtyStatusOnRequest;
+ f_fwdReqToDirWithAck;
+ so_setOwner;
+ t_allocateTBE;
+ rr_removeRequestorFromTBE;
+ sns_setNumAcksSharers;
+ cs_clearSharers;
+ ss_setSharers;
+ p_popRequestQueue;
+ }
+
+ transition(S, Evict, P_NP) {TagArrayRead, TagArrayWrite} {
+ t_allocateTBE;
+ sns_setNumAcksSharers;
+ i0_RegionInvNotifyDemand0;
+ d_deallocateRegionEntry;
+ }
+
+ transition(P_NP, LastAck, NP) {
+ dt_deallocateTBE;
+ wa_wakeUpAllDependents;
+ pt_popTriggerQueue;
+ }
+
+ transition(S, SendUpgrade, S_AP_W) {TagArrayRead, TagArrayWrite} {
+ mru_setMRU;
+ ur_updateDirtyStatusOnRequest;
+ f_fwdReqToDirWithAck;
+ so_setOwner;
+ t_allocateTBE;
+ rr_removeRequestorFromTBE;
+ sns_setNumAcksSharers;
+ cs_clearSharers;
+ ss_setSharers;
+ p_popRequestQueue;
+ }
+
+ transition(S_AP, LastAck, S_P) {
+ sp_sendPrivateNoticeToOrigReq;
+ pt_popTriggerQueue;
+ }
+
+ transition(P_AP, LastAck, P_P) {
+ sp_sendPrivateNoticeToOrigReq;
+ pt_popTriggerQueue;
+ }
+
+ transition(P, SendDowngrade, P_AS_W) {TagArrayRead, TagArrayWrite} {
+ mru_setMRU;
+ ur_updateDirtyStatusOnRequest;
+ f_fwdReqToDirWithAckShared;
+ so_setOwner;
+ t_allocateTBE;
+ sns_setNumAcksSharers;
+ ss_setSharers; //why do we set the sharers before sending the downgrade? Are we sending a downgrade to the requestor?
+ p_popRequestQueue;
+ }
+
+ transition(P_AS, LastAck, P_S) {
+ c_sendSharedNoticeToOrigReq;
+ pt_popTriggerQueue;
+ }
+
+ transition(P_S, CPUPrivateAck, S) {
+ dt_deallocateTBE;
+ w_wakeUpRegionDependents;
+ pr_popResponseQueue;
+ }
+
+ transition({P_NP, P_AS, S_AP, P_AP}, InvAckCore) {} {
+ ra_receiveAck;
+ pr_popResponseQueue;
+ }
+
+ transition({P_NP, S_AP, P_AP}, InvAckCoreNoShare) {} {
+ ra_receiveAck;
+ pr_popResponseQueue;
+ }
+
+ transition(P_AS, InvAckCoreNoShare) {} {
+ ra_receiveAck;
+ rsr_removeSharerResponse;
+ pr_popResponseQueue;
+ }
+
+}
+
+
diff --git a/src/mem/protocol/MOESI_AMD_Base-dir.sm b/src/mem/protocol/MOESI_AMD_Base-dir.sm
new file mode 100644
index 000000000..52cefda66
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-dir.sm
@@ -0,0 +1,1137 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+machine(MachineType:Directory, "AMD Baseline protocol")
+: DirectoryMemory * directory;
+ CacheMemory * L3CacheMemory;
+ Cycles response_latency := 5;
+ Cycles l3_hit_latency := 50;
+ bool noTCCdir := "False";
+ bool CPUonly := "False";
+ int TCC_select_num_bits;
+ bool useL3OnWT := "False";
+ Cycles to_memory_controller_latency := 1;
+
+ // From the Cores
+ MessageBuffer * requestFromCores, network="From", virtual_network="0", vnet_type="request";
+ MessageBuffer * responseFromCores, network="From", virtual_network="2", vnet_type="response";
+ MessageBuffer * unblockFromCores, network="From", virtual_network="4", vnet_type="unblock";
+
+ MessageBuffer * probeToCore, network="To", virtual_network="0", vnet_type="request";
+ MessageBuffer * responseToCore, network="To", virtual_network="2", vnet_type="response";
+
+ MessageBuffer * triggerQueue;
+ MessageBuffer * L3triggerQueue;
+ MessageBuffer * responseFromMemory;
+{
+ // STATES
+ state_declaration(State, desc="Directory states", default="Directory_State_U") {
+ U, AccessPermission:Backing_Store, desc="unblocked";
+ BL, AccessPermission:Busy, desc="got L3 WB request";
+ // BL is Busy because it's possible for the data only to be in the network
+ // in the WB, L3 has sent it and gone on with its business in possibly I
+ // state.
+ BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory";
+ BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory";
+ B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory";
+ BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory";
+ BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory";
+ BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory";
+ B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory";
+ BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack";
+ }
+
+ // Events
+ enumeration(Event, desc="Directory events") {
+ // CPU requests
+ RdBlkS, desc="...";
+ RdBlkM, desc="...";
+ RdBlk, desc="...";
+ CtoD, desc="...";
+ WriteThrough, desc="WriteThrough Message";
+ Atomic, desc="Atomic Message";
+
+ // writebacks
+ VicDirty, desc="...";
+ VicClean, desc="...";
+ CPUData, desc="WB data from CPU";
+ StaleWB, desc="Notification that WB has been superceded by a probe";
+
+ // probe responses
+ CPUPrbResp, desc="Probe Response Msg";
+
+ ProbeAcksComplete, desc="Probe Acks Complete";
+
+ L3Hit, desc="Hit in L3 return data to core";
+
+ // Memory Controller
+ MemData, desc="Fetched data from memory arrives";
+ WBAck, desc="Writeback Ack from memory arrives";
+
+ CoreUnblock, desc="Core received data, unblock";
+ UnblockWriteThrough, desc="Unblock because of writethrough request finishing";
+
+ StaleVicDirty, desc="Core invalidated before VicDirty processed";
+ }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ L3DataArrayRead, desc="Read the data array";
+ L3DataArrayWrite, desc="Write the data array";
+ L3TagArrayRead, desc="Read the data array";
+ L3TagArrayWrite, desc="Write the data array";
+ }
+
+ // TYPES
+
+ // DirectoryEntry
+ structure(Entry, desc="...", interface="AbstractEntry") {
+ State DirectoryState, desc="Directory state";
+ DataBlock DataBlk, desc="data for the block";
+ NetDest VicDirtyIgnore, desc="VicDirty coming from whom to ignore";
+ }
+
+ structure(CacheEntry, desc="...", interface="AbstractCacheEntry") {
+ DataBlock DataBlk, desc="data for the block";
+ MachineID LastSender, desc="Mach which this block came from";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ DataBlock DataBlk, desc="data for the block";
+ bool Dirty, desc="Is the data dirty?";
+ int NumPendingAcks, desc="num acks expected";
+ MachineID OriginalRequestor, desc="Original Requestor";
+ MachineID WTRequestor, desc="WT Requestor";
+ bool Cached, desc="data hit in Cache";
+ bool MemData, desc="Got MemData?",default="false";
+ bool wtData, desc="Got write through data?",default="false";
+ bool atomicData, desc="Got Atomic op?",default="false";
+ Cycles InitialRequestTime, desc="...";
+ Cycles ForwardRequestTime, desc="...";
+ Cycles ProbeRequestStartTime, desc="...";
+ MachineID LastSender, desc="Mach which this block came from";
+ bool L3Hit, default="false", desc="Was this an L3 hit?";
+ uint64_t probe_id, desc="probe id for lifetime profiling";
+ WriteMask writeMask, desc="outstanding write through mask";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
+
+ int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+ Tick clockEdge();
+ Tick cyclesToTicks(Cycles c);
+
+ void set_tbe(TBE a);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+ Cycles curCycle();
+
+ Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" {
+ Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr));
+
+ if (is_valid(dir_entry)) {
+ return dir_entry;
+ }
+
+ dir_entry := static_cast(Entry, "pointer",
+ directory.allocate(addr, new Entry));
+ return dir_entry;
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ TBE tbe := TBEs.lookup(addr);
+ if (is_valid(tbe) && tbe.MemData) {
+ DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe);
+ return tbe.DataBlk;
+ }
+ DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr));
+ return getDirectoryEntry(addr).DataBlk;
+ }
+
+ State getState(TBE tbe, CacheEntry entry, Addr addr) {
+ return getDirectoryEntry(addr).DirectoryState;
+ }
+
+ void setState(TBE tbe, CacheEntry entry, Addr addr, State state) {
+ getDirectoryEntry(addr).DirectoryState := state;
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ testAndRead(addr, tbe.DataBlk, pkt);
+ } else {
+ functionalMemoryRead(pkt);
+ }
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ int num_functional_writes := 0;
+
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ num_functional_writes := num_functional_writes +
+ testAndWrite(addr, tbe.DataBlk, pkt);
+ }
+
+ num_functional_writes := num_functional_writes
+ + functionalMemoryWrite(pkt);
+ return num_functional_writes;
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ // For this Directory, all permissions are just tracked in Directory, since
+ // it's not possible to have something in TBE but not Dir, just keep track
+ // of state all in one place.
+ if (directory.isPresent(addr)) {
+ return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState);
+ }
+
+ return AccessPermission:NotPresent;
+ }
+
+ void setAccessPermission(CacheEntry entry, Addr addr, State state) {
+ getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state));
+ }
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:L3DataArrayRead) {
+ L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:L3DataArrayWrite) {
+ L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:L3TagArrayRead) {
+ L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:L3TagArrayWrite) {
+ L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:L3DataArrayRead) {
+ return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L3DataArrayWrite) {
+ return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L3TagArrayRead) {
+ return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L3TagArrayWrite) {
+ return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else {
+ error("Invalid RequestType type in checkResourceAvailable");
+ return true;
+ }
+ }
+
+ // ** OUT_PORTS **
+ out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore);
+ out_port(responseNetwork_out, ResponseMsg, responseToCore);
+
+ out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+ out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue);
+
+ // ** IN_PORTS **
+
+ // Trigger Queue
+ in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=5) {
+ if (triggerQueue_in.isReady(clockEdge())) {
+ peek(triggerQueue_in, TriggerMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == TriggerType:AcksComplete) {
+ trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe);
+ }else if (in_msg.Type == TriggerType:UnblockWriteThrough) {
+ trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe);
+ } else {
+ error("Unknown trigger msg");
+ }
+ }
+ }
+ }
+
+ in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=4) {
+ if (L3TriggerQueue_in.isReady(clockEdge())) {
+ peek(L3TriggerQueue_in, TriggerMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == TriggerType:L3Hit) {
+ trigger(Event:L3Hit, in_msg.addr, entry, tbe);
+ } else {
+ error("Unknown trigger msg");
+ }
+ }
+ }
+ }
+
+ // Unblock Network
+ in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=3) {
+ if (unblockNetwork_in.isReady(clockEdge())) {
+ peek(unblockNetwork_in, UnblockMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ trigger(Event:CoreUnblock, in_msg.addr, entry, tbe);
+ }
+ }
+ }
+
+ // Core response network
+ in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=2) {
+ if (responseNetwork_in.isReady(clockEdge())) {
+ peek(responseNetwork_in, ResponseMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+ trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceResponseType:CPUData) {
+ trigger(Event:CPUData, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+ trigger(Event:StaleWB, in_msg.addr, entry, tbe);
+ } else {
+ error("Unexpected response type");
+ }
+ }
+ }
+ }
+
+ // off-chip memory request/response is done
+ in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=1) {
+ if (memQueue_in.isReady(clockEdge())) {
+ peek(memQueue_in, MemoryMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == MemoryRequestType:MEMORY_READ) {
+ trigger(Event:MemData, in_msg.addr, entry, tbe);
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) {
+ trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them.
+ } else {
+ DPRINTF(RubySlicc, "%s\n", in_msg.Type);
+ error("Invalid message");
+ }
+ }
+ }
+ }
+
+ in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) {
+ if (requestNetwork_in.isReady(clockEdge())) {
+ peek(requestNetwork_in, CPURequestMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == CoherenceRequestType:RdBlk) {
+ trigger(Event:RdBlk, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+ trigger(Event:RdBlkS, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+ trigger(Event:RdBlkM, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ trigger(Event:WriteThrough, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+ trigger(Event:Atomic, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+ if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+ DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr);
+ trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+ } else {
+ DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr);
+ trigger(Event:VicDirty, in_msg.addr, entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+ if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+ DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr);
+ trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+ } else {
+ DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
+ trigger(Event:VicClean, in_msg.addr, entry, tbe);
+ }
+ } else {
+ error("Bad request message type");
+ }
+ }
+ }
+ }
+
+ // Actions
+ action(s_sendResponseS, "s", desc="send Shared response") {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ if (tbe.L3Hit) {
+ out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+ } else {
+ out_msg.Sender := machineID;
+ }
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := false;
+ out_msg.State := CoherenceState:Shared;
+ out_msg.InitialRequestTime := tbe.InitialRequestTime;
+ out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+ out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+ out_msg.OriginalResponder := tbe.LastSender;
+ out_msg.L3Hit := tbe.L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(es_sendResponseES, "es", desc="send Exclusive or Shared response") {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ if (tbe.L3Hit) {
+ out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+ } else {
+ out_msg.Sender := machineID;
+ }
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := tbe.Dirty;
+ if (tbe.Cached) {
+ out_msg.State := CoherenceState:Shared;
+ } else {
+ out_msg.State := CoherenceState:Exclusive;
+ }
+ out_msg.InitialRequestTime := tbe.InitialRequestTime;
+ out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+ out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+ out_msg.OriginalResponder := tbe.LastSender;
+ out_msg.L3Hit := tbe.L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(m_sendResponseM, "m", desc="send Modified response") {
+ if (tbe.wtData) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:UnblockWriteThrough;
+ }
+ }else{
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ if (tbe.L3Hit) {
+ out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+ } else {
+ out_msg.Sender := machineID;
+ }
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := tbe.Dirty;
+ out_msg.State := CoherenceState:Modified;
+ out_msg.CtoD := false;
+ out_msg.InitialRequestTime := tbe.InitialRequestTime;
+ out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+ out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+ out_msg.OriginalResponder := tbe.LastSender;
+ if(tbe.atomicData){
+ out_msg.WTRequestor := tbe.WTRequestor;
+ }
+ out_msg.L3Hit := tbe.L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ if (tbe.atomicData) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:UnblockWriteThrough;
+ }
+ }
+ }
+ }
+
+ action(c_sendResponseCtoD, "c", desc="send CtoD Ack") {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ out_msg.Dirty := false;
+ out_msg.State := CoherenceState:Modified;
+ out_msg.CtoD := true;
+ out_msg.InitialRequestTime := tbe.InitialRequestTime;
+ out_msg.ForwardRequestTime := curCycle();
+ out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysWBAck;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.WTRequestor := in_msg.WTRequestor;
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.ForwardRequestTime := curCycle();
+ out_msg.ProbeRequestStartTime := curCycle();
+ }
+ }
+ }
+
+ action(l_queueMemWBReq, "lq", desc="Write WB data to memory") {
+ peek(responseNetwork_in, ResponseMsg) {
+ queueMemoryWrite(machineID, address, to_memory_controller_latency,
+ in_msg.DataBlk);
+ }
+ }
+
+ action(l_queueMemRdReq, "lr", desc="Read data from memory") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ if (L3CacheMemory.isTagPresent(address)) {
+ enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+ if (tbe.Dirty == false) {
+ tbe.DataBlk := entry.DataBlk;
+ }
+ tbe.LastSender := entry.LastSender;
+ tbe.L3Hit := true;
+ tbe.MemData := true;
+ L3CacheMemory.deallocate(address);
+ } else {
+ queueMemoryRead(machineID, address, to_memory_controller_latency);
+ }
+ }
+ }
+
+ action(dc_probeInvCoreData, "dc", desc="probe inv cores, return data") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket
+
+ // add relevant TCC node to list. This replaces all TCPs and SQCs
+ if (((in_msg.Type == CoherenceRequestType:WriteThrough ||
+ in_msg.Type == CoherenceRequestType:Atomic) &&
+ in_msg.NoWriteConflict) ||
+ CPUonly) {
+ } else if (noTCCdir) {
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ } else {
+ out_msg.Destination.add(mapAddressToRange(address,
+ MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ }
+ out_msg.Destination.remove(in_msg.Requestor);
+ tbe.NumPendingAcks := out_msg.Destination.count();
+ if (tbe.NumPendingAcks == 0) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:AcksComplete;
+ }
+ }
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ tbe.ProbeRequestStartTime := curCycle();
+ }
+ }
+ }
+
+ action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") {
+ peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+ enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbDowngrade;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket
+ // add relevant TCC node to the list. This replaces all TCPs and SQCs
+ if (noTCCdir || CPUonly) {
+ //Don't need to notify TCC about reads
+ } else {
+ out_msg.Destination.add(mapAddressToRange(address,
+ MachineType:TCCdir,
+ TCC_select_low_bit, TCC_select_num_bits));
+ tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+ }
+ if (noTCCdir && !CPUonly) {
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ }
+ out_msg.Destination.remove(in_msg.Requestor);
+ tbe.NumPendingAcks := out_msg.Destination.count();
+ if (tbe.NumPendingAcks == 0) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:AcksComplete;
+ }
+ }
+ DPRINTF(RubySlicc, "%s\n", (out_msg));
+ APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ tbe.ProbeRequestStartTime := curCycle();
+ }
+ }
+ }
+
+ action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") {
+ peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+ enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.ReturnData := false;
+ out_msg.MessageSize := MessageSizeType:Control;
+ out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket
+
+ // add relevant TCC node to the list. This replaces all TCPs and SQCs
+ if (noTCCdir && !CPUonly) {
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ } else {
+ if (!noTCCdir) {
+ out_msg.Destination.add(mapAddressToRange(address,
+ MachineType:TCCdir,
+ TCC_select_low_bit,
+ TCC_select_num_bits));
+ }
+ }
+ out_msg.Destination.remove(in_msg.Requestor);
+ tbe.NumPendingAcks := out_msg.Destination.count();
+ if (tbe.NumPendingAcks == 0) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:AcksComplete;
+ }
+ }
+ APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ tbe.ProbeRequestStartTime := curCycle();
+ }
+ }
+ }
+
+ action(d_writeDataToMemory, "d", desc="Write data to memory") {
+ peek(responseNetwork_in, ResponseMsg) {
+ getDirectoryEntry(address).DataBlk := in_msg.DataBlk;
+ if (tbe.Dirty == false) {
+ // have to update the TBE, too, because of how this
+ // directory deals with functional writes
+ tbe.DataBlk := in_msg.DataBlk;
+ }
+ }
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ check_allocate(TBEs);
+ peek(requestNetwork_in, CPURequestMsg) {
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ tbe.writeMask.clear();
+ tbe.writeMask.orMask(in_msg.writeMask);
+ tbe.wtData := true;
+ tbe.WTRequestor := in_msg.WTRequestor;
+ tbe.LastSender := in_msg.Requestor;
+ }
+ if (in_msg.Type == CoherenceRequestType:Atomic) {
+ tbe.writeMask.clear();
+ tbe.writeMask.orMask(in_msg.writeMask);
+ tbe.atomicData := true;
+ tbe.WTRequestor := in_msg.WTRequestor;
+ tbe.LastSender := in_msg.Requestor;
+ }
+ tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+ tbe.Dirty := false;
+ if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ tbe.DataBlk.copyPartial(in_msg.DataBlk,in_msg.writeMask);
+ tbe.Dirty := true;
+ }
+ tbe.OriginalRequestor := in_msg.Requestor;
+ tbe.NumPendingAcks := 0;
+ tbe.Cached := in_msg.ForceShared;
+ tbe.InitialRequestTime := in_msg.InitialRequestTime;
+ }
+ }
+
+ action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+ if (tbe.Dirty == false) {
+ getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+ }
+ TBEs.deallocate(address);
+ unset_tbe();
+ }
+
+ action(wd_writeBackData, "wd", desc="Write back data if needed") {
+ if (tbe.wtData) {
+ getDirectoryEntry(address).DataBlk.copyPartial(tbe.DataBlk, tbe.writeMask);
+ } else if (tbe.atomicData) {
+ tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk,tbe.writeMask);
+ getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+ } else if (tbe.Dirty == false) {
+ getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+ }
+ }
+
+ action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") {
+ peek(memQueue_in, MemoryMsg) {
+ if (tbe.wtData == true) {
+ // do nothing
+ } else if (tbe.Dirty == false) {
+ tbe.DataBlk := getDirectoryEntry(address).DataBlk;
+ }
+ tbe.MemData := true;
+ }
+ }
+
+ action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") {
+ peek(responseNetwork_in, ResponseMsg) {
+ if (in_msg.Dirty) {
+ if (tbe.wtData) {
+ DataBlock tmp := in_msg.DataBlk;
+ tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+ tbe.DataBlk := tmp;
+ tbe.writeMask.fillMask();
+ } else if (tbe.Dirty) {
+ if(tbe.atomicData == false && tbe.wtData == false) {
+ DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender);
+ assert(tbe.DataBlk == in_msg.DataBlk); // in case of double data
+ }
+ } else {
+ tbe.DataBlk := in_msg.DataBlk;
+ tbe.Dirty := in_msg.Dirty;
+ tbe.LastSender := in_msg.Sender;
+ }
+ }
+ if (in_msg.Hit) {
+ tbe.Cached := true;
+ }
+ }
+ }
+
+ action(mwc_markSinkWriteCancel, "mwc", desc="Mark to sink impending VicDirty") {
+ peek(responseNetwork_in, ResponseMsg) {
+ getDirectoryEntry(address).VicDirtyIgnore.add(in_msg.Sender);
+ APPEND_TRANSITION_COMMENT(" setting bit to sink VicDirty ");
+ }
+ }
+
+ action(x_decrementAcks, "x", desc="decrement Acks pending") {
+ tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+ APPEND_TRANSITION_COMMENT(" Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ }
+
+ action(o_checkForCompletion, "o", desc="check for ack completion") {
+ if (tbe.NumPendingAcks == 0) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:AcksComplete;
+ }
+ }
+ APPEND_TRANSITION_COMMENT(" Check: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ }
+
+ action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor);
+ }
+ }
+
+ action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
+ peek(responseNetwork_in, ResponseMsg) {
+ if (L3CacheMemory.isTagPresent(address)) {
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+ APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+ entry.DataBlk := in_msg.DataBlk;
+ entry.LastSender := in_msg.Sender;
+ } else {
+ if (L3CacheMemory.cacheAvail(address) == false) {
+ Addr victim := L3CacheMemory.cacheProbe(address);
+ CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+ L3CacheMemory.lookup(victim));
+ queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+ victim_entry.DataBlk);
+ L3CacheMemory.deallocate(victim);
+ }
+ assert(L3CacheMemory.cacheAvail(address));
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+ APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+ entry.DataBlk := in_msg.DataBlk;
+
+ entry.LastSender := in_msg.Sender;
+ }
+ }
+ }
+
+ action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") {
+ if ((tbe.wtData || tbe.atomicData) && useL3OnWT) {
+ if (L3CacheMemory.isTagPresent(address)) {
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+ APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+ entry.DataBlk := tbe.DataBlk;
+ entry.LastSender := tbe.LastSender;
+ } else {
+ if (L3CacheMemory.cacheAvail(address) == false) {
+ Addr victim := L3CacheMemory.cacheProbe(address);
+ CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+ L3CacheMemory.lookup(victim));
+ queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+ victim_entry.DataBlk);
+ L3CacheMemory.deallocate(victim);
+ }
+ assert(L3CacheMemory.cacheAvail(address));
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+ APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+ entry.DataBlk := tbe.DataBlk;
+ entry.LastSender := tbe.LastSender;
+ }
+ }
+ }
+
+ action(sf_setForwardReqTime, "sf", desc="...") {
+ tbe.ForwardRequestTime := curCycle();
+ }
+
+ action(dl_deallocateL3, "dl", desc="deallocate the L3 block") {
+ L3CacheMemory.deallocate(address);
+ }
+
+ action(p_popRequestQueue, "p", desc="pop request queue") {
+ requestNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pr_popResponseQueue, "pr", desc="pop response queue") {
+ responseNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pm_popMemQueue, "pm", desc="pop mem queue") {
+ memQueue_in.dequeue(clockEdge());
+ }
+
+ action(pt_popTriggerQueue, "pt", desc="pop trigger queue") {
+ triggerQueue_in.dequeue(clockEdge());
+ }
+
+ action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") {
+ L3TriggerQueue_in.dequeue(clockEdge());
+ }
+
+ action(pu_popUnblockQueue, "pu", desc="pop unblock queue") {
+ unblockNetwork_in.dequeue(clockEdge());
+ }
+
+ action(zz_recycleRequestQueue, "zz", desc="recycle request queue") {
+ requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(yy_recycleResponseQueue, "yy", desc="recycle response queue") {
+ responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") {
+ stall_and_wait(requestNetwork_in, address);
+ }
+
+ action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") {
+ wakeUpBuffers(address);
+ }
+
+ action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") {
+ wakeUpAllBuffers();
+ }
+
+ action(z_stall, "z", desc="...") {
+ }
+
+ // TRANSITIONS
+ transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) {
+ st_stallAndWaitRequest;
+ }
+
+ // It may be possible to save multiple invalidations here!
+ transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, {Atomic, WriteThrough}) {
+ st_stallAndWaitRequest;
+ }
+
+
+ // transitions from U
+ transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead} {
+ t_allocateTBE;
+ l_queueMemRdReq;
+ sc_probeShrCoreData;
+ p_popRequestQueue;
+ }
+
+ transition(U, WriteThrough, BM_PM) {L3TagArrayRead, L3TagArrayWrite} {
+ t_allocateTBE;
+ w_sendResponseWBAck;
+ l_queueMemRdReq;
+ dc_probeInvCoreData;
+ p_popRequestQueue;
+ }
+
+ transition(U, Atomic, BM_PM) {L3TagArrayRead, L3TagArrayWrite} {
+ t_allocateTBE;
+ l_queueMemRdReq;
+ dc_probeInvCoreData;
+ p_popRequestQueue;
+ }
+
+ transition(U, {RdBlkM}, BM_PM) {L3TagArrayRead} {
+ t_allocateTBE;
+ l_queueMemRdReq;
+ dc_probeInvCoreData;
+ p_popRequestQueue;
+ }
+
+ transition(U, RdBlk, B_PM) {L3TagArrayRead}{
+ t_allocateTBE;
+ l_queueMemRdReq;
+ sc_probeShrCoreData;
+ p_popRequestQueue;
+ }
+
+ transition(U, CtoD, BP) {L3TagArrayRead} {
+ t_allocateTBE;
+ ic_probeInvCore;
+ p_popRequestQueue;
+ }
+
+ transition(U, VicDirty, BL) {L3TagArrayRead} {
+ t_allocateTBE;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(U, VicClean, BL) {L3TagArrayRead} {
+ t_allocateTBE;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition(BL, {VicDirty, VicClean}) {
+ zz_recycleRequestQueue;
+ }
+
+ transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} {
+ d_writeDataToMemory;
+ al_allocateL3Block;
+ wa_wakeUpDependents;
+ dt_deallocateTBE;
+ pr_popResponseQueue;
+ }
+
+ transition(BL, StaleWB, U) {L3TagArrayWrite} {
+ dt_deallocateTBE;
+ wa_wakeUpAllDependents;
+ pr_popResponseQueue;
+ }
+
+ transition({B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm}, {VicDirty, VicClean}) {
+ z_stall;
+ }
+
+ transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, WBAck) {
+ pm_popMemQueue;
+ }
+
+ transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B}, StaleVicDirty) {
+ rv_removeVicDirtyIgnore;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition({B}, CoreUnblock, U) {
+ wa_wakeUpDependents;
+ pu_popUnblockQueue;
+ }
+
+ transition(B, UnblockWriteThrough, U) {
+ wa_wakeUpDependents;
+ pt_popTriggerQueue;
+ }
+
+ transition(BS_PM, MemData, BS_Pm) {} {
+ mt_writeMemDataToTBE;
+ pm_popMemQueue;
+ }
+
+ transition(BM_PM, MemData, BM_Pm){} {
+ mt_writeMemDataToTBE;
+ pm_popMemQueue;
+ }
+
+ transition(B_PM, MemData, B_Pm){} {
+ mt_writeMemDataToTBE;
+ pm_popMemQueue;
+ }
+
+ transition(BS_PM, L3Hit, BS_Pm) {} {
+ ptl_popTriggerQueue;
+ }
+
+ transition(BM_PM, L3Hit, BM_Pm) {} {
+ ptl_popTriggerQueue;
+ }
+
+ transition(B_PM, L3Hit, B_Pm) {} {
+ ptl_popTriggerQueue;
+ }
+
+ transition(BS_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+ mt_writeMemDataToTBE;
+ s_sendResponseS;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pm_popMemQueue;
+ }
+
+ transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+ mt_writeMemDataToTBE;
+ m_sendResponseM;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pm_popMemQueue;
+ }
+
+ transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+ mt_writeMemDataToTBE;
+ es_sendResponseES;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pm_popMemQueue;
+ }
+
+ transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+ s_sendResponseS;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ ptl_popTriggerQueue;
+ }
+
+ transition(BM_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} {
+ m_sendResponseM;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ ptl_popTriggerQueue;
+ }
+
+ transition(B_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} {
+ es_sendResponseES;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ ptl_popTriggerQueue;
+ }
+
+ transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, BP}, CPUPrbResp) {
+ y_writeProbeDataToTBE;
+ x_decrementAcks;
+ o_checkForCompletion;
+ pr_popResponseQueue;
+ }
+
+ transition(BS_PM, ProbeAcksComplete, BS_M) {} {
+ sf_setForwardReqTime;
+ pt_popTriggerQueue;
+ }
+
+ transition(BM_PM, ProbeAcksComplete, BM_M) {} {
+ sf_setForwardReqTime;
+ pt_popTriggerQueue;
+ }
+
+ transition(B_PM, ProbeAcksComplete, B_M){} {
+ sf_setForwardReqTime;
+ pt_popTriggerQueue;
+ }
+
+ transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+ sf_setForwardReqTime;
+ s_sendResponseS;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+
+ transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+ sf_setForwardReqTime;
+ m_sendResponseM;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+
+ transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+ sf_setForwardReqTime;
+ es_sendResponseES;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+
+ transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} {
+ sf_setForwardReqTime;
+ c_sendResponseCtoD;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-msg.sm b/src/mem/protocol/MOESI_AMD_Base-msg.sm
new file mode 100644
index 000000000..ff8842369
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-msg.sm
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2010-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+
+enumeration(CoherenceRequestType, desc="Coherence Request Types") {
+ // CPU Request Types ONLY
+ RdBlk, desc="Read Blk";
+ RdBlkM, desc="Read Blk Modified";
+ RdBlkS, desc="Read Blk Shared";
+ CtoD, desc="Change To Dirty";
+ VicClean, desc="L2 clean eviction";
+ VicDirty, desc="L2 dirty eviction";
+ Atomic, desc="Upper level atomic";
+ AtomicWriteBack, desc="Upper level atomic";
+ WriteThrough, desc="Ordered WriteThrough w/Data";
+ WriteThroughFifo, desc="WriteThrough with no data";
+ WriteThroughDummy, desc="WriteThrough with no data for atomic operation";
+ WriteFlush, desc="Release Flush";
+
+ WrCancel, desc="want to cancel WB to Memory"; // should this be here?
+
+ WBApproval, desc="WB Approval";
+
+ // Messages between Dir and R-Dir
+ ForceInv, desc="Send invalide to the block";
+ ForceDowngrade, desc="Send downgrade to the block";
+ Unblock, desc="Used to let the dir know a message has been sunk";
+
+ // Messages between R-Dir and R-Buffer
+ PrivateNotify, desc="Let region buffer know it has private access";
+ SharedNotify, desc="Let region buffer know it has shared access";
+ WbNotify, desc="Let region buffer know it saw its wb request";
+ Downgrade, desc="Force the region buffer to downgrade to shared";
+ // Response to R-Dir (probably should be on a different network, but
+ // I need it to be ordered with respect to requests)
+ InvAck, desc="Let the R-Dir know when the inv has occured";
+
+ PrivateRequest, desc="R-buf wants the region in private";
+ UpgradeRequest, desc="R-buf wants the region in private";
+ SharedRequest, desc="R-buf wants the region in shared (could respond with private)";
+ CleanWbRequest, desc="R-buf wants to deallocate clean region";
+
+ NA, desc="So we don't get segfaults";
+}
+
+enumeration(ProbeRequestType, desc="Probe Request Types") {
+ PrbDowngrade, desc="Probe for Status"; // EtoS, MtoO, StoS
+ PrbInv, desc="Probe to Invalidate";
+
+ // For regions
+ PrbRepl, desc="Force the cache to do a replacement";
+ PrbRegDowngrade, desc="Probe for Status"; // EtoS, MtoO, StoS
+ PrbAtomic, desc="Forwarded Atomic Operation";
+}
+
+
+enumeration(CoherenceResponseType, desc="Coherence Response Types") {
+ NBSysResp, desc="Northbridge response to CPU Rd request";
+ NBSysWBAck, desc="Northbridge response ok to WB";
+ TDSysResp, desc="TCCdirectory response to CPU Rd request";
+ TDSysWBAck, desc="TCCdirectory response ok to WB";
+ TDSysWBNack, desc="TCCdirectory response ok to drop";
+ CPUPrbResp, desc="CPU Probe Response";
+ CPUData, desc="CPU Data";
+ StaleNotif, desc="Notification of Stale WBAck, No data to writeback";
+ CPUCancelWB, desc="want to cancel WB to Memory";
+ MemData, desc="Data from Memory";
+
+ // for regions
+ PrivateAck, desc="Ack that r-buf received private notify";
+ RegionWbAck, desc="Writeback Ack that r-buf completed deallocation";
+ DirReadyAck, desc="Directory (mem ctrl)<->region dir handshake";
+}
+
+enumeration(CoherenceState, default="CoherenceState_NA", desc="Coherence State") {
+ Modified, desc="Modified";
+ Owned, desc="Owned state";
+ Exclusive, desc="Exclusive";
+ Shared, desc="Shared";
+ NA, desc="NA";
+}
+
+structure(CPURequestMsg, desc="...", interface="Message") {
+ Addr addr, desc="Physical address for this request";
+ Addr DemandAddress, desc="Physical block address for this request";
+ CoherenceRequestType Type, desc="Type of request";
+ DataBlock DataBlk, desc="data for the cache line"; // only for WB
+ bool Dirty, desc="whether WB data is dirty"; // only for WB
+ MachineID Requestor, desc="Node who initiated the request";
+ NetDest Destination, desc="Multicast destination mask";
+ bool Shared, desc="For CPU_WrVicBlk, vic is O not M. For CPU_ClVicBlk, vic is S";
+ MessageSizeType MessageSize, desc="size category of the message";
+ Cycles InitialRequestTime, desc="time the initial requests was sent from the L1Cache";
+ Cycles ForwardRequestTime, desc="time the dir forwarded the request";
+ Cycles ProbeRequestStartTime, desc="the time the dir started the probe request";
+ bool DemandRequest, default="false", desc="For profiling purposes";
+
+ NetDest Sharers, desc="Caches that may have a valid copy of the data";
+ bool ForceShared, desc="R-dir knows it is shared, pass on so it sends an S copy, not E";
+ bool Private, default="false", desc="Requestor already has private permissions, no need for dir check";
+ bool CtoDSinked, default="false", desc="This is true if the CtoD previously sent must have been sunk";
+
+ bool NoAckNeeded, default="false", desc="True if region buffer doesn't need to ack";
+ int Acks, default="0", desc="Acks that the dir (mem ctrl) should expect to receive";
+ CoherenceRequestType OriginalType, default="CoherenceRequestType_NA", desc="Type of request from core fwded through region buffer";
+ WriteMask writeMask, desc="Write Through Data";
+ MachineID WTRequestor, desc="Node who initiated the write through";
+ HSAScope scope, default="HSAScope_SYSTEM", desc="Request Scope";
+ int wfid, default="0", desc="wavefront id";
+ bool NoWriteConflict, default="true", desc="write collided with CAB entry";
+ int ProgramCounter, desc="PC that accesses to this block";
+
+ bool functionalRead(Packet *pkt) {
+ // Only PUTX messages contains the data block
+ if (Type == CoherenceRequestType:VicDirty) {
+ return testAndRead(addr, DataBlk, pkt);
+ }
+
+ return false;
+ }
+
+ bool functionalWrite(Packet *pkt) {
+ // No check on message type required since the protocol should
+ // read data from those messages that contain the block
+ return testAndWrite(addr, DataBlk, pkt);
+ }
+}
+
+structure(NBProbeRequestMsg, desc="...", interface="Message") {
+ Addr addr, desc="Physical address for this request";
+ ProbeRequestType Type, desc="NB_PrbNxtState signal";
+ bool ReturnData, desc="Indicates CPU should return data";
+ NetDest Destination, desc="Node to whom the data is sent";
+ MessageSizeType MessageSize, desc="size category of the message";
+ bool DemandRequest, default="false", desc="demand request, requesting 3-hop transfer";
+ Addr DemandAddress, desc="Demand block address for a region request";
+ MachineID Requestor, desc="Requestor id for 3-hop requests";
+ bool NoAckNeeded, default="false", desc="For short circuting acks";
+ int ProgramCounter, desc="PC that accesses to this block";
+
+ bool functionalRead(Packet *pkt) {
+ return false;
+ }
+
+ bool functionalWrite(Packet *pkt) {
+ // No check on message type required since the protocol should
+ // read data from those messages that contain the block
+ return false;
+ }
+
+}
+
+structure(TDProbeRequestMsg, desc="...", interface="Message") {
+ Addr addr, desc="Physical address for this request";
+ ProbeRequestType Type, desc="TD_PrbNxtState signal";
+ bool ReturnData, desc="Indicates CPU should return data";
+ bool localCtoD, desc="Indicates CtoD is within the GPU hierarchy (aka TCC subtree)";
+ NetDest Destination, desc="Node to whom the data is sent";
+ MessageSizeType MessageSize, desc="size category of the message";
+ int Phase, desc="Synchronization Phase";
+ int wfid, desc="wavefront id for Release";
+ MachineID Requestor, desc="Node who initiated the request";
+
+ bool functionalRead(Packet *pkt) {
+ return false;
+ }
+
+ bool functionalWrite(Packet *pkt) {
+ // No check on message type required since the protocol should
+ // read data from those messages that contain the block
+ return false;
+ }
+}
+
+// Response Messages seemed to be easily munged into one type
+structure(ResponseMsg, desc="...", interface="Message") {
+ Addr addr, desc="Physical address for this request";
+ CoherenceResponseType Type, desc="NB Sys Resp or CPU Response to Probe";
+ MachineID Sender, desc="Node who sent the data";
+ NetDest Destination, desc="Node to whom the data is sent";
+ // Begin Used Only By CPU Response
+ DataBlock DataBlk, desc="data for the cache line";
+ bool Hit, desc="probe hit valid line";
+ bool Shared, desc="True if S, or if NB Probe ReturnData==1 && O";
+ bool Dirty, desc="Is the data dirty (different than memory)?";
+ bool Ntsl, desc="indicates probed lin will be invalid after probe";
+ bool UntransferredOwner, desc="pending confirmation of ownership change";
+ // End Used Only By CPU Response
+
+ // Begin NB Response Only
+ CoherenceState State, default=CoherenceState_NA, desc="What returned data from NB should be in";
+ bool CtoD, desc="was the originator a CtoD?";
+ // End NB Response Only
+
+ // Normally if a block gets hit by a probe while waiting to be written back,
+ // you flip the NbReqShared signal (part of the CPURequest signal group).
+ // But since this is in packets and I don't want to send a separate packet,
+ // let's just send this signal back with the data instead
+ bool NbReqShared, desc="modification of Shared field from initial request, e.g. hit by shared probe";
+
+ MessageSizeType MessageSize, desc="size category of the message";
+ Cycles InitialRequestTime, desc="time the initial requests was sent from the L1Cache";
+ Cycles ForwardRequestTime, desc="time the dir forwarded the request";
+ Cycles ProbeRequestStartTime, desc="the time the dir started the probe request";
+ bool DemandRequest, default="false", desc="For profiling purposes";
+
+ bool L3Hit, default="false", desc="Did memory or L3 supply the data?";
+ MachineID OriginalResponder, desc="Mach which wrote the data to the L3";
+ MachineID WTRequestor, desc="Node who started the writethrough";
+
+ bool NotCached, default="false", desc="True when the Region buffer has already evicted the line";
+
+ bool NoAckNeeded, default="false", desc="For short circuting acks";
+ bool isValid, default="false", desc="Is acked block valid";
+ int wfid, default="0", desc="wavefront id";
+ int Phase, desc="Synchronization Phase";
+
+ int ProgramCounter, desc="PC that issues this request";
+ bool mispred, desc="tell TCP if the block should not be bypassed";
+
+
+ bool functionalRead(Packet *pkt) {
+ // Only PUTX messages contains the data block
+ if (Type == CoherenceResponseType:CPUData ||
+ Type == CoherenceResponseType:MemData) {
+ return testAndRead(addr, DataBlk, pkt);
+ }
+
+ return false;
+ }
+
+ bool functionalWrite(Packet *pkt) {
+ // No check on message type required since the protocol should
+ // read data from those messages that contain the block
+ return testAndWrite(addr, DataBlk, pkt);
+ }
+}
+
+structure(UnblockMsg, desc="...", interface="Message") {
+ Addr addr, desc="Physical address for this request";
+ NetDest Destination, desc="Destination (always directory)";
+ MessageSizeType MessageSize, desc="size category of the message";
+ MachineID Sender, desc="Node who sent the data";
+ bool currentOwner, default="false", desc="Is the sender the current owner";
+ bool DoneAck, default="false", desc="Is this a done ack?";
+ bool Dirty, default="false", desc="Was block dirty when evicted";
+ bool wasValid, default="false", desc="Was block valid when evicted";
+ bool valid, default="false", desc="Is block valid";
+ bool validToInvalid, default="false", desc="Was block valid when evicted";
+
+ bool functionalRead(Packet *pkt) {
+ return false;
+ }
+
+ bool functionalWrite(Packet *pkt) {
+ // No check on message type required since the protocol should
+ // read data from those messages that contain the block
+ return false;
+ }
+}
+
+enumeration(TriggerType, desc="Trigger Type") {
+ L2_to_L1, desc="L2 to L1 fill";
+ AcksComplete, desc="NB received all needed Acks";
+
+ // For regions
+ InvNext, desc="Invalidate the next block";
+ PrivateAck, desc="Loopback ack for machines with no Region Buffer";
+ AllOutstanding, desc="All outstanding requests have finished";
+ L3Hit, desc="L3 hit in dir";
+
+ // For region directory once the directory is blocked
+ InvRegion, desc="Invalidate region";
+ DowngradeRegion, desc="downgrade region";
+ //For writethrough
+ UnblockWriteThrough, desc="unblock";
+ WriteData, desc="Write to full cacheblock data";
+ WriteDone, desc="Sequencer says that write is done";
+ AtomicDone, desc="Atomic is done";
+}
+
+enumeration(CacheId, desc="Which Cache in the Core") {
+ L1I, desc="L1 I-cache";
+ L1D0, desc="L1 D-cache cluster 0";
+ L1D1, desc="L1 D-cache cluster 1";
+ NA, desc="Default";
+}
+
+structure(TriggerMsg, desc="...", interface="Message") {
+ Addr addr, desc="Address";
+ TriggerType Type, desc="Type of trigger";
+ CacheId Dest, default="CacheId_NA", desc="Cache to invalidate";
+ int ProgramCounter, desc="PC that accesses to this block";
+
+ bool functionalRead(Packet *pkt) {
+ return false;
+ }
+
+ bool functionalWrite(Packet *pkt) {
+ // No check on message type required since the protocol should
+ // read data from those messages that contain the block
+ return false;
+ }
+
+}
+
+enumeration(FifoType, desc="Fifo Type") {
+ WriteDummy, desc="Dummy Write for atomic operation";
+ WriteThrough, desc="simple writethrough request";
+ WriteFlush, desc="synchronization message";
+}
+
+structure(FifoMsg, desc="...", interface="Message") {
+ Addr addr, desc="Address";
+ FifoType Type, desc="WriteThrough/WriteFlush";
+ int wfid, default="0",desc="wavefront id";
+ MachineID Requestor, desc="Flush Requestor";
+ MachineID oRequestor, desc="original Flush Requestor";
+
+ bool functionalRead(Packet *pkt) {
+ return false;
+ }
+
+ bool functionalWrite(Packet *pkt) {
+ // No check on message type required since the protocol should
+ // read data from those messages that contain the block
+ return false;
+ }
+
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm b/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm
new file mode 100644
index 000000000..f545c2fa7
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base-probeFilter.sm
@@ -0,0 +1,1408 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu, Sooraj Puthoor
+ */
+
+/*
+ * This file is based on MOESI_AMD_Base.sm
+ * Differences with AMD base protocol
+ * -- Uses a probe filter memory to track sharers.
+ * -- The probe filter can be inclusive or non-inclusive
+ * -- Only two sharers tracked. Sharers are a) GPU or/and b) CPU
+ * -- If sharer information available, the sharer is probed
+ * -- If sharer information not available, probes are broadcasted
+ */
+
+machine(MachineType:Directory, "AMD Baseline protocol")
+: DirectoryMemory * directory;
+ CacheMemory * L3CacheMemory;
+ CacheMemory * ProbeFilterMemory;
+ Cycles response_latency := 5;
+ Cycles l3_hit_latency := 50;
+ bool noTCCdir := "False";
+ bool CAB_TCC := "False";
+ int TCC_select_num_bits:=1;
+ bool useL3OnWT := "False";
+ bool inclusiveDir := "True";
+ Cycles to_memory_controller_latency := 1;
+
+ // From the Cores
+ MessageBuffer * requestFromCores, network="From", virtual_network="0", ordered="false", vnet_type="request";
+ MessageBuffer * responseFromCores, network="From", virtual_network="2", ordered="false", vnet_type="response";
+ MessageBuffer * unblockFromCores, network="From", virtual_network="4", ordered="false", vnet_type="unblock";
+
+ MessageBuffer * probeToCore, network="To", virtual_network="0", ordered="false", vnet_type="request";
+ MessageBuffer * responseToCore, network="To", virtual_network="2", ordered="false", vnet_type="response";
+
+ MessageBuffer * triggerQueue, ordered="true";
+ MessageBuffer * L3triggerQueue, ordered="true";
+ MessageBuffer * responseFromMemory;
+{
+ // STATES
+ state_declaration(State, desc="Directory states", default="Directory_State_U") {
+ U, AccessPermission:Backing_Store, desc="unblocked";
+ BL, AccessPermission:Busy, desc="got L3 WB request";
+ // BL is Busy because it is busy waiting for the data
+ // which is possibly in the network. The cache which evicted the data
+ // might have moved to some other state after doing the eviction
+ // BS==> Received a read request; has not requested ownership
+ // B==> Received a read request; has requested ownership
+ // BM==> Received a modification request
+ B_P, AccessPermission:Backing_Store, desc="Back invalidation, waiting for probes";
+ BS_M, AccessPermission:Backing_Store, desc="blocked waiting for memory";
+ BM_M, AccessPermission:Backing_Store, desc="blocked waiting for memory";
+ B_M, AccessPermission:Backing_Store, desc="blocked waiting for memory";
+ BP, AccessPermission:Backing_Store, desc="blocked waiting for probes, no need for memory";
+ BS_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory";
+ BM_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory";
+ B_PM, AccessPermission:Backing_Store, desc="blocked waiting for probes and Memory";
+ BS_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ BM_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ B_Pm, AccessPermission:Backing_Store, desc="blocked waiting for probes, already got memory";
+ B, AccessPermission:Backing_Store, desc="sent response, Blocked til ack";
+ }
+
+ // Events
+ enumeration(Event, desc="Directory events") {
+ // CPU requests
+ RdBlkS, desc="...";
+ RdBlkM, desc="...";
+ RdBlk, desc="...";
+ CtoD, desc="...";
+ WriteThrough, desc="WriteThrough Message";
+ Atomic, desc="Atomic Message";
+
+ // writebacks
+ VicDirty, desc="...";
+ VicClean, desc="...";
+ CPUData, desc="WB data from CPU";
+ StaleWB, desc="Notification that WB has been superceded by a probe";
+
+ // probe responses
+ CPUPrbResp, desc="Probe Response Msg";
+
+ ProbeAcksComplete, desc="Probe Acks Complete";
+
+ L3Hit, desc="Hit in L3 return data to core";
+
+ // Replacement
+ PF_Repl, desc="Replace address from probe filter";
+
+ // Memory Controller
+ MemData, desc="Fetched data from memory arrives";
+ WBAck, desc="Writeback Ack from memory arrives";
+
+ CoreUnblock, desc="Core received data, unblock";
+ UnblockWriteThrough, desc="Unblock because of writethrough request finishing";
+
+ StaleVicDirty, desc="Core invalidated before VicDirty processed";
+ }
+
+ enumeration(RequestType, desc="To communicate stats from transitions to recordStats") {
+ L3DataArrayRead, desc="Read the data array";
+ L3DataArrayWrite, desc="Write the data array";
+ L3TagArrayRead, desc="Read the data array";
+ L3TagArrayWrite, desc="Write the data array";
+
+ PFTagArrayRead, desc="Read the data array";
+ PFTagArrayWrite, desc="Write the data array";
+ }
+
+ // TYPES
+
+ enumeration(ProbeFilterState, desc="") {
+ T, desc="Tracked";
+ NT, desc="Not tracked";
+ B, desc="Blocked, This entry is being replaced";
+ }
+
+ // DirectoryEntry
+ structure(Entry, desc="...", interface="AbstractEntry") {
+ State DirectoryState, desc="Directory state";
+ DataBlock DataBlk, desc="data for the block";
+ NetDest VicDirtyIgnore, desc="VicDirty coming from whom to ignore";
+ }
+
+ structure(CacheEntry, desc="...", interface="AbstractCacheEntry") {
+ DataBlock DataBlk, desc="data for the block";
+ MachineID LastSender, desc="Mach which this block came from";
+ ProbeFilterState pfState, desc="ProbeFilter state",default="Directory_ProbeFilterState_NT";
+ bool isOnCPU, desc="Block valid in the CPU complex",default="false";
+ bool isOnGPU, desc="Block valid in the GPU complex",default="false";
+ }
+
+ structure(TBE, desc="...") {
+ State TBEState, desc="Transient state";
+ DataBlock DataBlk, desc="data for the block";
+ bool Dirty, desc="Is the data dirty?";
+ int NumPendingAcks, desc="num acks expected";
+ MachineID OriginalRequestor, desc="Original Requestor";
+ MachineID WTRequestor, desc="WT Requestor";
+ bool Cached, desc="data hit in Cache";
+ bool MemData, desc="Got MemData?",default="false";
+ bool wtData, desc="Got write through data?",default="false";
+ bool atomicData, desc="Got Atomic op?",default="false";
+ Cycles InitialRequestTime, desc="...";
+ Cycles ForwardRequestTime, desc="...";
+ Cycles ProbeRequestStartTime, desc="...";
+ MachineID LastSender, desc="Mach which this block came from";
+ bool L3Hit, default="false", desc="Was this an L3 hit?";
+ uint64_t probe_id, desc="probe id for lifetime profiling";
+ WriteMask writeMask, desc="outstanding write through mask";
+ Addr demandAddress, desc="Address of demand request which caused probe filter eviction";
+ }
+
+ structure(TBETable, external="yes") {
+ TBE lookup(Addr);
+ void allocate(Addr);
+ void deallocate(Addr);
+ bool isPresent(Addr);
+ }
+
+ TBETable TBEs, template="<Directory_TBE>", constructor="m_number_of_TBEs";
+
+ int TCC_select_low_bit, default="RubySystem::getBlockSizeBits()";
+
+ Tick clockEdge();
+ Tick cyclesToTicks(Cycles c);
+
+ void set_tbe(TBE a);
+ void unset_tbe();
+ void wakeUpAllBuffers();
+ void wakeUpBuffers(Addr a);
+ Cycles curCycle();
+
+ Entry getDirectoryEntry(Addr addr), return_by_pointer="yes" {
+ Entry dir_entry := static_cast(Entry, "pointer", directory.lookup(addr));
+
+ if (is_valid(dir_entry)) {
+ //DPRINTF(RubySlicc, "Getting entry %s: %s\n", addr, dir_entry.DataBlk);
+ return dir_entry;
+ }
+
+ dir_entry := static_cast(Entry, "pointer",
+ directory.allocate(addr, new Entry));
+ return dir_entry;
+ }
+
+ DataBlock getDataBlock(Addr addr), return_by_ref="yes" {
+ TBE tbe := TBEs.lookup(addr);
+ if (is_valid(tbe) && tbe.MemData) {
+ DPRINTF(RubySlicc, "Returning DataBlk from TBE %s:%s\n", addr, tbe);
+ return tbe.DataBlk;
+ }
+ DPRINTF(RubySlicc, "Returning DataBlk from Dir %s:%s\n", addr, getDirectoryEntry(addr));
+ return getDirectoryEntry(addr).DataBlk;
+ }
+
+ State getState(TBE tbe, CacheEntry entry, Addr addr) {
+ CacheEntry probeFilterEntry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(addr));
+ if (inclusiveDir) {
+ if (is_valid(probeFilterEntry) && probeFilterEntry.pfState == ProbeFilterState:B) {
+ return State:B_P;
+ }
+ }
+ return getDirectoryEntry(addr).DirectoryState;
+ }
+
+ void setState(TBE tbe, CacheEntry entry, Addr addr, State state) {
+ getDirectoryEntry(addr).DirectoryState := state;
+ }
+
+ void functionalRead(Addr addr, Packet *pkt) {
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ testAndRead(addr, tbe.DataBlk, pkt);
+ } else {
+ functionalMemoryRead(pkt);
+ }
+ }
+
+ int functionalWrite(Addr addr, Packet *pkt) {
+ int num_functional_writes := 0;
+
+ TBE tbe := TBEs.lookup(addr);
+ if(is_valid(tbe)) {
+ num_functional_writes := num_functional_writes +
+ testAndWrite(addr, tbe.DataBlk, pkt);
+ }
+
+ num_functional_writes := num_functional_writes +
+ functionalMemoryWrite(pkt);
+ return num_functional_writes;
+ }
+
+ AccessPermission getAccessPermission(Addr addr) {
+ // For this Directory, all permissions are just tracked in Directory, since
+ // it's not possible to have something in TBE but not Dir, just keep track
+ // of state all in one place.
+ if (directory.isPresent(addr)) {
+ return Directory_State_to_permission(getDirectoryEntry(addr).DirectoryState);
+ }
+
+ return AccessPermission:NotPresent;
+ }
+
+ void setAccessPermission(CacheEntry entry, Addr addr, State state) {
+ getDirectoryEntry(addr).changePermission(Directory_State_to_permission(state));
+ }
+
+ void recordRequestType(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:L3DataArrayRead) {
+ L3CacheMemory.recordRequestType(CacheRequestType:DataArrayRead, addr);
+ } else if (request_type == RequestType:L3DataArrayWrite) {
+ L3CacheMemory.recordRequestType(CacheRequestType:DataArrayWrite, addr);
+ } else if (request_type == RequestType:L3TagArrayRead) {
+ L3CacheMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:L3TagArrayWrite) {
+ L3CacheMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ } else if (request_type == RequestType:PFTagArrayRead) {
+ ProbeFilterMemory.recordRequestType(CacheRequestType:TagArrayRead, addr);
+ } else if (request_type == RequestType:PFTagArrayWrite) {
+ ProbeFilterMemory.recordRequestType(CacheRequestType:TagArrayWrite, addr);
+ }
+ }
+
+ bool checkResourceAvailable(RequestType request_type, Addr addr) {
+ if (request_type == RequestType:L3DataArrayRead) {
+ return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L3DataArrayWrite) {
+ return L3CacheMemory.checkResourceAvailable(CacheResourceType:DataArray, addr);
+ } else if (request_type == RequestType:L3TagArrayRead) {
+ return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:L3TagArrayWrite) {
+ return L3CacheMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:PFTagArrayRead) {
+ return ProbeFilterMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else if (request_type == RequestType:PFTagArrayWrite) {
+ return ProbeFilterMemory.checkResourceAvailable(CacheResourceType:TagArray, addr);
+ } else {
+ error("Invalid RequestType type in checkResourceAvailable");
+ return true;
+ }
+ }
+
+ bool isNotPresentProbeFilter(Addr address) {
+ if (ProbeFilterMemory.isTagPresent(address) ||
+ ProbeFilterMemory.cacheAvail(address)) {
+ return false;
+ }
+ return true;
+ }
+
+ bool isGPUSharer(Addr address) {
+ assert(ProbeFilterMemory.isTagPresent(address));
+ CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+ if (entry.pfState == ProbeFilterState:NT) {
+ return true;
+ } else if (entry.isOnGPU){
+ return true;
+ }
+ return false;
+ }
+
+ bool isCPUSharer(Addr address) {
+ assert(ProbeFilterMemory.isTagPresent(address));
+ CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+ if (entry.pfState == ProbeFilterState:NT) {
+ return true;
+ } else if (entry.isOnCPU){
+ return true;
+ }
+ return false;
+ }
+
+
+ // ** OUT_PORTS **
+ out_port(probeNetwork_out, NBProbeRequestMsg, probeToCore);
+ out_port(responseNetwork_out, ResponseMsg, responseToCore);
+
+ out_port(triggerQueue_out, TriggerMsg, triggerQueue);
+ out_port(L3TriggerQueue_out, TriggerMsg, L3triggerQueue);
+
+ // ** IN_PORTS **
+
+ // Trigger Queue
+ in_port(triggerQueue_in, TriggerMsg, triggerQueue, rank=5) {
+ if (triggerQueue_in.isReady(clockEdge())) {
+ peek(triggerQueue_in, TriggerMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == TriggerType:AcksComplete) {
+ trigger(Event:ProbeAcksComplete, in_msg.addr, entry, tbe);
+ }else if (in_msg.Type == TriggerType:UnblockWriteThrough) {
+ trigger(Event:UnblockWriteThrough, in_msg.addr, entry, tbe);
+ } else {
+ error("Unknown trigger msg");
+ }
+ }
+ }
+ }
+
+ in_port(L3TriggerQueue_in, TriggerMsg, L3triggerQueue, rank=4) {
+ if (L3TriggerQueue_in.isReady(clockEdge())) {
+ peek(L3TriggerQueue_in, TriggerMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == TriggerType:L3Hit) {
+ trigger(Event:L3Hit, in_msg.addr, entry, tbe);
+ } else {
+ error("Unknown trigger msg");
+ }
+ }
+ }
+ }
+
+ // Unblock Network
+ in_port(unblockNetwork_in, UnblockMsg, unblockFromCores, rank=3) {
+ if (unblockNetwork_in.isReady(clockEdge())) {
+ peek(unblockNetwork_in, UnblockMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ trigger(Event:CoreUnblock, in_msg.addr, entry, tbe);
+ }
+ }
+ }
+
+ // Core response network
+ in_port(responseNetwork_in, ResponseMsg, responseFromCores, rank=2) {
+ if (responseNetwork_in.isReady(clockEdge())) {
+ peek(responseNetwork_in, ResponseMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == CoherenceResponseType:CPUPrbResp) {
+ trigger(Event:CPUPrbResp, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceResponseType:CPUData) {
+ trigger(Event:CPUData, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceResponseType:StaleNotif) {
+ trigger(Event:StaleWB, in_msg.addr, entry, tbe);
+ } else {
+ error("Unexpected response type");
+ }
+ }
+ }
+ }
+
+ // off-chip memory request/response is done
+ in_port(memQueue_in, MemoryMsg, responseFromMemory, rank=1) {
+ if (memQueue_in.isReady(clockEdge())) {
+ peek(memQueue_in, MemoryMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (in_msg.Type == MemoryRequestType:MEMORY_READ) {
+ trigger(Event:MemData, in_msg.addr, entry, tbe);
+ DPRINTF(RubySlicc, "%s\n", in_msg);
+ } else if (in_msg.Type == MemoryRequestType:MEMORY_WB) {
+ trigger(Event:WBAck, in_msg.addr, entry, tbe); // ignore WBAcks, don't care about them.
+ } else {
+ DPRINTF(RubySlicc, "%s\n", in_msg.Type);
+ error("Invalid message");
+ }
+ }
+ }
+ }
+
+ in_port(requestNetwork_in, CPURequestMsg, requestFromCores, rank=0) {
+ if (requestNetwork_in.isReady(clockEdge())) {
+ peek(requestNetwork_in, CPURequestMsg) {
+ TBE tbe := TBEs.lookup(in_msg.addr);
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(in_msg.addr));
+ if (inclusiveDir && isNotPresentProbeFilter(in_msg.addr)) {
+ Addr victim := ProbeFilterMemory.cacheProbe(in_msg.addr);
+ tbe := TBEs.lookup(victim);
+ entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(victim));
+ trigger(Event:PF_Repl, victim, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlk) {
+ trigger(Event:RdBlk, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkS) {
+ trigger(Event:RdBlkS, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+ trigger(Event:RdBlkM, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ trigger(Event:WriteThrough, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+ trigger(Event:Atomic, in_msg.addr, entry, tbe);
+ } else if (in_msg.Type == CoherenceRequestType:VicDirty) {
+ if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+ DPRINTF(RubySlicc, "Dropping VicDirty for address %s\n", in_msg.addr);
+ trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+ } else {
+ DPRINTF(RubySlicc, "Got VicDirty from %s on %s\n", in_msg.Requestor, in_msg.addr);
+ trigger(Event:VicDirty, in_msg.addr, entry, tbe);
+ }
+ } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+ if (getDirectoryEntry(in_msg.addr).VicDirtyIgnore.isElement(in_msg.Requestor)) {
+ DPRINTF(RubySlicc, "Dropping VicClean for address %s\n", in_msg.addr);
+ trigger(Event:StaleVicDirty, in_msg.addr, entry, tbe);
+ } else {
+ DPRINTF(RubySlicc, "Got VicClean from %s on %s\n", in_msg.Requestor, in_msg.addr);
+ trigger(Event:VicClean, in_msg.addr, entry, tbe);
+ }
+ } else {
+ error("Bad request message type");
+ }
+ }
+ }
+ }
+
+ // Actions
+ action(s_sendResponseS, "s", desc="send Shared response") {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ if (tbe.L3Hit) {
+ out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+ } else {
+ out_msg.Sender := machineID;
+ }
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := false;
+ out_msg.State := CoherenceState:Shared;
+ out_msg.InitialRequestTime := tbe.InitialRequestTime;
+ out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+ out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+ out_msg.OriginalResponder := tbe.LastSender;
+ out_msg.L3Hit := tbe.L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(es_sendResponseES, "es", desc="send Exclusive or Shared response") {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ if (tbe.L3Hit) {
+ out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+ } else {
+ out_msg.Sender := machineID;
+ }
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := tbe.Dirty;
+ if (tbe.Cached) {
+ out_msg.State := CoherenceState:Shared;
+ } else {
+ out_msg.State := CoherenceState:Exclusive;
+ }
+ out_msg.InitialRequestTime := tbe.InitialRequestTime;
+ out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+ out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+ out_msg.OriginalResponder := tbe.LastSender;
+ out_msg.L3Hit := tbe.L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ // write-through and atomics do not send an unblock ack back to the
+ // directory. Hence, directory has to generate a self unblocking
+ // message. Additionally, write through's does not require data
+ // in its response. Hence, write through is treated seperately from
+ // write-back and atomics
+ action(m_sendResponseM, "m", desc="send Modified response") {
+ if (tbe.wtData) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:UnblockWriteThrough;
+ }
+ }else{
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ if (tbe.L3Hit) {
+ out_msg.Sender := createMachineID(MachineType:L3Cache, intToID(0));
+ } else {
+ out_msg.Sender := machineID;
+ }
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.DataBlk := tbe.DataBlk;
+ out_msg.MessageSize := MessageSizeType:Response_Data;
+ out_msg.Dirty := tbe.Dirty;
+ out_msg.State := CoherenceState:Modified;
+ out_msg.CtoD := false;
+ out_msg.InitialRequestTime := tbe.InitialRequestTime;
+ out_msg.ForwardRequestTime := tbe.ForwardRequestTime;
+ out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+ out_msg.OriginalResponder := tbe.LastSender;
+ if(tbe.atomicData){
+ out_msg.WTRequestor := tbe.WTRequestor;
+ }
+ out_msg.L3Hit := tbe.L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ if (tbe.atomicData) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:UnblockWriteThrough;
+ }
+ }
+ }
+ }
+
+ action(c_sendResponseCtoD, "c", desc="send CtoD Ack") {
+ enqueue(responseNetwork_out, ResponseMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysResp;
+ out_msg.Sender := machineID;
+ out_msg.Destination.add(tbe.OriginalRequestor);
+ out_msg.MessageSize := MessageSizeType:Response_Control;
+ out_msg.Dirty := false;
+ out_msg.State := CoherenceState:Modified;
+ out_msg.CtoD := true;
+ out_msg.InitialRequestTime := tbe.InitialRequestTime;
+ out_msg.ForwardRequestTime := curCycle();
+ out_msg.ProbeRequestStartTime := tbe.ProbeRequestStartTime;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ }
+
+ action(w_sendResponseWBAck, "w", desc="send WB Ack") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(responseNetwork_out, ResponseMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := CoherenceResponseType:NBSysWBAck;
+ out_msg.Destination.add(in_msg.Requestor);
+ out_msg.WTRequestor := in_msg.WTRequestor;
+ out_msg.Sender := machineID;
+ out_msg.MessageSize := MessageSizeType:Writeback_Control;
+ out_msg.InitialRequestTime := in_msg.InitialRequestTime;
+ out_msg.ForwardRequestTime := curCycle();
+ out_msg.ProbeRequestStartTime := curCycle();
+ }
+ }
+ }
+
+ action(l_queueMemWBReq, "lq", desc="Write WB data to memory") {
+ peek(responseNetwork_in, ResponseMsg) {
+ queueMemoryWrite(machineID, address, to_memory_controller_latency,
+ in_msg.DataBlk);
+ }
+ }
+
+ action(l_queueMemRdReq, "lr", desc="Read data from memory") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ if (L3CacheMemory.isTagPresent(address)) {
+ enqueue(L3TriggerQueue_out, TriggerMsg, l3_hit_latency) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:L3Hit;
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ }
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+ tbe.DataBlk := entry.DataBlk;
+ tbe.LastSender := entry.LastSender;
+ tbe.L3Hit := true;
+ tbe.MemData := true;
+ L3CacheMemory.deallocate(address);
+ } else {
+ queueMemoryRead(machineID, address, to_memory_controller_latency);
+ }
+ }
+ }
+
+ action(dc_probeInvCoreData, "dc", desc="probe inv cores, return data") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ if(isCPUSharer(address)) {
+ out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket
+ }
+
+ // add relevant TCC node to list. This replaces all TCPs and SQCs
+ if(isGPUSharer(address)) {
+ if ((in_msg.Type == CoherenceRequestType:WriteThrough ||
+ in_msg.Type == CoherenceRequestType:Atomic) &&
+ in_msg.NoWriteConflict) {
+ // Don't Include TCCs unless there was write-CAB conflict in the TCC
+ } else if(noTCCdir) {
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ } else {
+ out_msg.Destination.add(map_Address_to_TCCdir(address));
+ }
+ }
+ out_msg.Destination.remove(in_msg.Requestor);
+ tbe.NumPendingAcks := out_msg.Destination.count();
+ if (tbe.NumPendingAcks == 0) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:AcksComplete;
+ }
+ }
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ APPEND_TRANSITION_COMMENT(" dc: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ tbe.ProbeRequestStartTime := curCycle();
+ }
+ }
+ }
+
+ action(bp_backProbe, "bp", desc="back probe") {
+ enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ if(isCPUSharer(address)) {
+ // won't be realistic for multisocket
+ out_msg.Destination.broadcast(MachineType:CorePair);
+ }
+ // add relevant TCC node to the list. This replaces all TCPs and SQCs
+ if(isGPUSharer(address)) {
+ if (noTCCdir) {
+ //Don't need to notify TCC about reads
+ } else {
+ out_msg.Destination.add(map_Address_to_TCCdir(address));
+ tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+ }
+ if (noTCCdir && CAB_TCC) {
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ }
+ }
+ tbe.NumPendingAcks := out_msg.Destination.count();
+ if (tbe.NumPendingAcks == 0) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:AcksComplete;
+ }
+ }
+ DPRINTF(RubySlicc, "%s\n", (out_msg));
+ APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ APPEND_TRANSITION_COMMENT(" - back probe");
+ tbe.ProbeRequestStartTime := curCycle();
+ }
+ }
+
+ action(sc_probeShrCoreData, "sc", desc="probe shared cores, return data") {
+ peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+ enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbDowngrade;
+ out_msg.ReturnData := true;
+ out_msg.MessageSize := MessageSizeType:Control;
+ if(isCPUSharer(address)) {
+ out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket
+ }
+ // add relevant TCC node to the list. This replaces all TCPs and SQCs
+ if(isGPUSharer(address)) {
+ if (noTCCdir) {
+ //Don't need to notify TCC about reads
+ } else {
+ out_msg.Destination.add(map_Address_to_TCCdir(address));
+ tbe.NumPendingAcks := tbe.NumPendingAcks + 1;
+ }
+ if (noTCCdir && CAB_TCC) {
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ }
+ }
+ out_msg.Destination.remove(in_msg.Requestor);
+ tbe.NumPendingAcks := out_msg.Destination.count();
+ if (tbe.NumPendingAcks == 0) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:AcksComplete;
+ }
+ }
+ DPRINTF(RubySlicc, "%s\n", (out_msg));
+ APPEND_TRANSITION_COMMENT(" sc: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ tbe.ProbeRequestStartTime := curCycle();
+ }
+ }
+ }
+
+ action(ic_probeInvCore, "ic", desc="probe invalidate core, no return data needed") {
+ peek(requestNetwork_in, CPURequestMsg) { // not the right network?
+ enqueue(probeNetwork_out, NBProbeRequestMsg, response_latency) {
+ out_msg.addr := address;
+ out_msg.Type := ProbeRequestType:PrbInv;
+ out_msg.ReturnData := false;
+ out_msg.MessageSize := MessageSizeType:Control;
+ if(isCPUSharer(address)) {
+ out_msg.Destination.broadcast(MachineType:CorePair); // won't be realistic for multisocket
+ }
+
+ // add relevant TCC node to the list. This replaces all TCPs and SQCs
+ if(isGPUSharer(address)) {
+ if (noTCCdir) {
+ out_msg.Destination.add(mapAddressToRange(address,MachineType:TCC,
+ TCC_select_low_bit, TCC_select_num_bits));
+ } else {
+ out_msg.Destination.add(map_Address_to_TCCdir(address));
+ }
+ }
+ out_msg.Destination.remove(in_msg.Requestor);
+ tbe.NumPendingAcks := out_msg.Destination.count();
+ if (tbe.NumPendingAcks == 0) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:AcksComplete;
+ }
+ }
+ APPEND_TRANSITION_COMMENT(" ic: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ DPRINTF(RubySlicc, "%s\n", out_msg);
+ tbe.ProbeRequestStartTime := curCycle();
+ }
+ }
+ }
+
+ action(sm_setMRU, "sm", desc="set probe filter entry as MRU") {
+ ProbeFilterMemory.setMRU(address);
+ }
+
+ action(d_writeDataToMemory, "d", desc="Write data to memory") {
+ peek(responseNetwork_in, ResponseMsg) {
+ getDirectoryEntry(address).DataBlk := in_msg.DataBlk;
+ DPRINTF(RubySlicc, "Writing Data: %s to address %s\n", in_msg.DataBlk,
+ in_msg.addr);
+ }
+ }
+
+ action(te_allocateTBEForEviction, "te", desc="allocate TBE Entry") {
+ check_allocate(TBEs);
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ tbe.writeMask.clear();
+ tbe.wtData := false;
+ tbe.atomicData := false;
+ tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+ tbe.Dirty := false;
+ tbe.NumPendingAcks := 0;
+ }
+
+ action(t_allocateTBE, "t", desc="allocate TBE Entry") {
+ check_allocate(TBEs);
+ peek(requestNetwork_in, CPURequestMsg) {
+ TBEs.allocate(address);
+ set_tbe(TBEs.lookup(address));
+ if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ tbe.writeMask.clear();
+ tbe.writeMask.orMask(in_msg.writeMask);
+ tbe.wtData := true;
+ tbe.WTRequestor := in_msg.WTRequestor;
+ tbe.LastSender := in_msg.Requestor;
+ }
+ if (in_msg.Type == CoherenceRequestType:Atomic) {
+ tbe.writeMask.clear();
+ tbe.writeMask.orMask(in_msg.writeMask);
+ tbe.atomicData := true;
+ tbe.WTRequestor := in_msg.WTRequestor;
+ tbe.LastSender := in_msg.Requestor;
+ }
+ tbe.DataBlk := getDirectoryEntry(address).DataBlk; // Data only for WBs
+ tbe.Dirty := false;
+ if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ tbe.DataBlk.copyPartial(in_msg.DataBlk,tbe.writeMask);
+ tbe.Dirty := false;
+ }
+ tbe.OriginalRequestor := in_msg.Requestor;
+ tbe.NumPendingAcks := 0;
+ tbe.Cached := in_msg.ForceShared;
+ tbe.InitialRequestTime := in_msg.InitialRequestTime;
+ }
+ }
+
+ action(dt_deallocateTBE, "dt", desc="deallocate TBE Entry") {
+ if (tbe.Dirty == false) {
+ getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+ }
+ TBEs.deallocate(address);
+ unset_tbe();
+ }
+
+ action(wd_writeBackData, "wd", desc="Write back data if needed") {
+ if (tbe.wtData) {
+ DataBlock tmp := getDirectoryEntry(address).DataBlk;
+ tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+ tbe.DataBlk := tmp;
+ getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+ } else if (tbe.atomicData) {
+ tbe.DataBlk.atomicPartial(getDirectoryEntry(address).DataBlk,
+ tbe.writeMask);
+ getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+ } else if (tbe.Dirty == false) {
+ getDirectoryEntry(address).DataBlk := tbe.DataBlk;
+ }
+ }
+
+ action(mt_writeMemDataToTBE, "mt", desc="write Mem data to TBE") {
+ peek(memQueue_in, MemoryMsg) {
+ if (tbe.wtData == true) {
+ // DO Nothing (already have the directory data)
+ } else if (tbe.Dirty == false) {
+ tbe.DataBlk := getDirectoryEntry(address).DataBlk;
+ }
+ tbe.MemData := true;
+ }
+ }
+
+ action(y_writeProbeDataToTBE, "y", desc="write Probe Data to TBE") {
+ peek(responseNetwork_in, ResponseMsg) {
+ if (in_msg.Dirty) {
+ DPRINTF(RubySlicc, "Got dirty data for %s from %s\n", address, in_msg.Sender);
+ DPRINTF(RubySlicc, "Data is %s\n", in_msg.DataBlk);
+ if (tbe.wtData) {
+ DataBlock tmp := in_msg.DataBlk;
+ tmp.copyPartial(tbe.DataBlk,tbe.writeMask);
+ tbe.DataBlk := tmp;
+ } else if (tbe.Dirty) {
+ if(tbe.atomicData == false && tbe.wtData == false) {
+ DPRINTF(RubySlicc, "Got double data for %s from %s\n", address, in_msg.Sender);
+ assert(tbe.DataBlk == in_msg.DataBlk); // in case of double data
+ }
+ } else {
+ tbe.DataBlk := in_msg.DataBlk;
+ tbe.Dirty := in_msg.Dirty;
+ tbe.LastSender := in_msg.Sender;
+ }
+ }
+ if (in_msg.Hit) {
+ tbe.Cached := true;
+ }
+ }
+ }
+
+ action(mwc_markSinkWriteCancel, "mwc", desc="Mark to sink impending VicDirty") {
+ peek(responseNetwork_in, ResponseMsg) {
+ DPRINTF(RubySlicc, "Write cancel bit set on address %s\n", address);
+ getDirectoryEntry(address).VicDirtyIgnore.add(in_msg.Sender);
+ APPEND_TRANSITION_COMMENT(" setting bit to sink VicDirty ");
+ }
+ }
+
+ action(x_decrementAcks, "x", desc="decrement Acks pending") {
+ tbe.NumPendingAcks := tbe.NumPendingAcks - 1;
+ APPEND_TRANSITION_COMMENT(" Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ }
+
+ action(o_checkForCompletion, "o", desc="check for ack completion") {
+ if (tbe.NumPendingAcks == 0) {
+ enqueue(triggerQueue_out, TriggerMsg, 1) {
+ out_msg.addr := address;
+ out_msg.Type := TriggerType:AcksComplete;
+ }
+ }
+ APPEND_TRANSITION_COMMENT(" Check: Acks remaining: ");
+ APPEND_TRANSITION_COMMENT(tbe.NumPendingAcks);
+ }
+
+ action(rv_removeVicDirtyIgnore, "rv", desc="Remove ignored core") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ getDirectoryEntry(address).VicDirtyIgnore.remove(in_msg.Requestor);
+ }
+ }
+
+ action(al_allocateL3Block, "al", desc="allocate the L3 block on WB") {
+ peek(responseNetwork_in, ResponseMsg) {
+ if (L3CacheMemory.isTagPresent(address)) {
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+ APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+ entry.DataBlk := in_msg.DataBlk;
+ entry.LastSender := in_msg.Sender;
+ } else {
+ if (L3CacheMemory.cacheAvail(address) == false) {
+ Addr victim := L3CacheMemory.cacheProbe(address);
+ CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+ L3CacheMemory.lookup(victim));
+ queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+ victim_entry.DataBlk);
+ L3CacheMemory.deallocate(victim);
+ }
+ assert(L3CacheMemory.cacheAvail(address));
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+ APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+ entry.DataBlk := in_msg.DataBlk;
+
+ entry.LastSender := in_msg.Sender;
+ }
+ }
+ }
+
+ action(alwt_allocateL3BlockOnWT, "alwt", desc="allocate the L3 block on WT") {
+ if ((tbe.wtData || tbe.atomicData) && useL3OnWT) {
+ if (L3CacheMemory.isTagPresent(address)) {
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.lookup(address));
+ APPEND_TRANSITION_COMMENT(" al wrote data to L3 (hit) ");
+ entry.DataBlk := tbe.DataBlk;
+ entry.LastSender := tbe.LastSender;
+ } else {
+ if (L3CacheMemory.cacheAvail(address) == false) {
+ Addr victim := L3CacheMemory.cacheProbe(address);
+ CacheEntry victim_entry := static_cast(CacheEntry, "pointer",
+ L3CacheMemory.lookup(victim));
+ queueMemoryWrite(machineID, victim, to_memory_controller_latency,
+ victim_entry.DataBlk);
+ L3CacheMemory.deallocate(victim);
+ }
+ assert(L3CacheMemory.cacheAvail(address));
+ CacheEntry entry := static_cast(CacheEntry, "pointer", L3CacheMemory.allocate(address, new CacheEntry));
+ APPEND_TRANSITION_COMMENT(" al wrote data to L3 ");
+ entry.DataBlk := tbe.DataBlk;
+ entry.LastSender := tbe.LastSender;
+ }
+ }
+ }
+
+ action(apf_allocateProbeFilterEntry, "apf", desc="Allocate probe filte entry") {
+ if (!ProbeFilterMemory.isTagPresent(address)) {
+ if (inclusiveDir) {
+ assert(ProbeFilterMemory.cacheAvail(address));
+ } else if (ProbeFilterMemory.cacheAvail(address) == false) {
+ Addr victim := ProbeFilterMemory.cacheProbe(address);
+ ProbeFilterMemory.deallocate(victim);
+ }
+ assert(ProbeFilterMemory.cacheAvail(address));
+ CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.allocate(address, new CacheEntry));
+ APPEND_TRANSITION_COMMENT(" allocating a new probe filter entry");
+ entry.pfState := ProbeFilterState:NT;
+ if (inclusiveDir) {
+ entry.pfState := ProbeFilterState:T;
+ }
+ entry.isOnCPU := false;
+ entry.isOnGPU := false;
+ }
+ }
+
+ action(mpfe_markPFEntryForEviction, "mpfe", desc="Mark this PF entry is being evicted") {
+ assert(ProbeFilterMemory.isTagPresent(address));
+ CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+ entry.pfState := ProbeFilterState:B;
+ peek(requestNetwork_in, CPURequestMsg) {
+ tbe.demandAddress := in_msg.addr;
+ }
+ }
+
+ action(we_wakeUpEvictionDependents, "we", desc="Wake up requests waiting for demand address and victim address") {
+ wakeUpBuffers(address);
+ wakeUpBuffers(tbe.demandAddress);
+ }
+
+ action(dpf_deallocateProbeFilter, "dpf", desc="deallocate PF entry") {
+ assert(ProbeFilterMemory.isTagPresent(address));
+ ProbeFilterMemory.deallocate(address);
+ }
+
+ action(upf_updateProbeFilter, "upf", desc="") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ assert(ProbeFilterMemory.isTagPresent(address));
+ CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+ if (in_msg.Type == CoherenceRequestType:WriteThrough) {
+ entry.pfState := ProbeFilterState:T;
+ entry.isOnCPU := false;
+ entry.isOnGPU := false;
+ } else if (in_msg.Type == CoherenceRequestType:Atomic) {
+ entry.pfState := ProbeFilterState:T;
+ entry.isOnCPU := false;
+ entry.isOnGPU := false;
+ } else if (in_msg.Type == CoherenceRequestType:RdBlkM) {
+ entry.pfState := ProbeFilterState:T;
+ entry.isOnCPU := false;
+ entry.isOnGPU := false;
+ } else if (in_msg.Type == CoherenceRequestType:CtoD) {
+ entry.pfState := ProbeFilterState:T;
+ entry.isOnCPU := false;
+ entry.isOnGPU := false;
+ }
+ if(machineIDToMachineType(in_msg.Requestor) == MachineType:CorePair) {
+ entry.isOnCPU := true;
+ } else {
+ entry.isOnGPU := true;
+ }
+ }
+ }
+
+ action(rmcd_removeSharerConditional, "rmcd", desc="remove sharer from probe Filter, conditional") {
+ peek(requestNetwork_in, CPURequestMsg) {
+ if (ProbeFilterMemory.isTagPresent(address)) {
+ CacheEntry entry := static_cast(CacheEntry, "pointer", ProbeFilterMemory.lookup(address));
+ if(machineIDToMachineType(in_msg.Requestor) == MachineType:CorePair) {//CorePair has inclusive L2
+ if (in_msg.Type == CoherenceRequestType:VicDirty) {
+ entry.isOnCPU := false;
+ } else if (in_msg.Type == CoherenceRequestType:VicClean) {
+ entry.isOnCPU := false;
+ }
+ }
+ }
+ }
+ }
+
+ action(sf_setForwardReqTime, "sf", desc="...") {
+ tbe.ForwardRequestTime := curCycle();
+ }
+
+ action(dl_deallocateL3, "dl", desc="deallocate the L3 block") {
+ L3CacheMemory.deallocate(address);
+ }
+
+ action(p_popRequestQueue, "p", desc="pop request queue") {
+ requestNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pr_popResponseQueue, "pr", desc="pop response queue") {
+ responseNetwork_in.dequeue(clockEdge());
+ }
+
+ action(pm_popMemQueue, "pm", desc="pop mem queue") {
+ memQueue_in.dequeue(clockEdge());
+ }
+
+ action(pt_popTriggerQueue, "pt", desc="pop trigger queue") {
+ triggerQueue_in.dequeue(clockEdge());
+ }
+
+ action(ptl_popTriggerQueue, "ptl", desc="pop L3 trigger queue") {
+ L3TriggerQueue_in.dequeue(clockEdge());
+ }
+
+ action(pu_popUnblockQueue, "pu", desc="pop unblock queue") {
+ unblockNetwork_in.dequeue(clockEdge());
+ }
+
+ action(zz_recycleRequestQueue, "zz", desc="recycle request queue") {
+ requestNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(yy_recycleResponseQueue, "yy", desc="recycle response queue") {
+ responseNetwork_in.recycle(clockEdge(), cyclesToTicks(recycle_latency));
+ }
+
+ action(st_stallAndWaitRequest, "st", desc="Stall and wait on the address") {
+ stall_and_wait(requestNetwork_in, address);
+ }
+
+ action(wa_wakeUpDependents, "wa", desc="Wake up any requests waiting for this address") {
+ wakeUpBuffers(address);
+ }
+
+ action(wa_wakeUpAllDependents, "waa", desc="Wake up any requests waiting for this region") {
+ wakeUpAllBuffers();
+ }
+
+ action(z_stall, "z", desc="...") {
+ }
+
+ // TRANSITIONS
+ transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, {RdBlkS, RdBlkM, RdBlk, CtoD}) {
+ st_stallAndWaitRequest;
+ }
+
+ // It may be possible to save multiple invalidations here!
+ transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, {Atomic, WriteThrough}) {
+ st_stallAndWaitRequest;
+ }
+
+
+ // transitions from U
+ transition(U, PF_Repl, B_P) {PFTagArrayRead, PFTagArrayWrite}{
+ te_allocateTBEForEviction;
+ apf_allocateProbeFilterEntry;
+ bp_backProbe;
+ sm_setMRU;
+ mpfe_markPFEntryForEviction;
+ }
+
+ transition(U, {RdBlkS}, BS_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} {
+ t_allocateTBE;
+ apf_allocateProbeFilterEntry;
+ l_queueMemRdReq;
+ sc_probeShrCoreData;
+ sm_setMRU;
+ upf_updateProbeFilter;
+ p_popRequestQueue;
+ }
+
+ transition(U, WriteThrough, BM_PM) {L3TagArrayRead, L3TagArrayWrite, PFTagArrayRead, PFTagArrayWrite} {
+ t_allocateTBE;
+ apf_allocateProbeFilterEntry;
+ w_sendResponseWBAck;
+ l_queueMemRdReq;
+ dc_probeInvCoreData;
+ sm_setMRU;
+ upf_updateProbeFilter;
+ p_popRequestQueue;
+ }
+
+ transition(U, Atomic, BM_PM) {L3TagArrayRead, L3TagArrayWrite, PFTagArrayRead, PFTagArrayWrite} {
+ t_allocateTBE;
+ apf_allocateProbeFilterEntry;
+ l_queueMemRdReq;
+ dc_probeInvCoreData;
+ sm_setMRU;
+ upf_updateProbeFilter;
+ p_popRequestQueue;
+ }
+
+ transition(U, {RdBlkM}, BM_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} {
+ t_allocateTBE;
+ apf_allocateProbeFilterEntry;
+ l_queueMemRdReq;
+ dc_probeInvCoreData;
+ sm_setMRU;
+ upf_updateProbeFilter;
+ p_popRequestQueue;
+ }
+
+ transition(U, RdBlk, B_PM) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite}{
+ t_allocateTBE;
+ apf_allocateProbeFilterEntry;
+ l_queueMemRdReq;
+ sc_probeShrCoreData;
+ sm_setMRU;
+ upf_updateProbeFilter;
+ p_popRequestQueue;
+ }
+
+ transition(U, CtoD, BP) {L3TagArrayRead, PFTagArrayRead, PFTagArrayWrite} {
+ t_allocateTBE;
+ apf_allocateProbeFilterEntry;
+ ic_probeInvCore;
+ sm_setMRU;
+ upf_updateProbeFilter;
+ p_popRequestQueue;
+ }
+
+ transition(U, VicDirty, BL) {L3TagArrayRead} {
+ t_allocateTBE;
+ w_sendResponseWBAck;
+ rmcd_removeSharerConditional;
+ p_popRequestQueue;
+ }
+
+ transition(U, VicClean, BL) {L3TagArrayRead} {
+ t_allocateTBE;
+ w_sendResponseWBAck;
+ rmcd_removeSharerConditional;
+ p_popRequestQueue;
+ }
+
+ transition(BL, {VicDirty, VicClean}) {
+ zz_recycleRequestQueue;
+ }
+
+ transition(BL, CPUData, U) {L3TagArrayWrite, L3DataArrayWrite} {
+ d_writeDataToMemory;
+ al_allocateL3Block;
+ wa_wakeUpDependents;
+ dt_deallocateTBE;
+ //l_queueMemWBReq; // why need an ack? esp. with DRAMSim, just put it in queue no ack needed
+ pr_popResponseQueue;
+ }
+
+ transition(BL, StaleWB, U) {L3TagArrayWrite} {
+ dt_deallocateTBE;
+ wa_wakeUpAllDependents;
+ pr_popResponseQueue;
+ }
+
+ transition({B, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P}, {VicDirty, VicClean}) {
+ z_stall;
+ }
+
+ transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, WBAck) {
+ pm_popMemQueue;
+ }
+
+ transition({BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, PF_Repl) {
+ zz_recycleRequestQueue;
+ }
+
+ transition({U, BL, BS_M, BM_M, B_M, BP, BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, B}, StaleVicDirty) {
+ rv_removeVicDirtyIgnore;
+ w_sendResponseWBAck;
+ p_popRequestQueue;
+ }
+
+ transition({B}, CoreUnblock, U) {
+ wa_wakeUpDependents;
+ pu_popUnblockQueue;
+ }
+
+ transition(B, UnblockWriteThrough, U) {
+ wa_wakeUpDependents;
+ pt_popTriggerQueue;
+ }
+
+ transition(BS_PM, MemData, BS_Pm) {} {
+ mt_writeMemDataToTBE;
+ pm_popMemQueue;
+ }
+
+ transition(BM_PM, MemData, BM_Pm){} {
+ mt_writeMemDataToTBE;
+ pm_popMemQueue;
+ }
+
+ transition(B_PM, MemData, B_Pm){} {
+ mt_writeMemDataToTBE;
+ pm_popMemQueue;
+ }
+
+ transition(BS_PM, L3Hit, BS_Pm) {} {
+ ptl_popTriggerQueue;
+ }
+
+ transition(BM_PM, L3Hit, BM_Pm) {} {
+ ptl_popTriggerQueue;
+ }
+
+ transition(B_PM, L3Hit, B_Pm) {} {
+ ptl_popTriggerQueue;
+ }
+
+ transition(BS_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+ mt_writeMemDataToTBE;
+ s_sendResponseS;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pm_popMemQueue;
+ }
+
+ transition(BM_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+ mt_writeMemDataToTBE;
+ m_sendResponseM;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pm_popMemQueue;
+ }
+
+ transition(B_M, MemData, B){L3TagArrayWrite, L3DataArrayWrite} {
+ mt_writeMemDataToTBE;
+ es_sendResponseES;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pm_popMemQueue;
+ }
+
+ transition(BS_M, L3Hit, B) {L3TagArrayWrite, L3DataArrayWrite} {
+ s_sendResponseS;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ ptl_popTriggerQueue;
+ }
+
+ transition(BM_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} {
+ m_sendResponseM;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ ptl_popTriggerQueue;
+ }
+
+ transition(B_M, L3Hit, B) {L3DataArrayWrite, L3TagArrayWrite} {
+ es_sendResponseES;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ ptl_popTriggerQueue;
+ }
+
+ transition({BS_PM, BM_PM, B_PM, BS_Pm, BM_Pm, B_Pm, B_P, BP}, CPUPrbResp) {
+ y_writeProbeDataToTBE;
+ x_decrementAcks;
+ o_checkForCompletion;
+ pr_popResponseQueue;
+ }
+
+ transition(BS_PM, ProbeAcksComplete, BS_M) {} {
+ sf_setForwardReqTime;
+ pt_popTriggerQueue;
+ }
+
+ transition(BM_PM, ProbeAcksComplete, BM_M) {} {
+ sf_setForwardReqTime;
+ pt_popTriggerQueue;
+ }
+
+ transition(B_PM, ProbeAcksComplete, B_M){} {
+ sf_setForwardReqTime;
+ pt_popTriggerQueue;
+ }
+
+ transition(BS_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+ sf_setForwardReqTime;
+ s_sendResponseS;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+
+ transition(BM_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+ sf_setForwardReqTime;
+ m_sendResponseM;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+
+ transition(B_Pm, ProbeAcksComplete, B){L3DataArrayWrite, L3TagArrayWrite} {
+ sf_setForwardReqTime;
+ es_sendResponseES;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+
+ transition(B_P, ProbeAcksComplete, U) {
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ we_wakeUpEvictionDependents;
+ dpf_deallocateProbeFilter;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+
+ transition(BP, ProbeAcksComplete, B){L3TagArrayWrite, L3TagArrayWrite} {
+ sf_setForwardReqTime;
+ c_sendResponseCtoD;
+ wd_writeBackData;
+ alwt_allocateL3BlockOnWT;
+ dt_deallocateTBE;
+ pt_popTriggerQueue;
+ }
+}
diff --git a/src/mem/protocol/MOESI_AMD_Base.slicc b/src/mem/protocol/MOESI_AMD_Base.slicc
new file mode 100644
index 000000000..b38145246
--- /dev/null
+++ b/src/mem/protocol/MOESI_AMD_Base.slicc
@@ -0,0 +1,6 @@
+protocol "MOESI_AMD_Base";
+include "RubySlicc_interfaces.slicc";
+include "MOESI_AMD_Base-msg.sm";
+include "MOESI_AMD_Base-CorePair.sm";
+include "MOESI_AMD_Base-L3cache.sm";
+include "MOESI_AMD_Base-dir.sm";
diff --git a/src/mem/protocol/RubySlicc_ComponentMapping.sm b/src/mem/protocol/RubySlicc_ComponentMapping.sm
index a72492b42..e1d7c4399 100644
--- a/src/mem/protocol/RubySlicc_ComponentMapping.sm
+++ b/src/mem/protocol/RubySlicc_ComponentMapping.sm
@@ -37,7 +37,10 @@ MachineID mapAddressToRange(Addr addr, MachineType type,
NetDest broadcast(MachineType type);
MachineID map_Address_to_DMA(Addr addr);
MachineID map_Address_to_Directory(Addr addr);
+MachineID map_Address_to_RegionDir(Addr addr);
NodeID map_Address_to_DirectoryNode(Addr addr);
+MachineID map_Address_to_TCCdir(Addr addr);
+NodeID map_Address_to_TCCdirNode(Addr addr);
NodeID machineIDToNodeID(MachineID machID);
NodeID machineIDToVersion(MachineID machID);
MachineType machineIDToMachineType(MachineID machID);
diff --git a/src/mem/protocol/RubySlicc_Exports.sm b/src/mem/protocol/RubySlicc_Exports.sm
index 5ee26d65c..c743ebe28 100644
--- a/src/mem/protocol/RubySlicc_Exports.sm
+++ b/src/mem/protocol/RubySlicc_Exports.sm
@@ -62,7 +62,7 @@ bool testAndWrite(Addr addr, DataBlock datablk, Packet *pkt);
// AccessPermission
// The following five states define the access permission of all memory blocks.
-// These permissions have multiple uses. They coordinate locking and
+// These permissions have multiple uses. They coordinate locking and
// synchronization primitives, as well as enable functional accesses.
// One should not need to add any additional permission values and it is very
// risky to do so.
@@ -73,7 +73,7 @@ enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent")
Read_Write, desc="block is Read/Write";
// Possibly Invalid data
- // The maybe stale permission indicates that accordingly to the protocol,
+ // The maybe stale permission indicates that accordingly to the protocol,
// there is no guarantee the block contains valid data. However, functional
// writes should update the block because a dataless PUT request may
// revalidate the block's data.
@@ -227,6 +227,13 @@ enumeration(MachineType, desc="...", default="MachineType_NULL") {
Collector, desc="Collector Mach";
L1Cache_wCC, desc="L1 Cache Mach to track cache-to-cache transfer (used for miss latency profile)";
L2Cache_wCC, desc="L2 Cache Mach to track cache-to-cache transfer (used for miss latency profile)";
+ CorePair, desc="Cache Mach (2 cores, Private L1Ds, Shared L1I & L2)";
+ TCP, desc="GPU L1 Data Cache (Texture Cache per Pipe)";
+ TCC, desc="GPU L2 Shared Cache (Texture Cache per Channel)";
+ TCCdir, desc="Directory at the GPU L2 Cache (TCC)";
+ SQC, desc="GPU L1 Instr Cache (Sequencer Cache)";
+ RegionDir, desc="Region-granular directory";
+ RegionBuffer,desc="Region buffer for CPU and GPU";
NULL, desc="null mach type";
}
diff --git a/src/mem/protocol/RubySlicc_Types.sm b/src/mem/protocol/RubySlicc_Types.sm
index a6c57e1b0..b8d284725 100644
--- a/src/mem/protocol/RubySlicc_Types.sm
+++ b/src/mem/protocol/RubySlicc_Types.sm
@@ -31,8 +31,8 @@
//
// **PLEASE NOTE!** When adding objects to this file you must also add a line
-// in the src/mem/ruby/SConscript file. Otherwise the external object's .hh
-// file will not be copied to the protocol directory and you will encounter a
+// in the src/mem/ruby/SConscript file. Otherwise the external object's .hh
+// file will not be copied to the protocol directory and you will encounter a
// undefined declaration error.
//
@@ -95,6 +95,8 @@ structure (NetDest, external = "yes", non_obj="yes") {
bool intersectionIsEmpty(Set);
bool intersectionIsEmpty(NetDest);
MachineID smallestElement(MachineType);
+ NetDest OR(NetDest);
+ NetDest AND(NetDest);
}
structure (Sequencer, external = "yes") {
@@ -117,6 +119,44 @@ structure (Sequencer, external = "yes") {
void invalidateSC(Addr);
}
+structure (GPUCoalescer, external = "yes") {
+ void readCallback(Addr, DataBlock);
+ void readCallback(Addr, MachineType, DataBlock);
+ void readCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles);
+ void readCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles, bool);
+ void writeCallback(Addr, DataBlock);
+ void writeCallback(Addr, MachineType, DataBlock);
+ void writeCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles);
+ void writeCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles, bool);
+ void checkCoherence(Addr);
+ void evictionCallback(Addr);
+ void recordCPReadCallBack(MachineID, MachineID);
+ void recordCPWriteCallBack(MachineID, MachineID);
+}
+
+structure (VIPERCoalescer, external = "yes") {
+ void readCallback(Addr, DataBlock);
+ void readCallback(Addr, MachineType, DataBlock);
+ void readCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles);
+ void readCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles, bool);
+ void writeCallback(Addr, DataBlock);
+ void writeCallback(Addr, MachineType, DataBlock);
+ void writeCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles);
+ void writeCallback(Addr, MachineType, DataBlock,
+ Cycles, Cycles, Cycles, bool);
+ void invCallback(Addr);
+ void wbCallback(Addr);
+ void checkCoherence(Addr);
+ void evictionCallback(Addr);
+}
+
structure(RubyRequest, desc="...", interface="Message", external="yes") {
Addr LineAddress, desc="Line address for this request";
Addr PhysicalAddress, desc="Physical address for this request";
@@ -161,6 +201,7 @@ structure (CacheMemory, external = "yes") {
Cycles getTagLatency();
Cycles getDataLatency();
void setMRU(Addr);
+ void setMRU(Addr, int);
void setMRU(AbstractCacheEntry);
void recordRequestType(CacheRequestType, Addr);
bool checkResourceAvailable(CacheResourceType, Addr);
diff --git a/src/mem/protocol/SConsopts b/src/mem/protocol/SConsopts
index ca432a73e..47b36e276 100644
--- a/src/mem/protocol/SConsopts
+++ b/src/mem/protocol/SConsopts
@@ -33,6 +33,11 @@ import os
Import('*')
all_protocols.extend([
+ 'GPU_VIPER',
+ 'GPU_VIPER_Baseline',
+ 'GPU_VIPER_Region',
+ 'GPU_RfO',
+ 'MOESI_AMD_Base',
'MESI_Two_Level',
'MESI_Three_Level',
'MI_example',
diff --git a/src/mem/ruby/SConscript b/src/mem/ruby/SConscript
index 16e932432..82a16c9b0 100644
--- a/src/mem/ruby/SConscript
+++ b/src/mem/ruby/SConscript
@@ -124,13 +124,20 @@ MakeInclude('common/Set.hh')
MakeInclude('common/WriteMask.hh')
MakeInclude('filters/AbstractBloomFilter.hh')
MakeInclude('network/MessageBuffer.hh')
-MakeInclude('structures/Prefetcher.hh')
MakeInclude('structures/CacheMemory.hh')
-MakeInclude('system/DMASequencer.hh')
MakeInclude('structures/DirectoryMemory.hh')
-MakeInclude('structures/WireBuffer.hh')
MakeInclude('structures/PerfectCacheMemory.hh')
MakeInclude('structures/PersistentTable.hh')
-MakeInclude('system/Sequencer.hh')
+MakeInclude('structures/Prefetcher.hh')
MakeInclude('structures/TBETable.hh')
MakeInclude('structures/TimerTable.hh')
+MakeInclude('structures/WireBuffer.hh')
+MakeInclude('system/DMASequencer.hh')
+MakeInclude('system/Sequencer.hh')
+
+# External types : Group "mem/protocol" : include "header.hh" to the bottom
+# of this MakeIncludes if it is referenced as
+# <# include "mem/protocol/header.hh"> in any file
+# generated_dir = Dir('../protocol')
+MakeInclude('system/GPUCoalescer.hh')
+MakeInclude('system/VIPERCoalescer.hh')
diff --git a/src/mem/ruby/profiler/Profiler.cc b/src/mem/ruby/profiler/Profiler.cc
index b3b37e5a6..7d3f20982 100644
--- a/src/mem/ruby/profiler/Profiler.cc
+++ b/src/mem/ruby/profiler/Profiler.cc
@@ -269,7 +269,7 @@ Profiler::collateStats()
it != m_ruby_system->m_abstract_controls[i].end(); ++it) {
AbstractController *ctr = (*it).second;
- Sequencer *seq = ctr->getSequencer();
+ Sequencer *seq = ctr->getCPUSequencer();
if (seq != NULL) {
m_outstandReqHist.add(seq->getOutstandReqHist());
}
@@ -282,7 +282,7 @@ Profiler::collateStats()
it != m_ruby_system->m_abstract_controls[i].end(); ++it) {
AbstractController *ctr = (*it).second;
- Sequencer *seq = ctr->getSequencer();
+ Sequencer *seq = ctr->getCPUSequencer();
if (seq != NULL) {
// add all the latencies
m_latencyHist.add(seq->getLatencyHist());
diff --git a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
index 926556781..cbd068c04 100644
--- a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
+++ b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
@@ -56,6 +56,12 @@ class AbstractCacheEntry : public AbstractEntry
virtual DataBlock& getDataBlk()
{ panic("getDataBlk() not implemented!"); }
+ int validBlocks;
+ virtual int& getNumValidBlocks()
+ {
+ return validBlocks;
+ }
+
// Functions for locking and unlocking the cache entry. These are required
// for supporting atomic memory accesses.
void setLocked(int context);
diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc
index 93fe50c88..458fde5bc 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.cc
+++ b/src/mem/ruby/slicc_interface/AbstractController.cc
@@ -200,6 +200,12 @@ AbstractController::unblock(Addr addr)
}
}
+bool
+AbstractController::isBlocked(Addr addr)
+{
+ return (m_block_map.count(addr) > 0);
+}
+
BaseMasterPort &
AbstractController::getMasterPort(const std::string &if_name,
PortID idx)
diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh
index 383507eed..4488ee3f4 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.hh
+++ b/src/mem/ruby/slicc_interface/AbstractController.hh
@@ -73,6 +73,7 @@ class AbstractController : public MemObject, public Consumer
// return instance name
void blockOnQueue(Addr, MessageBuffer*);
void unblock(Addr);
+ bool isBlocked(Addr);
virtual MessageBuffer* getMandatoryQueue() const = 0;
virtual MessageBuffer* getMemoryQueue() const = 0;
@@ -84,7 +85,7 @@ class AbstractController : public MemObject, public Consumer
virtual void regStats();
virtual void recordCacheTrace(int cntrl, CacheRecorder* tr) = 0;
- virtual Sequencer* getSequencer() const = 0;
+ virtual Sequencer* getCPUSequencer() const = 0;
//! These functions are used by ruby system to read/write the data blocks
//! that exist with in the controller.
diff --git a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
index 46071335e..cdedc2e14 100644
--- a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
+++ b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
@@ -43,6 +43,12 @@ map_Address_to_DirectoryNode(Addr addr)
return DirectoryMemory::mapAddressToDirectoryVersion(addr);
}
+inline NodeID
+map_Address_to_TCCdirNode(Addr addr)
+{
+ return DirectoryMemory::mapAddressToDirectoryVersion(addr);
+}
+
// used to determine the home directory
// returns a value between 0 and total_directories_within_the_system
inline MachineID
@@ -53,6 +59,22 @@ map_Address_to_Directory(Addr addr)
return mach;
}
+inline MachineID
+map_Address_to_RegionDir(Addr addr)
+{
+ MachineID mach = {MachineType_RegionDir,
+ map_Address_to_DirectoryNode(addr)};
+ return mach;
+}
+
+inline MachineID
+map_Address_to_TCCdir(Addr addr)
+{
+ MachineID mach =
+ {MachineType_TCCdir, map_Address_to_TCCdirNode(addr)};
+ return mach;
+}
+
inline NetDest
broadcast(MachineType type)
{
@@ -102,4 +124,11 @@ createMachineID(MachineType type, NodeID id)
return mach;
}
+inline MachineID
+MachineTypeAndNodeIDToMachineID(MachineType type, NodeID node)
+{
+ MachineID mach = {type, node};
+ return mach;
+}
+
#endif // __MEM_RUBY_SLICC_INTERFACE_COMPONENTMAPPINGS_HH__
diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc
index a8a3ba949..45fb85d05 100644
--- a/src/mem/ruby/structures/CacheMemory.cc
+++ b/src/mem/ruby/structures/CacheMemory.cc
@@ -35,6 +35,7 @@
#include "mem/protocol/AccessPermission.hh"
#include "mem/ruby/structures/CacheMemory.hh"
#include "mem/ruby/system/RubySystem.hh"
+#include "mem/ruby/system/WeightedLRUPolicy.hh"
using namespace std;
@@ -66,29 +67,27 @@ CacheMemory::CacheMemory(const Params *p)
m_start_index_bit = p->start_index_bit;
m_is_instruction_only_cache = p->is_icache;
m_resource_stalls = p->resourceStalls;
+ m_block_size = p->block_size; // may be 0 at this point. Updated in init()
}
void
CacheMemory::init()
{
- m_cache_num_sets = (m_cache_size / m_cache_assoc) /
- RubySystem::getBlockSizeBytes();
+ if (m_block_size == 0) {
+ m_block_size = RubySystem::getBlockSizeBytes();
+ }
+ m_cache_num_sets = (m_cache_size / m_cache_assoc) / m_block_size;
assert(m_cache_num_sets > 1);
m_cache_num_set_bits = floorLog2(m_cache_num_sets);
assert(m_cache_num_set_bits > 0);
- m_cache.resize(m_cache_num_sets);
- for (int i = 0; i < m_cache_num_sets; i++) {
- m_cache[i].resize(m_cache_assoc);
- for (int j = 0; j < m_cache_assoc; j++) {
- m_cache[i][j] = NULL;
- }
- }
+ m_cache.resize(m_cache_num_sets,
+ std::vector<AbstractCacheEntry*>(m_cache_assoc, nullptr));
}
CacheMemory::~CacheMemory()
{
- if (m_replacementPolicy_ptr != NULL)
+ if (m_replacementPolicy_ptr)
delete m_replacementPolicy_ptr;
for (int i = 0; i < m_cache_num_sets; i++) {
for (int j = 0; j < m_cache_assoc; j++) {
@@ -359,6 +358,37 @@ CacheMemory::setMRU(const AbstractCacheEntry *e)
}
void
+CacheMemory::setMRU(Addr address, int occupancy)
+{
+ int64_t cacheSet = addressToCacheSet(address);
+ int loc = findTagInSet(cacheSet, address);
+
+ if(loc != -1) {
+ if (m_replacementPolicy_ptr->useOccupancy()) {
+ (static_cast<WeightedLRUPolicy*>(m_replacementPolicy_ptr))->
+ touch(cacheSet, loc, curTick(), occupancy);
+ } else {
+ m_replacementPolicy_ptr->
+ touch(cacheSet, loc, curTick());
+ }
+ }
+}
+
+int
+CacheMemory::getReplacementWeight(int64_t set, int64_t loc)
+{
+ assert(set < m_cache_num_sets);
+ assert(loc < m_cache_assoc);
+ int ret = 0;
+ if(m_cache[set][loc] != NULL) {
+ ret = m_cache[set][loc]->getNumValidBlocks();
+ assert(ret >= 0);
+ }
+
+ return ret;
+}
+
+void
CacheMemory::recordCacheContents(int cntrl, CacheRecorder* tr) const
{
uint64_t warmedUpBlocks = 0;
diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh
index 72805b32b..5b30505d3 100644
--- a/src/mem/ruby/structures/CacheMemory.hh
+++ b/src/mem/ruby/structures/CacheMemory.hh
@@ -106,7 +106,8 @@ class CacheMemory : public SimObject
// Set this address to most recently used
void setMRU(Addr address);
- // Set this entry to most recently used
+ void setMRU(Addr addr, int occupancy);
+ int getReplacementWeight(int64_t set, int64_t loc);
void setMRU(const AbstractCacheEntry *e);
// Functions for locking and unlocking cache lines corresponding to the
@@ -146,6 +147,7 @@ class CacheMemory : public SimObject
Stats::Scalar numDataArrayStalls;
int getCacheSize() const { return m_cache_size; }
+ int getCacheAssoc() const { return m_cache_assoc; }
int getNumBlocks() const { return m_cache_num_sets * m_cache_assoc; }
Addr getAddressAtIdx(int idx) const;
@@ -182,6 +184,7 @@ class CacheMemory : public SimObject
int m_cache_assoc;
int m_start_index_bit;
bool m_resource_stalls;
+ int m_block_size;
};
std::ostream& operator<<(std::ostream& out, const CacheMemory& obj);
diff --git a/src/mem/ruby/structures/RubyCache.py b/src/mem/ruby/structures/RubyCache.py
index 4eb87ac74..9fc4726b0 100644
--- a/src/mem/ruby/structures/RubyCache.py
+++ b/src/mem/ruby/structures/RubyCache.py
@@ -42,6 +42,7 @@ class RubyCache(SimObject):
"")
start_index_bit = Param.Int(6, "index start, default 6 for 64-byte line");
is_icache = Param.Bool(False, "is instruction only cache");
+ block_size = Param.MemorySize("0B", "block size in bytes. 0 means default RubyBlockSize")
dataArrayBanks = Param.Int(1, "Number of banks for the data array")
tagArrayBanks = Param.Int(1, "Number of banks for the tag array")
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
new file mode 100644
index 000000000..db279bd3a
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "base/misc.hh"
+#include "base/str.hh"
+#include "config/the_isa.hh"
+
+#if THE_ISA == X86_ISA
+#include "arch/x86/insts/microldstop.hh"
+
+#endif // X86_ISA
+#include "mem/ruby/system/GPUCoalescer.hh"
+
+#include "cpu/testers/rubytest/RubyTester.hh"
+#include "debug/GPUCoalescer.hh"
+#include "debug/MemoryAccess.hh"
+#include "debug/ProtocolTrace.hh"
+#include "debug/RubyPort.hh"
+#include "debug/RubyStats.hh"
+#include "gpu-compute/shader.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/common/DataBlock.hh"
+#include "mem/ruby/common/SubBlock.hh"
+#include "mem/ruby/network/MessageBuffer.hh"
+#include "mem/ruby/profiler/Profiler.hh"
+#include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/slicc_interface/RubyRequest.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "params/RubyGPUCoalescer.hh"
+
+using namespace std;
+
+GPUCoalescer *
+RubyGPUCoalescerParams::create()
+{
+ return new GPUCoalescer(this);
+}
+
+HSAScope
+reqScopeToHSAScope(Request* req)
+{
+ HSAScope accessScope = HSAScope_UNSPECIFIED;
+ if (req->isScoped()) {
+ if (req->isWavefrontScope()) {
+ accessScope = HSAScope_WAVEFRONT;
+ } else if (req->isWorkgroupScope()) {
+ accessScope = HSAScope_WORKGROUP;
+ } else if (req->isDeviceScope()) {
+ accessScope = HSAScope_DEVICE;
+ } else if (req->isSystemScope()) {
+ accessScope = HSAScope_SYSTEM;
+ } else {
+ fatal("Bad scope type");
+ }
+ }
+ return accessScope;
+}
+
+HSASegment
+reqSegmentToHSASegment(Request* req)
+{
+ HSASegment accessSegment = HSASegment_GLOBAL;
+
+ if (req->isGlobalSegment()) {
+ accessSegment = HSASegment_GLOBAL;
+ } else if (req->isGroupSegment()) {
+ accessSegment = HSASegment_GROUP;
+ } else if (req->isPrivateSegment()) {
+ accessSegment = HSASegment_PRIVATE;
+ } else if (req->isKernargSegment()) {
+ accessSegment = HSASegment_KERNARG;
+ } else if (req->isReadonlySegment()) {
+ accessSegment = HSASegment_READONLY;
+ } else if (req->isSpillSegment()) {
+ accessSegment = HSASegment_SPILL;
+ } else if (req->isArgSegment()) {
+ accessSegment = HSASegment_ARG;
+ } else {
+ fatal("Bad segment type");
+ }
+
+ return accessSegment;
+}
+
+GPUCoalescer::GPUCoalescer(const Params *p)
+ : RubyPort(p), issueEvent(this), deadlockCheckEvent(this)
+{
+ m_store_waiting_on_load_cycles = 0;
+ m_store_waiting_on_store_cycles = 0;
+ m_load_waiting_on_store_cycles = 0;
+ m_load_waiting_on_load_cycles = 0;
+
+ m_outstanding_count = 0;
+
+ m_max_outstanding_requests = 0;
+ m_deadlock_threshold = 0;
+ m_instCache_ptr = nullptr;
+ m_dataCache_ptr = nullptr;
+
+ m_instCache_ptr = p->icache;
+ m_dataCache_ptr = p->dcache;
+ m_max_outstanding_requests = p->max_outstanding_requests;
+ m_deadlock_threshold = p->deadlock_threshold;
+
+ assert(m_max_outstanding_requests > 0);
+ assert(m_deadlock_threshold > 0);
+ assert(m_instCache_ptr);
+ assert(m_dataCache_ptr);
+
+ m_data_cache_hit_latency = p->dcache_hit_latency;
+
+ m_usingNetworkTester = p->using_network_tester;
+ assumingRfOCoherence = p->assume_rfo;
+}
+
+GPUCoalescer::~GPUCoalescer()
+{
+}
+
+void
+GPUCoalescer::wakeup()
+{
+ // Check for deadlock of any of the requests
+ Cycles current_time = curCycle();
+
+ // Check across all outstanding requests
+ int total_outstanding = 0;
+
+ RequestTable::iterator read = m_readRequestTable.begin();
+ RequestTable::iterator read_end = m_readRequestTable.end();
+ for (; read != read_end; ++read) {
+ GPUCoalescerRequest* request = read->second;
+ if (current_time - request->issue_time < m_deadlock_threshold)
+ continue;
+
+ panic("Possible Deadlock detected. Aborting!\n"
+ "version: %d request.paddr: 0x%x m_readRequestTable: %d "
+ "current time: %u issue_time: %d difference: %d\n", m_version,
+ request->pkt->getAddr(), m_readRequestTable.size(),
+ current_time * clockPeriod(), request->issue_time * clockPeriod(),
+ (current_time - request->issue_time)*clockPeriod());
+ }
+
+ RequestTable::iterator write = m_writeRequestTable.begin();
+ RequestTable::iterator write_end = m_writeRequestTable.end();
+ for (; write != write_end; ++write) {
+ GPUCoalescerRequest* request = write->second;
+ if (current_time - request->issue_time < m_deadlock_threshold)
+ continue;
+
+ panic("Possible Deadlock detected. Aborting!\n"
+ "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
+ "current time: %u issue_time: %d difference: %d\n", m_version,
+ request->pkt->getAddr(), m_writeRequestTable.size(),
+ current_time * clockPeriod(), request->issue_time * clockPeriod(),
+ (current_time - request->issue_time) * clockPeriod());
+ }
+
+ total_outstanding += m_writeRequestTable.size();
+ total_outstanding += m_readRequestTable.size();
+
+ assert(m_outstanding_count == total_outstanding);
+
+ if (m_outstanding_count > 0) {
+ // If there are still outstanding requests, keep checking
+ schedule(deadlockCheckEvent,
+ m_deadlock_threshold * clockPeriod() +
+ curTick());
+ }
+}
+
+void
+GPUCoalescer::resetStats()
+{
+ m_latencyHist.reset();
+ m_missLatencyHist.reset();
+ for (int i = 0; i < RubyRequestType_NUM; i++) {
+ m_typeLatencyHist[i]->reset();
+ m_missTypeLatencyHist[i]->reset();
+ for (int j = 0; j < MachineType_NUM; j++) {
+ m_missTypeMachLatencyHist[i][j]->reset();
+ }
+ }
+
+ for (int i = 0; i < MachineType_NUM; i++) {
+ m_missMachLatencyHist[i]->reset();
+
+ m_IssueToInitialDelayHist[i]->reset();
+ m_InitialToForwardDelayHist[i]->reset();
+ m_ForwardToFirstResponseDelayHist[i]->reset();
+ m_FirstResponseToCompletionDelayHist[i]->reset();
+ }
+}
+
+void
+GPUCoalescer::printProgress(ostream& out) const
+{
+}
+
+RequestStatus
+GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
+{
+ Addr line_addr = makeLineAddress(pkt->getAddr());
+
+ if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
+ return RequestStatus_BufferFull;
+ }
+
+ if(m_controller->isBlocked(line_addr) &&
+ request_type != RubyRequestType_Locked_RMW_Write) {
+ return RequestStatus_Aliased;
+ }
+
+ if ((request_type == RubyRequestType_ST) ||
+ (request_type == RubyRequestType_ATOMIC) ||
+ (request_type == RubyRequestType_ATOMIC_RETURN) ||
+ (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+ (request_type == RubyRequestType_RMW_Read) ||
+ (request_type == RubyRequestType_RMW_Write) ||
+ (request_type == RubyRequestType_Load_Linked) ||
+ (request_type == RubyRequestType_Store_Conditional) ||
+ (request_type == RubyRequestType_Locked_RMW_Read) ||
+ (request_type == RubyRequestType_Locked_RMW_Write) ||
+ (request_type == RubyRequestType_FLUSH)) {
+
+ // Check if there is any outstanding read request for the same
+ // cache line.
+ if (m_readRequestTable.count(line_addr) > 0) {
+ m_store_waiting_on_load_cycles++;
+ return RequestStatus_Aliased;
+ }
+
+ if (m_writeRequestTable.count(line_addr) > 0) {
+ // There is an outstanding write request for the cache line
+ m_store_waiting_on_store_cycles++;
+ return RequestStatus_Aliased;
+ }
+ } else {
+ // Check if there is any outstanding write request for the same
+ // cache line.
+ if (m_writeRequestTable.count(line_addr) > 0) {
+ m_load_waiting_on_store_cycles++;
+ return RequestStatus_Aliased;
+ }
+
+ if (m_readRequestTable.count(line_addr) > 0) {
+ // There is an outstanding read request for the cache line
+ m_load_waiting_on_load_cycles++;
+ return RequestStatus_Aliased;
+ }
+ }
+
+ return RequestStatus_Ready;
+
+}
+
+
+
+// sets the kernelEndList
+void
+GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
+{
+ // Don't know if this will happen or is possible
+ // but I just want to be careful and not have it become
+ // simulator hang in the future
+ DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
+ assert(kernelEndList.count(wavefront_id) == 0);
+
+ kernelEndList[wavefront_id] = pkt;
+ DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
+ kernelEndList.size());
+}
+
+
+// Insert the request on the correct request table. Return true if
+// the entry was already present.
+bool
+GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
+{
+ assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
+ pkt->req->isLockedRMW() ||
+ !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
+
+ int total_outstanding M5_VAR_USED =
+ m_writeRequestTable.size() + m_readRequestTable.size();
+
+ assert(m_outstanding_count == total_outstanding);
+
+ // See if we should schedule a deadlock check
+ if (deadlockCheckEvent.scheduled() == false) {
+ schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
+ }
+
+ Addr line_addr = makeLineAddress(pkt->getAddr());
+ if ((request_type == RubyRequestType_ST) ||
+ (request_type == RubyRequestType_ATOMIC) ||
+ (request_type == RubyRequestType_ATOMIC_RETURN) ||
+ (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+ (request_type == RubyRequestType_RMW_Read) ||
+ (request_type == RubyRequestType_RMW_Write) ||
+ (request_type == RubyRequestType_Load_Linked) ||
+ (request_type == RubyRequestType_Store_Conditional) ||
+ (request_type == RubyRequestType_Locked_RMW_Read) ||
+ (request_type == RubyRequestType_Locked_RMW_Write) ||
+ (request_type == RubyRequestType_FLUSH)) {
+
+ pair<RequestTable::iterator, bool> r =
+ m_writeRequestTable.insert(RequestTable::value_type(line_addr,
+ (GPUCoalescerRequest*) NULL));
+ if (r.second) {
+ RequestTable::iterator i = r.first;
+ i->second = new GPUCoalescerRequest(pkt, request_type,
+ curCycle());
+ DPRINTF(GPUCoalescer,
+ "Inserting write request for paddr %#x for type %d\n",
+ pkt->req->getPaddr(), i->second->m_type);
+ m_outstanding_count++;
+ } else {
+ return true;
+ }
+ } else {
+ pair<RequestTable::iterator, bool> r =
+ m_readRequestTable.insert(RequestTable::value_type(line_addr,
+ (GPUCoalescerRequest*) NULL));
+
+ if (r.second) {
+ RequestTable::iterator i = r.first;
+ i->second = new GPUCoalescerRequest(pkt, request_type,
+ curCycle());
+ DPRINTF(GPUCoalescer,
+ "Inserting read request for paddr %#x for type %d\n",
+ pkt->req->getPaddr(), i->second->m_type);
+ m_outstanding_count++;
+ } else {
+ return true;
+ }
+ }
+
+ m_outstandReqHist.sample(m_outstanding_count);
+
+ total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
+ assert(m_outstanding_count == total_outstanding);
+
+ return false;
+}
+
+void
+GPUCoalescer::markRemoved()
+{
+ m_outstanding_count--;
+ assert(m_outstanding_count ==
+ m_writeRequestTable.size() + m_readRequestTable.size());
+}
+
+void
+GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
+{
+ assert(m_outstanding_count ==
+ m_writeRequestTable.size() + m_readRequestTable.size());
+
+ Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
+ if ((srequest->m_type == RubyRequestType_ST) ||
+ (srequest->m_type == RubyRequestType_RMW_Read) ||
+ (srequest->m_type == RubyRequestType_RMW_Write) ||
+ (srequest->m_type == RubyRequestType_Load_Linked) ||
+ (srequest->m_type == RubyRequestType_Store_Conditional) ||
+ (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
+ (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
+ m_writeRequestTable.erase(line_addr);
+ } else {
+ m_readRequestTable.erase(line_addr);
+ }
+
+ markRemoved();
+}
+
+bool
+GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
+{
+ //
+ // The success flag indicates whether the LLSC operation was successful.
+ // LL ops will always succeed, but SC may fail if the cache line is no
+ // longer locked.
+ //
+ bool success = true;
+ if (request->m_type == RubyRequestType_Store_Conditional) {
+ if (!m_dataCache_ptr->isLocked(address, m_version)) {
+ //
+ // For failed SC requests, indicate the failure to the cpu by
+ // setting the extra data to zero.
+ //
+ request->pkt->req->setExtraData(0);
+ success = false;
+ } else {
+ //
+ // For successful SC requests, indicate the success to the cpu by
+ // setting the extra data to one.
+ //
+ request->pkt->req->setExtraData(1);
+ }
+ //
+ // Independent of success, all SC operations must clear the lock
+ //
+ m_dataCache_ptr->clearLocked(address);
+ } else if (request->m_type == RubyRequestType_Load_Linked) {
+ //
+ // Note: To fully follow Alpha LLSC semantics, should the LL clear any
+ // previously locked cache lines?
+ //
+ m_dataCache_ptr->setLocked(address, m_version);
+ } else if ((m_dataCache_ptr->isTagPresent(address)) &&
+ (m_dataCache_ptr->isLocked(address, m_version))) {
+ //
+ // Normal writes should clear the locked address
+ //
+ m_dataCache_ptr->clearLocked(address);
+ }
+ return success;
+}
+
+void
+GPUCoalescer::writeCallback(Addr address, DataBlock& data)
+{
+ writeCallback(address, MachineType_NULL, data);
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data)
+{
+ writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime)
+{
+ writeCallback(address, mach, data,
+ initialRequestTime, forwardRequestTime, firstResponseTime,
+ false);
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion)
+{
+ assert(address == makeLineAddress(address));
+
+ DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
+ assert(m_writeRequestTable.count(makeLineAddress(address)));
+
+ RequestTable::iterator i = m_writeRequestTable.find(address);
+ assert(i != m_writeRequestTable.end());
+ GPUCoalescerRequest* request = i->second;
+
+ m_writeRequestTable.erase(i);
+ markRemoved();
+
+ assert((request->m_type == RubyRequestType_ST) ||
+ (request->m_type == RubyRequestType_ATOMIC) ||
+ (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
+ (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+ (request->m_type == RubyRequestType_RMW_Read) ||
+ (request->m_type == RubyRequestType_RMW_Write) ||
+ (request->m_type == RubyRequestType_Load_Linked) ||
+ (request->m_type == RubyRequestType_Store_Conditional) ||
+ (request->m_type == RubyRequestType_Locked_RMW_Read) ||
+ (request->m_type == RubyRequestType_Locked_RMW_Write) ||
+ (request->m_type == RubyRequestType_FLUSH));
+
+
+ //
+ // For Alpha, properly handle LL, SC, and write requests with respect to
+ // locked cache blocks.
+ //
+ // Not valid for Network_test protocl
+ //
+ bool success = true;
+ if(!m_usingNetworkTester)
+ success = handleLlsc(address, request);
+
+ if (request->m_type == RubyRequestType_Locked_RMW_Read) {
+ m_controller->blockOnQueue(address, m_mandatory_q_ptr);
+ } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
+ m_controller->unblock(address);
+ }
+
+ hitCallback(request, mach, data, success,
+ request->issue_time, forwardRequestTime, firstResponseTime,
+ isRegion);
+}
+
+void
+GPUCoalescer::readCallback(Addr address, DataBlock& data)
+{
+ readCallback(address, MachineType_NULL, data);
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data)
+{
+ readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime)
+{
+
+ readCallback(address, mach, data,
+ initialRequestTime, forwardRequestTime, firstResponseTime,
+ false);
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion)
+{
+ assert(address == makeLineAddress(address));
+ assert(m_readRequestTable.count(makeLineAddress(address)));
+
+ DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
+ RequestTable::iterator i = m_readRequestTable.find(address);
+ assert(i != m_readRequestTable.end());
+ GPUCoalescerRequest* request = i->second;
+
+ m_readRequestTable.erase(i);
+ markRemoved();
+
+ assert((request->m_type == RubyRequestType_LD) ||
+ (request->m_type == RubyRequestType_IFETCH));
+
+ hitCallback(request, mach, data, true,
+ request->issue_time, forwardRequestTime, firstResponseTime,
+ isRegion);
+}
+
+void
+GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
+ MachineType mach,
+ DataBlock& data,
+ bool success,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion)
+{
+ PacketPtr pkt = srequest->pkt;
+ Addr request_address = pkt->getAddr();
+ Addr request_line_address = makeLineAddress(request_address);
+
+ RubyRequestType type = srequest->m_type;
+
+ // Set this cache entry to the most recently used
+ if (type == RubyRequestType_IFETCH) {
+ if (m_instCache_ptr->isTagPresent(request_line_address))
+ m_instCache_ptr->setMRU(request_line_address);
+ } else {
+ if (m_dataCache_ptr->isTagPresent(request_line_address))
+ m_dataCache_ptr->setMRU(request_line_address);
+ }
+
+ recordMissLatency(srequest, mach,
+ initialRequestTime,
+ forwardRequestTime,
+ firstResponseTime,
+ success, isRegion);
+ // update the data
+ //
+ // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
+ int len = reqCoalescer[request_line_address].size();
+ std::vector<PacketPtr> mylist;
+ for (int i = 0; i < len; ++i) {
+ PacketPtr pkt = reqCoalescer[request_line_address][i].first;
+ assert(type ==
+ reqCoalescer[request_line_address][i].second[PrimaryType]);
+ request_address = pkt->getAddr();
+ request_line_address = makeLineAddress(pkt->getAddr());
+ if (pkt->getPtr<uint8_t>()) {
+ if ((type == RubyRequestType_LD) ||
+ (type == RubyRequestType_ATOMIC) ||
+ (type == RubyRequestType_ATOMIC_RETURN) ||
+ (type == RubyRequestType_IFETCH) ||
+ (type == RubyRequestType_RMW_Read) ||
+ (type == RubyRequestType_Locked_RMW_Read) ||
+ (type == RubyRequestType_Load_Linked)) {
+ memcpy(pkt->getPtr<uint8_t>(),
+ data.getData(getOffset(request_address),
+ pkt->getSize()),
+ pkt->getSize());
+ } else {
+ data.setData(pkt->getPtr<uint8_t>(),
+ getOffset(request_address), pkt->getSize());
+ }
+ } else {
+ DPRINTF(MemoryAccess,
+ "WARNING. Data not transfered from Ruby to M5 for type " \
+ "%s\n",
+ RubyRequestType_to_string(type));
+ }
+
+ // If using the RubyTester, update the RubyTester sender state's
+ // subBlock with the recieved data. The tester will later access
+ // this state.
+ // Note: RubyPort will access it's sender state before the
+ // RubyTester.
+ if (m_usingRubyTester) {
+ RubyPort::SenderState *requestSenderState =
+ safe_cast<RubyPort::SenderState*>(pkt->senderState);
+ RubyTester::SenderState* testerSenderState =
+ safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+ testerSenderState->subBlock.mergeFrom(data);
+ }
+
+ mylist.push_back(pkt);
+ }
+ delete srequest;
+ reqCoalescer.erase(request_line_address);
+ assert(!reqCoalescer.count(request_line_address));
+
+
+
+ completeHitCallback(mylist, len);
+}
+
+bool
+GPUCoalescer::empty() const
+{
+ return m_writeRequestTable.empty() && m_readRequestTable.empty();
+}
+
+// Analyzes the packet to see if this request can be coalesced.
+// If request can be coalesced, this request is added to the reqCoalescer table
+// and makeRequest returns RequestStatus_Issued;
+// If this is the first request to a cacheline, request is added to both
+// newRequests queue and to the reqCoalescer table; makeRequest
+// returns RequestStatus_Issued.
+// If there is a pending request to this cacheline and this request
+// can't be coalesced, RequestStatus_Aliased is returned and
+// the packet needs to be reissued.
+RequestStatus
+GPUCoalescer::makeRequest(PacketPtr pkt)
+{
+ // Check for GPU Barrier Kernel End or Kernel Begin
+ // Leave these to be handled by the child class
+ // Kernel End/Barrier = isFlush + isRelease
+ // Kernel Begin = isFlush + isAcquire
+ if (pkt->req->isKernel()) {
+ if (pkt->req->isAcquire()){
+ // This is a Kernel Begin leave handling to
+ // virtual xCoalescer::makeRequest
+ return RequestStatus_Issued;
+ }else if(pkt->req->isRelease()) {
+ // This is a Kernel End leave handling to
+ // virtual xCoalescer::makeRequest
+ // If we are here then we didn't call
+ // a virtual version of this function
+ // so we will also schedule the callback
+ int wf_id = 0;
+ if (pkt->req->hasContextId()) {
+ wf_id = pkt->req->contextId();
+ }
+ insertKernel(wf_id, pkt);
+ newKernelEnds.push_back(wf_id);
+ if (!issueEvent.scheduled()) {
+ schedule(issueEvent, curTick());
+ }
+ return RequestStatus_Issued;
+ }
+ }
+
+ // If number of outstanding requests greater than the max allowed,
+ // return RequestStatus_BufferFull. This logic can be extended to
+ // support proper backpressure.
+ if (m_outstanding_count >= m_max_outstanding_requests) {
+ return RequestStatus_BufferFull;
+ }
+
+ RubyRequestType primary_type = RubyRequestType_NULL;
+ RubyRequestType secondary_type = RubyRequestType_NULL;
+
+ if (pkt->isLLSC()) {
+ //
+ // Alpha LL/SC instructions need to be handled carefully by the cache
+ // coherence protocol to ensure they follow the proper semantics. In
+ // particular, by identifying the operations as atomic, the protocol
+ // should understand that migratory sharing optimizations should not
+ // be performed (i.e. a load between the LL and SC should not steal
+ // away exclusive permission).
+ //
+ if (pkt->isWrite()) {
+ primary_type = RubyRequestType_Store_Conditional;
+ } else {
+ assert(pkt->isRead());
+ primary_type = RubyRequestType_Load_Linked;
+ }
+ secondary_type = RubyRequestType_ATOMIC;
+ } else if (pkt->req->isLockedRMW()) {
+ //
+ // x86 locked instructions are translated to store cache coherence
+ // requests because these requests should always be treated as read
+ // exclusive operations and should leverage any migratory sharing
+ // optimization built into the protocol.
+ //
+ if (pkt->isWrite()) {
+ primary_type = RubyRequestType_Locked_RMW_Write;
+ } else {
+ assert(pkt->isRead());
+ primary_type = RubyRequestType_Locked_RMW_Read;
+ }
+ secondary_type = RubyRequestType_ST;
+ } else if (pkt->isAtomicOp()) {
+ //
+ // GPU Atomic Operation
+ //
+ primary_type = RubyRequestType_ATOMIC;
+ secondary_type = RubyRequestType_ATOMIC;
+ } else {
+ if (pkt->isRead()) {
+ if (pkt->req->isInstFetch()) {
+ primary_type = secondary_type = RubyRequestType_IFETCH;
+ } else {
+#if THE_ISA == X86_ISA
+ uint32_t flags = pkt->req->getFlags();
+ bool storeCheck = flags &
+ (TheISA::StoreCheck << TheISA::FlagShift);
+#else
+ bool storeCheck = false;
+#endif // X86_ISA
+ if (storeCheck) {
+ primary_type = RubyRequestType_RMW_Read;
+ secondary_type = RubyRequestType_ST;
+ } else {
+ primary_type = secondary_type = RubyRequestType_LD;
+ }
+ }
+ } else if (pkt->isWrite()) {
+ //
+ // Note: M5 packets do not differentiate ST from RMW_Write
+ //
+ primary_type = secondary_type = RubyRequestType_ST;
+ } else if (pkt->isFlush()) {
+ primary_type = secondary_type = RubyRequestType_FLUSH;
+ } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
+ if (assumingRfOCoherence) {
+ // If we reached here, this request must be a memFence
+ // and the protocol implements RfO, the coalescer can
+ // assume sequentially consistency and schedule the callback
+ // immediately.
+ // Currently the code implements fence callbacks
+ // by reusing the mechanism for kernel completions.
+ // This should be fixed.
+ int wf_id = 0;
+ if (pkt->req->hasContextId()) {
+ wf_id = pkt->req->contextId();
+ }
+ insertKernel(wf_id, pkt);
+ newKernelEnds.push_back(wf_id);
+ if (!issueEvent.scheduled()) {
+ schedule(issueEvent, curTick());
+ }
+ return RequestStatus_Issued;
+ } else {
+ // If not RfO, return issued here and let the child coalescer
+ // take care of it.
+ return RequestStatus_Issued;
+ }
+ } else {
+ panic("Unsupported ruby packet type\n");
+ }
+ }
+
+ // Check if there is any pending request to this cache line from
+ // previous cycles.
+ // If there is a pending request, return aliased. Since coalescing
+ // across time is not permitted, aliased requests are not coalesced.
+ // If a request for this address has already been issued, we must block
+ RequestStatus status = getRequestStatus(pkt, primary_type);
+ if (status != RequestStatus_Ready)
+ return status;
+
+ Addr line_addr = makeLineAddress(pkt->getAddr());
+
+ // Check if this request can be coalesced with previous
+ // requests from this cycle.
+ if (!reqCoalescer.count(line_addr)) {
+ // This is the first access to this cache line.
+ // A new request to the memory subsystem has to be
+ // made in the next cycle for this cache line, so
+ // add this line addr to the "newRequests" queue
+ newRequests.push_back(line_addr);
+
+ // There was a request to this cache line in this cycle,
+ // let us see if we can coalesce this request with the previous
+ // requests from this cycle
+ } else if (primary_type !=
+ reqCoalescer[line_addr][0].second[PrimaryType]) {
+ // can't coalesce loads, stores and atomics!
+ return RequestStatus_Aliased;
+ } else if (pkt->req->isLockedRMW() ||
+ reqCoalescer[line_addr][0].first->req->isLockedRMW()) {
+ // can't coalesce locked accesses, but can coalesce atomics!
+ return RequestStatus_Aliased;
+ } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
+ pkt->req->contextId() !=
+ reqCoalescer[line_addr][0].first->req->contextId()) {
+ // can't coalesce releases from different wavefronts
+ return RequestStatus_Aliased;
+ }
+
+ // in addition to the packet, we need to save both request types
+ reqCoalescer[line_addr].push_back(
+ RequestDesc(pkt, std::vector<RubyRequestType>()) );
+ reqCoalescer[line_addr].back().second.push_back(primary_type);
+ reqCoalescer[line_addr].back().second.push_back(secondary_type);
+ if (!issueEvent.scheduled())
+ schedule(issueEvent, curTick());
+ // TODO: issue hardware prefetches here
+ return RequestStatus_Issued;
+}
+
+void
+GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
+{
+
+ int proc_id = -1;
+ if (pkt != NULL && pkt->req->hasContextId()) {
+ proc_id = pkt->req->contextId();
+ }
+
+ // If valid, copy the pc to the ruby request
+ Addr pc = 0;
+ if (pkt->req->hasPC()) {
+ pc = pkt->req->getPC();
+ }
+
+ // At the moment setting scopes only counts
+ // for GPU spill space accesses
+ // which is pkt->req->isStack()
+ // this scope is REPLACE since it
+ // does not need to be flushed at the end
+ // of a kernel Private and local may need
+ // to be visible at the end of the kernel
+ HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
+ HSAScope accessScope = reqScopeToHSAScope(pkt->req);
+
+ Addr line_addr = makeLineAddress(pkt->getAddr());
+
+ // Creating WriteMask that records written bytes
+ // and atomic operations. This enables partial writes
+ // and partial reads of those writes
+ DataBlock dataBlock;
+ dataBlock.clear();
+ uint32_t blockSize = RubySystem::getBlockSizeBytes();
+ std::vector<bool> accessMask(blockSize,false);
+ std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
+ uint32_t tableSize = reqCoalescer[line_addr].size();
+ for (int i = 0; i < tableSize; i++) {
+ PacketPtr tmpPkt = reqCoalescer[line_addr][i].first;
+ uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
+ uint32_t tmpSize = tmpPkt->getSize();
+ if (tmpPkt->isAtomicOp()) {
+ std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
+ tmpPkt->getAtomicOp());
+ atomicOps.push_back(tmpAtomicOp);
+ } else if(tmpPkt->isWrite()) {
+ dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
+ tmpOffset, tmpSize);
+ }
+ for (int j = 0; j < tmpSize; j++) {
+ accessMask[tmpOffset + j] = true;
+ }
+ }
+ std::shared_ptr<RubyRequest> msg;
+ if (pkt->isAtomicOp()) {
+ msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+ pkt->getPtr<uint8_t>(),
+ pkt->getSize(), pc, secondary_type,
+ RubyAccessMode_Supervisor, pkt,
+ PrefetchBit_No, proc_id, 100,
+ blockSize, accessMask,
+ dataBlock, atomicOps,
+ accessScope, accessSegment);
+ } else {
+ msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+ pkt->getPtr<uint8_t>(),
+ pkt->getSize(), pc, secondary_type,
+ RubyAccessMode_Supervisor, pkt,
+ PrefetchBit_No, proc_id, 100,
+ blockSize, accessMask,
+ dataBlock,
+ accessScope, accessSegment);
+ }
+ DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
+ curTick(), m_version, "Coal", "Begin", "", "",
+ printAddress(msg->getPhysicalAddress()),
+ RubyRequestType_to_string(secondary_type));
+
+ fatal_if(secondary_type == RubyRequestType_IFETCH,
+ "there should not be any I-Fetch requests in the GPU Coalescer");
+
+ // Send the message to the cache controller
+ fatal_if(m_data_cache_hit_latency == 0,
+ "should not have a latency of zero");
+
+ assert(m_mandatory_q_ptr);
+ m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+}
+
+template <class KEY, class VALUE>
+std::ostream &
+operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
+{
+ out << "[";
+ for (auto i = map.begin(); i != map.end(); ++i)
+ out << " " << i->first << "=" << i->second;
+ out << " ]";
+
+ return out;
+}
+
+void
+GPUCoalescer::print(ostream& out) const
+{
+ out << "[GPUCoalescer: " << m_version
+ << ", outstanding requests: " << m_outstanding_count
+ << ", read request table: " << m_readRequestTable
+ << ", write request table: " << m_writeRequestTable
+ << "]";
+}
+
+// this can be called from setState whenever coherence permissions are
+// upgraded when invoked, coherence violations will be checked for the
+// given block
+void
+GPUCoalescer::checkCoherence(Addr addr)
+{
+#ifdef CHECK_COHERENCE
+ m_ruby_system->checkGlobalCoherenceInvariant(addr);
+#endif
+}
+
+void
+GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
+ DPRINTF(RubyStats, "Recorded statistic: %s\n",
+ SequencerRequestType_to_string(requestType));
+}
+
+GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq)
+ : Event(Progress_Event_Pri), seq(_seq)
+{
+}
+
+
+void
+GPUCoalescer::completeIssue()
+{
+ // newRequests has the cacheline addresses of all the
+ // requests which need to be issued to the memory subsystem
+ // in this cycle
+ int len = newRequests.size();
+ DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
+ for (int i = 0; i < len; ++i) {
+ // Get the requests from reqCoalescer table. Get only the
+ // first request for each cacheline, the remaining requests
+ // can be coalesced with the first request. So, only
+ // one request is issued per cacheline.
+ RequestDesc info = reqCoalescer[newRequests[i]][0];
+ PacketPtr pkt = info.first;
+ DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
+ i, pkt->req->getPaddr());
+ // Insert this request to the read/writeRequestTables. These tables
+ // are used to track aliased requests in makeRequest subroutine
+ bool found = insertRequest(pkt, info.second[PrimaryType]);
+
+ if (found) {
+ panic("GPUCoalescer::makeRequest should never be called if the "
+ "request is already outstanding\n");
+ }
+
+ // Issue request to ruby subsystem
+ issueRequest(pkt, info.second[SecondaryType]);
+ }
+ newRequests.clear();
+
+ // have Kernel End releases been issued this cycle
+ len = newKernelEnds.size();
+ for (int i = 0; i < len; i++) {
+ kernelCallback(newKernelEnds[i]);
+ }
+ newKernelEnds.clear();
+}
+
+void
+GPUCoalescer::IssueEvent::process()
+{
+ seq->completeIssue();
+}
+
+const char *
+GPUCoalescer::IssueEvent::description() const
+{
+ return "Issue coalesced request";
+}
+
+void
+GPUCoalescer::evictionCallback(Addr address)
+{
+ ruby_eviction_callback(address);
+}
+
+void
+GPUCoalescer::kernelCallback(int wavefront_id)
+{
+ assert(kernelEndList.count(wavefront_id));
+
+ ruby_hit_callback(kernelEndList[wavefront_id]);
+
+ kernelEndList.erase(wavefront_id);
+}
+
+void
+GPUCoalescer::atomicCallback(Addr address,
+ MachineType mach,
+ const DataBlock& data)
+{
+ assert(address == makeLineAddress(address));
+
+ DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
+ assert(m_writeRequestTable.count(makeLineAddress(address)));
+
+ RequestTable::iterator i = m_writeRequestTable.find(address);
+ assert(i != m_writeRequestTable.end());
+ GPUCoalescerRequest* srequest = i->second;
+
+ m_writeRequestTable.erase(i);
+ markRemoved();
+
+ assert((srequest->m_type == RubyRequestType_ATOMIC) ||
+ (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
+ (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
+
+
+ // Atomics don't write to cache, so there is no MRU update...
+
+ recordMissLatency(srequest, mach,
+ srequest->issue_time, Cycles(0), Cycles(0), true, false);
+
+ PacketPtr pkt = srequest->pkt;
+ Addr request_address = pkt->getAddr();
+ Addr request_line_address = makeLineAddress(pkt->getAddr());
+
+ int len = reqCoalescer[request_line_address].size();
+ std::vector<PacketPtr> mylist;
+ for (int i = 0; i < len; ++i) {
+ PacketPtr pkt = reqCoalescer[request_line_address][i].first;
+ assert(srequest->m_type ==
+ reqCoalescer[request_line_address][i].second[PrimaryType]);
+ request_address = (pkt->getAddr());
+ request_line_address = makeLineAddress(request_address);
+ if (pkt->getPtr<uint8_t>() &&
+ srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
+ /* atomics are done in memory, and return the data *before* the atomic op... */
+ memcpy(pkt->getPtr<uint8_t>(),
+ data.getData(getOffset(request_address),
+ pkt->getSize()),
+ pkt->getSize());
+ } else {
+ DPRINTF(MemoryAccess,
+ "WARNING. Data not transfered from Ruby to M5 for type " \
+ "%s\n",
+ RubyRequestType_to_string(srequest->m_type));
+ }
+
+ // If using the RubyTester, update the RubyTester sender state's
+ // subBlock with the recieved data. The tester will later access
+ // this state.
+ // Note: RubyPort will access it's sender state before the
+ // RubyTester.
+ if (m_usingRubyTester) {
+ RubyPort::SenderState *requestSenderState =
+ safe_cast<RubyPort::SenderState*>(pkt->senderState);
+ RubyTester::SenderState* testerSenderState =
+ safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+ testerSenderState->subBlock.mergeFrom(data);
+ }
+
+ mylist.push_back(pkt);
+ }
+ delete srequest;
+ reqCoalescer.erase(request_line_address);
+ assert(!reqCoalescer.count(request_line_address));
+
+ completeHitCallback(mylist, len);
+}
+
+void
+GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
+{
+ if(myMachID == senderMachID) {
+ CP_TCPLdHits++;
+ } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) {
+ CP_TCPLdTransfers++;
+ } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) {
+ CP_TCCLdHits++;
+ } else {
+ CP_LdMiss++;
+ }
+}
+
+void
+GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
+{
+ if(myMachID == senderMachID) {
+ CP_TCPStHits++;
+ } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) {
+ CP_TCPStTransfers++;
+ } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) {
+ CP_TCCStHits++;
+ } else {
+ CP_StMiss++;
+ }
+}
+
+void
+GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
+{
+ for (int i = 0; i < len; ++i) {
+ RubyPort::SenderState *ss =
+ safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
+ MemSlavePort *port = ss->port;
+ assert(port != NULL);
+
+ mylist[i]->senderState = ss->predecessor;
+ delete ss;
+ port->hitCallback(mylist[i]);
+ trySendRetries();
+ }
+
+ testDrainComplete();
+}
+
+PacketPtr
+GPUCoalescer::mapAddrToPkt(Addr address)
+{
+ RequestTable::iterator i = m_readRequestTable.find(address);
+ assert(i != m_readRequestTable.end());
+ GPUCoalescerRequest* request = i->second;
+ return request->pkt;
+}
+
+void
+GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
+ MachineType mach,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool success, bool isRegion)
+{
+ RubyRequestType type = srequest->m_type;
+ Cycles issued_time = srequest->issue_time;
+ Cycles completion_time = curCycle();
+ assert(completion_time >= issued_time);
+ Cycles total_lat = completion_time - issued_time;
+
+ // cache stats (valid for RfO protocol only)
+ if (mach == MachineType_TCP) {
+ if (type == RubyRequestType_LD) {
+ GPU_TCPLdHits++;
+ } else {
+ GPU_TCPStHits++;
+ }
+ } else if (mach == MachineType_L1Cache_wCC) {
+ if (type == RubyRequestType_LD) {
+ GPU_TCPLdTransfers++;
+ } else {
+ GPU_TCPStTransfers++;
+ }
+ } else if (mach == MachineType_TCC) {
+ if (type == RubyRequestType_LD) {
+ GPU_TCCLdHits++;
+ } else {
+ GPU_TCCStHits++;
+ }
+ } else {
+ if (type == RubyRequestType_LD) {
+ GPU_LdMiss++;
+ } else {
+ GPU_StMiss++;
+ }
+ }
+
+ // Profile all access latency, even zero latency accesses
+ m_latencyHist.sample(total_lat);
+ m_typeLatencyHist[type]->sample(total_lat);
+
+ // Profile the miss latency for all non-zero demand misses
+ if (total_lat != Cycles(0)) {
+ m_missLatencyHist.sample(total_lat);
+ m_missTypeLatencyHist[type]->sample(total_lat);
+
+ if (mach != MachineType_NUM) {
+ m_missMachLatencyHist[mach]->sample(total_lat);
+ m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
+
+ if ((issued_time <= initialRequestTime) &&
+ (initialRequestTime <= forwardRequestTime) &&
+ (forwardRequestTime <= firstResponseTime) &&
+ (firstResponseTime <= completion_time)) {
+
+ m_IssueToInitialDelayHist[mach]->sample(
+ initialRequestTime - issued_time);
+ m_InitialToForwardDelayHist[mach]->sample(
+ forwardRequestTime - initialRequestTime);
+ m_ForwardToFirstResponseDelayHist[mach]->sample(
+ firstResponseTime - forwardRequestTime);
+ m_FirstResponseToCompletionDelayHist[mach]->sample(
+ completion_time - firstResponseTime);
+ }
+ }
+
+ }
+
+ DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
+ curTick(), m_version, "Coal",
+ success ? "Done" : "SC_Failed", "", "",
+ printAddress(srequest->pkt->getAddr()), total_lat);
+}
+
+void
+GPUCoalescer::regStats()
+{
+ // These statistical variables are not for display.
+ // The profiler will collate these across different
+ // coalescers and display those collated statistics.
+ m_outstandReqHist.init(10);
+ m_latencyHist.init(10);
+ m_missLatencyHist.init(10);
+
+ for (int i = 0; i < RubyRequestType_NUM; i++) {
+ m_typeLatencyHist.push_back(new Stats::Histogram());
+ m_typeLatencyHist[i]->init(10);
+
+ m_missTypeLatencyHist.push_back(new Stats::Histogram());
+ m_missTypeLatencyHist[i]->init(10);
+ }
+
+ for (int i = 0; i < MachineType_NUM; i++) {
+ m_missMachLatencyHist.push_back(new Stats::Histogram());
+ m_missMachLatencyHist[i]->init(10);
+
+ m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
+ m_IssueToInitialDelayHist[i]->init(10);
+
+ m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
+ m_InitialToForwardDelayHist[i]->init(10);
+
+ m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
+ m_ForwardToFirstResponseDelayHist[i]->init(10);
+
+ m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
+ m_FirstResponseToCompletionDelayHist[i]->init(10);
+ }
+
+ for (int i = 0; i < RubyRequestType_NUM; i++) {
+ m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
+
+ for (int j = 0; j < MachineType_NUM; j++) {
+ m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
+ m_missTypeMachLatencyHist[i][j]->init(10);
+ }
+ }
+
+ // GPU cache stats
+ GPU_TCPLdHits
+ .name(name() + ".gpu_tcp_ld_hits")
+ .desc("loads that hit in the TCP")
+ ;
+ GPU_TCPLdTransfers
+ .name(name() + ".gpu_tcp_ld_transfers")
+ .desc("TCP to TCP load transfers")
+ ;
+ GPU_TCCLdHits
+ .name(name() + ".gpu_tcc_ld_hits")
+ .desc("loads that hit in the TCC")
+ ;
+ GPU_LdMiss
+ .name(name() + ".gpu_ld_misses")
+ .desc("loads that miss in the GPU")
+ ;
+
+ GPU_TCPStHits
+ .name(name() + ".gpu_tcp_st_hits")
+ .desc("stores that hit in the TCP")
+ ;
+ GPU_TCPStTransfers
+ .name(name() + ".gpu_tcp_st_transfers")
+ .desc("TCP to TCP store transfers")
+ ;
+ GPU_TCCStHits
+ .name(name() + ".gpu_tcc_st_hits")
+ .desc("stores that hit in the TCC")
+ ;
+ GPU_StMiss
+ .name(name() + ".gpu_st_misses")
+ .desc("stores that miss in the GPU")
+ ;
+
+ // CP cache stats
+ CP_TCPLdHits
+ .name(name() + ".cp_tcp_ld_hits")
+ .desc("loads that hit in the TCP")
+ ;
+ CP_TCPLdTransfers
+ .name(name() + ".cp_tcp_ld_transfers")
+ .desc("TCP to TCP load transfers")
+ ;
+ CP_TCCLdHits
+ .name(name() + ".cp_tcc_ld_hits")
+ .desc("loads that hit in the TCC")
+ ;
+ CP_LdMiss
+ .name(name() + ".cp_ld_misses")
+ .desc("loads that miss in the GPU")
+ ;
+
+ CP_TCPStHits
+ .name(name() + ".cp_tcp_st_hits")
+ .desc("stores that hit in the TCP")
+ ;
+ CP_TCPStTransfers
+ .name(name() + ".cp_tcp_st_transfers")
+ .desc("TCP to TCP store transfers")
+ ;
+ CP_TCCStHits
+ .name(name() + ".cp_tcc_st_hits")
+ .desc("stores that hit in the TCC")
+ ;
+ CP_StMiss
+ .name(name() + ".cp_st_misses")
+ .desc("stores that miss in the GPU")
+ ;
+}
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
new file mode 100644
index 000000000..dbd47059c
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+
+#include <iostream>
+#include <unordered_map>
+
+#include "base/statistics.hh"
+#include "mem/protocol/HSAScope.hh"
+#include "mem/protocol/HSASegment.hh"
+#include "mem/protocol/PrefetchBit.hh"
+#include "mem/protocol/RubyAccessMode.hh"
+#include "mem/protocol/RubyRequestType.hh"
+#include "mem/protocol/SequencerRequestType.hh"
+#include "mem/request.hh"
+#include "mem/ruby/common/Address.hh"
+#include "mem/ruby/common/Consumer.hh"
+#include "mem/ruby/system/RubyPort.hh"
+
+class DataBlock;
+class CacheMsg;
+class MachineID;
+class CacheMemory;
+
+class RubyGPUCoalescerParams;
+
+HSAScope reqScopeToHSAScope(Request* req);
+HSASegment reqSegmentToHSASegment(Request* req);
+
+struct GPUCoalescerRequest
+{
+ PacketPtr pkt;
+ RubyRequestType m_type;
+ Cycles issue_time;
+
+ GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type,
+ Cycles _issue_time)
+ : pkt(_pkt), m_type(_m_type), issue_time(_issue_time)
+ {}
+};
+
+std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj);
+
+class GPUCoalescer : public RubyPort
+{
+ public:
+ typedef RubyGPUCoalescerParams Params;
+ GPUCoalescer(const Params *);
+ ~GPUCoalescer();
+
+ // Public Methods
+ void wakeup(); // Used only for deadlock detection
+
+ void printProgress(std::ostream& out) const;
+ void resetStats();
+ void collateStats();
+ void regStats();
+
+ void writeCallback(Addr address, DataBlock& data);
+
+ void writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data);
+
+ void writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion);
+
+ void writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime);
+
+ void readCallback(Addr address, DataBlock& data);
+
+ void readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data);
+
+ void readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime);
+
+ void readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion);
+ /* atomics need their own callback because the data
+ might be const coming from SLICC */
+ void atomicCallback(Addr address,
+ MachineType mach,
+ const DataBlock& data);
+
+ void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
+ void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
+
+ // Alternate implementations in VIPER Coalescer
+ virtual RequestStatus makeRequest(PacketPtr pkt);
+
+ int outstandingCount() const { return m_outstanding_count; }
+
+ bool
+ isDeadlockEventScheduled() const
+ {
+ return deadlockCheckEvent.scheduled();
+ }
+
+ void
+ descheduleDeadlockEvent()
+ {
+ deschedule(deadlockCheckEvent);
+ }
+
+ bool empty() const;
+
+ void print(std::ostream& out) const;
+ void checkCoherence(Addr address);
+
+ void markRemoved();
+ void removeRequest(GPUCoalescerRequest* request);
+ void evictionCallback(Addr address);
+ void completeIssue();
+
+ void insertKernel(int wavefront_id, PacketPtr pkt);
+
+ void recordRequestType(SequencerRequestType requestType);
+ Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
+
+ Stats::Histogram& getLatencyHist() { return m_latencyHist; }
+ Stats::Histogram& getTypeLatencyHist(uint32_t t)
+ { return *m_typeLatencyHist[t]; }
+
+ Stats::Histogram& getMissLatencyHist()
+ { return m_missLatencyHist; }
+ Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
+ { return *m_missTypeLatencyHist[t]; }
+
+ Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
+ { return *m_missMachLatencyHist[t]; }
+
+ Stats::Histogram&
+ getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
+ { return *m_missTypeMachLatencyHist[r][t]; }
+
+ Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
+ { return *m_IssueToInitialDelayHist[t]; }
+
+ Stats::Histogram&
+ getInitialToForwardDelayHist(const MachineType t) const
+ { return *m_InitialToForwardDelayHist[t]; }
+
+ Stats::Histogram&
+ getForwardRequestToFirstResponseHist(const MachineType t) const
+ { return *m_ForwardToFirstResponseDelayHist[t]; }
+
+ Stats::Histogram&
+ getFirstResponseToCompletionDelayHist(const MachineType t) const
+ { return *m_FirstResponseToCompletionDelayHist[t]; }
+
+ // Changed to protected to enable inheritance by VIPER Coalescer
+ protected:
+ bool tryCacheAccess(Addr addr, RubyRequestType type,
+ Addr pc, RubyAccessMode access_mode,
+ int size, DataBlock*& data_ptr);
+ // Alternate implementations in VIPER Coalescer
+ virtual void issueRequest(PacketPtr pkt, RubyRequestType type);
+
+ void kernelCallback(int wavfront_id);
+
+ void hitCallback(GPUCoalescerRequest* request,
+ MachineType mach,
+ DataBlock& data,
+ bool success,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion);
+ void recordMissLatency(GPUCoalescerRequest* request,
+ MachineType mach,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool success, bool isRegion);
+ void completeHitCallback(std::vector<PacketPtr> & mylist, int len);
+ PacketPtr mapAddrToPkt(Addr address);
+
+
+ RequestStatus getRequestStatus(PacketPtr pkt,
+ RubyRequestType request_type);
+ bool insertRequest(PacketPtr pkt, RubyRequestType request_type);
+
+ bool handleLlsc(Addr address, GPUCoalescerRequest* request);
+
+ // Private copy constructor and assignment operator
+ GPUCoalescer(const GPUCoalescer& obj);
+ GPUCoalescer& operator=(const GPUCoalescer& obj);
+
+ class IssueEvent : public Event
+ {
+ private:
+ GPUCoalescer *seq;
+ public:
+ IssueEvent(GPUCoalescer *_seq);
+ void process();
+ const char *description() const;
+ };
+
+ IssueEvent issueEvent;
+
+
+ // Changed to protected to enable inheritance by VIPER Coalescer
+ protected:
+ int m_max_outstanding_requests;
+ int m_deadlock_threshold;
+
+ CacheMemory* m_dataCache_ptr;
+ CacheMemory* m_instCache_ptr;
+
+ // The cache access latency for this GPU data cache. This is assessed at the
+ // beginning of each access. This should be very similar to the
+ // implementation in Sequencer() as this is very much like a Sequencer
+ Cycles m_data_cache_hit_latency;
+
+ // We need to track both the primary and secondary request types.
+ // The secondary request type comprises a subset of RubyRequestTypes that
+ // are understood by the L1 Controller. A primary request type can be any
+ // RubyRequestType.
+ enum {PrimaryType, SecondaryType};
+ typedef std::pair<PacketPtr, std::vector<RubyRequestType> > RequestDesc;
+ typedef std::unordered_map<Addr, std::vector<RequestDesc> > CoalescingTable;
+ CoalescingTable reqCoalescer;
+ std::vector<Addr> newRequests;
+
+ typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable;
+ RequestTable m_writeRequestTable;
+ RequestTable m_readRequestTable;
+ // Global outstanding request count, across all request tables
+ int m_outstanding_count;
+ bool m_deadlock_check_scheduled;
+ std::unordered_map<int, PacketPtr> kernelEndList;
+ std::vector<int> newKernelEnds;
+
+ int m_store_waiting_on_load_cycles;
+ int m_store_waiting_on_store_cycles;
+ int m_load_waiting_on_store_cycles;
+ int m_load_waiting_on_load_cycles;
+
+ bool m_usingNetworkTester;
+
+ class GPUCoalescerWakeupEvent : public Event
+ {
+ private:
+ GPUCoalescer *m_GPUCoalescer_ptr;
+
+ public:
+ GPUCoalescerWakeupEvent(GPUCoalescer *_seq) :
+ m_GPUCoalescer_ptr(_seq) {}
+ void process() { m_GPUCoalescer_ptr->wakeup(); }
+ const char *description() const
+ {
+ return "GPUCoalescer deadlock check";
+ }
+ };
+
+ GPUCoalescerWakeupEvent deadlockCheckEvent;
+ bool assumingRfOCoherence;
+
+ // m5 style stats for TCP hit/miss counts
+ Stats::Scalar GPU_TCPLdHits;
+ Stats::Scalar GPU_TCPLdTransfers;
+ Stats::Scalar GPU_TCCLdHits;
+ Stats::Scalar GPU_LdMiss;
+
+ Stats::Scalar GPU_TCPStHits;
+ Stats::Scalar GPU_TCPStTransfers;
+ Stats::Scalar GPU_TCCStHits;
+ Stats::Scalar GPU_StMiss;
+
+ Stats::Scalar CP_TCPLdHits;
+ Stats::Scalar CP_TCPLdTransfers;
+ Stats::Scalar CP_TCCLdHits;
+ Stats::Scalar CP_LdMiss;
+
+ Stats::Scalar CP_TCPStHits;
+ Stats::Scalar CP_TCPStTransfers;
+ Stats::Scalar CP_TCCStHits;
+ Stats::Scalar CP_StMiss;
+
+ //! Histogram for number of outstanding requests per cycle.
+ Stats::Histogram m_outstandReqHist;
+
+ //! Histogram for holding latency profile of all requests.
+ Stats::Histogram m_latencyHist;
+ std::vector<Stats::Histogram *> m_typeLatencyHist;
+
+ //! Histogram for holding latency profile of all requests that
+ //! miss in the controller connected to this sequencer.
+ Stats::Histogram m_missLatencyHist;
+ std::vector<Stats::Histogram *> m_missTypeLatencyHist;
+
+ //! Histograms for profiling the latencies for requests that
+ //! required external messages.
+ std::vector<Stats::Histogram *> m_missMachLatencyHist;
+ std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
+
+ //! Histograms for recording the breakdown of miss latency
+ std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
+ std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
+ std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
+ std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
+};
+
+inline std::ostream&
+operator<<(std::ostream& out, const GPUCoalescer& obj)
+{
+ obj.print(out);
+ out << std::flush;
+ return out;
+}
+
+#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+
diff --git a/src/mem/ruby/system/GPUCoalescer.py b/src/mem/ruby/system/GPUCoalescer.py
new file mode 100644
index 000000000..0c19f875d
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Steve Reinhardt
+# Brad Beckmann
+
+from m5.params import *
+from m5.proxy import *
+from Sequencer import *
+
+class RubyGPUCoalescer(RubySequencer):
+ type = 'RubyGPUCoalescer'
+ cxx_class = 'GPUCoalescer'
+ cxx_header = "mem/ruby/system/GPUCoalescer.hh"
+
+ # max_outstanding_requests = (wave front slots) x (wave front size)
+ max_outstanding_requests = Param.Int(40*64,
+ "max requests (incl. prefetches) outstanding")
+ assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
+ "Ownership coherence");
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 5a5f528bb..bf4002126 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -60,7 +60,8 @@ RubyPort::RubyPort(const Params *p)
memSlavePort(csprintf("%s-mem-slave-port", name()), this,
p->ruby_system->getAccessBackingStore(), -1,
p->no_retry_on_stall),
- gotAddrRanges(p->port_master_connection_count)
+ gotAddrRanges(p->port_master_connection_count),
+ m_isCPUSequencer(p->is_cpu_sequencer)
{
assert(m_version != -1);
diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh
index 07e0fde5a..6bd92b654 100644
--- a/src/mem/ruby/system/RubyPort.hh
+++ b/src/mem/ruby/system/RubyPort.hh
@@ -167,6 +167,8 @@ class RubyPort : public MemObject
uint32_t getId() { return m_version; }
DrainState drain() override;
+ bool isCPUSequencer() { return m_isCPUSequencer; }
+
protected:
void trySendRetries();
void ruby_hit_callback(PacketPtr pkt);
@@ -218,6 +220,8 @@ class RubyPort : public MemObject
// that should be called when the Sequencer becomes available after a stall.
//
std::vector<MemSlavePort *> retryList;
+
+ bool m_isCPUSequencer;
};
#endif // __MEM_RUBY_SYSTEM_RUBYPORT_HH__
diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc
index 1ecd2e098..e1717e519 100644
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -107,7 +107,7 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
Sequencer* sequencer_ptr = NULL;
for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
- sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer());
+ sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer());
if (sequencer_ptr == NULL) {
sequencer_ptr = sequencer_map[cntrl];
}
diff --git a/src/mem/ruby/system/SConscript b/src/mem/ruby/system/SConscript
index 8c5077362..b67311bca 100644
--- a/src/mem/ruby/system/SConscript
+++ b/src/mem/ruby/system/SConscript
@@ -33,12 +33,22 @@ Import('*')
if env['PROTOCOL'] == 'None':
Return()
+if env['BUILD_GPU']:
+ SimObject('GPUCoalescer.py')
SimObject('RubySystem.py')
SimObject('Sequencer.py')
+SimObject('WeightedLRUReplacementPolicy.py')
+if env['BUILD_GPU']:
+ SimObject('VIPERCoalescer.py')
Source('CacheRecorder.cc')
Source('DMASequencer.cc')
+if env['BUILD_GPU']:
+ Source('GPUCoalescer.cc')
Source('RubyPort.cc')
Source('RubyPortProxy.cc')
Source('RubySystem.cc')
Source('Sequencer.cc')
+if env['BUILD_GPU']:
+ Source('VIPERCoalescer.cc')
+Source('WeightedLRUPolicy.cc')
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 50418c700..c2727b41d 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -63,6 +63,7 @@ Sequencer::Sequencer(const Params *p)
m_max_outstanding_requests = p->max_outstanding_requests;
m_deadlock_threshold = p->deadlock_threshold;
+ m_coreId = p->coreid; // for tracking the two CorePair sequencers
assert(m_max_outstanding_requests > 0);
assert(m_deadlock_threshold > 0);
assert(m_instCache_ptr != NULL);
@@ -593,6 +594,8 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
ContextID proc_id = pkt->req->hasContextId() ?
pkt->req->contextId() : InvalidContextID;
+ ContextID core_id = coreId();
+
// If valid, copy the pc to the ruby request
Addr pc = 0;
if (pkt->req->hasPC()) {
@@ -607,7 +610,7 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
nullptr : pkt->getPtr<uint8_t>(),
pkt->getSize(), pc, secondary_type,
RubyAccessMode_Supervisor, pkt,
- PrefetchBit_No, proc_id);
+ PrefetchBit_No, proc_id, core_id);
DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %#x %s\n",
curTick(), m_version, "Seq", "Begin", "", "",
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index 47af7ea1e..2a2f49587 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -99,6 +99,7 @@ class Sequencer : public RubyPort
void markRemoved();
void evictionCallback(Addr address);
void invalidateSC(Addr address);
+ int coreId() const { return m_coreId; }
void recordRequestType(SequencerRequestType requestType);
Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
@@ -198,6 +199,8 @@ class Sequencer : public RubyPort
Stats::Scalar m_load_waiting_on_store;
Stats::Scalar m_load_waiting_on_load;
+ int m_coreId;
+
bool m_usingNetworkTester;
//! Histogram for number of outstanding requests per cycle.
diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py
index 7c90eb29c..d6ee0aa2f 100644
--- a/src/mem/ruby/system/Sequencer.py
+++ b/src/mem/ruby/system/Sequencer.py
@@ -32,54 +32,58 @@ from m5.proxy import *
from MemObject import MemObject
class RubyPort(MemObject):
- type = 'RubyPort'
- abstract = True
- cxx_header = "mem/ruby/system/RubyPort.hh"
- version = Param.Int(0, "")
+ type = 'RubyPort'
+ abstract = True
+ cxx_header = "mem/ruby/system/RubyPort.hh"
+ version = Param.Int(0, "")
- slave = VectorSlavePort("CPU slave port")
- master = VectorMasterPort("CPU master port")
- pio_master_port = MasterPort("Ruby mem master port")
- mem_master_port = MasterPort("Ruby mem master port")
- pio_slave_port = SlavePort("Ruby pio slave port")
- mem_slave_port = SlavePort("Ruby memory port")
+ slave = VectorSlavePort("CPU slave port")
+ master = VectorMasterPort("CPU master port")
+ pio_master_port = MasterPort("Ruby mem master port")
+ mem_master_port = MasterPort("Ruby mem master port")
+ pio_slave_port = SlavePort("Ruby pio slave port")
+ mem_slave_port = SlavePort("Ruby memory port")
- using_ruby_tester = Param.Bool(False, "")
- no_retry_on_stall = Param.Bool(False, "")
- ruby_system = Param.RubySystem(Parent.any, "")
- system = Param.System(Parent.any, "system object")
- support_data_reqs = Param.Bool(True, "data cache requests supported")
- support_inst_reqs = Param.Bool(True, "inst cache requests supported")
+ using_ruby_tester = Param.Bool(False, "")
+ no_retry_on_stall = Param.Bool(False, "")
+ ruby_system = Param.RubySystem(Parent.any, "")
+ system = Param.System(Parent.any, "system object")
+ support_data_reqs = Param.Bool(True, "data cache requests supported")
+ support_inst_reqs = Param.Bool(True, "inst cache requests supported")
+ is_cpu_sequencer = Param.Bool(True, "connected to a cpu")
class RubyPortProxy(RubyPort):
- type = 'RubyPortProxy'
- cxx_header = "mem/ruby/system/RubyPortProxy.hh"
+ type = 'RubyPortProxy'
+ cxx_header = "mem/ruby/system/RubyPortProxy.hh"
class RubySequencer(RubyPort):
- type = 'RubySequencer'
- cxx_class = 'Sequencer'
- cxx_header = "mem/ruby/system/Sequencer.hh"
+ type = 'RubySequencer'
+ cxx_class = 'Sequencer'
+ cxx_header = "mem/ruby/system/Sequencer.hh"
- icache = Param.RubyCache("")
- dcache = Param.RubyCache("")
- # Cache latencies currently assessed at the beginning of each access
- # NOTE: Setting these values to a value greater than one will result in
- # O3 CPU pipeline bubbles and negatively impact performance
- # TODO: Latencies should be migrated into each top-level cache controller
- icache_hit_latency = Param.Cycles(1, "Inst cache hit latency")
- dcache_hit_latency = Param.Cycles(1, "Data cache hit latency")
- max_outstanding_requests = Param.Int(16,
- "max requests (incl. prefetches) outstanding")
- deadlock_threshold = Param.Cycles(500000,
- "max outstanding cycles for a request before deadlock/livelock declared")
- using_network_tester = Param.Bool(False, "")
+ icache = Param.RubyCache("")
+ dcache = Param.RubyCache("")
+ # Cache latencies currently assessed at the beginning of each access
+ # NOTE: Setting these values to a value greater than one will result in
+ # O3 CPU pipeline bubbles and negatively impact performance
+ # TODO: Latencies should be migrated into each top-level cache controller
+ icache_hit_latency = Param.Cycles(1, "Inst cache hit latency")
+ dcache_hit_latency = Param.Cycles(1, "Data cache hit latency")
+ max_outstanding_requests = Param.Int(16,
+ "max requests (incl. prefetches) outstanding")
+ deadlock_threshold = Param.Cycles(500000,
+ "max outstanding cycles for a request before deadlock/livelock declared")
+ using_network_tester = Param.Bool(False, "")
+ # id used by protocols that support multiple sequencers per controller
+ # 99 is the dummy default value
+ coreid = Param.Int(99, "CorePair core id")
class DMASequencer(MemObject):
- type = 'DMASequencer'
- cxx_header = "mem/ruby/system/DMASequencer.hh"
+ type = 'DMASequencer'
+ cxx_header = "mem/ruby/system/DMASequencer.hh"
- version = Param.Int(0, "")
- slave = SlavePort("Device slave port")
- using_ruby_tester = Param.Bool(False, "")
- ruby_system = Param.RubySystem(Parent.any, "")
- system = Param.System(Parent.any, "system object")
+ version = Param.Int(0, "")
+ slave = SlavePort("Device slave port")
+ using_ruby_tester = Param.Bool(False, "")
+ ruby_system = Param.RubySystem(Parent.any, "")
+ system = Param.System(Parent.any, "system object")
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
new file mode 100644
index 000000000..ca91f2723
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "base/misc.hh"
+#include "base/str.hh"
+#include "config/the_isa.hh"
+
+#if THE_ISA == X86_ISA
+#include "arch/x86/insts/microldstop.hh"
+
+#endif // X86_ISA
+#include "mem/ruby/system/VIPERCoalescer.hh"
+
+#include "cpu/testers/rubytest/RubyTester.hh"
+#include "debug/GPUCoalescer.hh"
+#include "debug/MemoryAccess.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/common/SubBlock.hh"
+#include "mem/ruby/network/MessageBuffer.hh"
+#include "mem/ruby/profiler/Profiler.hh"
+#include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/slicc_interface/RubyRequest.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "mem/ruby/system/GPUCoalescer.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "params/VIPERCoalescer.hh"
+
+using namespace std;
+
+VIPERCoalescer *
+VIPERCoalescerParams::create()
+{
+ return new VIPERCoalescer(this);
+}
+
+VIPERCoalescer::VIPERCoalescer(const Params *p)
+ : GPUCoalescer(p)
+{
+ m_max_wb_per_cycle=p->max_wb_per_cycle;
+ m_max_inv_per_cycle=p->max_inv_per_cycle;
+ m_outstanding_inv = 0;
+ m_outstanding_wb = 0;
+}
+
+VIPERCoalescer::~VIPERCoalescer()
+{
+}
+
+// Analyzes the packet to see if this request can be coalesced.
+// If request can be coalesced, this request is added to the reqCoalescer table
+// and makeRequest returns RequestStatus_Issued;
+// If this is the first request to a cacheline, request is added to both
+// newRequests queue and to the reqCoalescer table; makeRequest
+// returns RequestStatus_Issued.
+// If there is a pending request to this cacheline and this request
+// can't be coalesced, RequestStatus_Aliased is returned and
+// the packet needs to be reissued.
+RequestStatus
+VIPERCoalescer::makeRequest(PacketPtr pkt)
+{
+ if (m_outstanding_wb | m_outstanding_inv) {
+ DPRINTF(GPUCoalescer,
+ "There are %d Writebacks and %d Invalidatons\n",
+ m_outstanding_wb, m_outstanding_inv);
+ }
+ // Are we in the middle of a release
+ if ((m_outstanding_wb) > 0) {
+ if (pkt->req->isKernel()) {
+ // Everythign is fine
+ // Barriers and Kernel End scan coalesce
+ // If it is a Kerenl Begin flush the cache
+ if (pkt->req->isAcquire() && (m_outstanding_inv == 0)) {
+ invL1();
+ }
+
+ if (pkt->req->isRelease()) {
+ insertKernel(pkt->req->contextId(), pkt);
+ }
+
+ return RequestStatus_Issued;
+ }
+// return RequestStatus_Aliased;
+ } else if (pkt->req->isKernel() && pkt->req->isRelease()) {
+ // Flush Dirty Data on Kernel End
+ // isKernel + isRelease
+ insertKernel(pkt->req->contextId(), pkt);
+ wbL1();
+ if(m_outstanding_wb == 0) {
+ for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
+ newKernelEnds.push_back(it->first);
+ }
+ completeIssue();
+ }
+ return RequestStatus_Issued;
+ }
+ RequestStatus requestStatus = GPUCoalescer::makeRequest(pkt);
+ if (requestStatus!=RequestStatus_Issued) {
+ // Request not isssued
+ // enqueue Retry
+ DPRINTF(GPUCoalescer, "Request not issued by GPUCoaleser\n");
+ return requestStatus;
+ } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
+ // Invalidate clean Data on Kernel Begin
+ // isKernel + isAcquire
+ invL1();
+ } else if (pkt->req->isAcquire() && pkt->req->isRelease()) {
+ // Deschedule the AtomicAcqRel and
+ // Flush and Invalidate the L1 cache
+ invwbL1();
+ if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
+ DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
+ deschedule(issueEvent);
+ }
+ } else if (pkt->req->isRelease()) {
+ // Deschedule the StoreRel and
+ // Flush the L1 cache
+ wbL1();
+ if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
+ DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
+ deschedule(issueEvent);
+ }
+ } else if (pkt->req->isAcquire()) {
+ // LoadAcq or AtomicAcq
+ // Invalidate the L1 cache
+ invL1();
+ }
+ // Request was successful
+ if (m_outstanding_wb == 0) {
+ if (!issueEvent.scheduled()) {
+ DPRINTF(GPUCoalescer, "issueEvent Rescheduled\n");
+ schedule(issueEvent, curTick());
+ }
+ }
+ return RequestStatus_Issued;
+}
+
+void
+VIPERCoalescer::wbCallback(Addr addr)
+{
+ m_outstanding_wb--;
+ // if L1 Flush Complete
+ // attemnpt to schedule issueEvent
+ assert(((int) m_outstanding_wb) >= 0);
+ if (m_outstanding_wb == 0) {
+ for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
+ newKernelEnds.push_back(it->first);
+ }
+ completeIssue();
+ }
+ trySendRetries();
+}
+
+void
+VIPERCoalescer::invCallback(Addr addr)
+{
+ m_outstanding_inv--;
+ // if L1 Flush Complete
+ // attemnpt to schedule issueEvent
+ // This probably won't happen, since
+ // we dont wait on cache invalidations
+ if (m_outstanding_wb == 0) {
+ for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
+ newKernelEnds.push_back(it->first);
+ }
+ completeIssue();
+ }
+ trySendRetries();
+}
+
+/**
+ * Invalidate L1 cache (Acquire)
+ */
+void
+VIPERCoalescer::invL1()
+{
+ int size = m_dataCache_ptr->getNumBlocks();
+ DPRINTF(GPUCoalescer,
+ "There are %d Invalidations outstanding before Cache Walk\n",
+ m_outstanding_inv);
+ // Walk the cache
+ for (int i = 0; i < size; i++) {
+ Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+ // Evict Read-only data
+ std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+ clockEdge(), addr, (uint8_t*) 0, 0, 0,
+ RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor,
+ nullptr);
+ assert(m_mandatory_q_ptr != NULL);
+ m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+ m_outstanding_inv++;
+ }
+ DPRINTF(GPUCoalescer,
+ "There are %d Invalidatons outstanding after Cache Walk\n",
+ m_outstanding_inv);
+}
+
+/**
+ * Writeback L1 cache (Release)
+ */
+void
+VIPERCoalescer::wbL1()
+{
+ int size = m_dataCache_ptr->getNumBlocks();
+ DPRINTF(GPUCoalescer,
+ "There are %d Writebacks outstanding before Cache Walk\n",
+ m_outstanding_wb);
+ // Walk the cache
+ for (int i = 0; i < size; i++) {
+ Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+ // Write dirty data back
+ std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+ clockEdge(), addr, (uint8_t*) 0, 0, 0,
+ RubyRequestType_FLUSH, RubyAccessMode_Supervisor,
+ nullptr);
+ assert(m_mandatory_q_ptr != NULL);
+ m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+ m_outstanding_wb++;
+ }
+ DPRINTF(GPUCoalescer,
+ "There are %d Writebacks outstanding after Cache Walk\n",
+ m_outstanding_wb);
+}
+
+/**
+ * Invalidate and Writeback L1 cache (Acquire&Release)
+ */
+void
+VIPERCoalescer::invwbL1()
+{
+ int size = m_dataCache_ptr->getNumBlocks();
+ // Walk the cache
+ for(int i = 0; i < size; i++) {
+ Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+ // Evict Read-only data
+ std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+ clockEdge(), addr, (uint8_t*) 0, 0, 0,
+ RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor,
+ nullptr);
+ assert(m_mandatory_q_ptr != NULL);
+ m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+ m_outstanding_inv++;
+ }
+ // Walk the cache
+ for(int i = 0; i< size; i++) {
+ Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+ // Write dirty data back
+ std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+ clockEdge(), addr, (uint8_t*) 0, 0, 0,
+ RubyRequestType_FLUSH, RubyAccessMode_Supervisor,
+ nullptr);
+ assert(m_mandatory_q_ptr != NULL);
+ m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+ m_outstanding_wb++;
+ }
+}
diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh
new file mode 100644
index 000000000..af6e44e7f
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+#define __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+
+#include <iostream>
+
+#include "mem/protocol/PrefetchBit.hh"
+#include "mem/protocol/RubyAccessMode.hh"
+#include "mem/protocol/RubyRequestType.hh"
+#include "mem/ruby/common/Address.hh"
+#include "mem/ruby/common/Consumer.hh"
+#include "mem/ruby/system/GPUCoalescer.hh"
+#include "mem/ruby/system/RubyPort.hh"
+
+class DataBlock;
+class CacheMsg;
+class MachineID;
+class CacheMemory;
+
+class VIPERCoalescerParams;
+
+class VIPERCoalescer : public GPUCoalescer
+{
+ public:
+ typedef VIPERCoalescerParams Params;
+ VIPERCoalescer(const Params *);
+ ~VIPERCoalescer();
+ void wbCallback(Addr address);
+ void invCallback(Addr address);
+ RequestStatus makeRequest(PacketPtr pkt);
+ private:
+ void invL1();
+ void wbL1();
+ void invwbL1();
+ uint64_t m_outstanding_inv;
+ uint64_t m_outstanding_wb;
+ uint64_t m_max_inv_per_cycle;
+ uint64_t m_max_wb_per_cycle;
+};
+#endif // __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+
diff --git a/src/mem/ruby/system/VIPERCoalescer.py b/src/mem/ruby/system/VIPERCoalescer.py
new file mode 100644
index 000000000..05c74386f
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Steve Reinhardt
+# Brad Beckmann
+
+from m5.params import *
+from m5.proxy import *
+from GPUCoalescer import *
+
+class VIPERCoalescer(RubyGPUCoalescer):
+ type = 'VIPERCoalescer'
+ cxx_class = 'VIPERCoalescer'
+ cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
+ max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
+ max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
+ assume_rfo = False
diff --git a/src/mem/ruby/system/WeightedLRUPolicy.cc b/src/mem/ruby/system/WeightedLRUPolicy.cc
new file mode 100644
index 000000000..5baa4d9a5
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUPolicy.cc
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Derek Hower
+ */
+
+#include "mem/ruby/system/WeightedLRUPolicy.hh"
+
+WeightedLRUPolicy::WeightedLRUPolicy(const Params* p)
+ : AbstractReplacementPolicy(p), m_cache(p->cache)
+{
+ m_last_occ_ptr = new int*[m_num_sets];
+ for(unsigned i = 0; i < m_num_sets; i++){
+ m_last_occ_ptr[i] = new int[m_assoc];
+ for(unsigned j = 0; j < m_assoc; j++){
+ m_last_occ_ptr[i][j] = 0;
+ }
+ }
+}
+
+WeightedLRUPolicy *
+WeightedLRUReplacementPolicyParams::create()
+{
+ return new WeightedLRUPolicy(this);
+}
+
+WeightedLRUPolicy::~WeightedLRUPolicy()
+{
+ if (m_last_occ_ptr != NULL){
+ for (unsigned i = 0; i < m_num_sets; i++){
+ if (m_last_occ_ptr[i] != NULL){
+ delete[] m_last_occ_ptr[i];
+ }
+ }
+ delete[] m_last_occ_ptr;
+ }
+}
+
+void
+WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time)
+{
+ assert(index >= 0 && index < m_assoc);
+ assert(set >= 0 && set < m_num_sets);
+
+ m_last_ref_ptr[set][index] = time;
+}
+
+void
+WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time, int occupancy)
+{
+ assert(index >= 0 && index < m_assoc);
+ assert(set >= 0 && set < m_num_sets);
+
+ m_last_ref_ptr[set][index] = time;
+ m_last_occ_ptr[set][index] = occupancy;
+}
+
+int64_t
+WeightedLRUPolicy::getVictim(int64_t set) const
+{
+ Tick time, smallest_time;
+ int64_t smallest_index;
+
+ smallest_index = 0;
+ smallest_time = m_last_ref_ptr[set][0];
+ int smallest_weight = m_last_ref_ptr[set][0];
+
+ for (unsigned i = 1; i < m_assoc; i++) {
+
+ int weight = m_last_occ_ptr[set][i];
+ if (weight < smallest_weight) {
+ smallest_weight = weight;
+ smallest_index = i;
+ smallest_time = m_last_ref_ptr[set][i];
+ } else if (weight == smallest_weight) {
+ time = m_last_ref_ptr[set][i];
+ if (time < smallest_time) {
+ smallest_index = i;
+ smallest_time = time;
+ }
+ }
+ }
+ return smallest_index;
+}
diff --git a/src/mem/ruby/system/WeightedLRUPolicy.hh b/src/mem/ruby/system/WeightedLRUPolicy.hh
new file mode 100644
index 000000000..3150779b2
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUPolicy.hh
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__
+#define __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__
+
+#include "mem/ruby/structures/AbstractReplacementPolicy.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "params/WeightedLRUReplacementPolicy.hh"
+
+/* Simple true LRU replacement policy */
+
+class WeightedLRUPolicy : public AbstractReplacementPolicy
+{
+ public:
+ typedef WeightedLRUReplacementPolicyParams Params;
+ WeightedLRUPolicy(const Params* p);
+ ~WeightedLRUPolicy();
+
+ void touch(int64_t set, int64_t way, Tick time);
+ void touch(int64_t set, int64_t way, Tick time, int occupancy);
+ int64_t getVictim(int64_t set) const override;
+
+ bool useOccupancy() const { return true; }
+
+ CacheMemory * m_cache;
+ int **m_last_occ_ptr;
+};
+
+#endif // __MEM_RUBY_SYSTEM_WeightedLRUPolicy_HH__
diff --git a/src/mem/ruby/system/WeightedLRUReplacementPolicy.py b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py
new file mode 100644
index 000000000..e7de33496
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py
@@ -0,0 +1,45 @@
+#
+# Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Derek Hower
+#
+
+from m5.params import *
+from m5.proxy import *
+from MemObject import MemObject
+from ReplacementPolicy import ReplacementPolicy
+
+class WeightedLRUReplacementPolicy(ReplacementPolicy):
+ type = "WeightedLRUReplacementPolicy"
+ cxx_class = "WeightedLRUPolicy"
+ cxx_header = "mem/ruby/system/WeightedLRUPolicy.hh"
+ cache = Param.RubyCache("")
diff --git a/src/mem/slicc/symbols/StateMachine.py b/src/mem/slicc/symbols/StateMachine.py
index a530307ee..fc3f32c3d 100644
--- a/src/mem/slicc/symbols/StateMachine.py
+++ b/src/mem/slicc/symbols/StateMachine.py
@@ -35,13 +35,17 @@ import re
python_class_map = {
"int": "Int",
+ "NodeID": "Int",
"uint32_t" : "UInt32",
"std::string": "String",
"bool": "Bool",
"CacheMemory": "RubyCache",
"WireBuffer": "RubyWireBuffer",
"Sequencer": "RubySequencer",
+ "GPUCoalescer" : "RubyGPUCoalescer",
+ "VIPERCoalescer" : "VIPERCoalescer",
"DirectoryMemory": "RubyDirectoryMemory",
+ "PerfectCacheMemory": "RubyPerfectCacheMemory",
"MemoryControl": "MemoryControl",
"MessageBuffer": "MessageBuffer",
"DMASequencer": "DMASequencer",
@@ -305,7 +309,7 @@ class $c_ident : public AbstractController
void collateStats();
void recordCacheTrace(int cntrl, CacheRecorder* tr);
- Sequencer* getSequencer() const;
+ Sequencer* getCPUSequencer() const;
int functionalWriteBuffers(PacketPtr&);
@@ -527,8 +531,14 @@ $c_ident::$c_ident(const Params *p)
else:
code('m_${{param.ident}} = p->${{param.ident}};')
- if re.compile("sequencer").search(param.ident):
- code('m_${{param.ident}}_ptr->setController(this);')
+ if re.compile("sequencer").search(param.ident) or \
+ param.type_ast.type.c_ident == "GPUCoalescer" or \
+ param.type_ast.type.c_ident == "VIPERCoalescer":
+ code('''
+if (m_${{param.ident}}_ptr != NULL) {
+ m_${{param.ident}}_ptr->setController(this);
+}
+''')
code('''
@@ -670,6 +680,28 @@ $c_ident::init()
assert(param.pointer)
seq_ident = "m_%s_ptr" % param.ident
+ if seq_ident != "NULL":
+ code('''
+Sequencer*
+$c_ident::getCPUSequencer() const
+{
+ if (NULL != $seq_ident && $seq_ident->isCPUSequencer()) {
+ return $seq_ident;
+ } else {
+ return NULL;
+ }
+}
+''')
+ else:
+ code('''
+
+Sequencer*
+$c_ident::getCPUSequencer() const
+{
+ return NULL;
+}
+''')
+
code('''
void
@@ -796,12 +828,6 @@ $c_ident::getMemoryQueue() const
return $memq_ident;
}
-Sequencer*
-$c_ident::getSequencer() const
-{
- return $seq_ident;
-}
-
void
$c_ident::print(ostream& out) const
{