summaryrefslogtreecommitdiff
path: root/src/mem/ruby
diff options
context:
space:
mode:
authorTony Gutierrez <anthony.gutierrez@amd.com>2016-01-19 14:28:22 -0500
committerTony Gutierrez <anthony.gutierrez@amd.com>2016-01-19 14:28:22 -0500
commit1a7d3f9fcb76a68540dd948f91413533a383bfde (patch)
tree867510a147cd095f19499d26b7c02d27de4cae9d /src/mem/ruby
parent28e353e0403ea379d244a418e8dc8ee0b48187cf (diff)
downloadgem5-1a7d3f9fcb76a68540dd948f91413533a383bfde.tar.xz
gpu-compute: AMD's baseline GPU model
Diffstat (limited to 'src/mem/ruby')
-rw-r--r--src/mem/ruby/SConscript15
-rw-r--r--src/mem/ruby/profiler/Profiler.cc4
-rw-r--r--src/mem/ruby/slicc_interface/AbstractCacheEntry.hh6
-rw-r--r--src/mem/ruby/slicc_interface/AbstractController.cc6
-rw-r--r--src/mem/ruby/slicc_interface/AbstractController.hh3
-rw-r--r--src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh29
-rw-r--r--src/mem/ruby/structures/CacheMemory.cc50
-rw-r--r--src/mem/ruby/structures/CacheMemory.hh5
-rw-r--r--src/mem/ruby/structures/RubyCache.py1
-rw-r--r--src/mem/ruby/system/GPUCoalescer.cc1397
-rw-r--r--src/mem/ruby/system/GPUCoalescer.hh368
-rw-r--r--src/mem/ruby/system/GPUCoalescer.py48
-rw-r--r--src/mem/ruby/system/RubyPort.cc3
-rw-r--r--src/mem/ruby/system/RubyPort.hh4
-rw-r--r--src/mem/ruby/system/RubySystem.cc2
-rw-r--r--src/mem/ruby/system/SConscript10
-rw-r--r--src/mem/ruby/system/Sequencer.cc5
-rw-r--r--src/mem/ruby/system/Sequencer.hh3
-rw-r--r--src/mem/ruby/system/Sequencer.py86
-rw-r--r--src/mem/ruby/system/VIPERCoalescer.cc287
-rw-r--r--src/mem/ruby/system/VIPERCoalescer.hh75
-rw-r--r--src/mem/ruby/system/VIPERCoalescer.py45
-rw-r--r--src/mem/ruby/system/WeightedLRUPolicy.cc113
-rw-r--r--src/mem/ruby/system/WeightedLRUPolicy.hh62
-rw-r--r--src/mem/ruby/system/WeightedLRUReplacementPolicy.py45
25 files changed, 2610 insertions, 62 deletions
diff --git a/src/mem/ruby/SConscript b/src/mem/ruby/SConscript
index 16e932432..82a16c9b0 100644
--- a/src/mem/ruby/SConscript
+++ b/src/mem/ruby/SConscript
@@ -124,13 +124,20 @@ MakeInclude('common/Set.hh')
MakeInclude('common/WriteMask.hh')
MakeInclude('filters/AbstractBloomFilter.hh')
MakeInclude('network/MessageBuffer.hh')
-MakeInclude('structures/Prefetcher.hh')
MakeInclude('structures/CacheMemory.hh')
-MakeInclude('system/DMASequencer.hh')
MakeInclude('structures/DirectoryMemory.hh')
-MakeInclude('structures/WireBuffer.hh')
MakeInclude('structures/PerfectCacheMemory.hh')
MakeInclude('structures/PersistentTable.hh')
-MakeInclude('system/Sequencer.hh')
+MakeInclude('structures/Prefetcher.hh')
MakeInclude('structures/TBETable.hh')
MakeInclude('structures/TimerTable.hh')
+MakeInclude('structures/WireBuffer.hh')
+MakeInclude('system/DMASequencer.hh')
+MakeInclude('system/Sequencer.hh')
+
+# External types : Group "mem/protocol" : include "header.hh" to the bottom
+# of this MakeIncludes if it is referenced as
+# <# include "mem/protocol/header.hh"> in any file
+# generated_dir = Dir('../protocol')
+MakeInclude('system/GPUCoalescer.hh')
+MakeInclude('system/VIPERCoalescer.hh')
diff --git a/src/mem/ruby/profiler/Profiler.cc b/src/mem/ruby/profiler/Profiler.cc
index b3b37e5a6..7d3f20982 100644
--- a/src/mem/ruby/profiler/Profiler.cc
+++ b/src/mem/ruby/profiler/Profiler.cc
@@ -269,7 +269,7 @@ Profiler::collateStats()
it != m_ruby_system->m_abstract_controls[i].end(); ++it) {
AbstractController *ctr = (*it).second;
- Sequencer *seq = ctr->getSequencer();
+ Sequencer *seq = ctr->getCPUSequencer();
if (seq != NULL) {
m_outstandReqHist.add(seq->getOutstandReqHist());
}
@@ -282,7 +282,7 @@ Profiler::collateStats()
it != m_ruby_system->m_abstract_controls[i].end(); ++it) {
AbstractController *ctr = (*it).second;
- Sequencer *seq = ctr->getSequencer();
+ Sequencer *seq = ctr->getCPUSequencer();
if (seq != NULL) {
// add all the latencies
m_latencyHist.add(seq->getLatencyHist());
diff --git a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
index 926556781..cbd068c04 100644
--- a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
+++ b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
@@ -56,6 +56,12 @@ class AbstractCacheEntry : public AbstractEntry
virtual DataBlock& getDataBlk()
{ panic("getDataBlk() not implemented!"); }
+ int validBlocks;
+ virtual int& getNumValidBlocks()
+ {
+ return validBlocks;
+ }
+
// Functions for locking and unlocking the cache entry. These are required
// for supporting atomic memory accesses.
void setLocked(int context);
diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc
index 93fe50c88..458fde5bc 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.cc
+++ b/src/mem/ruby/slicc_interface/AbstractController.cc
@@ -200,6 +200,12 @@ AbstractController::unblock(Addr addr)
}
}
+bool
+AbstractController::isBlocked(Addr addr)
+{
+ return (m_block_map.count(addr) > 0);
+}
+
BaseMasterPort &
AbstractController::getMasterPort(const std::string &if_name,
PortID idx)
diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh
index 383507eed..4488ee3f4 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.hh
+++ b/src/mem/ruby/slicc_interface/AbstractController.hh
@@ -73,6 +73,7 @@ class AbstractController : public MemObject, public Consumer
// return instance name
void blockOnQueue(Addr, MessageBuffer*);
void unblock(Addr);
+ bool isBlocked(Addr);
virtual MessageBuffer* getMandatoryQueue() const = 0;
virtual MessageBuffer* getMemoryQueue() const = 0;
@@ -84,7 +85,7 @@ class AbstractController : public MemObject, public Consumer
virtual void regStats();
virtual void recordCacheTrace(int cntrl, CacheRecorder* tr) = 0;
- virtual Sequencer* getSequencer() const = 0;
+ virtual Sequencer* getCPUSequencer() const = 0;
//! These functions are used by ruby system to read/write the data blocks
//! that exist with in the controller.
diff --git a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
index 46071335e..cdedc2e14 100644
--- a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
+++ b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
@@ -43,6 +43,12 @@ map_Address_to_DirectoryNode(Addr addr)
return DirectoryMemory::mapAddressToDirectoryVersion(addr);
}
+inline NodeID
+map_Address_to_TCCdirNode(Addr addr)
+{
+ return DirectoryMemory::mapAddressToDirectoryVersion(addr);
+}
+
// used to determine the home directory
// returns a value between 0 and total_directories_within_the_system
inline MachineID
@@ -53,6 +59,22 @@ map_Address_to_Directory(Addr addr)
return mach;
}
+inline MachineID
+map_Address_to_RegionDir(Addr addr)
+{
+ MachineID mach = {MachineType_RegionDir,
+ map_Address_to_DirectoryNode(addr)};
+ return mach;
+}
+
+inline MachineID
+map_Address_to_TCCdir(Addr addr)
+{
+ MachineID mach =
+ {MachineType_TCCdir, map_Address_to_TCCdirNode(addr)};
+ return mach;
+}
+
inline NetDest
broadcast(MachineType type)
{
@@ -102,4 +124,11 @@ createMachineID(MachineType type, NodeID id)
return mach;
}
+inline MachineID
+MachineTypeAndNodeIDToMachineID(MachineType type, NodeID node)
+{
+ MachineID mach = {type, node};
+ return mach;
+}
+
#endif // __MEM_RUBY_SLICC_INTERFACE_COMPONENTMAPPINGS_HH__
diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc
index a8a3ba949..45fb85d05 100644
--- a/src/mem/ruby/structures/CacheMemory.cc
+++ b/src/mem/ruby/structures/CacheMemory.cc
@@ -35,6 +35,7 @@
#include "mem/protocol/AccessPermission.hh"
#include "mem/ruby/structures/CacheMemory.hh"
#include "mem/ruby/system/RubySystem.hh"
+#include "mem/ruby/system/WeightedLRUPolicy.hh"
using namespace std;
@@ -66,29 +67,27 @@ CacheMemory::CacheMemory(const Params *p)
m_start_index_bit = p->start_index_bit;
m_is_instruction_only_cache = p->is_icache;
m_resource_stalls = p->resourceStalls;
+ m_block_size = p->block_size; // may be 0 at this point. Updated in init()
}
void
CacheMemory::init()
{
- m_cache_num_sets = (m_cache_size / m_cache_assoc) /
- RubySystem::getBlockSizeBytes();
+ if (m_block_size == 0) {
+ m_block_size = RubySystem::getBlockSizeBytes();
+ }
+ m_cache_num_sets = (m_cache_size / m_cache_assoc) / m_block_size;
assert(m_cache_num_sets > 1);
m_cache_num_set_bits = floorLog2(m_cache_num_sets);
assert(m_cache_num_set_bits > 0);
- m_cache.resize(m_cache_num_sets);
- for (int i = 0; i < m_cache_num_sets; i++) {
- m_cache[i].resize(m_cache_assoc);
- for (int j = 0; j < m_cache_assoc; j++) {
- m_cache[i][j] = NULL;
- }
- }
+ m_cache.resize(m_cache_num_sets,
+ std::vector<AbstractCacheEntry*>(m_cache_assoc, nullptr));
}
CacheMemory::~CacheMemory()
{
- if (m_replacementPolicy_ptr != NULL)
+ if (m_replacementPolicy_ptr)
delete m_replacementPolicy_ptr;
for (int i = 0; i < m_cache_num_sets; i++) {
for (int j = 0; j < m_cache_assoc; j++) {
@@ -359,6 +358,37 @@ CacheMemory::setMRU(const AbstractCacheEntry *e)
}
void
+CacheMemory::setMRU(Addr address, int occupancy)
+{
+ int64_t cacheSet = addressToCacheSet(address);
+ int loc = findTagInSet(cacheSet, address);
+
+ if(loc != -1) {
+ if (m_replacementPolicy_ptr->useOccupancy()) {
+ (static_cast<WeightedLRUPolicy*>(m_replacementPolicy_ptr))->
+ touch(cacheSet, loc, curTick(), occupancy);
+ } else {
+ m_replacementPolicy_ptr->
+ touch(cacheSet, loc, curTick());
+ }
+ }
+}
+
+int
+CacheMemory::getReplacementWeight(int64_t set, int64_t loc)
+{
+ assert(set < m_cache_num_sets);
+ assert(loc < m_cache_assoc);
+ int ret = 0;
+ if(m_cache[set][loc] != NULL) {
+ ret = m_cache[set][loc]->getNumValidBlocks();
+ assert(ret >= 0);
+ }
+
+ return ret;
+}
+
+void
CacheMemory::recordCacheContents(int cntrl, CacheRecorder* tr) const
{
uint64_t warmedUpBlocks = 0;
diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh
index 72805b32b..5b30505d3 100644
--- a/src/mem/ruby/structures/CacheMemory.hh
+++ b/src/mem/ruby/structures/CacheMemory.hh
@@ -106,7 +106,8 @@ class CacheMemory : public SimObject
// Set this address to most recently used
void setMRU(Addr address);
- // Set this entry to most recently used
+ void setMRU(Addr addr, int occupancy);
+ int getReplacementWeight(int64_t set, int64_t loc);
void setMRU(const AbstractCacheEntry *e);
// Functions for locking and unlocking cache lines corresponding to the
@@ -146,6 +147,7 @@ class CacheMemory : public SimObject
Stats::Scalar numDataArrayStalls;
int getCacheSize() const { return m_cache_size; }
+ int getCacheAssoc() const { return m_cache_assoc; }
int getNumBlocks() const { return m_cache_num_sets * m_cache_assoc; }
Addr getAddressAtIdx(int idx) const;
@@ -182,6 +184,7 @@ class CacheMemory : public SimObject
int m_cache_assoc;
int m_start_index_bit;
bool m_resource_stalls;
+ int m_block_size;
};
std::ostream& operator<<(std::ostream& out, const CacheMemory& obj);
diff --git a/src/mem/ruby/structures/RubyCache.py b/src/mem/ruby/structures/RubyCache.py
index 4eb87ac74..9fc4726b0 100644
--- a/src/mem/ruby/structures/RubyCache.py
+++ b/src/mem/ruby/structures/RubyCache.py
@@ -42,6 +42,7 @@ class RubyCache(SimObject):
"")
start_index_bit = Param.Int(6, "index start, default 6 for 64-byte line");
is_icache = Param.Bool(False, "is instruction only cache");
+ block_size = Param.MemorySize("0B", "block size in bytes. 0 means default RubyBlockSize")
dataArrayBanks = Param.Int(1, "Number of banks for the data array")
tagArrayBanks = Param.Int(1, "Number of banks for the tag array")
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
new file mode 100644
index 000000000..db279bd3a
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "base/misc.hh"
+#include "base/str.hh"
+#include "config/the_isa.hh"
+
+#if THE_ISA == X86_ISA
+#include "arch/x86/insts/microldstop.hh"
+
+#endif // X86_ISA
+#include "mem/ruby/system/GPUCoalescer.hh"
+
+#include "cpu/testers/rubytest/RubyTester.hh"
+#include "debug/GPUCoalescer.hh"
+#include "debug/MemoryAccess.hh"
+#include "debug/ProtocolTrace.hh"
+#include "debug/RubyPort.hh"
+#include "debug/RubyStats.hh"
+#include "gpu-compute/shader.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/common/DataBlock.hh"
+#include "mem/ruby/common/SubBlock.hh"
+#include "mem/ruby/network/MessageBuffer.hh"
+#include "mem/ruby/profiler/Profiler.hh"
+#include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/slicc_interface/RubyRequest.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "params/RubyGPUCoalescer.hh"
+
+using namespace std;
+
+GPUCoalescer *
+RubyGPUCoalescerParams::create()
+{
+ return new GPUCoalescer(this);
+}
+
+HSAScope
+reqScopeToHSAScope(Request* req)
+{
+ HSAScope accessScope = HSAScope_UNSPECIFIED;
+ if (req->isScoped()) {
+ if (req->isWavefrontScope()) {
+ accessScope = HSAScope_WAVEFRONT;
+ } else if (req->isWorkgroupScope()) {
+ accessScope = HSAScope_WORKGROUP;
+ } else if (req->isDeviceScope()) {
+ accessScope = HSAScope_DEVICE;
+ } else if (req->isSystemScope()) {
+ accessScope = HSAScope_SYSTEM;
+ } else {
+ fatal("Bad scope type");
+ }
+ }
+ return accessScope;
+}
+
+HSASegment
+reqSegmentToHSASegment(Request* req)
+{
+ HSASegment accessSegment = HSASegment_GLOBAL;
+
+ if (req->isGlobalSegment()) {
+ accessSegment = HSASegment_GLOBAL;
+ } else if (req->isGroupSegment()) {
+ accessSegment = HSASegment_GROUP;
+ } else if (req->isPrivateSegment()) {
+ accessSegment = HSASegment_PRIVATE;
+ } else if (req->isKernargSegment()) {
+ accessSegment = HSASegment_KERNARG;
+ } else if (req->isReadonlySegment()) {
+ accessSegment = HSASegment_READONLY;
+ } else if (req->isSpillSegment()) {
+ accessSegment = HSASegment_SPILL;
+ } else if (req->isArgSegment()) {
+ accessSegment = HSASegment_ARG;
+ } else {
+ fatal("Bad segment type");
+ }
+
+ return accessSegment;
+}
+
+GPUCoalescer::GPUCoalescer(const Params *p)
+ : RubyPort(p), issueEvent(this), deadlockCheckEvent(this)
+{
+ m_store_waiting_on_load_cycles = 0;
+ m_store_waiting_on_store_cycles = 0;
+ m_load_waiting_on_store_cycles = 0;
+ m_load_waiting_on_load_cycles = 0;
+
+ m_outstanding_count = 0;
+
+ m_max_outstanding_requests = 0;
+ m_deadlock_threshold = 0;
+ m_instCache_ptr = nullptr;
+ m_dataCache_ptr = nullptr;
+
+ m_instCache_ptr = p->icache;
+ m_dataCache_ptr = p->dcache;
+ m_max_outstanding_requests = p->max_outstanding_requests;
+ m_deadlock_threshold = p->deadlock_threshold;
+
+ assert(m_max_outstanding_requests > 0);
+ assert(m_deadlock_threshold > 0);
+ assert(m_instCache_ptr);
+ assert(m_dataCache_ptr);
+
+ m_data_cache_hit_latency = p->dcache_hit_latency;
+
+ m_usingNetworkTester = p->using_network_tester;
+ assumingRfOCoherence = p->assume_rfo;
+}
+
+GPUCoalescer::~GPUCoalescer()
+{
+}
+
+void
+GPUCoalescer::wakeup()
+{
+ // Check for deadlock of any of the requests
+ Cycles current_time = curCycle();
+
+ // Check across all outstanding requests
+ int total_outstanding = 0;
+
+ RequestTable::iterator read = m_readRequestTable.begin();
+ RequestTable::iterator read_end = m_readRequestTable.end();
+ for (; read != read_end; ++read) {
+ GPUCoalescerRequest* request = read->second;
+ if (current_time - request->issue_time < m_deadlock_threshold)
+ continue;
+
+ panic("Possible Deadlock detected. Aborting!\n"
+ "version: %d request.paddr: 0x%x m_readRequestTable: %d "
+ "current time: %u issue_time: %d difference: %d\n", m_version,
+ request->pkt->getAddr(), m_readRequestTable.size(),
+ current_time * clockPeriod(), request->issue_time * clockPeriod(),
+ (current_time - request->issue_time)*clockPeriod());
+ }
+
+ RequestTable::iterator write = m_writeRequestTable.begin();
+ RequestTable::iterator write_end = m_writeRequestTable.end();
+ for (; write != write_end; ++write) {
+ GPUCoalescerRequest* request = write->second;
+ if (current_time - request->issue_time < m_deadlock_threshold)
+ continue;
+
+ panic("Possible Deadlock detected. Aborting!\n"
+ "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
+ "current time: %u issue_time: %d difference: %d\n", m_version,
+ request->pkt->getAddr(), m_writeRequestTable.size(),
+ current_time * clockPeriod(), request->issue_time * clockPeriod(),
+ (current_time - request->issue_time) * clockPeriod());
+ }
+
+ total_outstanding += m_writeRequestTable.size();
+ total_outstanding += m_readRequestTable.size();
+
+ assert(m_outstanding_count == total_outstanding);
+
+ if (m_outstanding_count > 0) {
+ // If there are still outstanding requests, keep checking
+ schedule(deadlockCheckEvent,
+ m_deadlock_threshold * clockPeriod() +
+ curTick());
+ }
+}
+
+void
+GPUCoalescer::resetStats()
+{
+ m_latencyHist.reset();
+ m_missLatencyHist.reset();
+ for (int i = 0; i < RubyRequestType_NUM; i++) {
+ m_typeLatencyHist[i]->reset();
+ m_missTypeLatencyHist[i]->reset();
+ for (int j = 0; j < MachineType_NUM; j++) {
+ m_missTypeMachLatencyHist[i][j]->reset();
+ }
+ }
+
+ for (int i = 0; i < MachineType_NUM; i++) {
+ m_missMachLatencyHist[i]->reset();
+
+ m_IssueToInitialDelayHist[i]->reset();
+ m_InitialToForwardDelayHist[i]->reset();
+ m_ForwardToFirstResponseDelayHist[i]->reset();
+ m_FirstResponseToCompletionDelayHist[i]->reset();
+ }
+}
+
+void
+GPUCoalescer::printProgress(ostream& out) const
+{
+}
+
+RequestStatus
+GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
+{
+ Addr line_addr = makeLineAddress(pkt->getAddr());
+
+ if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
+ return RequestStatus_BufferFull;
+ }
+
+ if(m_controller->isBlocked(line_addr) &&
+ request_type != RubyRequestType_Locked_RMW_Write) {
+ return RequestStatus_Aliased;
+ }
+
+ if ((request_type == RubyRequestType_ST) ||
+ (request_type == RubyRequestType_ATOMIC) ||
+ (request_type == RubyRequestType_ATOMIC_RETURN) ||
+ (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+ (request_type == RubyRequestType_RMW_Read) ||
+ (request_type == RubyRequestType_RMW_Write) ||
+ (request_type == RubyRequestType_Load_Linked) ||
+ (request_type == RubyRequestType_Store_Conditional) ||
+ (request_type == RubyRequestType_Locked_RMW_Read) ||
+ (request_type == RubyRequestType_Locked_RMW_Write) ||
+ (request_type == RubyRequestType_FLUSH)) {
+
+ // Check if there is any outstanding read request for the same
+ // cache line.
+ if (m_readRequestTable.count(line_addr) > 0) {
+ m_store_waiting_on_load_cycles++;
+ return RequestStatus_Aliased;
+ }
+
+ if (m_writeRequestTable.count(line_addr) > 0) {
+ // There is an outstanding write request for the cache line
+ m_store_waiting_on_store_cycles++;
+ return RequestStatus_Aliased;
+ }
+ } else {
+ // Check if there is any outstanding write request for the same
+ // cache line.
+ if (m_writeRequestTable.count(line_addr) > 0) {
+ m_load_waiting_on_store_cycles++;
+ return RequestStatus_Aliased;
+ }
+
+ if (m_readRequestTable.count(line_addr) > 0) {
+ // There is an outstanding read request for the cache line
+ m_load_waiting_on_load_cycles++;
+ return RequestStatus_Aliased;
+ }
+ }
+
+ return RequestStatus_Ready;
+
+}
+
+
+
+// sets the kernelEndList
+void
+GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
+{
+ // Don't know if this will happen or is possible
+ // but I just want to be careful and not have it become
+ // simulator hang in the future
+ DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
+ assert(kernelEndList.count(wavefront_id) == 0);
+
+ kernelEndList[wavefront_id] = pkt;
+ DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
+ kernelEndList.size());
+}
+
+
+// Insert the request on the correct request table. Return true if
+// the entry was already present.
+bool
+GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
+{
+ assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
+ pkt->req->isLockedRMW() ||
+ !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
+
+ int total_outstanding M5_VAR_USED =
+ m_writeRequestTable.size() + m_readRequestTable.size();
+
+ assert(m_outstanding_count == total_outstanding);
+
+ // See if we should schedule a deadlock check
+ if (deadlockCheckEvent.scheduled() == false) {
+ schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
+ }
+
+ Addr line_addr = makeLineAddress(pkt->getAddr());
+ if ((request_type == RubyRequestType_ST) ||
+ (request_type == RubyRequestType_ATOMIC) ||
+ (request_type == RubyRequestType_ATOMIC_RETURN) ||
+ (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+ (request_type == RubyRequestType_RMW_Read) ||
+ (request_type == RubyRequestType_RMW_Write) ||
+ (request_type == RubyRequestType_Load_Linked) ||
+ (request_type == RubyRequestType_Store_Conditional) ||
+ (request_type == RubyRequestType_Locked_RMW_Read) ||
+ (request_type == RubyRequestType_Locked_RMW_Write) ||
+ (request_type == RubyRequestType_FLUSH)) {
+
+ pair<RequestTable::iterator, bool> r =
+ m_writeRequestTable.insert(RequestTable::value_type(line_addr,
+ (GPUCoalescerRequest*) NULL));
+ if (r.second) {
+ RequestTable::iterator i = r.first;
+ i->second = new GPUCoalescerRequest(pkt, request_type,
+ curCycle());
+ DPRINTF(GPUCoalescer,
+ "Inserting write request for paddr %#x for type %d\n",
+ pkt->req->getPaddr(), i->second->m_type);
+ m_outstanding_count++;
+ } else {
+ return true;
+ }
+ } else {
+ pair<RequestTable::iterator, bool> r =
+ m_readRequestTable.insert(RequestTable::value_type(line_addr,
+ (GPUCoalescerRequest*) NULL));
+
+ if (r.second) {
+ RequestTable::iterator i = r.first;
+ i->second = new GPUCoalescerRequest(pkt, request_type,
+ curCycle());
+ DPRINTF(GPUCoalescer,
+ "Inserting read request for paddr %#x for type %d\n",
+ pkt->req->getPaddr(), i->second->m_type);
+ m_outstanding_count++;
+ } else {
+ return true;
+ }
+ }
+
+ m_outstandReqHist.sample(m_outstanding_count);
+
+ total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
+ assert(m_outstanding_count == total_outstanding);
+
+ return false;
+}
+
+void
+GPUCoalescer::markRemoved()
+{
+ m_outstanding_count--;
+ assert(m_outstanding_count ==
+ m_writeRequestTable.size() + m_readRequestTable.size());
+}
+
+void
+GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
+{
+ assert(m_outstanding_count ==
+ m_writeRequestTable.size() + m_readRequestTable.size());
+
+ Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
+ if ((srequest->m_type == RubyRequestType_ST) ||
+ (srequest->m_type == RubyRequestType_RMW_Read) ||
+ (srequest->m_type == RubyRequestType_RMW_Write) ||
+ (srequest->m_type == RubyRequestType_Load_Linked) ||
+ (srequest->m_type == RubyRequestType_Store_Conditional) ||
+ (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
+ (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
+ m_writeRequestTable.erase(line_addr);
+ } else {
+ m_readRequestTable.erase(line_addr);
+ }
+
+ markRemoved();
+}
+
+bool
+GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
+{
+ //
+ // The success flag indicates whether the LLSC operation was successful.
+ // LL ops will always succeed, but SC may fail if the cache line is no
+ // longer locked.
+ //
+ bool success = true;
+ if (request->m_type == RubyRequestType_Store_Conditional) {
+ if (!m_dataCache_ptr->isLocked(address, m_version)) {
+ //
+ // For failed SC requests, indicate the failure to the cpu by
+ // setting the extra data to zero.
+ //
+ request->pkt->req->setExtraData(0);
+ success = false;
+ } else {
+ //
+ // For successful SC requests, indicate the success to the cpu by
+ // setting the extra data to one.
+ //
+ request->pkt->req->setExtraData(1);
+ }
+ //
+ // Independent of success, all SC operations must clear the lock
+ //
+ m_dataCache_ptr->clearLocked(address);
+ } else if (request->m_type == RubyRequestType_Load_Linked) {
+ //
+ // Note: To fully follow Alpha LLSC semantics, should the LL clear any
+ // previously locked cache lines?
+ //
+ m_dataCache_ptr->setLocked(address, m_version);
+ } else if ((m_dataCache_ptr->isTagPresent(address)) &&
+ (m_dataCache_ptr->isLocked(address, m_version))) {
+ //
+ // Normal writes should clear the locked address
+ //
+ m_dataCache_ptr->clearLocked(address);
+ }
+ return success;
+}
+
+void
+GPUCoalescer::writeCallback(Addr address, DataBlock& data)
+{
+ writeCallback(address, MachineType_NULL, data);
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data)
+{
+ writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime)
+{
+ writeCallback(address, mach, data,
+ initialRequestTime, forwardRequestTime, firstResponseTime,
+ false);
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion)
+{
+ assert(address == makeLineAddress(address));
+
+ DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
+ assert(m_writeRequestTable.count(makeLineAddress(address)));
+
+ RequestTable::iterator i = m_writeRequestTable.find(address);
+ assert(i != m_writeRequestTable.end());
+ GPUCoalescerRequest* request = i->second;
+
+ m_writeRequestTable.erase(i);
+ markRemoved();
+
+ assert((request->m_type == RubyRequestType_ST) ||
+ (request->m_type == RubyRequestType_ATOMIC) ||
+ (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
+ (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+ (request->m_type == RubyRequestType_RMW_Read) ||
+ (request->m_type == RubyRequestType_RMW_Write) ||
+ (request->m_type == RubyRequestType_Load_Linked) ||
+ (request->m_type == RubyRequestType_Store_Conditional) ||
+ (request->m_type == RubyRequestType_Locked_RMW_Read) ||
+ (request->m_type == RubyRequestType_Locked_RMW_Write) ||
+ (request->m_type == RubyRequestType_FLUSH));
+
+
+ //
+ // For Alpha, properly handle LL, SC, and write requests with respect to
+ // locked cache blocks.
+ //
+ // Not valid for Network_test protocl
+ //
+ bool success = true;
+ if(!m_usingNetworkTester)
+ success = handleLlsc(address, request);
+
+ if (request->m_type == RubyRequestType_Locked_RMW_Read) {
+ m_controller->blockOnQueue(address, m_mandatory_q_ptr);
+ } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
+ m_controller->unblock(address);
+ }
+
+ hitCallback(request, mach, data, success,
+ request->issue_time, forwardRequestTime, firstResponseTime,
+ isRegion);
+}
+
+void
+GPUCoalescer::readCallback(Addr address, DataBlock& data)
+{
+ readCallback(address, MachineType_NULL, data);
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data)
+{
+ readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime)
+{
+
+ readCallback(address, mach, data,
+ initialRequestTime, forwardRequestTime, firstResponseTime,
+ false);
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion)
+{
+ assert(address == makeLineAddress(address));
+ assert(m_readRequestTable.count(makeLineAddress(address)));
+
+ DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
+ RequestTable::iterator i = m_readRequestTable.find(address);
+ assert(i != m_readRequestTable.end());
+ GPUCoalescerRequest* request = i->second;
+
+ m_readRequestTable.erase(i);
+ markRemoved();
+
+ assert((request->m_type == RubyRequestType_LD) ||
+ (request->m_type == RubyRequestType_IFETCH));
+
+ hitCallback(request, mach, data, true,
+ request->issue_time, forwardRequestTime, firstResponseTime,
+ isRegion);
+}
+
+void
+GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
+ MachineType mach,
+ DataBlock& data,
+ bool success,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion)
+{
+ PacketPtr pkt = srequest->pkt;
+ Addr request_address = pkt->getAddr();
+ Addr request_line_address = makeLineAddress(request_address);
+
+ RubyRequestType type = srequest->m_type;
+
+ // Set this cache entry to the most recently used
+ if (type == RubyRequestType_IFETCH) {
+ if (m_instCache_ptr->isTagPresent(request_line_address))
+ m_instCache_ptr->setMRU(request_line_address);
+ } else {
+ if (m_dataCache_ptr->isTagPresent(request_line_address))
+ m_dataCache_ptr->setMRU(request_line_address);
+ }
+
+ recordMissLatency(srequest, mach,
+ initialRequestTime,
+ forwardRequestTime,
+ firstResponseTime,
+ success, isRegion);
+ // update the data
+ //
+ // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
+ int len = reqCoalescer[request_line_address].size();
+ std::vector<PacketPtr> mylist;
+ for (int i = 0; i < len; ++i) {
+ PacketPtr pkt = reqCoalescer[request_line_address][i].first;
+ assert(type ==
+ reqCoalescer[request_line_address][i].second[PrimaryType]);
+ request_address = pkt->getAddr();
+ request_line_address = makeLineAddress(pkt->getAddr());
+ if (pkt->getPtr<uint8_t>()) {
+ if ((type == RubyRequestType_LD) ||
+ (type == RubyRequestType_ATOMIC) ||
+ (type == RubyRequestType_ATOMIC_RETURN) ||
+ (type == RubyRequestType_IFETCH) ||
+ (type == RubyRequestType_RMW_Read) ||
+ (type == RubyRequestType_Locked_RMW_Read) ||
+ (type == RubyRequestType_Load_Linked)) {
+ memcpy(pkt->getPtr<uint8_t>(),
+ data.getData(getOffset(request_address),
+ pkt->getSize()),
+ pkt->getSize());
+ } else {
+ data.setData(pkt->getPtr<uint8_t>(),
+ getOffset(request_address), pkt->getSize());
+ }
+ } else {
+ DPRINTF(MemoryAccess,
+ "WARNING. Data not transfered from Ruby to M5 for type " \
+ "%s\n",
+ RubyRequestType_to_string(type));
+ }
+
+ // If using the RubyTester, update the RubyTester sender state's
+ // subBlock with the recieved data. The tester will later access
+ // this state.
+ // Note: RubyPort will access it's sender state before the
+ // RubyTester.
+ if (m_usingRubyTester) {
+ RubyPort::SenderState *requestSenderState =
+ safe_cast<RubyPort::SenderState*>(pkt->senderState);
+ RubyTester::SenderState* testerSenderState =
+ safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+ testerSenderState->subBlock.mergeFrom(data);
+ }
+
+ mylist.push_back(pkt);
+ }
+ delete srequest;
+ reqCoalescer.erase(request_line_address);
+ assert(!reqCoalescer.count(request_line_address));
+
+
+
+ completeHitCallback(mylist, len);
+}
+
+bool
+GPUCoalescer::empty() const
+{
+ return m_writeRequestTable.empty() && m_readRequestTable.empty();
+}
+
+// Analyzes the packet to see if this request can be coalesced.
+// If request can be coalesced, this request is added to the reqCoalescer table
+// and makeRequest returns RequestStatus_Issued;
+// If this is the first request to a cacheline, request is added to both
+// newRequests queue and to the reqCoalescer table; makeRequest
+// returns RequestStatus_Issued.
+// If there is a pending request to this cacheline and this request
+// can't be coalesced, RequestStatus_Aliased is returned and
+// the packet needs to be reissued.
+RequestStatus
+GPUCoalescer::makeRequest(PacketPtr pkt)
+{
+ // Check for GPU Barrier Kernel End or Kernel Begin
+ // Leave these to be handled by the child class
+ // Kernel End/Barrier = isFlush + isRelease
+ // Kernel Begin = isFlush + isAcquire
+ if (pkt->req->isKernel()) {
+ if (pkt->req->isAcquire()){
+ // This is a Kernel Begin leave handling to
+ // virtual xCoalescer::makeRequest
+ return RequestStatus_Issued;
+ }else if(pkt->req->isRelease()) {
+ // This is a Kernel End leave handling to
+ // virtual xCoalescer::makeRequest
+ // If we are here then we didn't call
+ // a virtual version of this function
+ // so we will also schedule the callback
+ int wf_id = 0;
+ if (pkt->req->hasContextId()) {
+ wf_id = pkt->req->contextId();
+ }
+ insertKernel(wf_id, pkt);
+ newKernelEnds.push_back(wf_id);
+ if (!issueEvent.scheduled()) {
+ schedule(issueEvent, curTick());
+ }
+ return RequestStatus_Issued;
+ }
+ }
+
+ // If number of outstanding requests greater than the max allowed,
+ // return RequestStatus_BufferFull. This logic can be extended to
+ // support proper backpressure.
+ if (m_outstanding_count >= m_max_outstanding_requests) {
+ return RequestStatus_BufferFull;
+ }
+
+ RubyRequestType primary_type = RubyRequestType_NULL;
+ RubyRequestType secondary_type = RubyRequestType_NULL;
+
+ if (pkt->isLLSC()) {
+ //
+ // Alpha LL/SC instructions need to be handled carefully by the cache
+ // coherence protocol to ensure they follow the proper semantics. In
+ // particular, by identifying the operations as atomic, the protocol
+ // should understand that migratory sharing optimizations should not
+ // be performed (i.e. a load between the LL and SC should not steal
+ // away exclusive permission).
+ //
+ if (pkt->isWrite()) {
+ primary_type = RubyRequestType_Store_Conditional;
+ } else {
+ assert(pkt->isRead());
+ primary_type = RubyRequestType_Load_Linked;
+ }
+ secondary_type = RubyRequestType_ATOMIC;
+ } else if (pkt->req->isLockedRMW()) {
+ //
+ // x86 locked instructions are translated to store cache coherence
+ // requests because these requests should always be treated as read
+ // exclusive operations and should leverage any migratory sharing
+ // optimization built into the protocol.
+ //
+ if (pkt->isWrite()) {
+ primary_type = RubyRequestType_Locked_RMW_Write;
+ } else {
+ assert(pkt->isRead());
+ primary_type = RubyRequestType_Locked_RMW_Read;
+ }
+ secondary_type = RubyRequestType_ST;
+ } else if (pkt->isAtomicOp()) {
+ //
+ // GPU Atomic Operation
+ //
+ primary_type = RubyRequestType_ATOMIC;
+ secondary_type = RubyRequestType_ATOMIC;
+ } else {
+ if (pkt->isRead()) {
+ if (pkt->req->isInstFetch()) {
+ primary_type = secondary_type = RubyRequestType_IFETCH;
+ } else {
+#if THE_ISA == X86_ISA
+ uint32_t flags = pkt->req->getFlags();
+ bool storeCheck = flags &
+ (TheISA::StoreCheck << TheISA::FlagShift);
+#else
+ bool storeCheck = false;
+#endif // X86_ISA
+ if (storeCheck) {
+ primary_type = RubyRequestType_RMW_Read;
+ secondary_type = RubyRequestType_ST;
+ } else {
+ primary_type = secondary_type = RubyRequestType_LD;
+ }
+ }
+ } else if (pkt->isWrite()) {
+ //
+ // Note: M5 packets do not differentiate ST from RMW_Write
+ //
+ primary_type = secondary_type = RubyRequestType_ST;
+ } else if (pkt->isFlush()) {
+ primary_type = secondary_type = RubyRequestType_FLUSH;
+ } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
+ if (assumingRfOCoherence) {
+ // If we reached here, this request must be a memFence
+ // and the protocol implements RfO, the coalescer can
+ // assume sequentially consistency and schedule the callback
+ // immediately.
+ // Currently the code implements fence callbacks
+ // by reusing the mechanism for kernel completions.
+ // This should be fixed.
+ int wf_id = 0;
+ if (pkt->req->hasContextId()) {
+ wf_id = pkt->req->contextId();
+ }
+ insertKernel(wf_id, pkt);
+ newKernelEnds.push_back(wf_id);
+ if (!issueEvent.scheduled()) {
+ schedule(issueEvent, curTick());
+ }
+ return RequestStatus_Issued;
+ } else {
+ // If not RfO, return issued here and let the child coalescer
+ // take care of it.
+ return RequestStatus_Issued;
+ }
+ } else {
+ panic("Unsupported ruby packet type\n");
+ }
+ }
+
+ // Check if there is any pending request to this cache line from
+ // previous cycles.
+ // If there is a pending request, return aliased. Since coalescing
+ // across time is not permitted, aliased requests are not coalesced.
+ // If a request for this address has already been issued, we must block
+ RequestStatus status = getRequestStatus(pkt, primary_type);
+ if (status != RequestStatus_Ready)
+ return status;
+
+ Addr line_addr = makeLineAddress(pkt->getAddr());
+
+ // Check if this request can be coalesced with previous
+ // requests from this cycle.
+ if (!reqCoalescer.count(line_addr)) {
+ // This is the first access to this cache line.
+ // A new request to the memory subsystem has to be
+ // made in the next cycle for this cache line, so
+ // add this line addr to the "newRequests" queue
+ newRequests.push_back(line_addr);
+
+ // There was a request to this cache line in this cycle,
+ // let us see if we can coalesce this request with the previous
+ // requests from this cycle
+ } else if (primary_type !=
+ reqCoalescer[line_addr][0].second[PrimaryType]) {
+ // can't coalesce loads, stores and atomics!
+ return RequestStatus_Aliased;
+ } else if (pkt->req->isLockedRMW() ||
+ reqCoalescer[line_addr][0].first->req->isLockedRMW()) {
+ // can't coalesce locked accesses, but can coalesce atomics!
+ return RequestStatus_Aliased;
+ } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
+ pkt->req->contextId() !=
+ reqCoalescer[line_addr][0].first->req->contextId()) {
+ // can't coalesce releases from different wavefronts
+ return RequestStatus_Aliased;
+ }
+
+ // in addition to the packet, we need to save both request types
+ reqCoalescer[line_addr].push_back(
+ RequestDesc(pkt, std::vector<RubyRequestType>()) );
+ reqCoalescer[line_addr].back().second.push_back(primary_type);
+ reqCoalescer[line_addr].back().second.push_back(secondary_type);
+ if (!issueEvent.scheduled())
+ schedule(issueEvent, curTick());
+ // TODO: issue hardware prefetches here
+ return RequestStatus_Issued;
+}
+
+void
+GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
+{
+
+ int proc_id = -1;
+ if (pkt != NULL && pkt->req->hasContextId()) {
+ proc_id = pkt->req->contextId();
+ }
+
+ // If valid, copy the pc to the ruby request
+ Addr pc = 0;
+ if (pkt->req->hasPC()) {
+ pc = pkt->req->getPC();
+ }
+
+ // At the moment setting scopes only counts
+ // for GPU spill space accesses
+ // which is pkt->req->isStack()
+ // this scope is REPLACE since it
+ // does not need to be flushed at the end
+ // of a kernel Private and local may need
+ // to be visible at the end of the kernel
+ HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
+ HSAScope accessScope = reqScopeToHSAScope(pkt->req);
+
+ Addr line_addr = makeLineAddress(pkt->getAddr());
+
+ // Creating WriteMask that records written bytes
+ // and atomic operations. This enables partial writes
+ // and partial reads of those writes
+ DataBlock dataBlock;
+ dataBlock.clear();
+ uint32_t blockSize = RubySystem::getBlockSizeBytes();
+ std::vector<bool> accessMask(blockSize,false);
+ std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
+ uint32_t tableSize = reqCoalescer[line_addr].size();
+ for (int i = 0; i < tableSize; i++) {
+ PacketPtr tmpPkt = reqCoalescer[line_addr][i].first;
+ uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
+ uint32_t tmpSize = tmpPkt->getSize();
+ if (tmpPkt->isAtomicOp()) {
+ std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
+ tmpPkt->getAtomicOp());
+ atomicOps.push_back(tmpAtomicOp);
+ } else if(tmpPkt->isWrite()) {
+ dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
+ tmpOffset, tmpSize);
+ }
+ for (int j = 0; j < tmpSize; j++) {
+ accessMask[tmpOffset + j] = true;
+ }
+ }
+ std::shared_ptr<RubyRequest> msg;
+ if (pkt->isAtomicOp()) {
+ msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+ pkt->getPtr<uint8_t>(),
+ pkt->getSize(), pc, secondary_type,
+ RubyAccessMode_Supervisor, pkt,
+ PrefetchBit_No, proc_id, 100,
+ blockSize, accessMask,
+ dataBlock, atomicOps,
+ accessScope, accessSegment);
+ } else {
+ msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+ pkt->getPtr<uint8_t>(),
+ pkt->getSize(), pc, secondary_type,
+ RubyAccessMode_Supervisor, pkt,
+ PrefetchBit_No, proc_id, 100,
+ blockSize, accessMask,
+ dataBlock,
+ accessScope, accessSegment);
+ }
+ DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
+ curTick(), m_version, "Coal", "Begin", "", "",
+ printAddress(msg->getPhysicalAddress()),
+ RubyRequestType_to_string(secondary_type));
+
+ fatal_if(secondary_type == RubyRequestType_IFETCH,
+ "there should not be any I-Fetch requests in the GPU Coalescer");
+
+ // Send the message to the cache controller
+ fatal_if(m_data_cache_hit_latency == 0,
+ "should not have a latency of zero");
+
+ assert(m_mandatory_q_ptr);
+ m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+}
+
+template <class KEY, class VALUE>
+std::ostream &
+operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
+{
+ out << "[";
+ for (auto i = map.begin(); i != map.end(); ++i)
+ out << " " << i->first << "=" << i->second;
+ out << " ]";
+
+ return out;
+}
+
+void
+GPUCoalescer::print(ostream& out) const
+{
+ out << "[GPUCoalescer: " << m_version
+ << ", outstanding requests: " << m_outstanding_count
+ << ", read request table: " << m_readRequestTable
+ << ", write request table: " << m_writeRequestTable
+ << "]";
+}
+
+// this can be called from setState whenever coherence permissions are
+// upgraded when invoked, coherence violations will be checked for the
+// given block
+void
+GPUCoalescer::checkCoherence(Addr addr)
+{
+#ifdef CHECK_COHERENCE
+ m_ruby_system->checkGlobalCoherenceInvariant(addr);
+#endif
+}
+
+void
+GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
+ DPRINTF(RubyStats, "Recorded statistic: %s\n",
+ SequencerRequestType_to_string(requestType));
+}
+
+GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq)
+ : Event(Progress_Event_Pri), seq(_seq)
+{
+}
+
+
+void
+GPUCoalescer::completeIssue()
+{
+ // newRequests has the cacheline addresses of all the
+ // requests which need to be issued to the memory subsystem
+ // in this cycle
+ int len = newRequests.size();
+ DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
+ for (int i = 0; i < len; ++i) {
+ // Get the requests from reqCoalescer table. Get only the
+ // first request for each cacheline, the remaining requests
+ // can be coalesced with the first request. So, only
+ // one request is issued per cacheline.
+ RequestDesc info = reqCoalescer[newRequests[i]][0];
+ PacketPtr pkt = info.first;
+ DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
+ i, pkt->req->getPaddr());
+ // Insert this request to the read/writeRequestTables. These tables
+ // are used to track aliased requests in makeRequest subroutine
+ bool found = insertRequest(pkt, info.second[PrimaryType]);
+
+ if (found) {
+ panic("GPUCoalescer::makeRequest should never be called if the "
+ "request is already outstanding\n");
+ }
+
+ // Issue request to ruby subsystem
+ issueRequest(pkt, info.second[SecondaryType]);
+ }
+ newRequests.clear();
+
+ // have Kernel End releases been issued this cycle
+ len = newKernelEnds.size();
+ for (int i = 0; i < len; i++) {
+ kernelCallback(newKernelEnds[i]);
+ }
+ newKernelEnds.clear();
+}
+
+void
+GPUCoalescer::IssueEvent::process()
+{
+ seq->completeIssue();
+}
+
+const char *
+GPUCoalescer::IssueEvent::description() const
+{
+ return "Issue coalesced request";
+}
+
+void
+GPUCoalescer::evictionCallback(Addr address)
+{
+ ruby_eviction_callback(address);
+}
+
+void
+GPUCoalescer::kernelCallback(int wavefront_id)
+{
+ assert(kernelEndList.count(wavefront_id));
+
+ ruby_hit_callback(kernelEndList[wavefront_id]);
+
+ kernelEndList.erase(wavefront_id);
+}
+
+void
+GPUCoalescer::atomicCallback(Addr address,
+ MachineType mach,
+ const DataBlock& data)
+{
+ assert(address == makeLineAddress(address));
+
+ DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
+ assert(m_writeRequestTable.count(makeLineAddress(address)));
+
+ RequestTable::iterator i = m_writeRequestTable.find(address);
+ assert(i != m_writeRequestTable.end());
+ GPUCoalescerRequest* srequest = i->second;
+
+ m_writeRequestTable.erase(i);
+ markRemoved();
+
+ assert((srequest->m_type == RubyRequestType_ATOMIC) ||
+ (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
+ (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
+
+
+ // Atomics don't write to cache, so there is no MRU update...
+
+ recordMissLatency(srequest, mach,
+ srequest->issue_time, Cycles(0), Cycles(0), true, false);
+
+ PacketPtr pkt = srequest->pkt;
+ Addr request_address = pkt->getAddr();
+ Addr request_line_address = makeLineAddress(pkt->getAddr());
+
+ int len = reqCoalescer[request_line_address].size();
+ std::vector<PacketPtr> mylist;
+ for (int i = 0; i < len; ++i) {
+ PacketPtr pkt = reqCoalescer[request_line_address][i].first;
+ assert(srequest->m_type ==
+ reqCoalescer[request_line_address][i].second[PrimaryType]);
+ request_address = (pkt->getAddr());
+ request_line_address = makeLineAddress(request_address);
+ if (pkt->getPtr<uint8_t>() &&
+ srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
+ /* atomics are done in memory, and return the data *before* the atomic op... */
+ memcpy(pkt->getPtr<uint8_t>(),
+ data.getData(getOffset(request_address),
+ pkt->getSize()),
+ pkt->getSize());
+ } else {
+ DPRINTF(MemoryAccess,
+ "WARNING. Data not transfered from Ruby to M5 for type " \
+ "%s\n",
+ RubyRequestType_to_string(srequest->m_type));
+ }
+
+ // If using the RubyTester, update the RubyTester sender state's
+ // subBlock with the recieved data. The tester will later access
+ // this state.
+ // Note: RubyPort will access it's sender state before the
+ // RubyTester.
+ if (m_usingRubyTester) {
+ RubyPort::SenderState *requestSenderState =
+ safe_cast<RubyPort::SenderState*>(pkt->senderState);
+ RubyTester::SenderState* testerSenderState =
+ safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+ testerSenderState->subBlock.mergeFrom(data);
+ }
+
+ mylist.push_back(pkt);
+ }
+ delete srequest;
+ reqCoalescer.erase(request_line_address);
+ assert(!reqCoalescer.count(request_line_address));
+
+ completeHitCallback(mylist, len);
+}
+
+void
+GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
+{
+ if(myMachID == senderMachID) {
+ CP_TCPLdHits++;
+ } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) {
+ CP_TCPLdTransfers++;
+ } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) {
+ CP_TCCLdHits++;
+ } else {
+ CP_LdMiss++;
+ }
+}
+
+void
+GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
+{
+ if(myMachID == senderMachID) {
+ CP_TCPStHits++;
+ } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) {
+ CP_TCPStTransfers++;
+ } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) {
+ CP_TCCStHits++;
+ } else {
+ CP_StMiss++;
+ }
+}
+
+void
+GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
+{
+ for (int i = 0; i < len; ++i) {
+ RubyPort::SenderState *ss =
+ safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
+ MemSlavePort *port = ss->port;
+ assert(port != NULL);
+
+ mylist[i]->senderState = ss->predecessor;
+ delete ss;
+ port->hitCallback(mylist[i]);
+ trySendRetries();
+ }
+
+ testDrainComplete();
+}
+
+PacketPtr
+GPUCoalescer::mapAddrToPkt(Addr address)
+{
+ RequestTable::iterator i = m_readRequestTable.find(address);
+ assert(i != m_readRequestTable.end());
+ GPUCoalescerRequest* request = i->second;
+ return request->pkt;
+}
+
+void
+GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
+ MachineType mach,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool success, bool isRegion)
+{
+ RubyRequestType type = srequest->m_type;
+ Cycles issued_time = srequest->issue_time;
+ Cycles completion_time = curCycle();
+ assert(completion_time >= issued_time);
+ Cycles total_lat = completion_time - issued_time;
+
+ // cache stats (valid for RfO protocol only)
+ if (mach == MachineType_TCP) {
+ if (type == RubyRequestType_LD) {
+ GPU_TCPLdHits++;
+ } else {
+ GPU_TCPStHits++;
+ }
+ } else if (mach == MachineType_L1Cache_wCC) {
+ if (type == RubyRequestType_LD) {
+ GPU_TCPLdTransfers++;
+ } else {
+ GPU_TCPStTransfers++;
+ }
+ } else if (mach == MachineType_TCC) {
+ if (type == RubyRequestType_LD) {
+ GPU_TCCLdHits++;
+ } else {
+ GPU_TCCStHits++;
+ }
+ } else {
+ if (type == RubyRequestType_LD) {
+ GPU_LdMiss++;
+ } else {
+ GPU_StMiss++;
+ }
+ }
+
+ // Profile all access latency, even zero latency accesses
+ m_latencyHist.sample(total_lat);
+ m_typeLatencyHist[type]->sample(total_lat);
+
+ // Profile the miss latency for all non-zero demand misses
+ if (total_lat != Cycles(0)) {
+ m_missLatencyHist.sample(total_lat);
+ m_missTypeLatencyHist[type]->sample(total_lat);
+
+ if (mach != MachineType_NUM) {
+ m_missMachLatencyHist[mach]->sample(total_lat);
+ m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
+
+ if ((issued_time <= initialRequestTime) &&
+ (initialRequestTime <= forwardRequestTime) &&
+ (forwardRequestTime <= firstResponseTime) &&
+ (firstResponseTime <= completion_time)) {
+
+ m_IssueToInitialDelayHist[mach]->sample(
+ initialRequestTime - issued_time);
+ m_InitialToForwardDelayHist[mach]->sample(
+ forwardRequestTime - initialRequestTime);
+ m_ForwardToFirstResponseDelayHist[mach]->sample(
+ firstResponseTime - forwardRequestTime);
+ m_FirstResponseToCompletionDelayHist[mach]->sample(
+ completion_time - firstResponseTime);
+ }
+ }
+
+ }
+
+ DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
+ curTick(), m_version, "Coal",
+ success ? "Done" : "SC_Failed", "", "",
+ printAddress(srequest->pkt->getAddr()), total_lat);
+}
+
+void
+GPUCoalescer::regStats()
+{
+ // These statistical variables are not for display.
+ // The profiler will collate these across different
+ // coalescers and display those collated statistics.
+ m_outstandReqHist.init(10);
+ m_latencyHist.init(10);
+ m_missLatencyHist.init(10);
+
+ for (int i = 0; i < RubyRequestType_NUM; i++) {
+ m_typeLatencyHist.push_back(new Stats::Histogram());
+ m_typeLatencyHist[i]->init(10);
+
+ m_missTypeLatencyHist.push_back(new Stats::Histogram());
+ m_missTypeLatencyHist[i]->init(10);
+ }
+
+ for (int i = 0; i < MachineType_NUM; i++) {
+ m_missMachLatencyHist.push_back(new Stats::Histogram());
+ m_missMachLatencyHist[i]->init(10);
+
+ m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
+ m_IssueToInitialDelayHist[i]->init(10);
+
+ m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
+ m_InitialToForwardDelayHist[i]->init(10);
+
+ m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
+ m_ForwardToFirstResponseDelayHist[i]->init(10);
+
+ m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
+ m_FirstResponseToCompletionDelayHist[i]->init(10);
+ }
+
+ for (int i = 0; i < RubyRequestType_NUM; i++) {
+ m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
+
+ for (int j = 0; j < MachineType_NUM; j++) {
+ m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
+ m_missTypeMachLatencyHist[i][j]->init(10);
+ }
+ }
+
+ // GPU cache stats
+ GPU_TCPLdHits
+ .name(name() + ".gpu_tcp_ld_hits")
+ .desc("loads that hit in the TCP")
+ ;
+ GPU_TCPLdTransfers
+ .name(name() + ".gpu_tcp_ld_transfers")
+ .desc("TCP to TCP load transfers")
+ ;
+ GPU_TCCLdHits
+ .name(name() + ".gpu_tcc_ld_hits")
+ .desc("loads that hit in the TCC")
+ ;
+ GPU_LdMiss
+ .name(name() + ".gpu_ld_misses")
+ .desc("loads that miss in the GPU")
+ ;
+
+ GPU_TCPStHits
+ .name(name() + ".gpu_tcp_st_hits")
+ .desc("stores that hit in the TCP")
+ ;
+ GPU_TCPStTransfers
+ .name(name() + ".gpu_tcp_st_transfers")
+ .desc("TCP to TCP store transfers")
+ ;
+ GPU_TCCStHits
+ .name(name() + ".gpu_tcc_st_hits")
+ .desc("stores that hit in the TCC")
+ ;
+ GPU_StMiss
+ .name(name() + ".gpu_st_misses")
+ .desc("stores that miss in the GPU")
+ ;
+
+ // CP cache stats
+ CP_TCPLdHits
+ .name(name() + ".cp_tcp_ld_hits")
+ .desc("loads that hit in the TCP")
+ ;
+ CP_TCPLdTransfers
+ .name(name() + ".cp_tcp_ld_transfers")
+ .desc("TCP to TCP load transfers")
+ ;
+ CP_TCCLdHits
+ .name(name() + ".cp_tcc_ld_hits")
+ .desc("loads that hit in the TCC")
+ ;
+ CP_LdMiss
+ .name(name() + ".cp_ld_misses")
+ .desc("loads that miss in the GPU")
+ ;
+
+ CP_TCPStHits
+ .name(name() + ".cp_tcp_st_hits")
+ .desc("stores that hit in the TCP")
+ ;
+ CP_TCPStTransfers
+ .name(name() + ".cp_tcp_st_transfers")
+ .desc("TCP to TCP store transfers")
+ ;
+ CP_TCCStHits
+ .name(name() + ".cp_tcc_st_hits")
+ .desc("stores that hit in the TCC")
+ ;
+ CP_StMiss
+ .name(name() + ".cp_st_misses")
+ .desc("stores that miss in the GPU")
+ ;
+}
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
new file mode 100644
index 000000000..dbd47059c
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+
+#include <iostream>
+#include <unordered_map>
+
+#include "base/statistics.hh"
+#include "mem/protocol/HSAScope.hh"
+#include "mem/protocol/HSASegment.hh"
+#include "mem/protocol/PrefetchBit.hh"
+#include "mem/protocol/RubyAccessMode.hh"
+#include "mem/protocol/RubyRequestType.hh"
+#include "mem/protocol/SequencerRequestType.hh"
+#include "mem/request.hh"
+#include "mem/ruby/common/Address.hh"
+#include "mem/ruby/common/Consumer.hh"
+#include "mem/ruby/system/RubyPort.hh"
+
+class DataBlock;
+class CacheMsg;
+class MachineID;
+class CacheMemory;
+
+class RubyGPUCoalescerParams;
+
+HSAScope reqScopeToHSAScope(Request* req);
+HSASegment reqSegmentToHSASegment(Request* req);
+
+struct GPUCoalescerRequest
+{
+ PacketPtr pkt;
+ RubyRequestType m_type;
+ Cycles issue_time;
+
+ GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type,
+ Cycles _issue_time)
+ : pkt(_pkt), m_type(_m_type), issue_time(_issue_time)
+ {}
+};
+
+std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj);
+
+class GPUCoalescer : public RubyPort
+{
+ public:
+ typedef RubyGPUCoalescerParams Params;
+ GPUCoalescer(const Params *);
+ ~GPUCoalescer();
+
+ // Public Methods
+ void wakeup(); // Used only for deadlock detection
+
+ void printProgress(std::ostream& out) const;
+ void resetStats();
+ void collateStats();
+ void regStats();
+
+ void writeCallback(Addr address, DataBlock& data);
+
+ void writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data);
+
+ void writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion);
+
+ void writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime);
+
+ void readCallback(Addr address, DataBlock& data);
+
+ void readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data);
+
+ void readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime);
+
+ void readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion);
+ /* atomics need their own callback because the data
+ might be const coming from SLICC */
+ void atomicCallback(Addr address,
+ MachineType mach,
+ const DataBlock& data);
+
+ void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
+ void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
+
+ // Alternate implementations in VIPER Coalescer
+ virtual RequestStatus makeRequest(PacketPtr pkt);
+
+ int outstandingCount() const { return m_outstanding_count; }
+
+ bool
+ isDeadlockEventScheduled() const
+ {
+ return deadlockCheckEvent.scheduled();
+ }
+
+ void
+ descheduleDeadlockEvent()
+ {
+ deschedule(deadlockCheckEvent);
+ }
+
+ bool empty() const;
+
+ void print(std::ostream& out) const;
+ void checkCoherence(Addr address);
+
+ void markRemoved();
+ void removeRequest(GPUCoalescerRequest* request);
+ void evictionCallback(Addr address);
+ void completeIssue();
+
+ void insertKernel(int wavefront_id, PacketPtr pkt);
+
+ void recordRequestType(SequencerRequestType requestType);
+ Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
+
+ Stats::Histogram& getLatencyHist() { return m_latencyHist; }
+ Stats::Histogram& getTypeLatencyHist(uint32_t t)
+ { return *m_typeLatencyHist[t]; }
+
+ Stats::Histogram& getMissLatencyHist()
+ { return m_missLatencyHist; }
+ Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
+ { return *m_missTypeLatencyHist[t]; }
+
+ Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
+ { return *m_missMachLatencyHist[t]; }
+
+ Stats::Histogram&
+ getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
+ { return *m_missTypeMachLatencyHist[r][t]; }
+
+ Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
+ { return *m_IssueToInitialDelayHist[t]; }
+
+ Stats::Histogram&
+ getInitialToForwardDelayHist(const MachineType t) const
+ { return *m_InitialToForwardDelayHist[t]; }
+
+ Stats::Histogram&
+ getForwardRequestToFirstResponseHist(const MachineType t) const
+ { return *m_ForwardToFirstResponseDelayHist[t]; }
+
+ Stats::Histogram&
+ getFirstResponseToCompletionDelayHist(const MachineType t) const
+ { return *m_FirstResponseToCompletionDelayHist[t]; }
+
+ // Changed to protected to enable inheritance by VIPER Coalescer
+ protected:
+ bool tryCacheAccess(Addr addr, RubyRequestType type,
+ Addr pc, RubyAccessMode access_mode,
+ int size, DataBlock*& data_ptr);
+ // Alternate implementations in VIPER Coalescer
+ virtual void issueRequest(PacketPtr pkt, RubyRequestType type);
+
+ void kernelCallback(int wavfront_id);
+
+ void hitCallback(GPUCoalescerRequest* request,
+ MachineType mach,
+ DataBlock& data,
+ bool success,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion);
+ void recordMissLatency(GPUCoalescerRequest* request,
+ MachineType mach,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool success, bool isRegion);
+ void completeHitCallback(std::vector<PacketPtr> & mylist, int len);
+ PacketPtr mapAddrToPkt(Addr address);
+
+
+ RequestStatus getRequestStatus(PacketPtr pkt,
+ RubyRequestType request_type);
+ bool insertRequest(PacketPtr pkt, RubyRequestType request_type);
+
+ bool handleLlsc(Addr address, GPUCoalescerRequest* request);
+
+ // Private copy constructor and assignment operator
+ GPUCoalescer(const GPUCoalescer& obj);
+ GPUCoalescer& operator=(const GPUCoalescer& obj);
+
+ class IssueEvent : public Event
+ {
+ private:
+ GPUCoalescer *seq;
+ public:
+ IssueEvent(GPUCoalescer *_seq);
+ void process();
+ const char *description() const;
+ };
+
+ IssueEvent issueEvent;
+
+
+ // Changed to protected to enable inheritance by VIPER Coalescer
+ protected:
+ int m_max_outstanding_requests;
+ int m_deadlock_threshold;
+
+ CacheMemory* m_dataCache_ptr;
+ CacheMemory* m_instCache_ptr;
+
+ // The cache access latency for this GPU data cache. This is assessed at the
+ // beginning of each access. This should be very similar to the
+ // implementation in Sequencer() as this is very much like a Sequencer
+ Cycles m_data_cache_hit_latency;
+
+ // We need to track both the primary and secondary request types.
+ // The secondary request type comprises a subset of RubyRequestTypes that
+ // are understood by the L1 Controller. A primary request type can be any
+ // RubyRequestType.
+ enum {PrimaryType, SecondaryType};
+ typedef std::pair<PacketPtr, std::vector<RubyRequestType> > RequestDesc;
+ typedef std::unordered_map<Addr, std::vector<RequestDesc> > CoalescingTable;
+ CoalescingTable reqCoalescer;
+ std::vector<Addr> newRequests;
+
+ typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable;
+ RequestTable m_writeRequestTable;
+ RequestTable m_readRequestTable;
+ // Global outstanding request count, across all request tables
+ int m_outstanding_count;
+ bool m_deadlock_check_scheduled;
+ std::unordered_map<int, PacketPtr> kernelEndList;
+ std::vector<int> newKernelEnds;
+
+ int m_store_waiting_on_load_cycles;
+ int m_store_waiting_on_store_cycles;
+ int m_load_waiting_on_store_cycles;
+ int m_load_waiting_on_load_cycles;
+
+ bool m_usingNetworkTester;
+
+ class GPUCoalescerWakeupEvent : public Event
+ {
+ private:
+ GPUCoalescer *m_GPUCoalescer_ptr;
+
+ public:
+ GPUCoalescerWakeupEvent(GPUCoalescer *_seq) :
+ m_GPUCoalescer_ptr(_seq) {}
+ void process() { m_GPUCoalescer_ptr->wakeup(); }
+ const char *description() const
+ {
+ return "GPUCoalescer deadlock check";
+ }
+ };
+
+ GPUCoalescerWakeupEvent deadlockCheckEvent;
+ bool assumingRfOCoherence;
+
+ // m5 style stats for TCP hit/miss counts
+ Stats::Scalar GPU_TCPLdHits;
+ Stats::Scalar GPU_TCPLdTransfers;
+ Stats::Scalar GPU_TCCLdHits;
+ Stats::Scalar GPU_LdMiss;
+
+ Stats::Scalar GPU_TCPStHits;
+ Stats::Scalar GPU_TCPStTransfers;
+ Stats::Scalar GPU_TCCStHits;
+ Stats::Scalar GPU_StMiss;
+
+ Stats::Scalar CP_TCPLdHits;
+ Stats::Scalar CP_TCPLdTransfers;
+ Stats::Scalar CP_TCCLdHits;
+ Stats::Scalar CP_LdMiss;
+
+ Stats::Scalar CP_TCPStHits;
+ Stats::Scalar CP_TCPStTransfers;
+ Stats::Scalar CP_TCCStHits;
+ Stats::Scalar CP_StMiss;
+
+ //! Histogram for number of outstanding requests per cycle.
+ Stats::Histogram m_outstandReqHist;
+
+ //! Histogram for holding latency profile of all requests.
+ Stats::Histogram m_latencyHist;
+ std::vector<Stats::Histogram *> m_typeLatencyHist;
+
+ //! Histogram for holding latency profile of all requests that
+ //! miss in the controller connected to this sequencer.
+ Stats::Histogram m_missLatencyHist;
+ std::vector<Stats::Histogram *> m_missTypeLatencyHist;
+
+ //! Histograms for profiling the latencies for requests that
+ //! required external messages.
+ std::vector<Stats::Histogram *> m_missMachLatencyHist;
+ std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
+
+ //! Histograms for recording the breakdown of miss latency
+ std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
+ std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
+ std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
+ std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
+};
+
+inline std::ostream&
+operator<<(std::ostream& out, const GPUCoalescer& obj)
+{
+ obj.print(out);
+ out << std::flush;
+ return out;
+}
+
+#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+
diff --git a/src/mem/ruby/system/GPUCoalescer.py b/src/mem/ruby/system/GPUCoalescer.py
new file mode 100644
index 000000000..0c19f875d
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Steve Reinhardt
+# Brad Beckmann
+
+from m5.params import *
+from m5.proxy import *
+from Sequencer import *
+
+class RubyGPUCoalescer(RubySequencer):
+ type = 'RubyGPUCoalescer'
+ cxx_class = 'GPUCoalescer'
+ cxx_header = "mem/ruby/system/GPUCoalescer.hh"
+
+ # max_outstanding_requests = (wave front slots) x (wave front size)
+ max_outstanding_requests = Param.Int(40*64,
+ "max requests (incl. prefetches) outstanding")
+ assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
+ "Ownership coherence");
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 5a5f528bb..bf4002126 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -60,7 +60,8 @@ RubyPort::RubyPort(const Params *p)
memSlavePort(csprintf("%s-mem-slave-port", name()), this,
p->ruby_system->getAccessBackingStore(), -1,
p->no_retry_on_stall),
- gotAddrRanges(p->port_master_connection_count)
+ gotAddrRanges(p->port_master_connection_count),
+ m_isCPUSequencer(p->is_cpu_sequencer)
{
assert(m_version != -1);
diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh
index 07e0fde5a..6bd92b654 100644
--- a/src/mem/ruby/system/RubyPort.hh
+++ b/src/mem/ruby/system/RubyPort.hh
@@ -167,6 +167,8 @@ class RubyPort : public MemObject
uint32_t getId() { return m_version; }
DrainState drain() override;
+ bool isCPUSequencer() { return m_isCPUSequencer; }
+
protected:
void trySendRetries();
void ruby_hit_callback(PacketPtr pkt);
@@ -218,6 +220,8 @@ class RubyPort : public MemObject
// that should be called when the Sequencer becomes available after a stall.
//
std::vector<MemSlavePort *> retryList;
+
+ bool m_isCPUSequencer;
};
#endif // __MEM_RUBY_SYSTEM_RUBYPORT_HH__
diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc
index 1ecd2e098..e1717e519 100644
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -107,7 +107,7 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
Sequencer* sequencer_ptr = NULL;
for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
- sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer());
+ sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer());
if (sequencer_ptr == NULL) {
sequencer_ptr = sequencer_map[cntrl];
}
diff --git a/src/mem/ruby/system/SConscript b/src/mem/ruby/system/SConscript
index 8c5077362..b67311bca 100644
--- a/src/mem/ruby/system/SConscript
+++ b/src/mem/ruby/system/SConscript
@@ -33,12 +33,22 @@ Import('*')
if env['PROTOCOL'] == 'None':
Return()
+if env['BUILD_GPU']:
+ SimObject('GPUCoalescer.py')
SimObject('RubySystem.py')
SimObject('Sequencer.py')
+SimObject('WeightedLRUReplacementPolicy.py')
+if env['BUILD_GPU']:
+ SimObject('VIPERCoalescer.py')
Source('CacheRecorder.cc')
Source('DMASequencer.cc')
+if env['BUILD_GPU']:
+ Source('GPUCoalescer.cc')
Source('RubyPort.cc')
Source('RubyPortProxy.cc')
Source('RubySystem.cc')
Source('Sequencer.cc')
+if env['BUILD_GPU']:
+ Source('VIPERCoalescer.cc')
+Source('WeightedLRUPolicy.cc')
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 50418c700..c2727b41d 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -63,6 +63,7 @@ Sequencer::Sequencer(const Params *p)
m_max_outstanding_requests = p->max_outstanding_requests;
m_deadlock_threshold = p->deadlock_threshold;
+ m_coreId = p->coreid; // for tracking the two CorePair sequencers
assert(m_max_outstanding_requests > 0);
assert(m_deadlock_threshold > 0);
assert(m_instCache_ptr != NULL);
@@ -593,6 +594,8 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
ContextID proc_id = pkt->req->hasContextId() ?
pkt->req->contextId() : InvalidContextID;
+ ContextID core_id = coreId();
+
// If valid, copy the pc to the ruby request
Addr pc = 0;
if (pkt->req->hasPC()) {
@@ -607,7 +610,7 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
nullptr : pkt->getPtr<uint8_t>(),
pkt->getSize(), pc, secondary_type,
RubyAccessMode_Supervisor, pkt,
- PrefetchBit_No, proc_id);
+ PrefetchBit_No, proc_id, core_id);
DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %#x %s\n",
curTick(), m_version, "Seq", "Begin", "", "",
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index 47af7ea1e..2a2f49587 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -99,6 +99,7 @@ class Sequencer : public RubyPort
void markRemoved();
void evictionCallback(Addr address);
void invalidateSC(Addr address);
+ int coreId() const { return m_coreId; }
void recordRequestType(SequencerRequestType requestType);
Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
@@ -198,6 +199,8 @@ class Sequencer : public RubyPort
Stats::Scalar m_load_waiting_on_store;
Stats::Scalar m_load_waiting_on_load;
+ int m_coreId;
+
bool m_usingNetworkTester;
//! Histogram for number of outstanding requests per cycle.
diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py
index 7c90eb29c..d6ee0aa2f 100644
--- a/src/mem/ruby/system/Sequencer.py
+++ b/src/mem/ruby/system/Sequencer.py
@@ -32,54 +32,58 @@ from m5.proxy import *
from MemObject import MemObject
class RubyPort(MemObject):
- type = 'RubyPort'
- abstract = True
- cxx_header = "mem/ruby/system/RubyPort.hh"
- version = Param.Int(0, "")
+ type = 'RubyPort'
+ abstract = True
+ cxx_header = "mem/ruby/system/RubyPort.hh"
+ version = Param.Int(0, "")
- slave = VectorSlavePort("CPU slave port")
- master = VectorMasterPort("CPU master port")
- pio_master_port = MasterPort("Ruby mem master port")
- mem_master_port = MasterPort("Ruby mem master port")
- pio_slave_port = SlavePort("Ruby pio slave port")
- mem_slave_port = SlavePort("Ruby memory port")
+ slave = VectorSlavePort("CPU slave port")
+ master = VectorMasterPort("CPU master port")
+ pio_master_port = MasterPort("Ruby mem master port")
+ mem_master_port = MasterPort("Ruby mem master port")
+ pio_slave_port = SlavePort("Ruby pio slave port")
+ mem_slave_port = SlavePort("Ruby memory port")
- using_ruby_tester = Param.Bool(False, "")
- no_retry_on_stall = Param.Bool(False, "")
- ruby_system = Param.RubySystem(Parent.any, "")
- system = Param.System(Parent.any, "system object")
- support_data_reqs = Param.Bool(True, "data cache requests supported")
- support_inst_reqs = Param.Bool(True, "inst cache requests supported")
+ using_ruby_tester = Param.Bool(False, "")
+ no_retry_on_stall = Param.Bool(False, "")
+ ruby_system = Param.RubySystem(Parent.any, "")
+ system = Param.System(Parent.any, "system object")
+ support_data_reqs = Param.Bool(True, "data cache requests supported")
+ support_inst_reqs = Param.Bool(True, "inst cache requests supported")
+ is_cpu_sequencer = Param.Bool(True, "connected to a cpu")
class RubyPortProxy(RubyPort):
- type = 'RubyPortProxy'
- cxx_header = "mem/ruby/system/RubyPortProxy.hh"
+ type = 'RubyPortProxy'
+ cxx_header = "mem/ruby/system/RubyPortProxy.hh"
class RubySequencer(RubyPort):
- type = 'RubySequencer'
- cxx_class = 'Sequencer'
- cxx_header = "mem/ruby/system/Sequencer.hh"
+ type = 'RubySequencer'
+ cxx_class = 'Sequencer'
+ cxx_header = "mem/ruby/system/Sequencer.hh"
- icache = Param.RubyCache("")
- dcache = Param.RubyCache("")
- # Cache latencies currently assessed at the beginning of each access
- # NOTE: Setting these values to a value greater than one will result in
- # O3 CPU pipeline bubbles and negatively impact performance
- # TODO: Latencies should be migrated into each top-level cache controller
- icache_hit_latency = Param.Cycles(1, "Inst cache hit latency")
- dcache_hit_latency = Param.Cycles(1, "Data cache hit latency")
- max_outstanding_requests = Param.Int(16,
- "max requests (incl. prefetches) outstanding")
- deadlock_threshold = Param.Cycles(500000,
- "max outstanding cycles for a request before deadlock/livelock declared")
- using_network_tester = Param.Bool(False, "")
+ icache = Param.RubyCache("")
+ dcache = Param.RubyCache("")
+ # Cache latencies currently assessed at the beginning of each access
+ # NOTE: Setting these values to a value greater than one will result in
+ # O3 CPU pipeline bubbles and negatively impact performance
+ # TODO: Latencies should be migrated into each top-level cache controller
+ icache_hit_latency = Param.Cycles(1, "Inst cache hit latency")
+ dcache_hit_latency = Param.Cycles(1, "Data cache hit latency")
+ max_outstanding_requests = Param.Int(16,
+ "max requests (incl. prefetches) outstanding")
+ deadlock_threshold = Param.Cycles(500000,
+ "max outstanding cycles for a request before deadlock/livelock declared")
+ using_network_tester = Param.Bool(False, "")
+ # id used by protocols that support multiple sequencers per controller
+ # 99 is the dummy default value
+ coreid = Param.Int(99, "CorePair core id")
class DMASequencer(MemObject):
- type = 'DMASequencer'
- cxx_header = "mem/ruby/system/DMASequencer.hh"
+ type = 'DMASequencer'
+ cxx_header = "mem/ruby/system/DMASequencer.hh"
- version = Param.Int(0, "")
- slave = SlavePort("Device slave port")
- using_ruby_tester = Param.Bool(False, "")
- ruby_system = Param.RubySystem(Parent.any, "")
- system = Param.System(Parent.any, "system object")
+ version = Param.Int(0, "")
+ slave = SlavePort("Device slave port")
+ using_ruby_tester = Param.Bool(False, "")
+ ruby_system = Param.RubySystem(Parent.any, "")
+ system = Param.System(Parent.any, "system object")
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
new file mode 100644
index 000000000..ca91f2723
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "base/misc.hh"
+#include "base/str.hh"
+#include "config/the_isa.hh"
+
+#if THE_ISA == X86_ISA
+#include "arch/x86/insts/microldstop.hh"
+
+#endif // X86_ISA
+#include "mem/ruby/system/VIPERCoalescer.hh"
+
+#include "cpu/testers/rubytest/RubyTester.hh"
+#include "debug/GPUCoalescer.hh"
+#include "debug/MemoryAccess.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/common/SubBlock.hh"
+#include "mem/ruby/network/MessageBuffer.hh"
+#include "mem/ruby/profiler/Profiler.hh"
+#include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/slicc_interface/RubyRequest.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "mem/ruby/system/GPUCoalescer.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "params/VIPERCoalescer.hh"
+
+using namespace std;
+
+VIPERCoalescer *
+VIPERCoalescerParams::create()
+{
+ return new VIPERCoalescer(this);
+}
+
+VIPERCoalescer::VIPERCoalescer(const Params *p)
+ : GPUCoalescer(p)
+{
+ m_max_wb_per_cycle=p->max_wb_per_cycle;
+ m_max_inv_per_cycle=p->max_inv_per_cycle;
+ m_outstanding_inv = 0;
+ m_outstanding_wb = 0;
+}
+
+VIPERCoalescer::~VIPERCoalescer()
+{
+}
+
+// Analyzes the packet to see if this request can be coalesced.
+// If request can be coalesced, this request is added to the reqCoalescer table
+// and makeRequest returns RequestStatus_Issued;
+// If this is the first request to a cacheline, request is added to both
+// newRequests queue and to the reqCoalescer table; makeRequest
+// returns RequestStatus_Issued.
+// If there is a pending request to this cacheline and this request
+// can't be coalesced, RequestStatus_Aliased is returned and
+// the packet needs to be reissued.
+RequestStatus
+VIPERCoalescer::makeRequest(PacketPtr pkt)
+{
+ if (m_outstanding_wb | m_outstanding_inv) {
+ DPRINTF(GPUCoalescer,
+ "There are %d Writebacks and %d Invalidatons\n",
+ m_outstanding_wb, m_outstanding_inv);
+ }
+ // Are we in the middle of a release
+ if ((m_outstanding_wb) > 0) {
+ if (pkt->req->isKernel()) {
+ // Everythign is fine
+ // Barriers and Kernel End scan coalesce
+ // If it is a Kerenl Begin flush the cache
+ if (pkt->req->isAcquire() && (m_outstanding_inv == 0)) {
+ invL1();
+ }
+
+ if (pkt->req->isRelease()) {
+ insertKernel(pkt->req->contextId(), pkt);
+ }
+
+ return RequestStatus_Issued;
+ }
+// return RequestStatus_Aliased;
+ } else if (pkt->req->isKernel() && pkt->req->isRelease()) {
+ // Flush Dirty Data on Kernel End
+ // isKernel + isRelease
+ insertKernel(pkt->req->contextId(), pkt);
+ wbL1();
+ if(m_outstanding_wb == 0) {
+ for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
+ newKernelEnds.push_back(it->first);
+ }
+ completeIssue();
+ }
+ return RequestStatus_Issued;
+ }
+ RequestStatus requestStatus = GPUCoalescer::makeRequest(pkt);
+ if (requestStatus!=RequestStatus_Issued) {
+ // Request not isssued
+ // enqueue Retry
+ DPRINTF(GPUCoalescer, "Request not issued by GPUCoaleser\n");
+ return requestStatus;
+ } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
+ // Invalidate clean Data on Kernel Begin
+ // isKernel + isAcquire
+ invL1();
+ } else if (pkt->req->isAcquire() && pkt->req->isRelease()) {
+ // Deschedule the AtomicAcqRel and
+ // Flush and Invalidate the L1 cache
+ invwbL1();
+ if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
+ DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
+ deschedule(issueEvent);
+ }
+ } else if (pkt->req->isRelease()) {
+ // Deschedule the StoreRel and
+ // Flush the L1 cache
+ wbL1();
+ if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
+ DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
+ deschedule(issueEvent);
+ }
+ } else if (pkt->req->isAcquire()) {
+ // LoadAcq or AtomicAcq
+ // Invalidate the L1 cache
+ invL1();
+ }
+ // Request was successful
+ if (m_outstanding_wb == 0) {
+ if (!issueEvent.scheduled()) {
+ DPRINTF(GPUCoalescer, "issueEvent Rescheduled\n");
+ schedule(issueEvent, curTick());
+ }
+ }
+ return RequestStatus_Issued;
+}
+
+void
+VIPERCoalescer::wbCallback(Addr addr)
+{
+ m_outstanding_wb--;
+ // if L1 Flush Complete
+ // attemnpt to schedule issueEvent
+ assert(((int) m_outstanding_wb) >= 0);
+ if (m_outstanding_wb == 0) {
+ for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
+ newKernelEnds.push_back(it->first);
+ }
+ completeIssue();
+ }
+ trySendRetries();
+}
+
+void
+VIPERCoalescer::invCallback(Addr addr)
+{
+ m_outstanding_inv--;
+ // if L1 Flush Complete
+ // attemnpt to schedule issueEvent
+ // This probably won't happen, since
+ // we dont wait on cache invalidations
+ if (m_outstanding_wb == 0) {
+ for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) {
+ newKernelEnds.push_back(it->first);
+ }
+ completeIssue();
+ }
+ trySendRetries();
+}
+
+/**
+ * Invalidate L1 cache (Acquire)
+ */
+void
+VIPERCoalescer::invL1()
+{
+ int size = m_dataCache_ptr->getNumBlocks();
+ DPRINTF(GPUCoalescer,
+ "There are %d Invalidations outstanding before Cache Walk\n",
+ m_outstanding_inv);
+ // Walk the cache
+ for (int i = 0; i < size; i++) {
+ Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+ // Evict Read-only data
+ std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+ clockEdge(), addr, (uint8_t*) 0, 0, 0,
+ RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor,
+ nullptr);
+ assert(m_mandatory_q_ptr != NULL);
+ m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+ m_outstanding_inv++;
+ }
+ DPRINTF(GPUCoalescer,
+ "There are %d Invalidatons outstanding after Cache Walk\n",
+ m_outstanding_inv);
+}
+
+/**
+ * Writeback L1 cache (Release)
+ */
+void
+VIPERCoalescer::wbL1()
+{
+ int size = m_dataCache_ptr->getNumBlocks();
+ DPRINTF(GPUCoalescer,
+ "There are %d Writebacks outstanding before Cache Walk\n",
+ m_outstanding_wb);
+ // Walk the cache
+ for (int i = 0; i < size; i++) {
+ Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+ // Write dirty data back
+ std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+ clockEdge(), addr, (uint8_t*) 0, 0, 0,
+ RubyRequestType_FLUSH, RubyAccessMode_Supervisor,
+ nullptr);
+ assert(m_mandatory_q_ptr != NULL);
+ m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+ m_outstanding_wb++;
+ }
+ DPRINTF(GPUCoalescer,
+ "There are %d Writebacks outstanding after Cache Walk\n",
+ m_outstanding_wb);
+}
+
+/**
+ * Invalidate and Writeback L1 cache (Acquire&Release)
+ */
+void
+VIPERCoalescer::invwbL1()
+{
+ int size = m_dataCache_ptr->getNumBlocks();
+ // Walk the cache
+ for(int i = 0; i < size; i++) {
+ Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+ // Evict Read-only data
+ std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+ clockEdge(), addr, (uint8_t*) 0, 0, 0,
+ RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor,
+ nullptr);
+ assert(m_mandatory_q_ptr != NULL);
+ m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+ m_outstanding_inv++;
+ }
+ // Walk the cache
+ for(int i = 0; i< size; i++) {
+ Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+ // Write dirty data back
+ std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+ clockEdge(), addr, (uint8_t*) 0, 0, 0,
+ RubyRequestType_FLUSH, RubyAccessMode_Supervisor,
+ nullptr);
+ assert(m_mandatory_q_ptr != NULL);
+ m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+ m_outstanding_wb++;
+ }
+}
diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh
new file mode 100644
index 000000000..af6e44e7f
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+#define __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+
+#include <iostream>
+
+#include "mem/protocol/PrefetchBit.hh"
+#include "mem/protocol/RubyAccessMode.hh"
+#include "mem/protocol/RubyRequestType.hh"
+#include "mem/ruby/common/Address.hh"
+#include "mem/ruby/common/Consumer.hh"
+#include "mem/ruby/system/GPUCoalescer.hh"
+#include "mem/ruby/system/RubyPort.hh"
+
+class DataBlock;
+class CacheMsg;
+class MachineID;
+class CacheMemory;
+
+class VIPERCoalescerParams;
+
+class VIPERCoalescer : public GPUCoalescer
+{
+ public:
+ typedef VIPERCoalescerParams Params;
+ VIPERCoalescer(const Params *);
+ ~VIPERCoalescer();
+ void wbCallback(Addr address);
+ void invCallback(Addr address);
+ RequestStatus makeRequest(PacketPtr pkt);
+ private:
+ void invL1();
+ void wbL1();
+ void invwbL1();
+ uint64_t m_outstanding_inv;
+ uint64_t m_outstanding_wb;
+ uint64_t m_max_inv_per_cycle;
+ uint64_t m_max_wb_per_cycle;
+};
+#endif // __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+
diff --git a/src/mem/ruby/system/VIPERCoalescer.py b/src/mem/ruby/system/VIPERCoalescer.py
new file mode 100644
index 000000000..05c74386f
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Steve Reinhardt
+# Brad Beckmann
+
+from m5.params import *
+from m5.proxy import *
+from GPUCoalescer import *
+
+class VIPERCoalescer(RubyGPUCoalescer):
+ type = 'VIPERCoalescer'
+ cxx_class = 'VIPERCoalescer'
+ cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
+ max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
+ max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
+ assume_rfo = False
diff --git a/src/mem/ruby/system/WeightedLRUPolicy.cc b/src/mem/ruby/system/WeightedLRUPolicy.cc
new file mode 100644
index 000000000..5baa4d9a5
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUPolicy.cc
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Derek Hower
+ */
+
+#include "mem/ruby/system/WeightedLRUPolicy.hh"
+
+WeightedLRUPolicy::WeightedLRUPolicy(const Params* p)
+ : AbstractReplacementPolicy(p), m_cache(p->cache)
+{
+ m_last_occ_ptr = new int*[m_num_sets];
+ for(unsigned i = 0; i < m_num_sets; i++){
+ m_last_occ_ptr[i] = new int[m_assoc];
+ for(unsigned j = 0; j < m_assoc; j++){
+ m_last_occ_ptr[i][j] = 0;
+ }
+ }
+}
+
+WeightedLRUPolicy *
+WeightedLRUReplacementPolicyParams::create()
+{
+ return new WeightedLRUPolicy(this);
+}
+
+WeightedLRUPolicy::~WeightedLRUPolicy()
+{
+ if (m_last_occ_ptr != NULL){
+ for (unsigned i = 0; i < m_num_sets; i++){
+ if (m_last_occ_ptr[i] != NULL){
+ delete[] m_last_occ_ptr[i];
+ }
+ }
+ delete[] m_last_occ_ptr;
+ }
+}
+
+void
+WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time)
+{
+ assert(index >= 0 && index < m_assoc);
+ assert(set >= 0 && set < m_num_sets);
+
+ m_last_ref_ptr[set][index] = time;
+}
+
+void
+WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time, int occupancy)
+{
+ assert(index >= 0 && index < m_assoc);
+ assert(set >= 0 && set < m_num_sets);
+
+ m_last_ref_ptr[set][index] = time;
+ m_last_occ_ptr[set][index] = occupancy;
+}
+
+int64_t
+WeightedLRUPolicy::getVictim(int64_t set) const
+{
+ Tick time, smallest_time;
+ int64_t smallest_index;
+
+ smallest_index = 0;
+ smallest_time = m_last_ref_ptr[set][0];
+ int smallest_weight = m_last_ref_ptr[set][0];
+
+ for (unsigned i = 1; i < m_assoc; i++) {
+
+ int weight = m_last_occ_ptr[set][i];
+ if (weight < smallest_weight) {
+ smallest_weight = weight;
+ smallest_index = i;
+ smallest_time = m_last_ref_ptr[set][i];
+ } else if (weight == smallest_weight) {
+ time = m_last_ref_ptr[set][i];
+ if (time < smallest_time) {
+ smallest_index = i;
+ smallest_time = time;
+ }
+ }
+ }
+ return smallest_index;
+}
diff --git a/src/mem/ruby/system/WeightedLRUPolicy.hh b/src/mem/ruby/system/WeightedLRUPolicy.hh
new file mode 100644
index 000000000..3150779b2
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUPolicy.hh
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__
+#define __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__
+
+#include "mem/ruby/structures/AbstractReplacementPolicy.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "params/WeightedLRUReplacementPolicy.hh"
+
+/* Simple true LRU replacement policy */
+
+class WeightedLRUPolicy : public AbstractReplacementPolicy
+{
+ public:
+ typedef WeightedLRUReplacementPolicyParams Params;
+ WeightedLRUPolicy(const Params* p);
+ ~WeightedLRUPolicy();
+
+ void touch(int64_t set, int64_t way, Tick time);
+ void touch(int64_t set, int64_t way, Tick time, int occupancy);
+ int64_t getVictim(int64_t set) const override;
+
+ bool useOccupancy() const { return true; }
+
+ CacheMemory * m_cache;
+ int **m_last_occ_ptr;
+};
+
+#endif // __MEM_RUBY_SYSTEM_WeightedLRUPolicy_HH__
diff --git a/src/mem/ruby/system/WeightedLRUReplacementPolicy.py b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py
new file mode 100644
index 000000000..e7de33496
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py
@@ -0,0 +1,45 @@
+#
+# Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# For use for simulation and test purposes only
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: Derek Hower
+#
+
+from m5.params import *
+from m5.proxy import *
+from MemObject import MemObject
+from ReplacementPolicy import ReplacementPolicy
+
+class WeightedLRUReplacementPolicy(ReplacementPolicy):
+ type = "WeightedLRUReplacementPolicy"
+ cxx_class = "WeightedLRUPolicy"
+ cxx_header = "mem/ruby/system/WeightedLRUPolicy.hh"
+ cache = Param.RubyCache("")