25 files changed, 2610 insertions, 62 deletions
diff --git a/src/mem/ruby/SConscript b/src/mem/ruby/SConscript
index 16e932432..82a16c9b0 100644
--- a/src/mem/ruby/SConscript
+++ b/src/mem/ruby/SConscript
@@ -124,13 +124,20 @@ MakeInclude('common/Set.hh')
 MakeInclude('common/WriteMask.hh')
 MakeInclude('filters/AbstractBloomFilter.hh')
 MakeInclude('network/MessageBuffer.hh')
-MakeInclude('structures/Prefetcher.hh')
 MakeInclude('structures/CacheMemory.hh')
-MakeInclude('system/DMASequencer.hh')
 MakeInclude('structures/DirectoryMemory.hh')
-MakeInclude('structures/WireBuffer.hh')
 MakeInclude('structures/PerfectCacheMemory.hh')
 MakeInclude('structures/PersistentTable.hh')
-MakeInclude('system/Sequencer.hh')
+MakeInclude('structures/Prefetcher.hh')
 MakeInclude('structures/TBETable.hh')
 MakeInclude('structures/TimerTable.hh')
+MakeInclude('structures/WireBuffer.hh')
+MakeInclude('system/DMASequencer.hh')
+MakeInclude('system/Sequencer.hh')
+
+# External types : Group "mem/protocol" : include "header.hh" to the bottom
+# of this MakeIncludes if it is referenced as
+# <# include "mem/protocol/header.hh"> in any file
+# generated_dir = Dir('../protocol')
+MakeInclude('system/GPUCoalescer.hh')
+MakeInclude('system/VIPERCoalescer.hh')
diff --git a/src/mem/ruby/profiler/Profiler.cc b/src/mem/ruby/profiler/Profiler.cc
index b3b37e5a6..7d3f20982 100644
--- a/src/mem/ruby/profiler/Profiler.cc
+++ b/src/mem/ruby/profiler/Profiler.cc
@@ -269,7 +269,7 @@ Profiler::collateStats()
                 it != m_ruby_system->m_abstract_controls[i].end(); ++it) {
 
             AbstractController *ctr = (*it).second;
-            Sequencer *seq = ctr->getSequencer();
+            Sequencer *seq = ctr->getCPUSequencer();
             if (seq != NULL) {
                 m_outstandReqHist.add(seq->getOutstandReqHist());
             }
@@ -282,7 +282,7 @@ Profiler::collateStats()
                 it != m_ruby_system->m_abstract_controls[i].end(); ++it) {
 
             AbstractController *ctr = (*it).second;
-            Sequencer *seq = ctr->getSequencer();
+            Sequencer *seq = ctr->getCPUSequencer();
             if (seq != NULL) {
                 // add all the latencies
                 m_latencyHist.add(seq->getLatencyHist());
diff --git a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
index 926556781..cbd068c04 100644
--- a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
+++ b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh
@@ -56,6 +56,12 @@ class AbstractCacheEntry : public AbstractEntry
     virtual DataBlock& getDataBlk()
     { panic("getDataBlk() not implemented!"); }
 
+    int validBlocks;
+    virtual int& getNumValidBlocks()
+    {
+        return validBlocks;
+    }
+
     // Functions for locking and unlocking the cache entry.  These are required
     // for supporting atomic memory accesses.
     void setLocked(int context);
diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc
index 93fe50c88..458fde5bc 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.cc
+++ b/src/mem/ruby/slicc_interface/AbstractController.cc
@@ -200,6 +200,12 @@ AbstractController::unblock(Addr addr)
     }
 }
 
+bool
+AbstractController::isBlocked(Addr addr)
+{
+    return (m_block_map.count(addr) > 0);
+}
+
 BaseMasterPort &
 AbstractController::getMasterPort(const std::string &if_name,
                                   PortID idx)
diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh
index 383507eed..4488ee3f4 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.hh
+++ b/src/mem/ruby/slicc_interface/AbstractController.hh
@@ -73,6 +73,7 @@ class AbstractController : public MemObject, public Consumer
     // return instance name
     void blockOnQueue(Addr, MessageBuffer*);
     void unblock(Addr);
+    bool isBlocked(Addr);
 
     virtual MessageBuffer* getMandatoryQueue() const = 0;
     virtual MessageBuffer* getMemoryQueue() const = 0;
@@ -84,7 +85,7 @@ class AbstractController : public MemObject, public Consumer
     virtual void regStats();
 
     virtual void recordCacheTrace(int cntrl, CacheRecorder* tr) = 0;
-    virtual Sequencer* getSequencer() const = 0;
+    virtual Sequencer* getCPUSequencer() const = 0;
 
     //! These functions are used by ruby system to read/write the data blocks
     //! that exist with in the controller.
diff --git a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
index 46071335e..cdedc2e14 100644
--- a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
+++ b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh
@@ -43,6 +43,12 @@ map_Address_to_DirectoryNode(Addr addr)
     return DirectoryMemory::mapAddressToDirectoryVersion(addr);
 }
 
+inline NodeID
+map_Address_to_TCCdirNode(Addr addr)
+{
+    return DirectoryMemory::mapAddressToDirectoryVersion(addr);
+}
+
 // used to determine the home directory
 // returns a value between 0 and total_directories_within_the_system
 inline MachineID
@@ -53,6 +59,22 @@ map_Address_to_Directory(Addr addr)
     return mach;
 }
 
+inline MachineID
+map_Address_to_RegionDir(Addr addr)
+{
+    MachineID mach = {MachineType_RegionDir,
+                      map_Address_to_DirectoryNode(addr)};
+    return mach;
+}
+
+inline MachineID
+map_Address_to_TCCdir(Addr addr)
+{
+    MachineID mach =
+        {MachineType_TCCdir, map_Address_to_TCCdirNode(addr)};
+    return mach;
+}
+
 inline NetDest
 broadcast(MachineType type)
 {
@@ -102,4 +124,11 @@ createMachineID(MachineType type, NodeID id)
     return mach;
 }
 
+inline MachineID
+MachineTypeAndNodeIDToMachineID(MachineType type, NodeID node)
+{
+    MachineID mach = {type, node};
+    return mach;
+}
+
 #endif  // __MEM_RUBY_SLICC_INTERFACE_COMPONENTMAPPINGS_HH__
diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc
index a8a3ba949..45fb85d05 100644
--- a/src/mem/ruby/structures/CacheMemory.cc
+++ b/src/mem/ruby/structures/CacheMemory.cc
@@ -35,6 +35,7 @@
 #include "mem/protocol/AccessPermission.hh"
 #include "mem/ruby/structures/CacheMemory.hh"
 #include "mem/ruby/system/RubySystem.hh"
+#include "mem/ruby/system/WeightedLRUPolicy.hh"
 
 using namespace std;
 
@@ -66,29 +67,27 @@ CacheMemory::CacheMemory(const Params *p)
     m_start_index_bit = p->start_index_bit;
     m_is_instruction_only_cache = p->is_icache;
     m_resource_stalls = p->resourceStalls;
+    m_block_size = p->block_size;  // may be 0 at this point. Updated in init()
 }
 
 void
 CacheMemory::init()
 {
-    m_cache_num_sets = (m_cache_size / m_cache_assoc) /
-        RubySystem::getBlockSizeBytes();
+    if (m_block_size == 0) {
+        m_block_size = RubySystem::getBlockSizeBytes();
+    }
+    m_cache_num_sets = (m_cache_size / m_cache_assoc) / m_block_size;
     assert(m_cache_num_sets > 1);
     m_cache_num_set_bits = floorLog2(m_cache_num_sets);
     assert(m_cache_num_set_bits > 0);
 
-    m_cache.resize(m_cache_num_sets);
-    for (int i = 0; i < m_cache_num_sets; i++) {
-        m_cache[i].resize(m_cache_assoc);
-        for (int j = 0; j < m_cache_assoc; j++) {
-            m_cache[i][j] = NULL;
-        }
-    }
+    m_cache.resize(m_cache_num_sets,
+                    std::vector<AbstractCacheEntry*>(m_cache_assoc, nullptr));
 }
 
 CacheMemory::~CacheMemory()
 {
-    if (m_replacementPolicy_ptr != NULL)
+    if (m_replacementPolicy_ptr)
         delete m_replacementPolicy_ptr;
     for (int i = 0; i < m_cache_num_sets; i++) {
         for (int j = 0; j < m_cache_assoc; j++) {
@@ -359,6 +358,37 @@ CacheMemory::setMRU(const AbstractCacheEntry *e)
 }
 
 void
+CacheMemory::setMRU(Addr address, int occupancy)
+{
+    int64_t cacheSet = addressToCacheSet(address);
+    int loc = findTagInSet(cacheSet, address);
+
+    if(loc != -1) {
+        if (m_replacementPolicy_ptr->useOccupancy()) {
+            (static_cast<WeightedLRUPolicy*>(m_replacementPolicy_ptr))->
+                touch(cacheSet, loc, curTick(), occupancy);
+        } else {
+            m_replacementPolicy_ptr->
+                touch(cacheSet, loc, curTick());
+        }
+    }
+}
+
+int
+CacheMemory::getReplacementWeight(int64_t set, int64_t loc)
+{
+    assert(set < m_cache_num_sets);
+    assert(loc < m_cache_assoc);
+    int ret = 0;
+    if(m_cache[set][loc] != NULL) {
+        ret = m_cache[set][loc]->getNumValidBlocks();
+        assert(ret >= 0);
+    }
+
+    return ret;
+}
+
+void
 CacheMemory::recordCacheContents(int cntrl, CacheRecorder* tr) const
 {
     uint64_t warmedUpBlocks = 0;
diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh
index 72805b32b..5b30505d3 100644
--- a/src/mem/ruby/structures/CacheMemory.hh
+++ b/src/mem/ruby/structures/CacheMemory.hh
@@ -106,7 +106,8 @@ class CacheMemory : public SimObject
 
     // Set this address to most recently used
     void setMRU(Addr address);
-    // Set this entry to most recently used
+    void setMRU(Addr addr, int occupancy);
+    int getReplacementWeight(int64_t set, int64_t loc);
     void setMRU(const AbstractCacheEntry *e);
 
     // Functions for locking and unlocking cache lines corresponding to the
@@ -146,6 +147,7 @@ class CacheMemory : public SimObject
     Stats::Scalar numDataArrayStalls;
 
     int getCacheSize() const { return m_cache_size; }
+    int getCacheAssoc() const { return m_cache_assoc; }
     int getNumBlocks() const { return m_cache_num_sets * m_cache_assoc; }
     Addr getAddressAtIdx(int idx) const;
 
@@ -182,6 +184,7 @@ class CacheMemory : public SimObject
     int m_cache_assoc;
     int m_start_index_bit;
     bool m_resource_stalls;
+    int m_block_size;
 };
 
 std::ostream& operator<<(std::ostream& out, const CacheMemory& obj);
diff --git a/src/mem/ruby/structures/RubyCache.py b/src/mem/ruby/structures/RubyCache.py
index 4eb87ac74..9fc4726b0 100644
--- a/src/mem/ruby/structures/RubyCache.py
+++ b/src/mem/ruby/structures/RubyCache.py
@@ -42,6 +42,7 @@ class RubyCache(SimObject):
                          "")
     start_index_bit = Param.Int(6, "index start, default 6 for 64-byte line");
     is_icache = Param.Bool(False, "is instruction only cache");
+    block_size = Param.MemorySize("0B", "block size in bytes. 0 means default RubyBlockSize")
 
     dataArrayBanks = Param.Int(1, "Number of banks for the data array")
     tagArrayBanks = Param.Int(1, "Number of banks for the tag array")
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
new file mode 100644
index 000000000..db279bd3a
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "base/misc.hh"
+#include "base/str.hh"
+#include "config/the_isa.hh"
+
+#if THE_ISA == X86_ISA
+#include "arch/x86/insts/microldstop.hh"
+
+#endif // X86_ISA
+#include "mem/ruby/system/GPUCoalescer.hh"
+
+#include "cpu/testers/rubytest/RubyTester.hh"
+#include "debug/GPUCoalescer.hh"
+#include "debug/MemoryAccess.hh"
+#include "debug/ProtocolTrace.hh"
+#include "debug/RubyPort.hh"
+#include "debug/RubyStats.hh"
+#include "gpu-compute/shader.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/common/DataBlock.hh"
+#include "mem/ruby/common/SubBlock.hh"
+#include "mem/ruby/network/MessageBuffer.hh"
+#include "mem/ruby/profiler/Profiler.hh"
+#include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/slicc_interface/RubyRequest.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "params/RubyGPUCoalescer.hh"
+
+using namespace std;
+
+GPUCoalescer *
+RubyGPUCoalescerParams::create()
+{
+    return new GPUCoalescer(this);
+}
+
+HSAScope
+reqScopeToHSAScope(Request* req)
+{
+    HSAScope accessScope = HSAScope_UNSPECIFIED;
+    if (req->isScoped()) {
+        if (req->isWavefrontScope()) {
+            accessScope = HSAScope_WAVEFRONT;
+        } else if (req->isWorkgroupScope()) {
+            accessScope = HSAScope_WORKGROUP;
+        } else if (req->isDeviceScope()) {
+            accessScope = HSAScope_DEVICE;
+        } else if (req->isSystemScope()) {
+            accessScope = HSAScope_SYSTEM;
+        } else {
+            fatal("Bad scope type");
+        }
+    }
+    return accessScope;
+}
+
+HSASegment
+reqSegmentToHSASegment(Request* req)
+{
+    HSASegment accessSegment = HSASegment_GLOBAL;
+
+    if (req->isGlobalSegment()) {
+        accessSegment = HSASegment_GLOBAL;
+    } else if (req->isGroupSegment()) {
+        accessSegment = HSASegment_GROUP;
+    } else if (req->isPrivateSegment()) {
+        accessSegment = HSASegment_PRIVATE;
+    } else if (req->isKernargSegment()) {
+        accessSegment = HSASegment_KERNARG;
+    } else if (req->isReadonlySegment()) {
+        accessSegment = HSASegment_READONLY;
+    } else if (req->isSpillSegment()) {
+        accessSegment = HSASegment_SPILL;
+    } else if (req->isArgSegment()) {
+        accessSegment = HSASegment_ARG;
+    } else {
+        fatal("Bad segment type");
+    }
+
+    return accessSegment;
+}
+
+GPUCoalescer::GPUCoalescer(const Params *p)
+    : RubyPort(p), issueEvent(this), deadlockCheckEvent(this)
+{
+    m_store_waiting_on_load_cycles = 0;
+    m_store_waiting_on_store_cycles = 0;
+    m_load_waiting_on_store_cycles = 0;
+    m_load_waiting_on_load_cycles = 0;
+
+    m_outstanding_count = 0;
+
+    m_max_outstanding_requests = 0;
+    m_deadlock_threshold = 0;
+    m_instCache_ptr = nullptr;
+    m_dataCache_ptr = nullptr;
+
+    m_instCache_ptr = p->icache;
+    m_dataCache_ptr = p->dcache;
+    m_max_outstanding_requests = p->max_outstanding_requests;
+    m_deadlock_threshold = p->deadlock_threshold;
+
+    assert(m_max_outstanding_requests > 0);
+    assert(m_deadlock_threshold > 0);
+    assert(m_instCache_ptr);
+    assert(m_dataCache_ptr);
+
+    m_data_cache_hit_latency = p->dcache_hit_latency;
+
+    m_usingNetworkTester = p->using_network_tester;
+    assumingRfOCoherence = p->assume_rfo;
+}
+
+GPUCoalescer::~GPUCoalescer()
+{
+}
+
+void
+GPUCoalescer::wakeup()
+{
+    // Check for deadlock of any of the requests
+    Cycles current_time = curCycle();
+
+    // Check across all outstanding requests
+    int total_outstanding = 0;
+
+    RequestTable::iterator read = m_readRequestTable.begin();
+    RequestTable::iterator read_end = m_readRequestTable.end();
+    for (; read != read_end; ++read) {
+        GPUCoalescerRequest* request = read->second;
+        if (current_time - request->issue_time < m_deadlock_threshold)
+            continue;
+
+        panic("Possible Deadlock detected. Aborting!\n"
+             "version: %d request.paddr: 0x%x m_readRequestTable: %d "
+             "current time: %u issue_time: %d difference: %d\n", m_version,
+              request->pkt->getAddr(), m_readRequestTable.size(),
+              current_time * clockPeriod(), request->issue_time * clockPeriod(),
+              (current_time - request->issue_time)*clockPeriod());
+    }
+
+    RequestTable::iterator write = m_writeRequestTable.begin();
+    RequestTable::iterator write_end = m_writeRequestTable.end();
+    for (; write != write_end; ++write) {
+        GPUCoalescerRequest* request = write->second;
+        if (current_time - request->issue_time < m_deadlock_threshold)
+            continue;
+
+        panic("Possible Deadlock detected. Aborting!\n"
+             "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
+             "current time: %u issue_time: %d difference: %d\n", m_version,
+              request->pkt->getAddr(), m_writeRequestTable.size(),
+              current_time * clockPeriod(), request->issue_time * clockPeriod(),
+              (current_time - request->issue_time) * clockPeriod());
+    }
+
+    total_outstanding += m_writeRequestTable.size();
+    total_outstanding += m_readRequestTable.size();
+
+    assert(m_outstanding_count == total_outstanding);
+
+    if (m_outstanding_count > 0) {
+        // If there are still outstanding requests, keep checking
+        schedule(deadlockCheckEvent,
+                 m_deadlock_threshold * clockPeriod() +
+                 curTick());
+    }
+}
+
+void
+GPUCoalescer::resetStats()
+{
+    m_latencyHist.reset();
+    m_missLatencyHist.reset();
+    for (int i = 0; i < RubyRequestType_NUM; i++) {
+        m_typeLatencyHist[i]->reset();
+        m_missTypeLatencyHist[i]->reset();
+        for (int j = 0; j < MachineType_NUM; j++) {
+            m_missTypeMachLatencyHist[i][j]->reset();
+        }
+    }
+
+    for (int i = 0; i < MachineType_NUM; i++) {
+        m_missMachLatencyHist[i]->reset();
+
+        m_IssueToInitialDelayHist[i]->reset();
+        m_InitialToForwardDelayHist[i]->reset();
+        m_ForwardToFirstResponseDelayHist[i]->reset();
+        m_FirstResponseToCompletionDelayHist[i]->reset();
+    }
+}
+
+void
+GPUCoalescer::printProgress(ostream& out) const
+{
+}
+
+RequestStatus
+GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
+{
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
+        return RequestStatus_BufferFull;
+    }
+
+    if(m_controller->isBlocked(line_addr) &&
+       request_type != RubyRequestType_Locked_RMW_Write) {
+        return RequestStatus_Aliased;
+    }
+
+    if ((request_type == RubyRequestType_ST) ||
+        (request_type == RubyRequestType_ATOMIC) ||
+        (request_type == RubyRequestType_ATOMIC_RETURN) ||
+        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+        (request_type == RubyRequestType_RMW_Read) ||
+        (request_type == RubyRequestType_RMW_Write) ||
+        (request_type == RubyRequestType_Load_Linked) ||
+        (request_type == RubyRequestType_Store_Conditional) ||
+        (request_type == RubyRequestType_Locked_RMW_Read) ||
+        (request_type == RubyRequestType_Locked_RMW_Write) ||
+        (request_type == RubyRequestType_FLUSH)) {
+
+        // Check if there is any outstanding read request for the same
+        // cache line.
+        if (m_readRequestTable.count(line_addr) > 0) {
+            m_store_waiting_on_load_cycles++;
+            return RequestStatus_Aliased;
+        }
+
+        if (m_writeRequestTable.count(line_addr) > 0) {
+          // There is an outstanding write request for the cache line
+          m_store_waiting_on_store_cycles++;
+          return RequestStatus_Aliased;
+        }
+    } else {
+        // Check if there is any outstanding write request for the same
+        // cache line.
+        if (m_writeRequestTable.count(line_addr) > 0) {
+            m_load_waiting_on_store_cycles++;
+            return RequestStatus_Aliased;
+        }
+
+        if (m_readRequestTable.count(line_addr) > 0) {
+            // There is an outstanding read request for the cache line
+            m_load_waiting_on_load_cycles++;
+            return RequestStatus_Aliased;
+        }
+    }
+
+    return RequestStatus_Ready;
+
+}
+
+
+
+// sets the kernelEndList
+void
+GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
+{
+    // Don't know if this will happen or is possible
+    // but I just want to be careful and not have it become
+    // simulator hang in the future
+    DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
+    assert(kernelEndList.count(wavefront_id) == 0);
+
+    kernelEndList[wavefront_id] = pkt;
+    DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
+            kernelEndList.size());
+}
+
+
+// Insert the request on the correct request table.  Return true if
+// the entry was already present.
+bool
+GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
+{
+    assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
+           pkt->req->isLockedRMW() ||
+           !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
+
+    int total_outstanding M5_VAR_USED =
+        m_writeRequestTable.size() + m_readRequestTable.size();
+
+    assert(m_outstanding_count == total_outstanding);
+
+    // See if we should schedule a deadlock check
+    if (deadlockCheckEvent.scheduled() == false) {
+        schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
+    }
+
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+    if ((request_type == RubyRequestType_ST) ||
+        (request_type == RubyRequestType_ATOMIC) ||
+        (request_type == RubyRequestType_ATOMIC_RETURN) ||
+        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+        (request_type == RubyRequestType_RMW_Read) ||
+        (request_type == RubyRequestType_RMW_Write) ||
+        (request_type == RubyRequestType_Load_Linked) ||
+        (request_type == RubyRequestType_Store_Conditional) ||
+        (request_type == RubyRequestType_Locked_RMW_Read) ||
+        (request_type == RubyRequestType_Locked_RMW_Write) ||
+        (request_type == RubyRequestType_FLUSH)) {
+
+        pair<RequestTable::iterator, bool> r =
+          m_writeRequestTable.insert(RequestTable::value_type(line_addr,
+                                       (GPUCoalescerRequest*) NULL));
+        if (r.second) {
+            RequestTable::iterator i = r.first;
+            i->second = new GPUCoalescerRequest(pkt, request_type,
+                                                curCycle());
+            DPRINTF(GPUCoalescer,
+                    "Inserting write request for paddr %#x for type %d\n",
+                    pkt->req->getPaddr(), i->second->m_type);
+            m_outstanding_count++;
+        } else {
+            return true;
+        }
+    } else {
+        pair<RequestTable::iterator, bool> r =
+            m_readRequestTable.insert(RequestTable::value_type(line_addr,
+                                        (GPUCoalescerRequest*) NULL));
+
+        if (r.second) {
+            RequestTable::iterator i = r.first;
+            i->second = new GPUCoalescerRequest(pkt, request_type,
+                                             curCycle());
+            DPRINTF(GPUCoalescer,
+                    "Inserting read request for paddr %#x for type %d\n",
+                    pkt->req->getPaddr(), i->second->m_type);
+            m_outstanding_count++;
+        } else {
+            return true;
+        }
+    }
+
+    m_outstandReqHist.sample(m_outstanding_count);
+
+    total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
+    assert(m_outstanding_count == total_outstanding);
+
+    return false;
+}
+
+void
+GPUCoalescer::markRemoved()
+{
+    m_outstanding_count--;
+    assert(m_outstanding_count ==
+           m_writeRequestTable.size() + m_readRequestTable.size());
+}
+
+void
+GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
+{
+    assert(m_outstanding_count ==
+           m_writeRequestTable.size() + m_readRequestTable.size());
+
+    Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
+    if ((srequest->m_type == RubyRequestType_ST) ||
+        (srequest->m_type == RubyRequestType_RMW_Read) ||
+        (srequest->m_type == RubyRequestType_RMW_Write) ||
+        (srequest->m_type == RubyRequestType_Load_Linked) ||
+        (srequest->m_type == RubyRequestType_Store_Conditional) ||
+        (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
+        (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
+        m_writeRequestTable.erase(line_addr);
+    } else {
+        m_readRequestTable.erase(line_addr);
+    }
+
+    markRemoved();
+}
+
+bool
+GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
+{
+    //
+    // The success flag indicates whether the LLSC operation was successful.
+    // LL ops will always succeed, but SC may fail if the cache line is no
+    // longer locked.
+    //
+    bool success = true;
+    if (request->m_type == RubyRequestType_Store_Conditional) {
+        if (!m_dataCache_ptr->isLocked(address, m_version)) {
+            //
+            // For failed SC requests, indicate the failure to the cpu by
+            // setting the extra data to zero.
+            //
+            request->pkt->req->setExtraData(0);
+            success = false;
+        } else {
+            //
+            // For successful SC requests, indicate the success to the cpu by
+            // setting the extra data to one.
+            //
+            request->pkt->req->setExtraData(1);
+        }
+        //
+        // Independent of success, all SC operations must clear the lock
+        //
+        m_dataCache_ptr->clearLocked(address);
+    } else if (request->m_type == RubyRequestType_Load_Linked) {
+        //
+        // Note: To fully follow Alpha LLSC semantics, should the LL clear any
+        // previously locked cache lines?
+        //
+        m_dataCache_ptr->setLocked(address, m_version);
+    } else if ((m_dataCache_ptr->isTagPresent(address)) &&
+               (m_dataCache_ptr->isLocked(address, m_version))) {
+        //
+        // Normal writes should clear the locked address
+        //
+        m_dataCache_ptr->clearLocked(address);
+    }
+    return success;
+}
+
+void
+GPUCoalescer::writeCallback(Addr address, DataBlock& data)
+{
+    writeCallback(address, MachineType_NULL, data);
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+                         MachineType mach,
+                         DataBlock& data)
+{
+    writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+                         MachineType mach,
+                         DataBlock& data,
+                         Cycles initialRequestTime,
+                         Cycles forwardRequestTime,
+                         Cycles firstResponseTime)
+{
+    writeCallback(address, mach, data,
+                  initialRequestTime, forwardRequestTime, firstResponseTime,
+                  false);
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+                         MachineType mach,
+                         DataBlock& data,
+                         Cycles initialRequestTime,
+                         Cycles forwardRequestTime,
+                         Cycles firstResponseTime,
+                         bool isRegion)
+{
+    assert(address == makeLineAddress(address));
+
+    DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
+    assert(m_writeRequestTable.count(makeLineAddress(address)));
+
+    RequestTable::iterator i = m_writeRequestTable.find(address);
+    assert(i != m_writeRequestTable.end());
+    GPUCoalescerRequest* request = i->second;
+
+    m_writeRequestTable.erase(i);
+    markRemoved();
+
+    assert((request->m_type == RubyRequestType_ST) ||
+           (request->m_type == RubyRequestType_ATOMIC) ||
+           (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
+           (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+           (request->m_type == RubyRequestType_RMW_Read) ||
+           (request->m_type == RubyRequestType_RMW_Write) ||
+           (request->m_type == RubyRequestType_Load_Linked) ||
+           (request->m_type == RubyRequestType_Store_Conditional) ||
+           (request->m_type == RubyRequestType_Locked_RMW_Read) ||
+           (request->m_type == RubyRequestType_Locked_RMW_Write) ||
+           (request->m_type == RubyRequestType_FLUSH));
+
+
+    //
+    // For Alpha, properly handle LL, SC, and write requests with respect to
+    // locked cache blocks.
+    //
+    // Not valid for Network_test protocl
+    //
+    bool success = true;
+    if(!m_usingNetworkTester)
+        success = handleLlsc(address, request);
+
+    if (request->m_type == RubyRequestType_Locked_RMW_Read) {
+        m_controller->blockOnQueue(address, m_mandatory_q_ptr);
+    } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
+        m_controller->unblock(address);
+    }
+
+    hitCallback(request, mach, data, success,
+                request->issue_time, forwardRequestTime, firstResponseTime,
+                isRegion);
+}
+
+void
+GPUCoalescer::readCallback(Addr address, DataBlock& data)
+{
+    readCallback(address, MachineType_NULL, data);
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+                        MachineType mach,
+                        DataBlock& data)
+{
+    readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+                        MachineType mach,
+                        DataBlock& data,
+                        Cycles initialRequestTime,
+                        Cycles forwardRequestTime,
+                        Cycles firstResponseTime)
+{
+
+    readCallback(address, mach, data,
+                 initialRequestTime, forwardRequestTime, firstResponseTime,
+                 false);
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+                        MachineType mach,
+                        DataBlock& data,
+                        Cycles initialRequestTime,
+                        Cycles forwardRequestTime,
+                        Cycles firstResponseTime,
+                        bool isRegion)
+{
+    assert(address == makeLineAddress(address));
+    assert(m_readRequestTable.count(makeLineAddress(address)));
+
+    DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
+    RequestTable::iterator i = m_readRequestTable.find(address);
+    assert(i != m_readRequestTable.end());
+    GPUCoalescerRequest* request = i->second;
+
+    m_readRequestTable.erase(i);
+    markRemoved();
+
+    assert((request->m_type == RubyRequestType_LD) ||
+           (request->m_type == RubyRequestType_IFETCH));
+
+    hitCallback(request, mach, data, true,
+                request->issue_time, forwardRequestTime, firstResponseTime,
+                isRegion);
+}
+
+void
+GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
+                       MachineType mach,
+                       DataBlock& data,
+                       bool success,
+                       Cycles initialRequestTime,
+                       Cycles forwardRequestTime,
+                       Cycles firstResponseTime,
+                       bool isRegion)
+{
+    PacketPtr pkt = srequest->pkt;
+    Addr request_address = pkt->getAddr();
+    Addr request_line_address = makeLineAddress(request_address);
+
+    RubyRequestType type = srequest->m_type;
+
+    // Set this cache entry to the most recently used
+    if (type == RubyRequestType_IFETCH) {
+        if (m_instCache_ptr->isTagPresent(request_line_address))
+            m_instCache_ptr->setMRU(request_line_address);
+    } else {
+        if (m_dataCache_ptr->isTagPresent(request_line_address))
+            m_dataCache_ptr->setMRU(request_line_address);
+    }
+
+    recordMissLatency(srequest, mach,
+                      initialRequestTime,
+                      forwardRequestTime,
+                      firstResponseTime,
+                      success, isRegion);
+    // update the data
+    //
+    // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
+    int len = reqCoalescer[request_line_address].size();
+    std::vector<PacketPtr> mylist;
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = reqCoalescer[request_line_address][i].first;
+        assert(type ==
+               reqCoalescer[request_line_address][i].second[PrimaryType]);
+        request_address = pkt->getAddr();
+        request_line_address = makeLineAddress(pkt->getAddr());
+        if (pkt->getPtr<uint8_t>()) {
+            if ((type == RubyRequestType_LD) ||
+                (type == RubyRequestType_ATOMIC) ||
+                (type == RubyRequestType_ATOMIC_RETURN) ||
+                (type == RubyRequestType_IFETCH) ||
+                (type == RubyRequestType_RMW_Read) ||
+                (type == RubyRequestType_Locked_RMW_Read) ||
+                (type == RubyRequestType_Load_Linked)) {
+                memcpy(pkt->getPtr<uint8_t>(),
+                       data.getData(getOffset(request_address),
+                                    pkt->getSize()),
+                       pkt->getSize());
+            } else {
+                data.setData(pkt->getPtr<uint8_t>(),
+                             getOffset(request_address), pkt->getSize());
+            }
+        } else {
+            DPRINTF(MemoryAccess,
+                    "WARNING.  Data not transfered from Ruby to M5 for type " \
+                    "%s\n",
+                    RubyRequestType_to_string(type));
+        }
+
+        // If using the RubyTester, update the RubyTester sender state's
+        // subBlock with the recieved data.  The tester will later access
+        // this state.
+        // Note: RubyPort will access it's sender state before the
+        // RubyTester.
+        if (m_usingRubyTester) {
+            RubyPort::SenderState *requestSenderState =
+                safe_cast<RubyPort::SenderState*>(pkt->senderState);
+            RubyTester::SenderState* testerSenderState =
+                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+            testerSenderState->subBlock.mergeFrom(data);
+        }
+
+        mylist.push_back(pkt);
+    }
+    delete srequest;
+    reqCoalescer.erase(request_line_address);
+    assert(!reqCoalescer.count(request_line_address));
+
+
+
+    completeHitCallback(mylist, len);
+}
+
+bool
+GPUCoalescer::empty() const
+{
+    return m_writeRequestTable.empty() && m_readRequestTable.empty();
+}
+
+// Analyzes the packet to see if this request can be coalesced.
+// If request can be coalesced, this request is added to the reqCoalescer table
+// and makeRequest returns RequestStatus_Issued;
+// If this is the first request to a cacheline, request is added to both
+// newRequests queue and to the reqCoalescer table; makeRequest
+// returns RequestStatus_Issued.
+// If there is a pending request to this cacheline and this request
+// can't be coalesced, RequestStatus_Aliased is returned and
+// the packet needs to be reissued.
+RequestStatus
+GPUCoalescer::makeRequest(PacketPtr pkt)
+{
+    // Check for GPU Barrier Kernel End or Kernel Begin
+    // Leave these to be handled by the child class
+    // Kernel End/Barrier = isFlush + isRelease
+    // Kernel Begin = isFlush + isAcquire
+    if (pkt->req->isKernel()) {
+        if (pkt->req->isAcquire()){
+            // This is a Kernel Begin leave handling to
+            // virtual xCoalescer::makeRequest
+            return RequestStatus_Issued;
+        }else if(pkt->req->isRelease()) {
+            // This is a Kernel End leave handling to
+            // virtual xCoalescer::makeRequest
+            // If we are here then we didn't call
+            // a virtual version of this function
+            // so we will also schedule the callback
+            int wf_id = 0;
+            if (pkt->req->hasContextId()) {
+                wf_id = pkt->req->contextId();
+            }
+            insertKernel(wf_id, pkt);
+            newKernelEnds.push_back(wf_id);
+            if (!issueEvent.scheduled()) {
+                schedule(issueEvent, curTick());
+            }
+            return RequestStatus_Issued;
+        }
+    }
+
+    // If number of outstanding requests greater than the max allowed,
+    // return RequestStatus_BufferFull. This logic can be extended to
+    // support proper backpressure.
+    if (m_outstanding_count >= m_max_outstanding_requests) {
+        return RequestStatus_BufferFull;
+    }
+
+    RubyRequestType primary_type = RubyRequestType_NULL;
+    RubyRequestType secondary_type = RubyRequestType_NULL;
+
+    if (pkt->isLLSC()) {
+        //
+        // Alpha LL/SC instructions need to be handled carefully by the cache
+        // coherence protocol to ensure they follow the proper semantics. In
+        // particular, by identifying the operations as atomic, the protocol
+        // should understand that migratory sharing optimizations should not
+        // be performed (i.e. a load between the LL and SC should not steal
+        // away exclusive permission).
+        //
+        if (pkt->isWrite()) {
+            primary_type = RubyRequestType_Store_Conditional;
+        } else {
+            assert(pkt->isRead());
+            primary_type = RubyRequestType_Load_Linked;
+        }
+        secondary_type = RubyRequestType_ATOMIC;
+    } else if (pkt->req->isLockedRMW()) {
+        //
+        // x86 locked instructions are translated to store cache coherence
+        // requests because these requests should always be treated as read
+        // exclusive operations and should leverage any migratory sharing
+        // optimization built into the protocol.
+        //
+        if (pkt->isWrite()) {
+            primary_type = RubyRequestType_Locked_RMW_Write;
+        } else {
+            assert(pkt->isRead());
+            primary_type = RubyRequestType_Locked_RMW_Read;
+        }
+        secondary_type = RubyRequestType_ST;
+    } else if (pkt->isAtomicOp()) {
+        //
+        // GPU Atomic Operation
+        //
+        primary_type = RubyRequestType_ATOMIC;
+        secondary_type = RubyRequestType_ATOMIC;
+    } else {
+        if (pkt->isRead()) {
+            if (pkt->req->isInstFetch()) {
+                primary_type = secondary_type = RubyRequestType_IFETCH;
+            } else {
+#if THE_ISA == X86_ISA
+                uint32_t flags = pkt->req->getFlags();
+                bool storeCheck = flags &
+                        (TheISA::StoreCheck << TheISA::FlagShift);
+#else
+                bool storeCheck = false;
+#endif // X86_ISA
+                if (storeCheck) {
+                    primary_type = RubyRequestType_RMW_Read;
+                    secondary_type = RubyRequestType_ST;
+                } else {
+                    primary_type = secondary_type = RubyRequestType_LD;
+                }
+            }
+        } else if (pkt->isWrite()) {
+            //
+            // Note: M5 packets do not differentiate ST from RMW_Write
+            //
+            primary_type = secondary_type = RubyRequestType_ST;
+        } else if (pkt->isFlush()) {
+            primary_type = secondary_type = RubyRequestType_FLUSH;
+        } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
+            if (assumingRfOCoherence) {
+                // If we reached here, this request must be a memFence
+                // and the protocol implements RfO, the coalescer can
+                // assume sequentially consistency and schedule the callback
+                // immediately.
+                // Currently the code implements fence callbacks
+                // by reusing the mechanism for kernel completions.
+                // This should be fixed.
+                int wf_id = 0;
+                if (pkt->req->hasContextId()) {
+                    wf_id = pkt->req->contextId();
+                }
+                insertKernel(wf_id, pkt);
+                newKernelEnds.push_back(wf_id);
+                if (!issueEvent.scheduled()) {
+                    schedule(issueEvent, curTick());
+                }
+                return RequestStatus_Issued;
+            } else {
+                // If not RfO, return issued here and let the child coalescer
+                // take care of it.
+                return RequestStatus_Issued;
+            }
+        } else {
+            panic("Unsupported ruby packet type\n");
+        }
+    }
+
+    // Check if there is any pending request to this cache line from
+    // previous cycles.
+    // If there is a pending request, return aliased. Since coalescing
+    // across time is not permitted, aliased requests are not coalesced.
+    // If a request for this address has already been issued, we must block
+    RequestStatus status = getRequestStatus(pkt, primary_type);
+    if (status != RequestStatus_Ready)
+        return status;
+
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    // Check if this request can be coalesced with previous
+    // requests from this cycle.
+    if (!reqCoalescer.count(line_addr)) {
+        // This is the first access to this cache line.
+        // A new request to the memory subsystem has to be
+        // made in the next cycle for this cache line, so
+        // add this line addr to the "newRequests" queue
+        newRequests.push_back(line_addr);
+
+    // There was a request to this cache line in this cycle,
+    // let us see if we can coalesce this request with the previous
+    // requests from this cycle
+    } else if (primary_type !=
+               reqCoalescer[line_addr][0].second[PrimaryType]) {
+        // can't coalesce loads, stores and atomics!
+        return RequestStatus_Aliased;
+    } else if (pkt->req->isLockedRMW() ||
+               reqCoalescer[line_addr][0].first->req->isLockedRMW()) {
+        // can't coalesce locked accesses, but can coalesce atomics!
+        return RequestStatus_Aliased;
+    } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
+               pkt->req->contextId() !=
+               reqCoalescer[line_addr][0].first->req->contextId()) {
+        // can't coalesce releases from different wavefronts
+        return RequestStatus_Aliased;
+    }
+
+    // in addition to the packet, we need to save both request types
+    reqCoalescer[line_addr].push_back(
+            RequestDesc(pkt, std::vector<RubyRequestType>()) );
+    reqCoalescer[line_addr].back().second.push_back(primary_type);
+    reqCoalescer[line_addr].back().second.push_back(secondary_type);
+    if (!issueEvent.scheduled())
+        schedule(issueEvent, curTick());
+    // TODO: issue hardware prefetches here
+    return RequestStatus_Issued;
+}
+
+void
+GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
+{
+
+    int proc_id = -1;
+    if (pkt != NULL && pkt->req->hasContextId()) {
+        proc_id = pkt->req->contextId();
+    }
+
+    // If valid, copy the pc to the ruby request
+    Addr pc = 0;
+    if (pkt->req->hasPC()) {
+        pc = pkt->req->getPC();
+    }
+
+    // At the moment setting scopes only counts
+    // for GPU spill space accesses
+    // which is pkt->req->isStack()
+    // this scope is REPLACE since it
+    // does not need to be flushed at the end
+    // of a kernel Private and local may need
+    // to be visible at the end of the kernel
+    HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
+    HSAScope accessScope = reqScopeToHSAScope(pkt->req);
+
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    // Creating WriteMask that records written bytes
+    // and atomic operations. This enables partial writes
+    // and partial reads of those writes
+    DataBlock dataBlock;
+    dataBlock.clear();
+    uint32_t blockSize = RubySystem::getBlockSizeBytes();
+    std::vector<bool> accessMask(blockSize,false);
+    std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
+    uint32_t tableSize = reqCoalescer[line_addr].size();
+    for (int i = 0; i < tableSize; i++) {
+        PacketPtr tmpPkt = reqCoalescer[line_addr][i].first;
+        uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
+        uint32_t tmpSize = tmpPkt->getSize();
+        if (tmpPkt->isAtomicOp()) {
+            std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
+                                                        tmpPkt->getAtomicOp());
+            atomicOps.push_back(tmpAtomicOp);
+        } else if(tmpPkt->isWrite()) {
+            dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
+                              tmpOffset, tmpSize);
+        }
+        for (int j = 0; j < tmpSize; j++) {
+            accessMask[tmpOffset + j] = true;
+        }
+    }
+    std::shared_ptr<RubyRequest> msg;
+    if (pkt->isAtomicOp()) {
+        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+                              pkt->getPtr<uint8_t>(),
+                              pkt->getSize(), pc, secondary_type,
+                              RubyAccessMode_Supervisor, pkt,
+                              PrefetchBit_No, proc_id, 100,
+                              blockSize, accessMask,
+                              dataBlock, atomicOps,
+                              accessScope, accessSegment);
+    } else {
+        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+                              pkt->getPtr<uint8_t>(),
+                              pkt->getSize(), pc, secondary_type,
+                              RubyAccessMode_Supervisor, pkt,
+                              PrefetchBit_No, proc_id, 100,
+                              blockSize, accessMask,
+                              dataBlock,
+                              accessScope, accessSegment);
+    }
+    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
+             curTick(), m_version, "Coal", "Begin", "", "",
+             printAddress(msg->getPhysicalAddress()),
+             RubyRequestType_to_string(secondary_type));
+
+    fatal_if(secondary_type == RubyRequestType_IFETCH,
+             "there should not be any I-Fetch requests in the GPU Coalescer");
+
+    // Send the message to the cache controller
+    fatal_if(m_data_cache_hit_latency == 0,
+             "should not have a latency of zero");
+
+    assert(m_mandatory_q_ptr);
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+}
+
+template <class KEY, class VALUE>
+std::ostream &
+operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
+{
+    out << "[";
+    for (auto i = map.begin(); i != map.end(); ++i)
+        out << " " << i->first << "=" << i->second;
+    out << " ]";
+
+    return out;
+}
+
+void
+GPUCoalescer::print(ostream& out) const
+{
+    out << "[GPUCoalescer: " << m_version
+        << ", outstanding requests: " << m_outstanding_count
+        << ", read request table: " << m_readRequestTable
+        << ", write request table: " << m_writeRequestTable
+        << "]";
+}
+
+// this can be called from setState whenever coherence permissions are
+// upgraded when invoked, coherence violations will be checked for the
+// given block
+void
+GPUCoalescer::checkCoherence(Addr addr)
+{
+#ifdef CHECK_COHERENCE
+    m_ruby_system->checkGlobalCoherenceInvariant(addr);
+#endif
+}
+
+void
+GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
+    DPRINTF(RubyStats, "Recorded statistic: %s\n",
+            SequencerRequestType_to_string(requestType));
+}
+
+GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq)
+    : Event(Progress_Event_Pri), seq(_seq)
+{
+}
+
+
+void
+GPUCoalescer::completeIssue()
+{
+    // newRequests has the cacheline addresses of all the
+    // requests which need to be issued to the memory subsystem
+    // in this cycle
+    int len = newRequests.size();
+    DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
+    for (int i = 0; i < len; ++i) {
+        // Get the requests from reqCoalescer table. Get only the
+        // first request for each cacheline, the remaining requests
+        // can be coalesced with the first request. So, only
+        // one request is issued per cacheline.
+        RequestDesc info = reqCoalescer[newRequests[i]][0];
+        PacketPtr pkt = info.first;
+        DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
+                i, pkt->req->getPaddr());
+        // Insert this request to the read/writeRequestTables. These tables
+        // are used to track aliased requests in makeRequest subroutine
+        bool found = insertRequest(pkt, info.second[PrimaryType]);
+
+        if (found) {
+            panic("GPUCoalescer::makeRequest should never be called if the "
+                  "request is already outstanding\n");
+        }
+
+        // Issue request to ruby subsystem
+        issueRequest(pkt, info.second[SecondaryType]);
+    }
+    newRequests.clear();
+
+    // have Kernel End releases been issued this cycle
+    len = newKernelEnds.size();
+    for (int i = 0; i < len; i++) {
+        kernelCallback(newKernelEnds[i]);
+    }
+    newKernelEnds.clear();
+}
+
+void
+GPUCoalescer::IssueEvent::process()
+{
+    seq->completeIssue();
+}
+
+const char *
+GPUCoalescer::IssueEvent::description() const
+{
+    return "Issue coalesced request";
+}
+
+void
+GPUCoalescer::evictionCallback(Addr address)
+{
+    ruby_eviction_callback(address);
+}
+
+void
+GPUCoalescer::kernelCallback(int wavefront_id)
+{
+    assert(kernelEndList.count(wavefront_id));
+
+    ruby_hit_callback(kernelEndList[wavefront_id]);
+
+    kernelEndList.erase(wavefront_id);
+}
+
+void
+GPUCoalescer::atomicCallback(Addr address,
+                             MachineType mach,
+                             const DataBlock& data)
+{
+    assert(address == makeLineAddress(address));
+
+    DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
+    assert(m_writeRequestTable.count(makeLineAddress(address)));
+
+    RequestTable::iterator i = m_writeRequestTable.find(address);
+    assert(i != m_writeRequestTable.end());
+    GPUCoalescerRequest* srequest = i->second;
+
+    m_writeRequestTable.erase(i);
+    markRemoved();
+
+    assert((srequest->m_type == RubyRequestType_ATOMIC) ||
+           (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
+           (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
+
+
+    // Atomics don't write to cache, so there is no MRU update...
+
+    recordMissLatency(srequest, mach,
+                      srequest->issue_time, Cycles(0), Cycles(0), true, false);
+
+    PacketPtr pkt = srequest->pkt;
+    Addr request_address = pkt->getAddr();
+    Addr request_line_address = makeLineAddress(pkt->getAddr());
+
+    int len = reqCoalescer[request_line_address].size();
+    std::vector<PacketPtr> mylist;
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = reqCoalescer[request_line_address][i].first;
+        assert(srequest->m_type ==
+               reqCoalescer[request_line_address][i].second[PrimaryType]);
+        request_address = (pkt->getAddr());
+        request_line_address = makeLineAddress(request_address);
+        if (pkt->getPtr<uint8_t>() &&
+            srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
+            /* atomics are done in memory, and return the data *before* the atomic op... */
+            memcpy(pkt->getPtr<uint8_t>(),
+                   data.getData(getOffset(request_address),
+                                pkt->getSize()),
+                   pkt->getSize());
+        } else {
+            DPRINTF(MemoryAccess,
+                    "WARNING.  Data not transfered from Ruby to M5 for type " \
+                    "%s\n",
+                    RubyRequestType_to_string(srequest->m_type));
+        }
+
+        // If using the RubyTester, update the RubyTester sender state's
+        // subBlock with the recieved data.  The tester will later access
+        // this state.
+        // Note: RubyPort will access it's sender state before the
+        // RubyTester.
+        if (m_usingRubyTester) {
+            RubyPort::SenderState *requestSenderState =
+                safe_cast<RubyPort::SenderState*>(pkt->senderState);
+            RubyTester::SenderState* testerSenderState =
+                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+            testerSenderState->subBlock.mergeFrom(data);
+        }
+
+        mylist.push_back(pkt);
+    }
+    delete srequest;
+    reqCoalescer.erase(request_line_address);
+    assert(!reqCoalescer.count(request_line_address));
+
+    completeHitCallback(mylist, len);
+}
+
+void
+GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
+{
+    if(myMachID == senderMachID) {
+        CP_TCPLdHits++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) {
+        CP_TCPLdTransfers++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) {
+        CP_TCCLdHits++;
+    } else {
+        CP_LdMiss++;
+    }
+}
+
+void
+GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
+{
+    if(myMachID == senderMachID) {
+        CP_TCPStHits++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) {
+        CP_TCPStTransfers++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) {
+        CP_TCCStHits++;
+    } else {
+        CP_StMiss++;
+    }
+}
+
+void
+GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
+{
+    for (int i = 0; i < len; ++i) {
+        RubyPort::SenderState *ss =
+            safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
+        MemSlavePort *port = ss->port;
+        assert(port != NULL);
+
+        mylist[i]->senderState = ss->predecessor;
+        delete ss;
+        port->hitCallback(mylist[i]);
+        trySendRetries();
+    }
+
+    testDrainComplete();
+}
+
+PacketPtr
+GPUCoalescer::mapAddrToPkt(Addr address)
+{
+    RequestTable::iterator i = m_readRequestTable.find(address);
+    assert(i != m_readRequestTable.end());
+    GPUCoalescerRequest* request = i->second;
+    return request->pkt;
+}
+
+void
+GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
+                                MachineType mach,
+                                Cycles initialRequestTime,
+                                Cycles forwardRequestTime,
+                                Cycles firstResponseTime,
+                                bool success, bool isRegion)
+{
+    RubyRequestType type = srequest->m_type;
+    Cycles issued_time = srequest->issue_time;
+    Cycles completion_time = curCycle();
+    assert(completion_time >= issued_time);
+    Cycles total_lat = completion_time - issued_time;
+
+    // cache stats (valid for RfO protocol only)
+    if (mach == MachineType_TCP) {
+        if (type == RubyRequestType_LD) {
+            GPU_TCPLdHits++;
+        } else {
+            GPU_TCPStHits++;
+        }
+    } else if (mach == MachineType_L1Cache_wCC) {
+        if (type == RubyRequestType_LD) {
+            GPU_TCPLdTransfers++;
+        } else {
+            GPU_TCPStTransfers++;
+        }
+    } else if (mach == MachineType_TCC) {
+        if (type == RubyRequestType_LD) {
+            GPU_TCCLdHits++;
+        } else {
+            GPU_TCCStHits++;
+        }
+    } else  {
+        if (type == RubyRequestType_LD) {
+            GPU_LdMiss++;
+        } else {
+            GPU_StMiss++;
+        }
+    }
+
+    // Profile all access latency, even zero latency accesses
+    m_latencyHist.sample(total_lat);
+    m_typeLatencyHist[type]->sample(total_lat);
+
+    // Profile the miss latency for all non-zero demand misses
+    if (total_lat != Cycles(0)) {
+        m_missLatencyHist.sample(total_lat);
+        m_missTypeLatencyHist[type]->sample(total_lat);
+
+        if (mach != MachineType_NUM) {
+            m_missMachLatencyHist[mach]->sample(total_lat);
+            m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
+
+            if ((issued_time <= initialRequestTime) &&
+                (initialRequestTime <= forwardRequestTime) &&
+                (forwardRequestTime <= firstResponseTime) &&
+                (firstResponseTime <= completion_time)) {
+
+                m_IssueToInitialDelayHist[mach]->sample(
+                    initialRequestTime - issued_time);
+                m_InitialToForwardDelayHist[mach]->sample(
+                    forwardRequestTime - initialRequestTime);
+                m_ForwardToFirstResponseDelayHist[mach]->sample(
+                    firstResponseTime - forwardRequestTime);
+                m_FirstResponseToCompletionDelayHist[mach]->sample(
+                    completion_time - firstResponseTime);
+            }
+        }
+
+    }
+
+    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
+             curTick(), m_version, "Coal",
+             success ? "Done" : "SC_Failed", "", "",
+             printAddress(srequest->pkt->getAddr()), total_lat);
+}
+
+void
+GPUCoalescer::regStats()
+{
+    // These statistical variables are not for display.
+    // The profiler will collate these across different
+    // coalescers and display those collated statistics.
+    m_outstandReqHist.init(10);
+    m_latencyHist.init(10);
+    m_missLatencyHist.init(10);
+
+    for (int i = 0; i < RubyRequestType_NUM; i++) {
+        m_typeLatencyHist.push_back(new Stats::Histogram());
+        m_typeLatencyHist[i]->init(10);
+
+        m_missTypeLatencyHist.push_back(new Stats::Histogram());
+        m_missTypeLatencyHist[i]->init(10);
+    }
+
+    for (int i = 0; i < MachineType_NUM; i++) {
+        m_missMachLatencyHist.push_back(new Stats::Histogram());
+        m_missMachLatencyHist[i]->init(10);
+
+        m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
+        m_IssueToInitialDelayHist[i]->init(10);
+
+        m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
+        m_InitialToForwardDelayHist[i]->init(10);
+
+        m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
+        m_ForwardToFirstResponseDelayHist[i]->init(10);
+
+        m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
+        m_FirstResponseToCompletionDelayHist[i]->init(10);
+    }
+
+    for (int i = 0; i < RubyRequestType_NUM; i++) {
+        m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
+
+        for (int j = 0; j < MachineType_NUM; j++) {
+            m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
+            m_missTypeMachLatencyHist[i][j]->init(10);
+        }
+    }
+
+    // GPU cache stats
+    GPU_TCPLdHits
+        .name(name() + ".gpu_tcp_ld_hits")
+        .desc("loads that hit in the TCP")
+        ;
+    GPU_TCPLdTransfers
+        .name(name() + ".gpu_tcp_ld_transfers")
+        .desc("TCP to TCP load transfers")
+        ;
+    GPU_TCCLdHits
+        .name(name() + ".gpu_tcc_ld_hits")
+        .desc("loads that hit in the TCC")
+        ;
+    GPU_LdMiss
+        .name(name() + ".gpu_ld_misses")
+        .desc("loads that miss in the GPU")
+        ;
+
+    GPU_TCPStHits
+        .name(name() + ".gpu_tcp_st_hits")
+        .desc("stores that hit in the TCP")
+        ;
+    GPU_TCPStTransfers
+        .name(name() + ".gpu_tcp_st_transfers")
+        .desc("TCP to TCP store transfers")
+        ;
+    GPU_TCCStHits
+        .name(name() + ".gpu_tcc_st_hits")
+        .desc("stores that hit in the TCC")
+        ;
+    GPU_StMiss
+        .name(name() + ".gpu_st_misses")
+        .desc("stores that miss in the GPU")
+        ;
+
+    // CP cache stats
+    CP_TCPLdHits
+        .name(name() + ".cp_tcp_ld_hits")
+        .desc("loads that hit in the TCP")
+        ;
+    CP_TCPLdTransfers
+        .name(name() + ".cp_tcp_ld_transfers")
+        .desc("TCP to TCP load transfers")
+        ;
+    CP_TCCLdHits
+        .name(name() + ".cp_tcc_ld_hits")
+        .desc("loads that hit in the TCC")
+        ;
+    CP_LdMiss
+        .name(name() + ".cp_ld_misses")
+        .desc("loads that miss in the GPU")
+        ;
+
+    CP_TCPStHits
+        .name(name() + ".cp_tcp_st_hits")
+        .desc("stores that hit in the TCP")
+        ;
+    CP_TCPStTransfers
+        .name(name() + ".cp_tcp_st_transfers")
+        .desc("TCP to TCP store transfers")
+        ;
+    CP_TCCStHits
+        .name(name() + ".cp_tcc_st_hits")
+        .desc("stores that hit in the TCC")
+        ;
+    CP_StMiss
+        .name(name() + ".cp_st_misses")
+        .desc("stores that miss in the GPU")
+        ;
+}
diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh
new file mode 100644
index 000000000..dbd47059c
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.hh
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+
+#include <iostream>
+#include <unordered_map>
+
+#include "base/statistics.hh"
+#include "mem/protocol/HSAScope.hh"
+#include "mem/protocol/HSASegment.hh"
+#include "mem/protocol/PrefetchBit.hh"
+#include "mem/protocol/RubyAccessMode.hh"
+#include "mem/protocol/RubyRequestType.hh"
+#include "mem/protocol/SequencerRequestType.hh"
+#include "mem/request.hh"
+#include "mem/ruby/common/Address.hh"
+#include "mem/ruby/common/Consumer.hh"
+#include "mem/ruby/system/RubyPort.hh"
+
+class DataBlock;
+class CacheMsg;
+class MachineID;
+class CacheMemory;
+
+class RubyGPUCoalescerParams;
+
+HSAScope reqScopeToHSAScope(Request* req);
+HSASegment reqSegmentToHSASegment(Request* req);
+
+struct GPUCoalescerRequest
+{
+    PacketPtr pkt;
+    RubyRequestType m_type;
+    Cycles issue_time;
+
+    GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type,
+                        Cycles _issue_time)
+        : pkt(_pkt), m_type(_m_type), issue_time(_issue_time)
+    {}
+};
+
+std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj);
+
+class GPUCoalescer : public RubyPort
+{
+  public:
+    typedef RubyGPUCoalescerParams Params;
+    GPUCoalescer(const Params *);
+    ~GPUCoalescer();
+
+    // Public Methods
+    void wakeup(); // Used only for deadlock detection
+
+    void printProgress(std::ostream& out) const;
+    void resetStats();
+    void collateStats();
+    void regStats();
+
+    void writeCallback(Addr address, DataBlock& data);
+
+    void writeCallback(Addr address,
+                       MachineType mach,
+                       DataBlock& data);
+
+    void writeCallback(Addr address,
+                       MachineType mach,
+                       DataBlock& data,
+                       Cycles initialRequestTime,
+                       Cycles forwardRequestTime,
+                       Cycles firstResponseTime,
+                       bool isRegion);
+
+    void writeCallback(Addr address,
+                       MachineType mach,
+                       DataBlock& data,
+                       Cycles initialRequestTime,
+                       Cycles forwardRequestTime,
+                       Cycles firstResponseTime);
+
+    void readCallback(Addr address, DataBlock& data);
+
+    void readCallback(Addr address,
+                      MachineType mach,
+                      DataBlock& data);
+
+    void readCallback(Addr address,
+                      MachineType mach,
+                      DataBlock& data,
+                      Cycles initialRequestTime,
+                      Cycles forwardRequestTime,
+                      Cycles firstResponseTime);
+
+    void readCallback(Addr address,
+                      MachineType mach,
+                      DataBlock& data,
+                      Cycles initialRequestTime,
+                      Cycles forwardRequestTime,
+                      Cycles firstResponseTime,
+                      bool isRegion);
+    /* atomics need their own callback because the data
+       might be const coming from SLICC */
+    void atomicCallback(Addr address,
+                        MachineType mach,
+                        const DataBlock& data);
+
+    void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID);
+    void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID);
+
+    // Alternate implementations in VIPER Coalescer
+    virtual RequestStatus makeRequest(PacketPtr pkt);
+
+    int outstandingCount() const { return m_outstanding_count; }
+
+    bool
+    isDeadlockEventScheduled() const
+    {
+        return deadlockCheckEvent.scheduled();
+    }
+
+    void
+    descheduleDeadlockEvent()
+    {
+        deschedule(deadlockCheckEvent);
+    }
+
+    bool empty() const;
+
+    void print(std::ostream& out) const;
+    void checkCoherence(Addr address);
+
+    void markRemoved();
+    void removeRequest(GPUCoalescerRequest* request);
+    void evictionCallback(Addr address);
+    void completeIssue();
+
+    void insertKernel(int wavefront_id, PacketPtr pkt);
+
+    void recordRequestType(SequencerRequestType requestType);
+    Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
+
+    Stats::Histogram& getLatencyHist() { return m_latencyHist; }
+    Stats::Histogram& getTypeLatencyHist(uint32_t t)
+    { return *m_typeLatencyHist[t]; }
+
+    Stats::Histogram& getMissLatencyHist()
+    { return m_missLatencyHist; }
+    Stats::Histogram& getMissTypeLatencyHist(uint32_t t)
+    { return *m_missTypeLatencyHist[t]; }
+
+    Stats::Histogram& getMissMachLatencyHist(uint32_t t) const
+    { return *m_missMachLatencyHist[t]; }
+
+    Stats::Histogram&
+    getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const
+    { return *m_missTypeMachLatencyHist[r][t]; }
+
+    Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const
+    { return *m_IssueToInitialDelayHist[t]; }
+
+    Stats::Histogram&
+    getInitialToForwardDelayHist(const MachineType t) const
+    { return *m_InitialToForwardDelayHist[t]; }
+
+    Stats::Histogram&
+    getForwardRequestToFirstResponseHist(const MachineType t) const
+    { return *m_ForwardToFirstResponseDelayHist[t]; }
+
+    Stats::Histogram&
+    getFirstResponseToCompletionDelayHist(const MachineType t) const
+    { return *m_FirstResponseToCompletionDelayHist[t]; }
+
+  // Changed to protected to enable inheritance by VIPER Coalescer
+  protected:
+    bool tryCacheAccess(Addr addr, RubyRequestType type,
+                        Addr pc, RubyAccessMode access_mode,
+                        int size, DataBlock*& data_ptr);
+    // Alternate implementations in VIPER Coalescer
+    virtual void issueRequest(PacketPtr pkt, RubyRequestType type);
+
+    void kernelCallback(int wavfront_id);
+
+    void hitCallback(GPUCoalescerRequest* request,
+                     MachineType mach,
+                     DataBlock& data,
+                     bool success,
+                     Cycles initialRequestTime,
+                     Cycles forwardRequestTime,
+                     Cycles firstResponseTime,
+                     bool isRegion);
+    void recordMissLatency(GPUCoalescerRequest* request,
+                           MachineType mach,
+                           Cycles initialRequestTime,
+                           Cycles forwardRequestTime,
+                           Cycles firstResponseTime,
+                           bool success, bool isRegion);
+    void completeHitCallback(std::vector<PacketPtr> & mylist, int len);
+    PacketPtr mapAddrToPkt(Addr address);
+
+
+    RequestStatus getRequestStatus(PacketPtr pkt,
+                                   RubyRequestType request_type);
+    bool insertRequest(PacketPtr pkt, RubyRequestType request_type);
+
+    bool handleLlsc(Addr address, GPUCoalescerRequest* request);
+
+    // Private copy constructor and assignment operator
+    GPUCoalescer(const GPUCoalescer& obj);
+    GPUCoalescer& operator=(const GPUCoalescer& obj);
+
+    class IssueEvent : public Event
+    {
+      private:
+        GPUCoalescer *seq;
+      public:
+        IssueEvent(GPUCoalescer *_seq);
+        void process();
+        const char *description() const;
+    };
+
+    IssueEvent issueEvent;
+
+
+  // Changed to protected to enable inheritance by VIPER Coalescer
+  protected:
+    int m_max_outstanding_requests;
+    int m_deadlock_threshold;
+
+    CacheMemory* m_dataCache_ptr;
+    CacheMemory* m_instCache_ptr;
+
+    // The cache access latency for this GPU data cache. This is assessed at the
+    // beginning of each access. This should be very similar to the
+    // implementation in Sequencer() as this is very much like a Sequencer
+    Cycles m_data_cache_hit_latency;
+
+    // We need to track both the primary and secondary request types.
+    // The secondary request type comprises a subset of RubyRequestTypes that
+    // are understood by the L1 Controller. A primary request type can be any
+    // RubyRequestType.
+    enum {PrimaryType, SecondaryType};
+    typedef std::pair<PacketPtr, std::vector<RubyRequestType> > RequestDesc;
+    typedef std::unordered_map<Addr, std::vector<RequestDesc> > CoalescingTable;
+    CoalescingTable reqCoalescer;
+    std::vector<Addr> newRequests;
+
+    typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable;
+    RequestTable m_writeRequestTable;
+    RequestTable m_readRequestTable;
+    // Global outstanding request count, across all request tables
+    int m_outstanding_count;
+    bool m_deadlock_check_scheduled;
+    std::unordered_map<int, PacketPtr> kernelEndList;
+    std::vector<int> newKernelEnds;
+
+    int m_store_waiting_on_load_cycles;
+    int m_store_waiting_on_store_cycles;
+    int m_load_waiting_on_store_cycles;
+    int m_load_waiting_on_load_cycles;
+
+    bool m_usingNetworkTester;
+
+    class GPUCoalescerWakeupEvent : public Event
+    {
+      private:
+        GPUCoalescer *m_GPUCoalescer_ptr;
+
+      public:
+        GPUCoalescerWakeupEvent(GPUCoalescer *_seq) :
+            m_GPUCoalescer_ptr(_seq) {}
+        void process() { m_GPUCoalescer_ptr->wakeup(); }
+        const char *description() const
+        {
+            return "GPUCoalescer deadlock check";
+        }
+    };
+
+    GPUCoalescerWakeupEvent deadlockCheckEvent;
+    bool assumingRfOCoherence;
+
+    // m5 style stats for TCP hit/miss counts
+    Stats::Scalar GPU_TCPLdHits;
+    Stats::Scalar GPU_TCPLdTransfers;
+    Stats::Scalar GPU_TCCLdHits;
+    Stats::Scalar GPU_LdMiss;
+
+    Stats::Scalar GPU_TCPStHits;
+    Stats::Scalar GPU_TCPStTransfers;
+    Stats::Scalar GPU_TCCStHits;
+    Stats::Scalar GPU_StMiss;
+
+    Stats::Scalar CP_TCPLdHits;
+    Stats::Scalar CP_TCPLdTransfers;
+    Stats::Scalar CP_TCCLdHits;
+    Stats::Scalar CP_LdMiss;
+
+    Stats::Scalar CP_TCPStHits;
+    Stats::Scalar CP_TCPStTransfers;
+    Stats::Scalar CP_TCCStHits;
+    Stats::Scalar CP_StMiss;
+
+    //! Histogram for number of outstanding requests per cycle.
+    Stats::Histogram m_outstandReqHist;
+
+    //! Histogram for holding latency profile of all requests.
+    Stats::Histogram m_latencyHist;
+    std::vector<Stats::Histogram *> m_typeLatencyHist;
+
+    //! Histogram for holding latency profile of all requests that
+    //! miss in the controller connected to this sequencer.
+    Stats::Histogram m_missLatencyHist;
+    std::vector<Stats::Histogram *> m_missTypeLatencyHist;
+
+    //! Histograms for profiling the latencies for requests that
+    //! required external messages.
+    std::vector<Stats::Histogram *> m_missMachLatencyHist;
+    std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist;
+
+    //! Histograms for recording the breakdown of miss latency
+    std::vector<Stats::Histogram *> m_IssueToInitialDelayHist;
+    std::vector<Stats::Histogram *> m_InitialToForwardDelayHist;
+    std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist;
+    std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist;
+};
+
+inline std::ostream&
+operator<<(std::ostream& out, const GPUCoalescer& obj)
+{
+    obj.print(out);
+    out << std::flush;
+    return out;
+}
+
+#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__
+
diff --git a/src/mem/ruby/system/GPUCoalescer.py b/src/mem/ruby/system/GPUCoalescer.py
new file mode 100644
index 000000000..0c19f875d
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.py
@@ -0,0 +1,48 @@
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Steve Reinhardt
+#          Brad Beckmann
+
+from m5.params import *
+from m5.proxy import *
+from Sequencer import *
+
+class RubyGPUCoalescer(RubySequencer):
+   type = 'RubyGPUCoalescer'
+   cxx_class = 'GPUCoalescer'
+   cxx_header = "mem/ruby/system/GPUCoalescer.hh"
+
+   # max_outstanding_requests = (wave front slots) x (wave front size)
+   max_outstanding_requests = Param.Int(40*64,
+                                "max requests (incl. prefetches) outstanding")
+   assume_rfo = Param.Bool(True, "assume protocol implementes Read for "
+                           "Ownership coherence");
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 5a5f528bb..bf4002126 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -60,7 +60,8 @@ RubyPort::RubyPort(const Params *p)
       memSlavePort(csprintf("%s-mem-slave-port", name()), this,
                    p->ruby_system->getAccessBackingStore(), -1,
                    p->no_retry_on_stall),
-      gotAddrRanges(p->port_master_connection_count)
+      gotAddrRanges(p->port_master_connection_count),
+      m_isCPUSequencer(p->is_cpu_sequencer)
 {
     assert(m_version != -1);
 
diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh
index 07e0fde5a..6bd92b654 100644
--- a/src/mem/ruby/system/RubyPort.hh
+++ b/src/mem/ruby/system/RubyPort.hh
@@ -167,6 +167,8 @@ class RubyPort : public MemObject
     uint32_t getId() { return m_version; }
     DrainState drain() override;
 
+    bool isCPUSequencer() { return m_isCPUSequencer; }
+
   protected:
     void trySendRetries();
     void ruby_hit_callback(PacketPtr pkt);
@@ -218,6 +220,8 @@ class RubyPort : public MemObject
     // that should be called when the Sequencer becomes available after a stall.
     //
     std::vector<MemSlavePort *> retryList;
+
+    bool m_isCPUSequencer;
 };
 
 #endif // __MEM_RUBY_SYSTEM_RUBYPORT_HH__
diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc
index 1ecd2e098..e1717e519 100644
--- a/src/mem/ruby/system/RubySystem.cc
+++ b/src/mem/ruby/system/RubySystem.cc
@@ -107,7 +107,7 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
     Sequencer* sequencer_ptr = NULL;
 
     for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
-        sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer());
+        sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer());
         if (sequencer_ptr == NULL) {
             sequencer_ptr = sequencer_map[cntrl];
         }
diff --git a/src/mem/ruby/system/SConscript b/src/mem/ruby/system/SConscript
index 8c5077362..b67311bca 100644
--- a/src/mem/ruby/system/SConscript
+++ b/src/mem/ruby/system/SConscript
@@ -33,12 +33,22 @@ Import('*')
 if env['PROTOCOL'] == 'None':
     Return()
 
+if env['BUILD_GPU']:
+    SimObject('GPUCoalescer.py')
 SimObject('RubySystem.py')
 SimObject('Sequencer.py')
+SimObject('WeightedLRUReplacementPolicy.py')
+if env['BUILD_GPU']:
+    SimObject('VIPERCoalescer.py')
 
 Source('CacheRecorder.cc')
 Source('DMASequencer.cc')
+if env['BUILD_GPU']:
+    Source('GPUCoalescer.cc')
 Source('RubyPort.cc')
 Source('RubyPortProxy.cc')
 Source('RubySystem.cc')
 Source('Sequencer.cc')
+if env['BUILD_GPU']:
+    Source('VIPERCoalescer.cc')
+Source('WeightedLRUPolicy.cc')
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 50418c700..c2727b41d 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -63,6 +63,7 @@ Sequencer::Sequencer(const Params *p)
     m_max_outstanding_requests = p->max_outstanding_requests;
     m_deadlock_threshold = p->deadlock_threshold;
 
+    m_coreId = p->coreid; // for tracking the two CorePair sequencers
     assert(m_max_outstanding_requests > 0);
     assert(m_deadlock_threshold > 0);
     assert(m_instCache_ptr != NULL);
@@ -593,6 +594,8 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
     ContextID proc_id = pkt->req->hasContextId() ?
         pkt->req->contextId() : InvalidContextID;
 
+    ContextID core_id = coreId();
+
     // If valid, copy the pc to the ruby request
     Addr pc = 0;
     if (pkt->req->hasPC()) {
@@ -607,7 +610,7 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
                                       nullptr : pkt->getPtr<uint8_t>(),
                                       pkt->getSize(), pc, secondary_type,
                                       RubyAccessMode_Supervisor, pkt,
-                                      PrefetchBit_No, proc_id);
+                                      PrefetchBit_No, proc_id, core_id);
 
     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %#x %s\n",
             curTick(), m_version, "Seq", "Begin", "", "",
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index 47af7ea1e..2a2f49587 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -99,6 +99,7 @@ class Sequencer : public RubyPort
     void markRemoved();
     void evictionCallback(Addr address);
     void invalidateSC(Addr address);
+    int coreId() const { return m_coreId; }
 
     void recordRequestType(SequencerRequestType requestType);
     Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; }
@@ -198,6 +199,8 @@ class Sequencer : public RubyPort
     Stats::Scalar m_load_waiting_on_store;
     Stats::Scalar m_load_waiting_on_load;
 
+    int m_coreId;
+
     bool m_usingNetworkTester;
 
     //! Histogram for number of outstanding requests per cycle.
diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py
index 7c90eb29c..d6ee0aa2f 100644
--- a/src/mem/ruby/system/Sequencer.py
+++ b/src/mem/ruby/system/Sequencer.py
@@ -32,54 +32,58 @@ from m5.proxy import *
 from MemObject import MemObject
 
 class RubyPort(MemObject):
-    type = 'RubyPort'
-    abstract = True
-    cxx_header = "mem/ruby/system/RubyPort.hh"
-    version = Param.Int(0, "")
+   type = 'RubyPort'
+   abstract = True
+   cxx_header = "mem/ruby/system/RubyPort.hh"
+   version = Param.Int(0, "")
 
-    slave = VectorSlavePort("CPU slave port")
-    master = VectorMasterPort("CPU master port")
-    pio_master_port = MasterPort("Ruby mem master port")
-    mem_master_port = MasterPort("Ruby mem master port")
-    pio_slave_port = SlavePort("Ruby pio slave port")
-    mem_slave_port = SlavePort("Ruby memory port")
+   slave = VectorSlavePort("CPU slave port")
+   master = VectorMasterPort("CPU master port")
+   pio_master_port = MasterPort("Ruby mem master port")
+   mem_master_port = MasterPort("Ruby mem master port")
+   pio_slave_port = SlavePort("Ruby pio slave port")
+   mem_slave_port = SlavePort("Ruby memory port")
 
-    using_ruby_tester = Param.Bool(False, "")
-    no_retry_on_stall = Param.Bool(False, "")
-    ruby_system = Param.RubySystem(Parent.any, "")
-    system = Param.System(Parent.any, "system object")
-    support_data_reqs = Param.Bool(True, "data cache requests supported")
-    support_inst_reqs = Param.Bool(True, "inst cache requests supported")
+   using_ruby_tester = Param.Bool(False, "")
+   no_retry_on_stall = Param.Bool(False, "")
+   ruby_system = Param.RubySystem(Parent.any, "")
+   system = Param.System(Parent.any, "system object")
+   support_data_reqs = Param.Bool(True, "data cache requests supported")
+   support_inst_reqs = Param.Bool(True, "inst cache requests supported")
+   is_cpu_sequencer = Param.Bool(True, "connected to a cpu")
 
 class RubyPortProxy(RubyPort):
-    type = 'RubyPortProxy'
-    cxx_header = "mem/ruby/system/RubyPortProxy.hh"
+   type = 'RubyPortProxy'
+   cxx_header = "mem/ruby/system/RubyPortProxy.hh"
 
 class RubySequencer(RubyPort):
-    type = 'RubySequencer'
-    cxx_class = 'Sequencer'
-    cxx_header = "mem/ruby/system/Sequencer.hh"
+   type = 'RubySequencer'
+   cxx_class = 'Sequencer'
+   cxx_header = "mem/ruby/system/Sequencer.hh"
 
-    icache = Param.RubyCache("")
-    dcache = Param.RubyCache("")
-    # Cache latencies currently assessed at the beginning of each access
-    # NOTE: Setting these values to a value greater than one will result in
-    # O3 CPU pipeline bubbles and negatively impact performance
-    # TODO: Latencies should be migrated into each top-level cache controller
-    icache_hit_latency = Param.Cycles(1, "Inst cache hit latency")
-    dcache_hit_latency = Param.Cycles(1, "Data cache hit latency")
-    max_outstanding_requests = Param.Int(16,
-        "max requests (incl. prefetches) outstanding")
-    deadlock_threshold = Param.Cycles(500000,
-        "max outstanding cycles for a request before deadlock/livelock declared")
-    using_network_tester = Param.Bool(False, "")
+   icache = Param.RubyCache("")
+   dcache = Param.RubyCache("")
+   # Cache latencies currently assessed at the beginning of each access
+   # NOTE: Setting these values to a value greater than one will result in
+   # O3 CPU pipeline bubbles and negatively impact performance
+   # TODO: Latencies should be migrated into each top-level cache controller
+   icache_hit_latency = Param.Cycles(1, "Inst cache hit latency")
+   dcache_hit_latency = Param.Cycles(1, "Data cache hit latency")
+   max_outstanding_requests = Param.Int(16,
+       "max requests (incl. prefetches) outstanding")
+   deadlock_threshold = Param.Cycles(500000,
+       "max outstanding cycles for a request before deadlock/livelock declared")
+   using_network_tester = Param.Bool(False, "")
+   # id used by protocols that support multiple sequencers per controller
+   # 99 is the dummy default value
+   coreid = Param.Int(99, "CorePair core id")
 
 class DMASequencer(MemObject):
-    type = 'DMASequencer'
-    cxx_header = "mem/ruby/system/DMASequencer.hh"
+   type = 'DMASequencer'
+   cxx_header = "mem/ruby/system/DMASequencer.hh"
 
-    version = Param.Int(0, "")
-    slave = SlavePort("Device slave port")
-    using_ruby_tester = Param.Bool(False, "")
-    ruby_system = Param.RubySystem(Parent.any, "")
-    system = Param.System(Parent.any, "system object")
+   version = Param.Int(0, "")
+   slave = SlavePort("Device slave port")
+   using_ruby_tester = Param.Bool(False, "")
+   ruby_system = Param.RubySystem(Parent.any, "")
+   system = Param.System(Parent.any, "system object")
diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc
new file mode 100644
index 000000000..ca91f2723
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.cc
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "base/misc.hh"
+#include "base/str.hh"
+#include "config/the_isa.hh"
+
+#if THE_ISA == X86_ISA
+#include "arch/x86/insts/microldstop.hh"
+
+#endif // X86_ISA
+#include "mem/ruby/system/VIPERCoalescer.hh"
+
+#include "cpu/testers/rubytest/RubyTester.hh"
+#include "debug/GPUCoalescer.hh"
+#include "debug/MemoryAccess.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/common/SubBlock.hh"
+#include "mem/ruby/network/MessageBuffer.hh"
+#include "mem/ruby/profiler/Profiler.hh"
+#include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/slicc_interface/RubyRequest.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "mem/ruby/system/GPUCoalescer.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "params/VIPERCoalescer.hh"
+
+using namespace std;
+
+VIPERCoalescer *
+VIPERCoalescerParams::create()
+{
+    return new VIPERCoalescer(this);
+}
+
+VIPERCoalescer::VIPERCoalescer(const Params *p)
+    : GPUCoalescer(p)
+{
+    m_max_wb_per_cycle=p->max_wb_per_cycle;
+    m_max_inv_per_cycle=p->max_inv_per_cycle;
+    m_outstanding_inv = 0;
+    m_outstanding_wb = 0;
+}
+
+VIPERCoalescer::~VIPERCoalescer()
+{
+}
+
+// Analyzes the packet to see if this request can be coalesced.
+// If request can be coalesced, this request is added to the reqCoalescer table
+// and makeRequest returns RequestStatus_Issued;
+// If this is the first request to a cacheline, request is added to both
+// newRequests queue and to the reqCoalescer table; makeRequest
+// returns RequestStatus_Issued.
+// If there is a pending request to this cacheline and this request
+// can't be coalesced, RequestStatus_Aliased is returned and
+// the packet needs to be reissued.
+RequestStatus
+VIPERCoalescer::makeRequest(PacketPtr pkt)
+{
+    if (m_outstanding_wb | m_outstanding_inv) {
+        DPRINTF(GPUCoalescer,
+                "There are %d Writebacks and %d Invalidatons\n",
+                m_outstanding_wb, m_outstanding_inv);
+    }
+    // Are we in the middle of a release
+    if ((m_outstanding_wb) > 0) {
+        if (pkt->req->isKernel()) {
+            // Everythign is fine
+            // Barriers and Kernel End scan coalesce
+            // If it is a Kerenl Begin flush the cache
+            if (pkt->req->isAcquire() && (m_outstanding_inv == 0)) {
+                invL1();
+            }
+
+            if (pkt->req->isRelease()) {
+                insertKernel(pkt->req->contextId(), pkt);
+            }
+
+            return RequestStatus_Issued;
+        }
+//        return RequestStatus_Aliased;
+    } else if (pkt->req->isKernel() && pkt->req->isRelease()) {
+        // Flush Dirty Data on Kernel End
+        // isKernel + isRelease
+        insertKernel(pkt->req->contextId(), pkt);
+        wbL1();
+        if(m_outstanding_wb == 0) {
+            for (auto it =  kernelEndList.begin(); it != kernelEndList.end(); it++) {
+                newKernelEnds.push_back(it->first);
+            }
+            completeIssue();
+        }
+        return RequestStatus_Issued;
+    }
+    RequestStatus requestStatus = GPUCoalescer::makeRequest(pkt);
+    if (requestStatus!=RequestStatus_Issued) {
+        // Request not isssued
+        // enqueue Retry
+        DPRINTF(GPUCoalescer, "Request not issued by GPUCoaleser\n");
+        return requestStatus;
+    } else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
+        // Invalidate clean Data on Kernel Begin
+        // isKernel + isAcquire
+        invL1();
+    } else if (pkt->req->isAcquire() && pkt->req->isRelease()) {
+        // Deschedule the AtomicAcqRel and
+        // Flush and Invalidate the L1 cache
+        invwbL1();
+        if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
+            DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
+            deschedule(issueEvent);
+        }
+    } else if (pkt->req->isRelease()) {
+        // Deschedule the StoreRel and
+        // Flush the L1 cache
+        wbL1();
+        if (m_outstanding_wb > 0 && issueEvent.scheduled()) {
+            DPRINTF(GPUCoalescer, "issueEvent Descheduled\n");
+            deschedule(issueEvent);
+        }
+    } else if (pkt->req->isAcquire()) {
+        // LoadAcq or AtomicAcq
+        // Invalidate the L1 cache
+        invL1();
+    }
+    // Request was successful
+    if (m_outstanding_wb == 0) {
+        if (!issueEvent.scheduled()) {
+            DPRINTF(GPUCoalescer, "issueEvent Rescheduled\n");
+            schedule(issueEvent, curTick());
+        }
+    }
+    return RequestStatus_Issued;
+}
+
+void
+VIPERCoalescer::wbCallback(Addr addr)
+{
+    m_outstanding_wb--;
+    // if L1 Flush Complete
+    // attemnpt to schedule issueEvent
+    assert(((int) m_outstanding_wb) >= 0);
+    if (m_outstanding_wb == 0) {
+        for (auto it =  kernelEndList.begin(); it != kernelEndList.end(); it++) {
+            newKernelEnds.push_back(it->first);
+        }
+        completeIssue();
+    }
+    trySendRetries();
+}
+
+void
+VIPERCoalescer::invCallback(Addr addr)
+{
+    m_outstanding_inv--;
+    // if L1 Flush Complete
+    // attemnpt to schedule issueEvent
+    // This probably won't happen, since
+    // we dont wait on cache invalidations
+    if (m_outstanding_wb == 0) {
+        for (auto it =  kernelEndList.begin(); it != kernelEndList.end(); it++) {
+            newKernelEnds.push_back(it->first);
+        }
+        completeIssue();
+    }
+    trySendRetries();
+}
+
+/**
+  * Invalidate L1 cache (Acquire)
+  */
+void
+VIPERCoalescer::invL1()
+{
+    int size = m_dataCache_ptr->getNumBlocks();
+    DPRINTF(GPUCoalescer,
+            "There are %d Invalidations outstanding before Cache Walk\n",
+            m_outstanding_inv);
+    // Walk the cache
+    for (int i = 0; i < size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Evict Read-only data
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, (uint8_t*) 0, 0, 0,
+            RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor,
+            nullptr);
+        assert(m_mandatory_q_ptr != NULL);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+        m_outstanding_inv++;
+    }
+    DPRINTF(GPUCoalescer,
+            "There are %d Invalidatons outstanding after Cache Walk\n",
+            m_outstanding_inv);
+}
+
+/**
+  * Writeback L1 cache (Release)
+  */
+void
+VIPERCoalescer::wbL1()
+{
+    int size = m_dataCache_ptr->getNumBlocks();
+    DPRINTF(GPUCoalescer,
+            "There are %d Writebacks outstanding before Cache Walk\n",
+            m_outstanding_wb);
+    // Walk the cache
+    for (int i = 0; i < size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Write dirty data back
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, (uint8_t*) 0, 0, 0,
+            RubyRequestType_FLUSH, RubyAccessMode_Supervisor,
+            nullptr);
+        assert(m_mandatory_q_ptr != NULL);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+        m_outstanding_wb++;
+    }
+    DPRINTF(GPUCoalescer,
+            "There are %d Writebacks outstanding after Cache Walk\n",
+            m_outstanding_wb);
+}
+
+/**
+  * Invalidate and Writeback L1 cache (Acquire&Release)
+  */
+void
+VIPERCoalescer::invwbL1()
+{
+    int size = m_dataCache_ptr->getNumBlocks();
+    // Walk the cache
+    for(int i = 0; i < size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Evict Read-only data
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, (uint8_t*) 0, 0, 0,
+            RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor,
+            nullptr);
+        assert(m_mandatory_q_ptr != NULL);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+        m_outstanding_inv++;
+    }
+    // Walk the cache
+    for(int i = 0; i< size; i++) {
+        Addr addr = m_dataCache_ptr->getAddressAtIdx(i);
+        // Write dirty data back
+        std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(
+            clockEdge(), addr, (uint8_t*) 0, 0, 0,
+            RubyRequestType_FLUSH, RubyAccessMode_Supervisor,
+            nullptr);
+        assert(m_mandatory_q_ptr != NULL);
+        m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+        m_outstanding_wb++;
+    }
+}
diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh
new file mode 100644
index 000000000..af6e44e7f
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.hh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+#define __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+
+#include <iostream>
+
+#include "mem/protocol/PrefetchBit.hh"
+#include "mem/protocol/RubyAccessMode.hh"
+#include "mem/protocol/RubyRequestType.hh"
+#include "mem/ruby/common/Address.hh"
+#include "mem/ruby/common/Consumer.hh"
+#include "mem/ruby/system/GPUCoalescer.hh"
+#include "mem/ruby/system/RubyPort.hh"
+
+class DataBlock;
+class CacheMsg;
+class MachineID;
+class CacheMemory;
+
+class VIPERCoalescerParams;
+
+class VIPERCoalescer : public GPUCoalescer
+{
+  public:
+    typedef VIPERCoalescerParams Params;
+    VIPERCoalescer(const Params *);
+    ~VIPERCoalescer();
+    void wbCallback(Addr address);
+    void invCallback(Addr address);
+    RequestStatus makeRequest(PacketPtr pkt);
+  private:
+    void invL1();
+    void wbL1();
+    void invwbL1();
+    uint64_t m_outstanding_inv;
+    uint64_t m_outstanding_wb;
+    uint64_t m_max_inv_per_cycle;
+    uint64_t m_max_wb_per_cycle;
+};
+#endif // __MEM_RUBY_SYSTEM_VI_COALESCER_HH__
+
diff --git a/src/mem/ruby/system/VIPERCoalescer.py b/src/mem/ruby/system/VIPERCoalescer.py
new file mode 100644
index 000000000..05c74386f
--- /dev/null
+++ b/src/mem/ruby/system/VIPERCoalescer.py
@@ -0,0 +1,45 @@
+#  Copyright (c) 2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Steve Reinhardt
+#          Brad Beckmann
+
+from m5.params import *
+from m5.proxy import *
+from GPUCoalescer import *
+
+class VIPERCoalescer(RubyGPUCoalescer):
+    type = 'VIPERCoalescer'
+    cxx_class = 'VIPERCoalescer'
+    cxx_header = "mem/ruby/system/VIPERCoalescer.hh"
+    max_inv_per_cycle = Param.Int(32, "max invalidations per cycle")
+    max_wb_per_cycle = Param.Int(32, "max writebacks per cycle")
+    assume_rfo = False
diff --git a/src/mem/ruby/system/WeightedLRUPolicy.cc b/src/mem/ruby/system/WeightedLRUPolicy.cc
new file mode 100644
index 000000000..5baa4d9a5
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUPolicy.cc
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Derek Hower
+ */
+
+#include "mem/ruby/system/WeightedLRUPolicy.hh"
+
+WeightedLRUPolicy::WeightedLRUPolicy(const Params* p)
+    : AbstractReplacementPolicy(p), m_cache(p->cache)
+{
+    m_last_occ_ptr = new int*[m_num_sets];
+    for(unsigned i = 0; i < m_num_sets; i++){
+        m_last_occ_ptr[i] = new int[m_assoc];
+        for(unsigned j = 0; j < m_assoc; j++){
+            m_last_occ_ptr[i][j] = 0;
+        }
+    }
+}
+
+WeightedLRUPolicy *
+WeightedLRUReplacementPolicyParams::create()
+{
+    return new WeightedLRUPolicy(this);
+}
+
+WeightedLRUPolicy::~WeightedLRUPolicy()
+{
+    if (m_last_occ_ptr != NULL){
+        for (unsigned i = 0; i < m_num_sets; i++){
+            if (m_last_occ_ptr[i] != NULL){
+                delete[] m_last_occ_ptr[i];
+            }
+        }
+        delete[] m_last_occ_ptr;
+    }
+}
+
+void
+WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time)
+{
+    assert(index >= 0 && index < m_assoc);
+    assert(set >= 0 && set < m_num_sets);
+
+    m_last_ref_ptr[set][index] = time;
+}
+
+void
+WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time, int occupancy)
+{
+    assert(index >= 0 && index < m_assoc);
+    assert(set >= 0 && set < m_num_sets);
+
+    m_last_ref_ptr[set][index] = time;
+    m_last_occ_ptr[set][index] = occupancy;
+}
+
+int64_t
+WeightedLRUPolicy::getVictim(int64_t set) const
+{
+    Tick time, smallest_time;
+    int64_t smallest_index;
+
+    smallest_index = 0;
+    smallest_time = m_last_ref_ptr[set][0];
+    int smallest_weight = m_last_ref_ptr[set][0];
+
+    for (unsigned i = 1; i < m_assoc; i++) {
+
+        int weight = m_last_occ_ptr[set][i];
+        if (weight < smallest_weight) {
+            smallest_weight = weight;
+            smallest_index = i;
+            smallest_time = m_last_ref_ptr[set][i];
+        } else if (weight == smallest_weight) {
+            time = m_last_ref_ptr[set][i];
+            if (time < smallest_time) {
+                smallest_index = i;
+                smallest_time = time;
+            }
+        }
+    }
+    return smallest_index;
+}
diff --git a/src/mem/ruby/system/WeightedLRUPolicy.hh b/src/mem/ruby/system/WeightedLRUPolicy.hh
new file mode 100644
index 000000000..3150779b2
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUPolicy.hh
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#ifndef __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__
+#define __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__
+
+#include "mem/ruby/structures/AbstractReplacementPolicy.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "params/WeightedLRUReplacementPolicy.hh"
+
+/* Simple true LRU replacement policy */
+
+class WeightedLRUPolicy : public AbstractReplacementPolicy
+{
+  public:
+    typedef WeightedLRUReplacementPolicyParams Params;
+    WeightedLRUPolicy(const Params* p);
+    ~WeightedLRUPolicy();
+
+    void touch(int64_t set, int64_t way, Tick time);
+    void touch(int64_t set, int64_t way, Tick time, int occupancy);
+    int64_t getVictim(int64_t set) const override;
+
+    bool useOccupancy() const { return true; }
+
+    CacheMemory * m_cache;
+    int **m_last_occ_ptr;
+};
+
+#endif // __MEM_RUBY_SYSTEM_WeightedLRUPolicy_HH__
diff --git a/src/mem/ruby/system/WeightedLRUReplacementPolicy.py b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py
new file mode 100644
index 000000000..e7de33496
--- /dev/null
+++ b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py
@@ -0,0 +1,45 @@
+#
+#  Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+#  All rights reserved.
+#
+#  For use for simulation and test purposes only
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+#
+#  2. Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+#
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#  may be used to endorse or promote products derived from this software
+#  without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+#
+#  Author: Derek Hower
+#
+
+from m5.params import *
+from m5.proxy import *
+from MemObject import MemObject
+from ReplacementPolicy import ReplacementPolicy
+
+class WeightedLRUReplacementPolicy(ReplacementPolicy):
+    type = "WeightedLRUReplacementPolicy"
+    cxx_class = "WeightedLRUPolicy"
+    cxx_header = "mem/ruby/system/WeightedLRUPolicy.hh"
+    cache = Param.RubyCache("")