1 files changed, 1397 insertions, 0 deletions
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
new file mode 100644
index 000000000..db279bd3a
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "base/misc.hh"
+#include "base/str.hh"
+#include "config/the_isa.hh"
+
+#if THE_ISA == X86_ISA
+#include "arch/x86/insts/microldstop.hh"
+
+#endif // X86_ISA
+#include "mem/ruby/system/GPUCoalescer.hh"
+
+#include "cpu/testers/rubytest/RubyTester.hh"
+#include "debug/GPUCoalescer.hh"
+#include "debug/MemoryAccess.hh"
+#include "debug/ProtocolTrace.hh"
+#include "debug/RubyPort.hh"
+#include "debug/RubyStats.hh"
+#include "gpu-compute/shader.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/common/DataBlock.hh"
+#include "mem/ruby/common/SubBlock.hh"
+#include "mem/ruby/network/MessageBuffer.hh"
+#include "mem/ruby/profiler/Profiler.hh"
+#include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/slicc_interface/RubyRequest.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "params/RubyGPUCoalescer.hh"
+
+using namespace std;
+
+GPUCoalescer *
+RubyGPUCoalescerParams::create()
+{
+    return new GPUCoalescer(this);
+}
+
+HSAScope
+reqScopeToHSAScope(Request* req)
+{
+    HSAScope accessScope = HSAScope_UNSPECIFIED;
+    if (req->isScoped()) {
+        if (req->isWavefrontScope()) {
+            accessScope = HSAScope_WAVEFRONT;
+        } else if (req->isWorkgroupScope()) {
+            accessScope = HSAScope_WORKGROUP;
+        } else if (req->isDeviceScope()) {
+            accessScope = HSAScope_DEVICE;
+        } else if (req->isSystemScope()) {
+            accessScope = HSAScope_SYSTEM;
+        } else {
+            fatal("Bad scope type");
+        }
+    }
+    return accessScope;
+}
+
+HSASegment
+reqSegmentToHSASegment(Request* req)
+{
+    HSASegment accessSegment = HSASegment_GLOBAL;
+
+    if (req->isGlobalSegment()) {
+        accessSegment = HSASegment_GLOBAL;
+    } else if (req->isGroupSegment()) {
+        accessSegment = HSASegment_GROUP;
+    } else if (req->isPrivateSegment()) {
+        accessSegment = HSASegment_PRIVATE;
+    } else if (req->isKernargSegment()) {
+        accessSegment = HSASegment_KERNARG;
+    } else if (req->isReadonlySegment()) {
+        accessSegment = HSASegment_READONLY;
+    } else if (req->isSpillSegment()) {
+        accessSegment = HSASegment_SPILL;
+    } else if (req->isArgSegment()) {
+        accessSegment = HSASegment_ARG;
+    } else {
+        fatal("Bad segment type");
+    }
+
+    return accessSegment;
+}
+
+GPUCoalescer::GPUCoalescer(const Params *p)
+    : RubyPort(p), issueEvent(this), deadlockCheckEvent(this)
+{
+    m_store_waiting_on_load_cycles = 0;
+    m_store_waiting_on_store_cycles = 0;
+    m_load_waiting_on_store_cycles = 0;
+    m_load_waiting_on_load_cycles = 0;
+
+    m_outstanding_count = 0;
+
+    m_max_outstanding_requests = 0;
+    m_deadlock_threshold = 0;
+    m_instCache_ptr = nullptr;
+    m_dataCache_ptr = nullptr;
+
+    m_instCache_ptr = p->icache;
+    m_dataCache_ptr = p->dcache;
+    m_max_outstanding_requests = p->max_outstanding_requests;
+    m_deadlock_threshold = p->deadlock_threshold;
+
+    assert(m_max_outstanding_requests > 0);
+    assert(m_deadlock_threshold > 0);
+    assert(m_instCache_ptr);
+    assert(m_dataCache_ptr);
+
+    m_data_cache_hit_latency = p->dcache_hit_latency;
+
+    m_usingNetworkTester = p->using_network_tester;
+    assumingRfOCoherence = p->assume_rfo;
+}
+
+GPUCoalescer::~GPUCoalescer()
+{
+}
+
+void
+GPUCoalescer::wakeup()
+{
+    // Check for deadlock of any of the requests
+    Cycles current_time = curCycle();
+
+    // Check across all outstanding requests
+    int total_outstanding = 0;
+
+    RequestTable::iterator read = m_readRequestTable.begin();
+    RequestTable::iterator read_end = m_readRequestTable.end();
+    for (; read != read_end; ++read) {
+        GPUCoalescerRequest* request = read->second;
+        if (current_time - request->issue_time < m_deadlock_threshold)
+            continue;
+
+        panic("Possible Deadlock detected. Aborting!\n"
+             "version: %d request.paddr: 0x%x m_readRequestTable: %d "
+             "current time: %u issue_time: %d difference: %d\n", m_version,
+              request->pkt->getAddr(), m_readRequestTable.size(),
+              current_time * clockPeriod(), request->issue_time * clockPeriod(),
+              (current_time - request->issue_time)*clockPeriod());
+    }
+
+    RequestTable::iterator write = m_writeRequestTable.begin();
+    RequestTable::iterator write_end = m_writeRequestTable.end();
+    for (; write != write_end; ++write) {
+        GPUCoalescerRequest* request = write->second;
+        if (current_time - request->issue_time < m_deadlock_threshold)
+            continue;
+
+        panic("Possible Deadlock detected. Aborting!\n"
+             "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
+             "current time: %u issue_time: %d difference: %d\n", m_version,
+              request->pkt->getAddr(), m_writeRequestTable.size(),
+              current_time * clockPeriod(), request->issue_time * clockPeriod(),
+              (current_time - request->issue_time) * clockPeriod());
+    }
+
+    total_outstanding += m_writeRequestTable.size();
+    total_outstanding += m_readRequestTable.size();
+
+    assert(m_outstanding_count == total_outstanding);
+
+    if (m_outstanding_count > 0) {
+        // If there are still outstanding requests, keep checking
+        schedule(deadlockCheckEvent,
+                 m_deadlock_threshold * clockPeriod() +
+                 curTick());
+    }
+}
+
+void
+GPUCoalescer::resetStats()
+{
+    m_latencyHist.reset();
+    m_missLatencyHist.reset();
+    for (int i = 0; i < RubyRequestType_NUM; i++) {
+        m_typeLatencyHist[i]->reset();
+        m_missTypeLatencyHist[i]->reset();
+        for (int j = 0; j < MachineType_NUM; j++) {
+            m_missTypeMachLatencyHist[i][j]->reset();
+        }
+    }
+
+    for (int i = 0; i < MachineType_NUM; i++) {
+        m_missMachLatencyHist[i]->reset();
+
+        m_IssueToInitialDelayHist[i]->reset();
+        m_InitialToForwardDelayHist[i]->reset();
+        m_ForwardToFirstResponseDelayHist[i]->reset();
+        m_FirstResponseToCompletionDelayHist[i]->reset();
+    }
+}
+
+void
+GPUCoalescer::printProgress(ostream& out) const
+{
+}
+
+RequestStatus
+GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
+{
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
+        return RequestStatus_BufferFull;
+    }
+
+    if(m_controller->isBlocked(line_addr) &&
+       request_type != RubyRequestType_Locked_RMW_Write) {
+        return RequestStatus_Aliased;
+    }
+
+    if ((request_type == RubyRequestType_ST) ||
+        (request_type == RubyRequestType_ATOMIC) ||
+        (request_type == RubyRequestType_ATOMIC_RETURN) ||
+        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+        (request_type == RubyRequestType_RMW_Read) ||
+        (request_type == RubyRequestType_RMW_Write) ||
+        (request_type == RubyRequestType_Load_Linked) ||
+        (request_type == RubyRequestType_Store_Conditional) ||
+        (request_type == RubyRequestType_Locked_RMW_Read) ||
+        (request_type == RubyRequestType_Locked_RMW_Write) ||
+        (request_type == RubyRequestType_FLUSH)) {
+
+        // Check if there is any outstanding read request for the same
+        // cache line.
+        if (m_readRequestTable.count(line_addr) > 0) {
+            m_store_waiting_on_load_cycles++;
+            return RequestStatus_Aliased;
+        }
+
+        if (m_writeRequestTable.count(line_addr) > 0) {
+          // There is an outstanding write request for the cache line
+          m_store_waiting_on_store_cycles++;
+          return RequestStatus_Aliased;
+        }
+    } else {
+        // Check if there is any outstanding write request for the same
+        // cache line.
+        if (m_writeRequestTable.count(line_addr) > 0) {
+            m_load_waiting_on_store_cycles++;
+            return RequestStatus_Aliased;
+        }
+
+        if (m_readRequestTable.count(line_addr) > 0) {
+            // There is an outstanding read request for the cache line
+            m_load_waiting_on_load_cycles++;
+            return RequestStatus_Aliased;
+        }
+    }
+
+    return RequestStatus_Ready;
+
+}
+
+
+
+// sets the kernelEndList
+void
+GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
+{
+    // Don't know if this will happen or is possible
+    // but I just want to be careful and not have it become
+    // simulator hang in the future
+    DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
+    assert(kernelEndList.count(wavefront_id) == 0);
+
+    kernelEndList[wavefront_id] = pkt;
+    DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
+            kernelEndList.size());
+}
+
+
+// Insert the request on the correct request table.  Return true if
+// the entry was already present.
+bool
+GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
+{
+    assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
+           pkt->req->isLockedRMW() ||
+           !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
+
+    int total_outstanding M5_VAR_USED =
+        m_writeRequestTable.size() + m_readRequestTable.size();
+
+    assert(m_outstanding_count == total_outstanding);
+
+    // See if we should schedule a deadlock check
+    if (deadlockCheckEvent.scheduled() == false) {
+        schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
+    }
+
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+    if ((request_type == RubyRequestType_ST) ||
+        (request_type == RubyRequestType_ATOMIC) ||
+        (request_type == RubyRequestType_ATOMIC_RETURN) ||
+        (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+        (request_type == RubyRequestType_RMW_Read) ||
+        (request_type == RubyRequestType_RMW_Write) ||
+        (request_type == RubyRequestType_Load_Linked) ||
+        (request_type == RubyRequestType_Store_Conditional) ||
+        (request_type == RubyRequestType_Locked_RMW_Read) ||
+        (request_type == RubyRequestType_Locked_RMW_Write) ||
+        (request_type == RubyRequestType_FLUSH)) {
+
+        pair<RequestTable::iterator, bool> r =
+          m_writeRequestTable.insert(RequestTable::value_type(line_addr,
+                                       (GPUCoalescerRequest*) NULL));
+        if (r.second) {
+            RequestTable::iterator i = r.first;
+            i->second = new GPUCoalescerRequest(pkt, request_type,
+                                                curCycle());
+            DPRINTF(GPUCoalescer,
+                    "Inserting write request for paddr %#x for type %d\n",
+                    pkt->req->getPaddr(), i->second->m_type);
+            m_outstanding_count++;
+        } else {
+            return true;
+        }
+    } else {
+        pair<RequestTable::iterator, bool> r =
+            m_readRequestTable.insert(RequestTable::value_type(line_addr,
+                                        (GPUCoalescerRequest*) NULL));
+
+        if (r.second) {
+            RequestTable::iterator i = r.first;
+            i->second = new GPUCoalescerRequest(pkt, request_type,
+                                             curCycle());
+            DPRINTF(GPUCoalescer,
+                    "Inserting read request for paddr %#x for type %d\n",
+                    pkt->req->getPaddr(), i->second->m_type);
+            m_outstanding_count++;
+        } else {
+            return true;
+        }
+    }
+
+    m_outstandReqHist.sample(m_outstanding_count);
+
+    total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
+    assert(m_outstanding_count == total_outstanding);
+
+    return false;
+}
+
+void
+GPUCoalescer::markRemoved()
+{
+    m_outstanding_count--;
+    assert(m_outstanding_count ==
+           m_writeRequestTable.size() + m_readRequestTable.size());
+}
+
+void
+GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
+{
+    assert(m_outstanding_count ==
+           m_writeRequestTable.size() + m_readRequestTable.size());
+
+    Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
+    if ((srequest->m_type == RubyRequestType_ST) ||
+        (srequest->m_type == RubyRequestType_RMW_Read) ||
+        (srequest->m_type == RubyRequestType_RMW_Write) ||
+        (srequest->m_type == RubyRequestType_Load_Linked) ||
+        (srequest->m_type == RubyRequestType_Store_Conditional) ||
+        (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
+        (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
+        m_writeRequestTable.erase(line_addr);
+    } else {
+        m_readRequestTable.erase(line_addr);
+    }
+
+    markRemoved();
+}
+
+bool
+GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
+{
+    //
+    // The success flag indicates whether the LLSC operation was successful.
+    // LL ops will always succeed, but SC may fail if the cache line is no
+    // longer locked.
+    //
+    bool success = true;
+    if (request->m_type == RubyRequestType_Store_Conditional) {
+        if (!m_dataCache_ptr->isLocked(address, m_version)) {
+            //
+            // For failed SC requests, indicate the failure to the cpu by
+            // setting the extra data to zero.
+            //
+            request->pkt->req->setExtraData(0);
+            success = false;
+        } else {
+            //
+            // For successful SC requests, indicate the success to the cpu by
+            // setting the extra data to one.
+            //
+            request->pkt->req->setExtraData(1);
+        }
+        //
+        // Independent of success, all SC operations must clear the lock
+        //
+        m_dataCache_ptr->clearLocked(address);
+    } else if (request->m_type == RubyRequestType_Load_Linked) {
+        //
+        // Note: To fully follow Alpha LLSC semantics, should the LL clear any
+        // previously locked cache lines?
+        //
+        m_dataCache_ptr->setLocked(address, m_version);
+    } else if ((m_dataCache_ptr->isTagPresent(address)) &&
+               (m_dataCache_ptr->isLocked(address, m_version))) {
+        //
+        // Normal writes should clear the locked address
+        //
+        m_dataCache_ptr->clearLocked(address);
+    }
+    return success;
+}
+
+void
+GPUCoalescer::writeCallback(Addr address, DataBlock& data)
+{
+    writeCallback(address, MachineType_NULL, data);
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+                         MachineType mach,
+                         DataBlock& data)
+{
+    writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+                         MachineType mach,
+                         DataBlock& data,
+                         Cycles initialRequestTime,
+                         Cycles forwardRequestTime,
+                         Cycles firstResponseTime)
+{
+    writeCallback(address, mach, data,
+                  initialRequestTime, forwardRequestTime, firstResponseTime,
+                  false);
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+                         MachineType mach,
+                         DataBlock& data,
+                         Cycles initialRequestTime,
+                         Cycles forwardRequestTime,
+                         Cycles firstResponseTime,
+                         bool isRegion)
+{
+    assert(address == makeLineAddress(address));
+
+    DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
+    assert(m_writeRequestTable.count(makeLineAddress(address)));
+
+    RequestTable::iterator i = m_writeRequestTable.find(address);
+    assert(i != m_writeRequestTable.end());
+    GPUCoalescerRequest* request = i->second;
+
+    m_writeRequestTable.erase(i);
+    markRemoved();
+
+    assert((request->m_type == RubyRequestType_ST) ||
+           (request->m_type == RubyRequestType_ATOMIC) ||
+           (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
+           (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+           (request->m_type == RubyRequestType_RMW_Read) ||
+           (request->m_type == RubyRequestType_RMW_Write) ||
+           (request->m_type == RubyRequestType_Load_Linked) ||
+           (request->m_type == RubyRequestType_Store_Conditional) ||
+           (request->m_type == RubyRequestType_Locked_RMW_Read) ||
+           (request->m_type == RubyRequestType_Locked_RMW_Write) ||
+           (request->m_type == RubyRequestType_FLUSH));
+
+
+    //
+    // For Alpha, properly handle LL, SC, and write requests with respect to
+    // locked cache blocks.
+    //
+    // Not valid for Network_test protocl
+    //
+    bool success = true;
+    if(!m_usingNetworkTester)
+        success = handleLlsc(address, request);
+
+    if (request->m_type == RubyRequestType_Locked_RMW_Read) {
+        m_controller->blockOnQueue(address, m_mandatory_q_ptr);
+    } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
+        m_controller->unblock(address);
+    }
+
+    hitCallback(request, mach, data, success,
+                request->issue_time, forwardRequestTime, firstResponseTime,
+                isRegion);
+}
+
+void
+GPUCoalescer::readCallback(Addr address, DataBlock& data)
+{
+    readCallback(address, MachineType_NULL, data);
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+                        MachineType mach,
+                        DataBlock& data)
+{
+    readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+                        MachineType mach,
+                        DataBlock& data,
+                        Cycles initialRequestTime,
+                        Cycles forwardRequestTime,
+                        Cycles firstResponseTime)
+{
+
+    readCallback(address, mach, data,
+                 initialRequestTime, forwardRequestTime, firstResponseTime,
+                 false);
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+                        MachineType mach,
+                        DataBlock& data,
+                        Cycles initialRequestTime,
+                        Cycles forwardRequestTime,
+                        Cycles firstResponseTime,
+                        bool isRegion)
+{
+    assert(address == makeLineAddress(address));
+    assert(m_readRequestTable.count(makeLineAddress(address)));
+
+    DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
+    RequestTable::iterator i = m_readRequestTable.find(address);
+    assert(i != m_readRequestTable.end());
+    GPUCoalescerRequest* request = i->second;
+
+    m_readRequestTable.erase(i);
+    markRemoved();
+
+    assert((request->m_type == RubyRequestType_LD) ||
+           (request->m_type == RubyRequestType_IFETCH));
+
+    hitCallback(request, mach, data, true,
+                request->issue_time, forwardRequestTime, firstResponseTime,
+                isRegion);
+}
+
+void
+GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
+                       MachineType mach,
+                       DataBlock& data,
+                       bool success,
+                       Cycles initialRequestTime,
+                       Cycles forwardRequestTime,
+                       Cycles firstResponseTime,
+                       bool isRegion)
+{
+    PacketPtr pkt = srequest->pkt;
+    Addr request_address = pkt->getAddr();
+    Addr request_line_address = makeLineAddress(request_address);
+
+    RubyRequestType type = srequest->m_type;
+
+    // Set this cache entry to the most recently used
+    if (type == RubyRequestType_IFETCH) {
+        if (m_instCache_ptr->isTagPresent(request_line_address))
+            m_instCache_ptr->setMRU(request_line_address);
+    } else {
+        if (m_dataCache_ptr->isTagPresent(request_line_address))
+            m_dataCache_ptr->setMRU(request_line_address);
+    }
+
+    recordMissLatency(srequest, mach,
+                      initialRequestTime,
+                      forwardRequestTime,
+                      firstResponseTime,
+                      success, isRegion);
+    // update the data
+    //
+    // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
+    int len = reqCoalescer[request_line_address].size();
+    std::vector<PacketPtr> mylist;
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = reqCoalescer[request_line_address][i].first;
+        assert(type ==
+               reqCoalescer[request_line_address][i].second[PrimaryType]);
+        request_address = pkt->getAddr();
+        request_line_address = makeLineAddress(pkt->getAddr());
+        if (pkt->getPtr<uint8_t>()) {
+            if ((type == RubyRequestType_LD) ||
+                (type == RubyRequestType_ATOMIC) ||
+                (type == RubyRequestType_ATOMIC_RETURN) ||
+                (type == RubyRequestType_IFETCH) ||
+                (type == RubyRequestType_RMW_Read) ||
+                (type == RubyRequestType_Locked_RMW_Read) ||
+                (type == RubyRequestType_Load_Linked)) {
+                memcpy(pkt->getPtr<uint8_t>(),
+                       data.getData(getOffset(request_address),
+                                    pkt->getSize()),
+                       pkt->getSize());
+            } else {
+                data.setData(pkt->getPtr<uint8_t>(),
+                             getOffset(request_address), pkt->getSize());
+            }
+        } else {
+            DPRINTF(MemoryAccess,
+                    "WARNING.  Data not transfered from Ruby to M5 for type " \
+                    "%s\n",
+                    RubyRequestType_to_string(type));
+        }
+
+        // If using the RubyTester, update the RubyTester sender state's
+        // subBlock with the recieved data.  The tester will later access
+        // this state.
+        // Note: RubyPort will access it's sender state before the
+        // RubyTester.
+        if (m_usingRubyTester) {
+            RubyPort::SenderState *requestSenderState =
+                safe_cast<RubyPort::SenderState*>(pkt->senderState);
+            RubyTester::SenderState* testerSenderState =
+                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+            testerSenderState->subBlock.mergeFrom(data);
+        }
+
+        mylist.push_back(pkt);
+    }
+    delete srequest;
+    reqCoalescer.erase(request_line_address);
+    assert(!reqCoalescer.count(request_line_address));
+
+
+
+    completeHitCallback(mylist, len);
+}
+
+bool
+GPUCoalescer::empty() const
+{
+    return m_writeRequestTable.empty() && m_readRequestTable.empty();
+}
+
+// Analyzes the packet to see if this request can be coalesced.
+// If request can be coalesced, this request is added to the reqCoalescer table
+// and makeRequest returns RequestStatus_Issued;
+// If this is the first request to a cacheline, request is added to both
+// newRequests queue and to the reqCoalescer table; makeRequest
+// returns RequestStatus_Issued.
+// If there is a pending request to this cacheline and this request
+// can't be coalesced, RequestStatus_Aliased is returned and
+// the packet needs to be reissued.
+RequestStatus
+GPUCoalescer::makeRequest(PacketPtr pkt)
+{
+    // Check for GPU Barrier Kernel End or Kernel Begin
+    // Leave these to be handled by the child class
+    // Kernel End/Barrier = isFlush + isRelease
+    // Kernel Begin = isFlush + isAcquire
+    if (pkt->req->isKernel()) {
+        if (pkt->req->isAcquire()){
+            // This is a Kernel Begin leave handling to
+            // virtual xCoalescer::makeRequest
+            return RequestStatus_Issued;
+        }else if(pkt->req->isRelease()) {
+            // This is a Kernel End leave handling to
+            // virtual xCoalescer::makeRequest
+            // If we are here then we didn't call
+            // a virtual version of this function
+            // so we will also schedule the callback
+            int wf_id = 0;
+            if (pkt->req->hasContextId()) {
+                wf_id = pkt->req->contextId();
+            }
+            insertKernel(wf_id, pkt);
+            newKernelEnds.push_back(wf_id);
+            if (!issueEvent.scheduled()) {
+                schedule(issueEvent, curTick());
+            }
+            return RequestStatus_Issued;
+        }
+    }
+
+    // If number of outstanding requests greater than the max allowed,
+    // return RequestStatus_BufferFull. This logic can be extended to
+    // support proper backpressure.
+    if (m_outstanding_count >= m_max_outstanding_requests) {
+        return RequestStatus_BufferFull;
+    }
+
+    RubyRequestType primary_type = RubyRequestType_NULL;
+    RubyRequestType secondary_type = RubyRequestType_NULL;
+
+    if (pkt->isLLSC()) {
+        //
+        // Alpha LL/SC instructions need to be handled carefully by the cache
+        // coherence protocol to ensure they follow the proper semantics. In
+        // particular, by identifying the operations as atomic, the protocol
+        // should understand that migratory sharing optimizations should not
+        // be performed (i.e. a load between the LL and SC should not steal
+        // away exclusive permission).
+        //
+        if (pkt->isWrite()) {
+            primary_type = RubyRequestType_Store_Conditional;
+        } else {
+            assert(pkt->isRead());
+            primary_type = RubyRequestType_Load_Linked;
+        }
+        secondary_type = RubyRequestType_ATOMIC;
+    } else if (pkt->req->isLockedRMW()) {
+        //
+        // x86 locked instructions are translated to store cache coherence
+        // requests because these requests should always be treated as read
+        // exclusive operations and should leverage any migratory sharing
+        // optimization built into the protocol.
+        //
+        if (pkt->isWrite()) {
+            primary_type = RubyRequestType_Locked_RMW_Write;
+        } else {
+            assert(pkt->isRead());
+            primary_type = RubyRequestType_Locked_RMW_Read;
+        }
+        secondary_type = RubyRequestType_ST;
+    } else if (pkt->isAtomicOp()) {
+        //
+        // GPU Atomic Operation
+        //
+        primary_type = RubyRequestType_ATOMIC;
+        secondary_type = RubyRequestType_ATOMIC;
+    } else {
+        if (pkt->isRead()) {
+            if (pkt->req->isInstFetch()) {
+                primary_type = secondary_type = RubyRequestType_IFETCH;
+            } else {
+#if THE_ISA == X86_ISA
+                uint32_t flags = pkt->req->getFlags();
+                bool storeCheck = flags &
+                        (TheISA::StoreCheck << TheISA::FlagShift);
+#else
+                bool storeCheck = false;
+#endif // X86_ISA
+                if (storeCheck) {
+                    primary_type = RubyRequestType_RMW_Read;
+                    secondary_type = RubyRequestType_ST;
+                } else {
+                    primary_type = secondary_type = RubyRequestType_LD;
+                }
+            }
+        } else if (pkt->isWrite()) {
+            //
+            // Note: M5 packets do not differentiate ST from RMW_Write
+            //
+            primary_type = secondary_type = RubyRequestType_ST;
+        } else if (pkt->isFlush()) {
+            primary_type = secondary_type = RubyRequestType_FLUSH;
+        } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
+            if (assumingRfOCoherence) {
+                // If we reached here, this request must be a memFence
+                // and the protocol implements RfO, the coalescer can
+                // assume sequentially consistency and schedule the callback
+                // immediately.
+                // Currently the code implements fence callbacks
+                // by reusing the mechanism for kernel completions.
+                // This should be fixed.
+                int wf_id = 0;
+                if (pkt->req->hasContextId()) {
+                    wf_id = pkt->req->contextId();
+                }
+                insertKernel(wf_id, pkt);
+                newKernelEnds.push_back(wf_id);
+                if (!issueEvent.scheduled()) {
+                    schedule(issueEvent, curTick());
+                }
+                return RequestStatus_Issued;
+            } else {
+                // If not RfO, return issued here and let the child coalescer
+                // take care of it.
+                return RequestStatus_Issued;
+            }
+        } else {
+            panic("Unsupported ruby packet type\n");
+        }
+    }
+
+    // Check if there is any pending request to this cache line from
+    // previous cycles.
+    // If there is a pending request, return aliased. Since coalescing
+    // across time is not permitted, aliased requests are not coalesced.
+    // If a request for this address has already been issued, we must block
+    RequestStatus status = getRequestStatus(pkt, primary_type);
+    if (status != RequestStatus_Ready)
+        return status;
+
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    // Check if this request can be coalesced with previous
+    // requests from this cycle.
+    if (!reqCoalescer.count(line_addr)) {
+        // This is the first access to this cache line.
+        // A new request to the memory subsystem has to be
+        // made in the next cycle for this cache line, so
+        // add this line addr to the "newRequests" queue
+        newRequests.push_back(line_addr);
+
+    // There was a request to this cache line in this cycle,
+    // let us see if we can coalesce this request with the previous
+    // requests from this cycle
+    } else if (primary_type !=
+               reqCoalescer[line_addr][0].second[PrimaryType]) {
+        // can't coalesce loads, stores and atomics!
+        return RequestStatus_Aliased;
+    } else if (pkt->req->isLockedRMW() ||
+               reqCoalescer[line_addr][0].first->req->isLockedRMW()) {
+        // can't coalesce locked accesses, but can coalesce atomics!
+        return RequestStatus_Aliased;
+    } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
+               pkt->req->contextId() !=
+               reqCoalescer[line_addr][0].first->req->contextId()) {
+        // can't coalesce releases from different wavefronts
+        return RequestStatus_Aliased;
+    }
+
+    // in addition to the packet, we need to save both request types
+    reqCoalescer[line_addr].push_back(
+            RequestDesc(pkt, std::vector<RubyRequestType>()) );
+    reqCoalescer[line_addr].back().second.push_back(primary_type);
+    reqCoalescer[line_addr].back().second.push_back(secondary_type);
+    if (!issueEvent.scheduled())
+        schedule(issueEvent, curTick());
+    // TODO: issue hardware prefetches here
+    return RequestStatus_Issued;
+}
+
+void
+GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
+{
+
+    int proc_id = -1;
+    if (pkt != NULL && pkt->req->hasContextId()) {
+        proc_id = pkt->req->contextId();
+    }
+
+    // If valid, copy the pc to the ruby request
+    Addr pc = 0;
+    if (pkt->req->hasPC()) {
+        pc = pkt->req->getPC();
+    }
+
+    // At the moment setting scopes only counts
+    // for GPU spill space accesses
+    // which is pkt->req->isStack()
+    // this scope is REPLACE since it
+    // does not need to be flushed at the end
+    // of a kernel Private and local may need
+    // to be visible at the end of the kernel
+    HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
+    HSAScope accessScope = reqScopeToHSAScope(pkt->req);
+
+    Addr line_addr = makeLineAddress(pkt->getAddr());
+
+    // Creating WriteMask that records written bytes
+    // and atomic operations. This enables partial writes
+    // and partial reads of those writes
+    DataBlock dataBlock;
+    dataBlock.clear();
+    uint32_t blockSize = RubySystem::getBlockSizeBytes();
+    std::vector<bool> accessMask(blockSize,false);
+    std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
+    uint32_t tableSize = reqCoalescer[line_addr].size();
+    for (int i = 0; i < tableSize; i++) {
+        PacketPtr tmpPkt = reqCoalescer[line_addr][i].first;
+        uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
+        uint32_t tmpSize = tmpPkt->getSize();
+        if (tmpPkt->isAtomicOp()) {
+            std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
+                                                        tmpPkt->getAtomicOp());
+            atomicOps.push_back(tmpAtomicOp);
+        } else if(tmpPkt->isWrite()) {
+            dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
+                              tmpOffset, tmpSize);
+        }
+        for (int j = 0; j < tmpSize; j++) {
+            accessMask[tmpOffset + j] = true;
+        }
+    }
+    std::shared_ptr<RubyRequest> msg;
+    if (pkt->isAtomicOp()) {
+        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+                              pkt->getPtr<uint8_t>(),
+                              pkt->getSize(), pc, secondary_type,
+                              RubyAccessMode_Supervisor, pkt,
+                              PrefetchBit_No, proc_id, 100,
+                              blockSize, accessMask,
+                              dataBlock, atomicOps,
+                              accessScope, accessSegment);
+    } else {
+        msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+                              pkt->getPtr<uint8_t>(),
+                              pkt->getSize(), pc, secondary_type,
+                              RubyAccessMode_Supervisor, pkt,
+                              PrefetchBit_No, proc_id, 100,
+                              blockSize, accessMask,
+                              dataBlock,
+                              accessScope, accessSegment);
+    }
+    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
+             curTick(), m_version, "Coal", "Begin", "", "",
+             printAddress(msg->getPhysicalAddress()),
+             RubyRequestType_to_string(secondary_type));
+
+    fatal_if(secondary_type == RubyRequestType_IFETCH,
+             "there should not be any I-Fetch requests in the GPU Coalescer");
+
+    // Send the message to the cache controller
+    fatal_if(m_data_cache_hit_latency == 0,
+             "should not have a latency of zero");
+
+    assert(m_mandatory_q_ptr);
+    m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+}
+
+template <class KEY, class VALUE>
+std::ostream &
+operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
+{
+    out << "[";
+    for (auto i = map.begin(); i != map.end(); ++i)
+        out << " " << i->first << "=" << i->second;
+    out << " ]";
+
+    return out;
+}
+
+void
+GPUCoalescer::print(ostream& out) const
+{
+    out << "[GPUCoalescer: " << m_version
+        << ", outstanding requests: " << m_outstanding_count
+        << ", read request table: " << m_readRequestTable
+        << ", write request table: " << m_writeRequestTable
+        << "]";
+}
+
+// this can be called from setState whenever coherence permissions are
+// upgraded when invoked, coherence violations will be checked for the
+// given block
+void
+GPUCoalescer::checkCoherence(Addr addr)
+{
+#ifdef CHECK_COHERENCE
+    m_ruby_system->checkGlobalCoherenceInvariant(addr);
+#endif
+}
+
+void
+GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
+    DPRINTF(RubyStats, "Recorded statistic: %s\n",
+            SequencerRequestType_to_string(requestType));
+}
+
+GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq)
+    : Event(Progress_Event_Pri), seq(_seq)
+{
+}
+
+
+void
+GPUCoalescer::completeIssue()
+{
+    // newRequests has the cacheline addresses of all the
+    // requests which need to be issued to the memory subsystem
+    // in this cycle
+    int len = newRequests.size();
+    DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
+    for (int i = 0; i < len; ++i) {
+        // Get the requests from reqCoalescer table. Get only the
+        // first request for each cacheline, the remaining requests
+        // can be coalesced with the first request. So, only
+        // one request is issued per cacheline.
+        RequestDesc info = reqCoalescer[newRequests[i]][0];
+        PacketPtr pkt = info.first;
+        DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
+                i, pkt->req->getPaddr());
+        // Insert this request to the read/writeRequestTables. These tables
+        // are used to track aliased requests in makeRequest subroutine
+        bool found = insertRequest(pkt, info.second[PrimaryType]);
+
+        if (found) {
+            panic("GPUCoalescer::makeRequest should never be called if the "
+                  "request is already outstanding\n");
+        }
+
+        // Issue request to ruby subsystem
+        issueRequest(pkt, info.second[SecondaryType]);
+    }
+    newRequests.clear();
+
+    // have Kernel End releases been issued this cycle
+    len = newKernelEnds.size();
+    for (int i = 0; i < len; i++) {
+        kernelCallback(newKernelEnds[i]);
+    }
+    newKernelEnds.clear();
+}
+
+void
+GPUCoalescer::IssueEvent::process()
+{
+    seq->completeIssue();
+}
+
+const char *
+GPUCoalescer::IssueEvent::description() const
+{
+    return "Issue coalesced request";
+}
+
+void
+GPUCoalescer::evictionCallback(Addr address)
+{
+    ruby_eviction_callback(address);
+}
+
+void
+GPUCoalescer::kernelCallback(int wavefront_id)
+{
+    assert(kernelEndList.count(wavefront_id));
+
+    ruby_hit_callback(kernelEndList[wavefront_id]);
+
+    kernelEndList.erase(wavefront_id);
+}
+
+void
+GPUCoalescer::atomicCallback(Addr address,
+                             MachineType mach,
+                             const DataBlock& data)
+{
+    assert(address == makeLineAddress(address));
+
+    DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
+    assert(m_writeRequestTable.count(makeLineAddress(address)));
+
+    RequestTable::iterator i = m_writeRequestTable.find(address);
+    assert(i != m_writeRequestTable.end());
+    GPUCoalescerRequest* srequest = i->second;
+
+    m_writeRequestTable.erase(i);
+    markRemoved();
+
+    assert((srequest->m_type == RubyRequestType_ATOMIC) ||
+           (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
+           (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
+
+
+    // Atomics don't write to cache, so there is no MRU update...
+
+    recordMissLatency(srequest, mach,
+                      srequest->issue_time, Cycles(0), Cycles(0), true, false);
+
+    PacketPtr pkt = srequest->pkt;
+    Addr request_address = pkt->getAddr();
+    Addr request_line_address = makeLineAddress(pkt->getAddr());
+
+    int len = reqCoalescer[request_line_address].size();
+    std::vector<PacketPtr> mylist;
+    for (int i = 0; i < len; ++i) {
+        PacketPtr pkt = reqCoalescer[request_line_address][i].first;
+        assert(srequest->m_type ==
+               reqCoalescer[request_line_address][i].second[PrimaryType]);
+        request_address = (pkt->getAddr());
+        request_line_address = makeLineAddress(request_address);
+        if (pkt->getPtr<uint8_t>() &&
+            srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
+            /* atomics are done in memory, and return the data *before* the atomic op... */
+            memcpy(pkt->getPtr<uint8_t>(),
+                   data.getData(getOffset(request_address),
+                                pkt->getSize()),
+                   pkt->getSize());
+        } else {
+            DPRINTF(MemoryAccess,
+                    "WARNING.  Data not transfered from Ruby to M5 for type " \
+                    "%s\n",
+                    RubyRequestType_to_string(srequest->m_type));
+        }
+
+        // If using the RubyTester, update the RubyTester sender state's
+        // subBlock with the recieved data.  The tester will later access
+        // this state.
+        // Note: RubyPort will access it's sender state before the
+        // RubyTester.
+        if (m_usingRubyTester) {
+            RubyPort::SenderState *requestSenderState =
+                safe_cast<RubyPort::SenderState*>(pkt->senderState);
+            RubyTester::SenderState* testerSenderState =
+                safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+            testerSenderState->subBlock.mergeFrom(data);
+        }
+
+        mylist.push_back(pkt);
+    }
+    delete srequest;
+    reqCoalescer.erase(request_line_address);
+    assert(!reqCoalescer.count(request_line_address));
+
+    completeHitCallback(mylist, len);
+}
+
+void
+GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
+{
+    if(myMachID == senderMachID) {
+        CP_TCPLdHits++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) {
+        CP_TCPLdTransfers++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) {
+        CP_TCCLdHits++;
+    } else {
+        CP_LdMiss++;
+    }
+}
+
+void
+GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
+{
+    if(myMachID == senderMachID) {
+        CP_TCPStHits++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) {
+        CP_TCPStTransfers++;
+    } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) {
+        CP_TCCStHits++;
+    } else {
+        CP_StMiss++;
+    }
+}
+
+void
+GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
+{
+    for (int i = 0; i < len; ++i) {
+        RubyPort::SenderState *ss =
+            safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
+        MemSlavePort *port = ss->port;
+        assert(port != NULL);
+
+        mylist[i]->senderState = ss->predecessor;
+        delete ss;
+        port->hitCallback(mylist[i]);
+        trySendRetries();
+    }
+
+    testDrainComplete();
+}
+
+PacketPtr
+GPUCoalescer::mapAddrToPkt(Addr address)
+{
+    RequestTable::iterator i = m_readRequestTable.find(address);
+    assert(i != m_readRequestTable.end());
+    GPUCoalescerRequest* request = i->second;
+    return request->pkt;
+}
+
+void
+GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
+                                MachineType mach,
+                                Cycles initialRequestTime,
+                                Cycles forwardRequestTime,
+                                Cycles firstResponseTime,
+                                bool success, bool isRegion)
+{
+    RubyRequestType type = srequest->m_type;
+    Cycles issued_time = srequest->issue_time;
+    Cycles completion_time = curCycle();
+    assert(completion_time >= issued_time);
+    Cycles total_lat = completion_time - issued_time;
+
+    // cache stats (valid for RfO protocol only)
+    if (mach == MachineType_TCP) {
+        if (type == RubyRequestType_LD) {
+            GPU_TCPLdHits++;
+        } else {
+            GPU_TCPStHits++;
+        }
+    } else if (mach == MachineType_L1Cache_wCC) {
+        if (type == RubyRequestType_LD) {
+            GPU_TCPLdTransfers++;
+        } else {
+            GPU_TCPStTransfers++;
+        }
+    } else if (mach == MachineType_TCC) {
+        if (type == RubyRequestType_LD) {
+            GPU_TCCLdHits++;
+        } else {
+            GPU_TCCStHits++;
+        }
+    } else  {
+        if (type == RubyRequestType_LD) {
+            GPU_LdMiss++;
+        } else {
+            GPU_StMiss++;
+        }
+    }
+
+    // Profile all access latency, even zero latency accesses
+    m_latencyHist.sample(total_lat);
+    m_typeLatencyHist[type]->sample(total_lat);
+
+    // Profile the miss latency for all non-zero demand misses
+    if (total_lat != Cycles(0)) {
+        m_missLatencyHist.sample(total_lat);
+        m_missTypeLatencyHist[type]->sample(total_lat);
+
+        if (mach != MachineType_NUM) {
+            m_missMachLatencyHist[mach]->sample(total_lat);
+            m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
+
+            if ((issued_time <= initialRequestTime) &&
+                (initialRequestTime <= forwardRequestTime) &&
+                (forwardRequestTime <= firstResponseTime) &&
+                (firstResponseTime <= completion_time)) {
+
+                m_IssueToInitialDelayHist[mach]->sample(
+                    initialRequestTime - issued_time);
+                m_InitialToForwardDelayHist[mach]->sample(
+                    forwardRequestTime - initialRequestTime);
+                m_ForwardToFirstResponseDelayHist[mach]->sample(
+                    firstResponseTime - forwardRequestTime);
+                m_FirstResponseToCompletionDelayHist[mach]->sample(
+                    completion_time - firstResponseTime);
+            }
+        }
+
+    }
+
+    DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
+             curTick(), m_version, "Coal",
+             success ? "Done" : "SC_Failed", "", "",
+             printAddress(srequest->pkt->getAddr()), total_lat);
+}
+
+void
+GPUCoalescer::regStats()
+{
+    // These statistical variables are not for display.
+    // The profiler will collate these across different
+    // coalescers and display those collated statistics.
+    m_outstandReqHist.init(10);
+    m_latencyHist.init(10);
+    m_missLatencyHist.init(10);
+
+    for (int i = 0; i < RubyRequestType_NUM; i++) {
+        m_typeLatencyHist.push_back(new Stats::Histogram());
+        m_typeLatencyHist[i]->init(10);
+
+        m_missTypeLatencyHist.push_back(new Stats::Histogram());
+        m_missTypeLatencyHist[i]->init(10);
+    }
+
+    for (int i = 0; i < MachineType_NUM; i++) {
+        m_missMachLatencyHist.push_back(new Stats::Histogram());
+        m_missMachLatencyHist[i]->init(10);
+
+        m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
+        m_IssueToInitialDelayHist[i]->init(10);
+
+        m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
+        m_InitialToForwardDelayHist[i]->init(10);
+
+        m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
+        m_ForwardToFirstResponseDelayHist[i]->init(10);
+
+        m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
+        m_FirstResponseToCompletionDelayHist[i]->init(10);
+    }
+
+    for (int i = 0; i < RubyRequestType_NUM; i++) {
+        m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
+
+        for (int j = 0; j < MachineType_NUM; j++) {
+            m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
+            m_missTypeMachLatencyHist[i][j]->init(10);
+        }
+    }
+
+    // GPU cache stats
+    GPU_TCPLdHits
+        .name(name() + ".gpu_tcp_ld_hits")
+        .desc("loads that hit in the TCP")
+        ;
+    GPU_TCPLdTransfers
+        .name(name() + ".gpu_tcp_ld_transfers")
+        .desc("TCP to TCP load transfers")
+        ;
+    GPU_TCCLdHits
+        .name(name() + ".gpu_tcc_ld_hits")
+        .desc("loads that hit in the TCC")
+        ;
+    GPU_LdMiss
+        .name(name() + ".gpu_ld_misses")
+        .desc("loads that miss in the GPU")
+        ;
+
+    GPU_TCPStHits
+        .name(name() + ".gpu_tcp_st_hits")
+        .desc("stores that hit in the TCP")
+        ;
+    GPU_TCPStTransfers
+        .name(name() + ".gpu_tcp_st_transfers")
+        .desc("TCP to TCP store transfers")
+        ;
+    GPU_TCCStHits
+        .name(name() + ".gpu_tcc_st_hits")
+        .desc("stores that hit in the TCC")
+        ;
+    GPU_StMiss
+        .name(name() + ".gpu_st_misses")
+        .desc("stores that miss in the GPU")
+        ;
+
+    // CP cache stats
+    CP_TCPLdHits
+        .name(name() + ".cp_tcp_ld_hits")
+        .desc("loads that hit in the TCP")
+        ;
+    CP_TCPLdTransfers
+        .name(name() + ".cp_tcp_ld_transfers")
+        .desc("TCP to TCP load transfers")
+        ;
+    CP_TCCLdHits
+        .name(name() + ".cp_tcc_ld_hits")
+        .desc("loads that hit in the TCC")
+        ;
+    CP_LdMiss
+        .name(name() + ".cp_ld_misses")
+        .desc("loads that miss in the GPU")
+        ;
+
+    CP_TCPStHits
+        .name(name() + ".cp_tcp_st_hits")
+        .desc("stores that hit in the TCP")
+        ;
+    CP_TCPStTransfers
+        .name(name() + ".cp_tcp_st_transfers")
+        .desc("TCP to TCP store transfers")
+        ;
+    CP_TCCStHits
+        .name(name() + ".cp_tcc_st_hits")
+        .desc("stores that hit in the TCC")
+        ;
+    CP_StMiss
+        .name(name() + ".cp_st_misses")
+        .desc("stores that miss in the GPU")
+        ;
+}