summaryrefslogtreecommitdiff
path: root/src/mem/ruby/system/GPUCoalescer.cc
diff options
context:
space:
mode:
authorTony Gutierrez <anthony.gutierrez@amd.com>2016-01-19 14:28:22 -0500
committerTony Gutierrez <anthony.gutierrez@amd.com>2016-01-19 14:28:22 -0500
commit1a7d3f9fcb76a68540dd948f91413533a383bfde (patch)
tree867510a147cd095f19499d26b7c02d27de4cae9d /src/mem/ruby/system/GPUCoalescer.cc
parent28e353e0403ea379d244a418e8dc8ee0b48187cf (diff)
downloadgem5-1a7d3f9fcb76a68540dd948f91413533a383bfde.tar.xz
gpu-compute: AMD's baseline GPU model
Diffstat (limited to 'src/mem/ruby/system/GPUCoalescer.cc')
-rw-r--r--src/mem/ruby/system/GPUCoalescer.cc1397
1 files changed, 1397 insertions, 0 deletions
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc
new file mode 100644
index 000000000..db279bd3a
--- /dev/null
+++ b/src/mem/ruby/system/GPUCoalescer.cc
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Sooraj Puthoor
+ */
+
+#include "base/misc.hh"
+#include "base/str.hh"
+#include "config/the_isa.hh"
+
+#if THE_ISA == X86_ISA
+#include "arch/x86/insts/microldstop.hh"
+
+#endif // X86_ISA
+#include "mem/ruby/system/GPUCoalescer.hh"
+
+#include "cpu/testers/rubytest/RubyTester.hh"
+#include "debug/GPUCoalescer.hh"
+#include "debug/MemoryAccess.hh"
+#include "debug/ProtocolTrace.hh"
+#include "debug/RubyPort.hh"
+#include "debug/RubyStats.hh"
+#include "gpu-compute/shader.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/common/DataBlock.hh"
+#include "mem/ruby/common/SubBlock.hh"
+#include "mem/ruby/network/MessageBuffer.hh"
+#include "mem/ruby/profiler/Profiler.hh"
+#include "mem/ruby/slicc_interface/AbstractController.hh"
+#include "mem/ruby/slicc_interface/RubyRequest.hh"
+#include "mem/ruby/structures/CacheMemory.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "params/RubyGPUCoalescer.hh"
+
+using namespace std;
+
+GPUCoalescer *
+RubyGPUCoalescerParams::create()
+{
+ return new GPUCoalescer(this);
+}
+
+HSAScope
+reqScopeToHSAScope(Request* req)
+{
+ HSAScope accessScope = HSAScope_UNSPECIFIED;
+ if (req->isScoped()) {
+ if (req->isWavefrontScope()) {
+ accessScope = HSAScope_WAVEFRONT;
+ } else if (req->isWorkgroupScope()) {
+ accessScope = HSAScope_WORKGROUP;
+ } else if (req->isDeviceScope()) {
+ accessScope = HSAScope_DEVICE;
+ } else if (req->isSystemScope()) {
+ accessScope = HSAScope_SYSTEM;
+ } else {
+ fatal("Bad scope type");
+ }
+ }
+ return accessScope;
+}
+
+HSASegment
+reqSegmentToHSASegment(Request* req)
+{
+ HSASegment accessSegment = HSASegment_GLOBAL;
+
+ if (req->isGlobalSegment()) {
+ accessSegment = HSASegment_GLOBAL;
+ } else if (req->isGroupSegment()) {
+ accessSegment = HSASegment_GROUP;
+ } else if (req->isPrivateSegment()) {
+ accessSegment = HSASegment_PRIVATE;
+ } else if (req->isKernargSegment()) {
+ accessSegment = HSASegment_KERNARG;
+ } else if (req->isReadonlySegment()) {
+ accessSegment = HSASegment_READONLY;
+ } else if (req->isSpillSegment()) {
+ accessSegment = HSASegment_SPILL;
+ } else if (req->isArgSegment()) {
+ accessSegment = HSASegment_ARG;
+ } else {
+ fatal("Bad segment type");
+ }
+
+ return accessSegment;
+}
+
+GPUCoalescer::GPUCoalescer(const Params *p)
+ : RubyPort(p), issueEvent(this), deadlockCheckEvent(this)
+{
+ m_store_waiting_on_load_cycles = 0;
+ m_store_waiting_on_store_cycles = 0;
+ m_load_waiting_on_store_cycles = 0;
+ m_load_waiting_on_load_cycles = 0;
+
+ m_outstanding_count = 0;
+
+ m_max_outstanding_requests = 0;
+ m_deadlock_threshold = 0;
+ m_instCache_ptr = nullptr;
+ m_dataCache_ptr = nullptr;
+
+ m_instCache_ptr = p->icache;
+ m_dataCache_ptr = p->dcache;
+ m_max_outstanding_requests = p->max_outstanding_requests;
+ m_deadlock_threshold = p->deadlock_threshold;
+
+ assert(m_max_outstanding_requests > 0);
+ assert(m_deadlock_threshold > 0);
+ assert(m_instCache_ptr);
+ assert(m_dataCache_ptr);
+
+ m_data_cache_hit_latency = p->dcache_hit_latency;
+
+ m_usingNetworkTester = p->using_network_tester;
+ assumingRfOCoherence = p->assume_rfo;
+}
+
+GPUCoalescer::~GPUCoalescer()
+{
+}
+
+void
+GPUCoalescer::wakeup()
+{
+ // Check for deadlock of any of the requests
+ Cycles current_time = curCycle();
+
+ // Check across all outstanding requests
+ int total_outstanding = 0;
+
+ RequestTable::iterator read = m_readRequestTable.begin();
+ RequestTable::iterator read_end = m_readRequestTable.end();
+ for (; read != read_end; ++read) {
+ GPUCoalescerRequest* request = read->second;
+ if (current_time - request->issue_time < m_deadlock_threshold)
+ continue;
+
+ panic("Possible Deadlock detected. Aborting!\n"
+ "version: %d request.paddr: 0x%x m_readRequestTable: %d "
+ "current time: %u issue_time: %d difference: %d\n", m_version,
+ request->pkt->getAddr(), m_readRequestTable.size(),
+ current_time * clockPeriod(), request->issue_time * clockPeriod(),
+ (current_time - request->issue_time)*clockPeriod());
+ }
+
+ RequestTable::iterator write = m_writeRequestTable.begin();
+ RequestTable::iterator write_end = m_writeRequestTable.end();
+ for (; write != write_end; ++write) {
+ GPUCoalescerRequest* request = write->second;
+ if (current_time - request->issue_time < m_deadlock_threshold)
+ continue;
+
+ panic("Possible Deadlock detected. Aborting!\n"
+ "version: %d request.paddr: 0x%x m_writeRequestTable: %d "
+ "current time: %u issue_time: %d difference: %d\n", m_version,
+ request->pkt->getAddr(), m_writeRequestTable.size(),
+ current_time * clockPeriod(), request->issue_time * clockPeriod(),
+ (current_time - request->issue_time) * clockPeriod());
+ }
+
+ total_outstanding += m_writeRequestTable.size();
+ total_outstanding += m_readRequestTable.size();
+
+ assert(m_outstanding_count == total_outstanding);
+
+ if (m_outstanding_count > 0) {
+ // If there are still outstanding requests, keep checking
+ schedule(deadlockCheckEvent,
+ m_deadlock_threshold * clockPeriod() +
+ curTick());
+ }
+}
+
+void
+GPUCoalescer::resetStats()
+{
+ m_latencyHist.reset();
+ m_missLatencyHist.reset();
+ for (int i = 0; i < RubyRequestType_NUM; i++) {
+ m_typeLatencyHist[i]->reset();
+ m_missTypeLatencyHist[i]->reset();
+ for (int j = 0; j < MachineType_NUM; j++) {
+ m_missTypeMachLatencyHist[i][j]->reset();
+ }
+ }
+
+ for (int i = 0; i < MachineType_NUM; i++) {
+ m_missMachLatencyHist[i]->reset();
+
+ m_IssueToInitialDelayHist[i]->reset();
+ m_InitialToForwardDelayHist[i]->reset();
+ m_ForwardToFirstResponseDelayHist[i]->reset();
+ m_FirstResponseToCompletionDelayHist[i]->reset();
+ }
+}
+
+void
+GPUCoalescer::printProgress(ostream& out) const
+{
+}
+
+RequestStatus
+GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type)
+{
+ Addr line_addr = makeLineAddress(pkt->getAddr());
+
+ if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) {
+ return RequestStatus_BufferFull;
+ }
+
+ if(m_controller->isBlocked(line_addr) &&
+ request_type != RubyRequestType_Locked_RMW_Write) {
+ return RequestStatus_Aliased;
+ }
+
+ if ((request_type == RubyRequestType_ST) ||
+ (request_type == RubyRequestType_ATOMIC) ||
+ (request_type == RubyRequestType_ATOMIC_RETURN) ||
+ (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+ (request_type == RubyRequestType_RMW_Read) ||
+ (request_type == RubyRequestType_RMW_Write) ||
+ (request_type == RubyRequestType_Load_Linked) ||
+ (request_type == RubyRequestType_Store_Conditional) ||
+ (request_type == RubyRequestType_Locked_RMW_Read) ||
+ (request_type == RubyRequestType_Locked_RMW_Write) ||
+ (request_type == RubyRequestType_FLUSH)) {
+
+ // Check if there is any outstanding read request for the same
+ // cache line.
+ if (m_readRequestTable.count(line_addr) > 0) {
+ m_store_waiting_on_load_cycles++;
+ return RequestStatus_Aliased;
+ }
+
+ if (m_writeRequestTable.count(line_addr) > 0) {
+ // There is an outstanding write request for the cache line
+ m_store_waiting_on_store_cycles++;
+ return RequestStatus_Aliased;
+ }
+ } else {
+ // Check if there is any outstanding write request for the same
+ // cache line.
+ if (m_writeRequestTable.count(line_addr) > 0) {
+ m_load_waiting_on_store_cycles++;
+ return RequestStatus_Aliased;
+ }
+
+ if (m_readRequestTable.count(line_addr) > 0) {
+ // There is an outstanding read request for the cache line
+ m_load_waiting_on_load_cycles++;
+ return RequestStatus_Aliased;
+ }
+ }
+
+ return RequestStatus_Ready;
+
+}
+
+
+
+// sets the kernelEndList
+void
+GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
+{
+ // Don't know if this will happen or is possible
+ // but I just want to be careful and not have it become
+ // simulator hang in the future
+ DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
+ assert(kernelEndList.count(wavefront_id) == 0);
+
+ kernelEndList[wavefront_id] = pkt;
+ DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
+ kernelEndList.size());
+}
+
+
+// Insert the request on the correct request table. Return true if
+// the entry was already present.
+bool
+GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
+{
+ assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready ||
+ pkt->req->isLockedRMW() ||
+ !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge()));
+
+ int total_outstanding M5_VAR_USED =
+ m_writeRequestTable.size() + m_readRequestTable.size();
+
+ assert(m_outstanding_count == total_outstanding);
+
+ // See if we should schedule a deadlock check
+ if (deadlockCheckEvent.scheduled() == false) {
+ schedule(deadlockCheckEvent, m_deadlock_threshold + curTick());
+ }
+
+ Addr line_addr = makeLineAddress(pkt->getAddr());
+ if ((request_type == RubyRequestType_ST) ||
+ (request_type == RubyRequestType_ATOMIC) ||
+ (request_type == RubyRequestType_ATOMIC_RETURN) ||
+ (request_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+ (request_type == RubyRequestType_RMW_Read) ||
+ (request_type == RubyRequestType_RMW_Write) ||
+ (request_type == RubyRequestType_Load_Linked) ||
+ (request_type == RubyRequestType_Store_Conditional) ||
+ (request_type == RubyRequestType_Locked_RMW_Read) ||
+ (request_type == RubyRequestType_Locked_RMW_Write) ||
+ (request_type == RubyRequestType_FLUSH)) {
+
+ pair<RequestTable::iterator, bool> r =
+ m_writeRequestTable.insert(RequestTable::value_type(line_addr,
+ (GPUCoalescerRequest*) NULL));
+ if (r.second) {
+ RequestTable::iterator i = r.first;
+ i->second = new GPUCoalescerRequest(pkt, request_type,
+ curCycle());
+ DPRINTF(GPUCoalescer,
+ "Inserting write request for paddr %#x for type %d\n",
+ pkt->req->getPaddr(), i->second->m_type);
+ m_outstanding_count++;
+ } else {
+ return true;
+ }
+ } else {
+ pair<RequestTable::iterator, bool> r =
+ m_readRequestTable.insert(RequestTable::value_type(line_addr,
+ (GPUCoalescerRequest*) NULL));
+
+ if (r.second) {
+ RequestTable::iterator i = r.first;
+ i->second = new GPUCoalescerRequest(pkt, request_type,
+ curCycle());
+ DPRINTF(GPUCoalescer,
+ "Inserting read request for paddr %#x for type %d\n",
+ pkt->req->getPaddr(), i->second->m_type);
+ m_outstanding_count++;
+ } else {
+ return true;
+ }
+ }
+
+ m_outstandReqHist.sample(m_outstanding_count);
+
+ total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size();
+ assert(m_outstanding_count == total_outstanding);
+
+ return false;
+}
+
+void
+GPUCoalescer::markRemoved()
+{
+ m_outstanding_count--;
+ assert(m_outstanding_count ==
+ m_writeRequestTable.size() + m_readRequestTable.size());
+}
+
+void
+GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest)
+{
+ assert(m_outstanding_count ==
+ m_writeRequestTable.size() + m_readRequestTable.size());
+
+ Addr line_addr = makeLineAddress(srequest->pkt->getAddr());
+ if ((srequest->m_type == RubyRequestType_ST) ||
+ (srequest->m_type == RubyRequestType_RMW_Read) ||
+ (srequest->m_type == RubyRequestType_RMW_Write) ||
+ (srequest->m_type == RubyRequestType_Load_Linked) ||
+ (srequest->m_type == RubyRequestType_Store_Conditional) ||
+ (srequest->m_type == RubyRequestType_Locked_RMW_Read) ||
+ (srequest->m_type == RubyRequestType_Locked_RMW_Write)) {
+ m_writeRequestTable.erase(line_addr);
+ } else {
+ m_readRequestTable.erase(line_addr);
+ }
+
+ markRemoved();
+}
+
+bool
+GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request)
+{
+ //
+ // The success flag indicates whether the LLSC operation was successful.
+ // LL ops will always succeed, but SC may fail if the cache line is no
+ // longer locked.
+ //
+ bool success = true;
+ if (request->m_type == RubyRequestType_Store_Conditional) {
+ if (!m_dataCache_ptr->isLocked(address, m_version)) {
+ //
+ // For failed SC requests, indicate the failure to the cpu by
+ // setting the extra data to zero.
+ //
+ request->pkt->req->setExtraData(0);
+ success = false;
+ } else {
+ //
+ // For successful SC requests, indicate the success to the cpu by
+ // setting the extra data to one.
+ //
+ request->pkt->req->setExtraData(1);
+ }
+ //
+ // Independent of success, all SC operations must clear the lock
+ //
+ m_dataCache_ptr->clearLocked(address);
+ } else if (request->m_type == RubyRequestType_Load_Linked) {
+ //
+ // Note: To fully follow Alpha LLSC semantics, should the LL clear any
+ // previously locked cache lines?
+ //
+ m_dataCache_ptr->setLocked(address, m_version);
+ } else if ((m_dataCache_ptr->isTagPresent(address)) &&
+ (m_dataCache_ptr->isLocked(address, m_version))) {
+ //
+ // Normal writes should clear the locked address
+ //
+ m_dataCache_ptr->clearLocked(address);
+ }
+ return success;
+}
+
+void
+GPUCoalescer::writeCallback(Addr address, DataBlock& data)
+{
+ writeCallback(address, MachineType_NULL, data);
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data)
+{
+ writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime)
+{
+ writeCallback(address, mach, data,
+ initialRequestTime, forwardRequestTime, firstResponseTime,
+ false);
+}
+
+void
+GPUCoalescer::writeCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion)
+{
+ assert(address == makeLineAddress(address));
+
+ DPRINTF(GPUCoalescer, "write callback for address %#x\n", address);
+ assert(m_writeRequestTable.count(makeLineAddress(address)));
+
+ RequestTable::iterator i = m_writeRequestTable.find(address);
+ assert(i != m_writeRequestTable.end());
+ GPUCoalescerRequest* request = i->second;
+
+ m_writeRequestTable.erase(i);
+ markRemoved();
+
+ assert((request->m_type == RubyRequestType_ST) ||
+ (request->m_type == RubyRequestType_ATOMIC) ||
+ (request->m_type == RubyRequestType_ATOMIC_RETURN) ||
+ (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) ||
+ (request->m_type == RubyRequestType_RMW_Read) ||
+ (request->m_type == RubyRequestType_RMW_Write) ||
+ (request->m_type == RubyRequestType_Load_Linked) ||
+ (request->m_type == RubyRequestType_Store_Conditional) ||
+ (request->m_type == RubyRequestType_Locked_RMW_Read) ||
+ (request->m_type == RubyRequestType_Locked_RMW_Write) ||
+ (request->m_type == RubyRequestType_FLUSH));
+
+
+ //
+ // For Alpha, properly handle LL, SC, and write requests with respect to
+ // locked cache blocks.
+ //
+ // Not valid for Network_test protocl
+ //
+ bool success = true;
+ if(!m_usingNetworkTester)
+ success = handleLlsc(address, request);
+
+ if (request->m_type == RubyRequestType_Locked_RMW_Read) {
+ m_controller->blockOnQueue(address, m_mandatory_q_ptr);
+ } else if (request->m_type == RubyRequestType_Locked_RMW_Write) {
+ m_controller->unblock(address);
+ }
+
+ hitCallback(request, mach, data, success,
+ request->issue_time, forwardRequestTime, firstResponseTime,
+ isRegion);
+}
+
+void
+GPUCoalescer::readCallback(Addr address, DataBlock& data)
+{
+ readCallback(address, MachineType_NULL, data);
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data)
+{
+ readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime)
+{
+
+ readCallback(address, mach, data,
+ initialRequestTime, forwardRequestTime, firstResponseTime,
+ false);
+}
+
+void
+GPUCoalescer::readCallback(Addr address,
+ MachineType mach,
+ DataBlock& data,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion)
+{
+ assert(address == makeLineAddress(address));
+ assert(m_readRequestTable.count(makeLineAddress(address)));
+
+ DPRINTF(GPUCoalescer, "read callback for address %#x\n", address);
+ RequestTable::iterator i = m_readRequestTable.find(address);
+ assert(i != m_readRequestTable.end());
+ GPUCoalescerRequest* request = i->second;
+
+ m_readRequestTable.erase(i);
+ markRemoved();
+
+ assert((request->m_type == RubyRequestType_LD) ||
+ (request->m_type == RubyRequestType_IFETCH));
+
+ hitCallback(request, mach, data, true,
+ request->issue_time, forwardRequestTime, firstResponseTime,
+ isRegion);
+}
+
+void
+GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest,
+ MachineType mach,
+ DataBlock& data,
+ bool success,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool isRegion)
+{
+ PacketPtr pkt = srequest->pkt;
+ Addr request_address = pkt->getAddr();
+ Addr request_line_address = makeLineAddress(request_address);
+
+ RubyRequestType type = srequest->m_type;
+
+ // Set this cache entry to the most recently used
+ if (type == RubyRequestType_IFETCH) {
+ if (m_instCache_ptr->isTagPresent(request_line_address))
+ m_instCache_ptr->setMRU(request_line_address);
+ } else {
+ if (m_dataCache_ptr->isTagPresent(request_line_address))
+ m_dataCache_ptr->setMRU(request_line_address);
+ }
+
+ recordMissLatency(srequest, mach,
+ initialRequestTime,
+ forwardRequestTime,
+ firstResponseTime,
+ success, isRegion);
+ // update the data
+ //
+ // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
+ int len = reqCoalescer[request_line_address].size();
+ std::vector<PacketPtr> mylist;
+ for (int i = 0; i < len; ++i) {
+ PacketPtr pkt = reqCoalescer[request_line_address][i].first;
+ assert(type ==
+ reqCoalescer[request_line_address][i].second[PrimaryType]);
+ request_address = pkt->getAddr();
+ request_line_address = makeLineAddress(pkt->getAddr());
+ if (pkt->getPtr<uint8_t>()) {
+ if ((type == RubyRequestType_LD) ||
+ (type == RubyRequestType_ATOMIC) ||
+ (type == RubyRequestType_ATOMIC_RETURN) ||
+ (type == RubyRequestType_IFETCH) ||
+ (type == RubyRequestType_RMW_Read) ||
+ (type == RubyRequestType_Locked_RMW_Read) ||
+ (type == RubyRequestType_Load_Linked)) {
+ memcpy(pkt->getPtr<uint8_t>(),
+ data.getData(getOffset(request_address),
+ pkt->getSize()),
+ pkt->getSize());
+ } else {
+ data.setData(pkt->getPtr<uint8_t>(),
+ getOffset(request_address), pkt->getSize());
+ }
+ } else {
+ DPRINTF(MemoryAccess,
+ "WARNING. Data not transfered from Ruby to M5 for type " \
+ "%s\n",
+ RubyRequestType_to_string(type));
+ }
+
+ // If using the RubyTester, update the RubyTester sender state's
+ // subBlock with the recieved data. The tester will later access
+ // this state.
+ // Note: RubyPort will access it's sender state before the
+ // RubyTester.
+ if (m_usingRubyTester) {
+ RubyPort::SenderState *requestSenderState =
+ safe_cast<RubyPort::SenderState*>(pkt->senderState);
+ RubyTester::SenderState* testerSenderState =
+ safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+ testerSenderState->subBlock.mergeFrom(data);
+ }
+
+ mylist.push_back(pkt);
+ }
+ delete srequest;
+ reqCoalescer.erase(request_line_address);
+ assert(!reqCoalescer.count(request_line_address));
+
+
+
+ completeHitCallback(mylist, len);
+}
+
+bool
+GPUCoalescer::empty() const
+{
+ return m_writeRequestTable.empty() && m_readRequestTable.empty();
+}
+
+// Analyzes the packet to see if this request can be coalesced.
+// If request can be coalesced, this request is added to the reqCoalescer table
+// and makeRequest returns RequestStatus_Issued;
+// If this is the first request to a cacheline, request is added to both
+// newRequests queue and to the reqCoalescer table; makeRequest
+// returns RequestStatus_Issued.
+// If there is a pending request to this cacheline and this request
+// can't be coalesced, RequestStatus_Aliased is returned and
+// the packet needs to be reissued.
+RequestStatus
+GPUCoalescer::makeRequest(PacketPtr pkt)
+{
+ // Check for GPU Barrier Kernel End or Kernel Begin
+ // Leave these to be handled by the child class
+ // Kernel End/Barrier = isFlush + isRelease
+ // Kernel Begin = isFlush + isAcquire
+ if (pkt->req->isKernel()) {
+ if (pkt->req->isAcquire()){
+ // This is a Kernel Begin leave handling to
+ // virtual xCoalescer::makeRequest
+ return RequestStatus_Issued;
+ }else if(pkt->req->isRelease()) {
+ // This is a Kernel End leave handling to
+ // virtual xCoalescer::makeRequest
+ // If we are here then we didn't call
+ // a virtual version of this function
+ // so we will also schedule the callback
+ int wf_id = 0;
+ if (pkt->req->hasContextId()) {
+ wf_id = pkt->req->contextId();
+ }
+ insertKernel(wf_id, pkt);
+ newKernelEnds.push_back(wf_id);
+ if (!issueEvent.scheduled()) {
+ schedule(issueEvent, curTick());
+ }
+ return RequestStatus_Issued;
+ }
+ }
+
+ // If number of outstanding requests greater than the max allowed,
+ // return RequestStatus_BufferFull. This logic can be extended to
+ // support proper backpressure.
+ if (m_outstanding_count >= m_max_outstanding_requests) {
+ return RequestStatus_BufferFull;
+ }
+
+ RubyRequestType primary_type = RubyRequestType_NULL;
+ RubyRequestType secondary_type = RubyRequestType_NULL;
+
+ if (pkt->isLLSC()) {
+ //
+ // Alpha LL/SC instructions need to be handled carefully by the cache
+ // coherence protocol to ensure they follow the proper semantics. In
+ // particular, by identifying the operations as atomic, the protocol
+ // should understand that migratory sharing optimizations should not
+ // be performed (i.e. a load between the LL and SC should not steal
+ // away exclusive permission).
+ //
+ if (pkt->isWrite()) {
+ primary_type = RubyRequestType_Store_Conditional;
+ } else {
+ assert(pkt->isRead());
+ primary_type = RubyRequestType_Load_Linked;
+ }
+ secondary_type = RubyRequestType_ATOMIC;
+ } else if (pkt->req->isLockedRMW()) {
+ //
+ // x86 locked instructions are translated to store cache coherence
+ // requests because these requests should always be treated as read
+ // exclusive operations and should leverage any migratory sharing
+ // optimization built into the protocol.
+ //
+ if (pkt->isWrite()) {
+ primary_type = RubyRequestType_Locked_RMW_Write;
+ } else {
+ assert(pkt->isRead());
+ primary_type = RubyRequestType_Locked_RMW_Read;
+ }
+ secondary_type = RubyRequestType_ST;
+ } else if (pkt->isAtomicOp()) {
+ //
+ // GPU Atomic Operation
+ //
+ primary_type = RubyRequestType_ATOMIC;
+ secondary_type = RubyRequestType_ATOMIC;
+ } else {
+ if (pkt->isRead()) {
+ if (pkt->req->isInstFetch()) {
+ primary_type = secondary_type = RubyRequestType_IFETCH;
+ } else {
+#if THE_ISA == X86_ISA
+ uint32_t flags = pkt->req->getFlags();
+ bool storeCheck = flags &
+ (TheISA::StoreCheck << TheISA::FlagShift);
+#else
+ bool storeCheck = false;
+#endif // X86_ISA
+ if (storeCheck) {
+ primary_type = RubyRequestType_RMW_Read;
+ secondary_type = RubyRequestType_ST;
+ } else {
+ primary_type = secondary_type = RubyRequestType_LD;
+ }
+ }
+ } else if (pkt->isWrite()) {
+ //
+ // Note: M5 packets do not differentiate ST from RMW_Write
+ //
+ primary_type = secondary_type = RubyRequestType_ST;
+ } else if (pkt->isFlush()) {
+ primary_type = secondary_type = RubyRequestType_FLUSH;
+ } else if (pkt->req->isRelease() || pkt->req->isAcquire()) {
+ if (assumingRfOCoherence) {
+ // If we reached here, this request must be a memFence
+ // and the protocol implements RfO, the coalescer can
+ // assume sequentially consistency and schedule the callback
+ // immediately.
+ // Currently the code implements fence callbacks
+ // by reusing the mechanism for kernel completions.
+ // This should be fixed.
+ int wf_id = 0;
+ if (pkt->req->hasContextId()) {
+ wf_id = pkt->req->contextId();
+ }
+ insertKernel(wf_id, pkt);
+ newKernelEnds.push_back(wf_id);
+ if (!issueEvent.scheduled()) {
+ schedule(issueEvent, curTick());
+ }
+ return RequestStatus_Issued;
+ } else {
+ // If not RfO, return issued here and let the child coalescer
+ // take care of it.
+ return RequestStatus_Issued;
+ }
+ } else {
+ panic("Unsupported ruby packet type\n");
+ }
+ }
+
+ // Check if there is any pending request to this cache line from
+ // previous cycles.
+ // If there is a pending request, return aliased. Since coalescing
+ // across time is not permitted, aliased requests are not coalesced.
+ // If a request for this address has already been issued, we must block
+ RequestStatus status = getRequestStatus(pkt, primary_type);
+ if (status != RequestStatus_Ready)
+ return status;
+
+ Addr line_addr = makeLineAddress(pkt->getAddr());
+
+ // Check if this request can be coalesced with previous
+ // requests from this cycle.
+ if (!reqCoalescer.count(line_addr)) {
+ // This is the first access to this cache line.
+ // A new request to the memory subsystem has to be
+ // made in the next cycle for this cache line, so
+ // add this line addr to the "newRequests" queue
+ newRequests.push_back(line_addr);
+
+ // There was a request to this cache line in this cycle,
+ // let us see if we can coalesce this request with the previous
+ // requests from this cycle
+ } else if (primary_type !=
+ reqCoalescer[line_addr][0].second[PrimaryType]) {
+ // can't coalesce loads, stores and atomics!
+ return RequestStatus_Aliased;
+ } else if (pkt->req->isLockedRMW() ||
+ reqCoalescer[line_addr][0].first->req->isLockedRMW()) {
+ // can't coalesce locked accesses, but can coalesce atomics!
+ return RequestStatus_Aliased;
+ } else if (pkt->req->hasContextId() && pkt->req->isRelease() &&
+ pkt->req->contextId() !=
+ reqCoalescer[line_addr][0].first->req->contextId()) {
+ // can't coalesce releases from different wavefronts
+ return RequestStatus_Aliased;
+ }
+
+ // in addition to the packet, we need to save both request types
+ reqCoalescer[line_addr].push_back(
+ RequestDesc(pkt, std::vector<RubyRequestType>()) );
+ reqCoalescer[line_addr].back().second.push_back(primary_type);
+ reqCoalescer[line_addr].back().second.push_back(secondary_type);
+ if (!issueEvent.scheduled())
+ schedule(issueEvent, curTick());
+ // TODO: issue hardware prefetches here
+ return RequestStatus_Issued;
+}
+
+void
+GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
+{
+
+ int proc_id = -1;
+ if (pkt != NULL && pkt->req->hasContextId()) {
+ proc_id = pkt->req->contextId();
+ }
+
+ // If valid, copy the pc to the ruby request
+ Addr pc = 0;
+ if (pkt->req->hasPC()) {
+ pc = pkt->req->getPC();
+ }
+
+ // At the moment setting scopes only counts
+ // for GPU spill space accesses
+ // which is pkt->req->isStack()
+ // this scope is REPLACE since it
+ // does not need to be flushed at the end
+ // of a kernel Private and local may need
+ // to be visible at the end of the kernel
+ HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
+ HSAScope accessScope = reqScopeToHSAScope(pkt->req);
+
+ Addr line_addr = makeLineAddress(pkt->getAddr());
+
+ // Creating WriteMask that records written bytes
+ // and atomic operations. This enables partial writes
+ // and partial reads of those writes
+ DataBlock dataBlock;
+ dataBlock.clear();
+ uint32_t blockSize = RubySystem::getBlockSizeBytes();
+ std::vector<bool> accessMask(blockSize,false);
+ std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
+ uint32_t tableSize = reqCoalescer[line_addr].size();
+ for (int i = 0; i < tableSize; i++) {
+ PacketPtr tmpPkt = reqCoalescer[line_addr][i].first;
+ uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
+ uint32_t tmpSize = tmpPkt->getSize();
+ if (tmpPkt->isAtomicOp()) {
+ std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
+ tmpPkt->getAtomicOp());
+ atomicOps.push_back(tmpAtomicOp);
+ } else if(tmpPkt->isWrite()) {
+ dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
+ tmpOffset, tmpSize);
+ }
+ for (int j = 0; j < tmpSize; j++) {
+ accessMask[tmpOffset + j] = true;
+ }
+ }
+ std::shared_ptr<RubyRequest> msg;
+ if (pkt->isAtomicOp()) {
+ msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+ pkt->getPtr<uint8_t>(),
+ pkt->getSize(), pc, secondary_type,
+ RubyAccessMode_Supervisor, pkt,
+ PrefetchBit_No, proc_id, 100,
+ blockSize, accessMask,
+ dataBlock, atomicOps,
+ accessScope, accessSegment);
+ } else {
+ msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
+ pkt->getPtr<uint8_t>(),
+ pkt->getSize(), pc, secondary_type,
+ RubyAccessMode_Supervisor, pkt,
+ PrefetchBit_No, proc_id, 100,
+ blockSize, accessMask,
+ dataBlock,
+ accessScope, accessSegment);
+ }
+ DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
+ curTick(), m_version, "Coal", "Begin", "", "",
+ printAddress(msg->getPhysicalAddress()),
+ RubyRequestType_to_string(secondary_type));
+
+ fatal_if(secondary_type == RubyRequestType_IFETCH,
+ "there should not be any I-Fetch requests in the GPU Coalescer");
+
+ // Send the message to the cache controller
+ fatal_if(m_data_cache_hit_latency == 0,
+ "should not have a latency of zero");
+
+ assert(m_mandatory_q_ptr);
+ m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency);
+}
+
+template <class KEY, class VALUE>
+std::ostream &
+operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
+{
+ out << "[";
+ for (auto i = map.begin(); i != map.end(); ++i)
+ out << " " << i->first << "=" << i->second;
+ out << " ]";
+
+ return out;
+}
+
+void
+GPUCoalescer::print(ostream& out) const
+{
+ out << "[GPUCoalescer: " << m_version
+ << ", outstanding requests: " << m_outstanding_count
+ << ", read request table: " << m_readRequestTable
+ << ", write request table: " << m_writeRequestTable
+ << "]";
+}
+
+// this can be called from setState whenever coherence permissions are
+// upgraded when invoked, coherence violations will be checked for the
+// given block
+void
+GPUCoalescer::checkCoherence(Addr addr)
+{
+#ifdef CHECK_COHERENCE
+ m_ruby_system->checkGlobalCoherenceInvariant(addr);
+#endif
+}
+
+void
+GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
+ DPRINTF(RubyStats, "Recorded statistic: %s\n",
+ SequencerRequestType_to_string(requestType));
+}
+
+GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq)
+ : Event(Progress_Event_Pri), seq(_seq)
+{
+}
+
+
+void
+GPUCoalescer::completeIssue()
+{
+ // newRequests has the cacheline addresses of all the
+ // requests which need to be issued to the memory subsystem
+ // in this cycle
+ int len = newRequests.size();
+ DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len);
+ for (int i = 0; i < len; ++i) {
+ // Get the requests from reqCoalescer table. Get only the
+ // first request for each cacheline, the remaining requests
+ // can be coalesced with the first request. So, only
+ // one request is issued per cacheline.
+ RequestDesc info = reqCoalescer[newRequests[i]][0];
+ PacketPtr pkt = info.first;
+ DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n",
+ i, pkt->req->getPaddr());
+ // Insert this request to the read/writeRequestTables. These tables
+ // are used to track aliased requests in makeRequest subroutine
+ bool found = insertRequest(pkt, info.second[PrimaryType]);
+
+ if (found) {
+ panic("GPUCoalescer::makeRequest should never be called if the "
+ "request is already outstanding\n");
+ }
+
+ // Issue request to ruby subsystem
+ issueRequest(pkt, info.second[SecondaryType]);
+ }
+ newRequests.clear();
+
+ // have Kernel End releases been issued this cycle
+ len = newKernelEnds.size();
+ for (int i = 0; i < len; i++) {
+ kernelCallback(newKernelEnds[i]);
+ }
+ newKernelEnds.clear();
+}
+
+void
+GPUCoalescer::IssueEvent::process()
+{
+ seq->completeIssue();
+}
+
+const char *
+GPUCoalescer::IssueEvent::description() const
+{
+ return "Issue coalesced request";
+}
+
+void
+GPUCoalescer::evictionCallback(Addr address)
+{
+ ruby_eviction_callback(address);
+}
+
+void
+GPUCoalescer::kernelCallback(int wavefront_id)
+{
+ assert(kernelEndList.count(wavefront_id));
+
+ ruby_hit_callback(kernelEndList[wavefront_id]);
+
+ kernelEndList.erase(wavefront_id);
+}
+
+void
+GPUCoalescer::atomicCallback(Addr address,
+ MachineType mach,
+ const DataBlock& data)
+{
+ assert(address == makeLineAddress(address));
+
+ DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address);
+ assert(m_writeRequestTable.count(makeLineAddress(address)));
+
+ RequestTable::iterator i = m_writeRequestTable.find(address);
+ assert(i != m_writeRequestTable.end());
+ GPUCoalescerRequest* srequest = i->second;
+
+ m_writeRequestTable.erase(i);
+ markRemoved();
+
+ assert((srequest->m_type == RubyRequestType_ATOMIC) ||
+ (srequest->m_type == RubyRequestType_ATOMIC_RETURN) ||
+ (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN));
+
+
+ // Atomics don't write to cache, so there is no MRU update...
+
+ recordMissLatency(srequest, mach,
+ srequest->issue_time, Cycles(0), Cycles(0), true, false);
+
+ PacketPtr pkt = srequest->pkt;
+ Addr request_address = pkt->getAddr();
+ Addr request_line_address = makeLineAddress(pkt->getAddr());
+
+ int len = reqCoalescer[request_line_address].size();
+ std::vector<PacketPtr> mylist;
+ for (int i = 0; i < len; ++i) {
+ PacketPtr pkt = reqCoalescer[request_line_address][i].first;
+ assert(srequest->m_type ==
+ reqCoalescer[request_line_address][i].second[PrimaryType]);
+ request_address = (pkt->getAddr());
+ request_line_address = makeLineAddress(request_address);
+ if (pkt->getPtr<uint8_t>() &&
+ srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) {
+ /* atomics are done in memory, and return the data *before* the atomic op... */
+ memcpy(pkt->getPtr<uint8_t>(),
+ data.getData(getOffset(request_address),
+ pkt->getSize()),
+ pkt->getSize());
+ } else {
+ DPRINTF(MemoryAccess,
+ "WARNING. Data not transfered from Ruby to M5 for type " \
+ "%s\n",
+ RubyRequestType_to_string(srequest->m_type));
+ }
+
+ // If using the RubyTester, update the RubyTester sender state's
+ // subBlock with the recieved data. The tester will later access
+ // this state.
+ // Note: RubyPort will access it's sender state before the
+ // RubyTester.
+ if (m_usingRubyTester) {
+ RubyPort::SenderState *requestSenderState =
+ safe_cast<RubyPort::SenderState*>(pkt->senderState);
+ RubyTester::SenderState* testerSenderState =
+ safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor);
+ testerSenderState->subBlock.mergeFrom(data);
+ }
+
+ mylist.push_back(pkt);
+ }
+ delete srequest;
+ reqCoalescer.erase(request_line_address);
+ assert(!reqCoalescer.count(request_line_address));
+
+ completeHitCallback(mylist, len);
+}
+
+void
+GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
+{
+ if(myMachID == senderMachID) {
+ CP_TCPLdHits++;
+ } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) {
+ CP_TCPLdTransfers++;
+ } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) {
+ CP_TCCLdHits++;
+ } else {
+ CP_LdMiss++;
+ }
+}
+
+void
+GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
+{
+ if(myMachID == senderMachID) {
+ CP_TCPStHits++;
+ } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) {
+ CP_TCPStTransfers++;
+ } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) {
+ CP_TCCStHits++;
+ } else {
+ CP_StMiss++;
+ }
+}
+
+void
+GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len)
+{
+ for (int i = 0; i < len; ++i) {
+ RubyPort::SenderState *ss =
+ safe_cast<RubyPort::SenderState *>(mylist[i]->senderState);
+ MemSlavePort *port = ss->port;
+ assert(port != NULL);
+
+ mylist[i]->senderState = ss->predecessor;
+ delete ss;
+ port->hitCallback(mylist[i]);
+ trySendRetries();
+ }
+
+ testDrainComplete();
+}
+
+PacketPtr
+GPUCoalescer::mapAddrToPkt(Addr address)
+{
+ RequestTable::iterator i = m_readRequestTable.find(address);
+ assert(i != m_readRequestTable.end());
+ GPUCoalescerRequest* request = i->second;
+ return request->pkt;
+}
+
+void
+GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest,
+ MachineType mach,
+ Cycles initialRequestTime,
+ Cycles forwardRequestTime,
+ Cycles firstResponseTime,
+ bool success, bool isRegion)
+{
+ RubyRequestType type = srequest->m_type;
+ Cycles issued_time = srequest->issue_time;
+ Cycles completion_time = curCycle();
+ assert(completion_time >= issued_time);
+ Cycles total_lat = completion_time - issued_time;
+
+ // cache stats (valid for RfO protocol only)
+ if (mach == MachineType_TCP) {
+ if (type == RubyRequestType_LD) {
+ GPU_TCPLdHits++;
+ } else {
+ GPU_TCPStHits++;
+ }
+ } else if (mach == MachineType_L1Cache_wCC) {
+ if (type == RubyRequestType_LD) {
+ GPU_TCPLdTransfers++;
+ } else {
+ GPU_TCPStTransfers++;
+ }
+ } else if (mach == MachineType_TCC) {
+ if (type == RubyRequestType_LD) {
+ GPU_TCCLdHits++;
+ } else {
+ GPU_TCCStHits++;
+ }
+ } else {
+ if (type == RubyRequestType_LD) {
+ GPU_LdMiss++;
+ } else {
+ GPU_StMiss++;
+ }
+ }
+
+ // Profile all access latency, even zero latency accesses
+ m_latencyHist.sample(total_lat);
+ m_typeLatencyHist[type]->sample(total_lat);
+
+ // Profile the miss latency for all non-zero demand misses
+ if (total_lat != Cycles(0)) {
+ m_missLatencyHist.sample(total_lat);
+ m_missTypeLatencyHist[type]->sample(total_lat);
+
+ if (mach != MachineType_NUM) {
+ m_missMachLatencyHist[mach]->sample(total_lat);
+ m_missTypeMachLatencyHist[type][mach]->sample(total_lat);
+
+ if ((issued_time <= initialRequestTime) &&
+ (initialRequestTime <= forwardRequestTime) &&
+ (forwardRequestTime <= firstResponseTime) &&
+ (firstResponseTime <= completion_time)) {
+
+ m_IssueToInitialDelayHist[mach]->sample(
+ initialRequestTime - issued_time);
+ m_InitialToForwardDelayHist[mach]->sample(
+ forwardRequestTime - initialRequestTime);
+ m_ForwardToFirstResponseDelayHist[mach]->sample(
+ firstResponseTime - forwardRequestTime);
+ m_FirstResponseToCompletionDelayHist[mach]->sample(
+ completion_time - firstResponseTime);
+ }
+ }
+
+ }
+
+ DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
+ curTick(), m_version, "Coal",
+ success ? "Done" : "SC_Failed", "", "",
+ printAddress(srequest->pkt->getAddr()), total_lat);
+}
+
+void
+GPUCoalescer::regStats()
+{
+ // These statistical variables are not for display.
+ // The profiler will collate these across different
+ // coalescers and display those collated statistics.
+ m_outstandReqHist.init(10);
+ m_latencyHist.init(10);
+ m_missLatencyHist.init(10);
+
+ for (int i = 0; i < RubyRequestType_NUM; i++) {
+ m_typeLatencyHist.push_back(new Stats::Histogram());
+ m_typeLatencyHist[i]->init(10);
+
+ m_missTypeLatencyHist.push_back(new Stats::Histogram());
+ m_missTypeLatencyHist[i]->init(10);
+ }
+
+ for (int i = 0; i < MachineType_NUM; i++) {
+ m_missMachLatencyHist.push_back(new Stats::Histogram());
+ m_missMachLatencyHist[i]->init(10);
+
+ m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
+ m_IssueToInitialDelayHist[i]->init(10);
+
+ m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
+ m_InitialToForwardDelayHist[i]->init(10);
+
+ m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
+ m_ForwardToFirstResponseDelayHist[i]->init(10);
+
+ m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
+ m_FirstResponseToCompletionDelayHist[i]->init(10);
+ }
+
+ for (int i = 0; i < RubyRequestType_NUM; i++) {
+ m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());
+
+ for (int j = 0; j < MachineType_NUM; j++) {
+ m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
+ m_missTypeMachLatencyHist[i][j]->init(10);
+ }
+ }
+
+ // GPU cache stats
+ GPU_TCPLdHits
+ .name(name() + ".gpu_tcp_ld_hits")
+ .desc("loads that hit in the TCP")
+ ;
+ GPU_TCPLdTransfers
+ .name(name() + ".gpu_tcp_ld_transfers")
+ .desc("TCP to TCP load transfers")
+ ;
+ GPU_TCCLdHits
+ .name(name() + ".gpu_tcc_ld_hits")
+ .desc("loads that hit in the TCC")
+ ;
+ GPU_LdMiss
+ .name(name() + ".gpu_ld_misses")
+ .desc("loads that miss in the GPU")
+ ;
+
+ GPU_TCPStHits
+ .name(name() + ".gpu_tcp_st_hits")
+ .desc("stores that hit in the TCP")
+ ;
+ GPU_TCPStTransfers
+ .name(name() + ".gpu_tcp_st_transfers")
+ .desc("TCP to TCP store transfers")
+ ;
+ GPU_TCCStHits
+ .name(name() + ".gpu_tcc_st_hits")
+ .desc("stores that hit in the TCC")
+ ;
+ GPU_StMiss
+ .name(name() + ".gpu_st_misses")
+ .desc("stores that miss in the GPU")
+ ;
+
+ // CP cache stats
+ CP_TCPLdHits
+ .name(name() + ".cp_tcp_ld_hits")
+ .desc("loads that hit in the TCP")
+ ;
+ CP_TCPLdTransfers
+ .name(name() + ".cp_tcp_ld_transfers")
+ .desc("TCP to TCP load transfers")
+ ;
+ CP_TCCLdHits
+ .name(name() + ".cp_tcc_ld_hits")
+ .desc("loads that hit in the TCC")
+ ;
+ CP_LdMiss
+ .name(name() + ".cp_ld_misses")
+ .desc("loads that miss in the GPU")
+ ;
+
+ CP_TCPStHits
+ .name(name() + ".cp_tcp_st_hits")
+ .desc("stores that hit in the TCP")
+ ;
+ CP_TCPStTransfers
+ .name(name() + ".cp_tcp_st_transfers")
+ .desc("TCP to TCP store transfers")
+ ;
+ CP_TCCStHits
+ .name(name() + ".cp_tcc_st_hits")
+ .desc("stores that hit in the TCC")
+ ;
+ CP_StMiss
+ .name(name() + ".cp_st_misses")
+ .desc("stores that miss in the GPU")
+ ;
+}