diff options
author | Tony Gutierrez <anthony.gutierrez@amd.com> | 2016-01-19 14:28:22 -0500 |
---|---|---|
committer | Tony Gutierrez <anthony.gutierrez@amd.com> | 2016-01-19 14:28:22 -0500 |
commit | 1a7d3f9fcb76a68540dd948f91413533a383bfde (patch) | |
tree | 867510a147cd095f19499d26b7c02d27de4cae9d /src/mem/ruby/system/GPUCoalescer.cc | |
parent | 28e353e0403ea379d244a418e8dc8ee0b48187cf (diff) | |
download | gem5-1a7d3f9fcb76a68540dd948f91413533a383bfde.tar.xz |
gpu-compute: AMD's baseline GPU model
Diffstat (limited to 'src/mem/ruby/system/GPUCoalescer.cc')
-rw-r--r-- | src/mem/ruby/system/GPUCoalescer.cc | 1397 |
1 files changed, 1397 insertions, 0 deletions
diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc new file mode 100644 index 000000000..db279bd3a --- /dev/null +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -0,0 +1,1397 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "base/misc.hh" +#include "base/str.hh" +#include "config/the_isa.hh" + +#if THE_ISA == X86_ISA +#include "arch/x86/insts/microldstop.hh" + +#endif // X86_ISA +#include "mem/ruby/system/GPUCoalescer.hh" + +#include "cpu/testers/rubytest/RubyTester.hh" +#include "debug/GPUCoalescer.hh" +#include "debug/MemoryAccess.hh" +#include "debug/ProtocolTrace.hh" +#include "debug/RubyPort.hh" +#include "debug/RubyStats.hh" +#include "gpu-compute/shader.hh" +#include "mem/packet.hh" +#include "mem/ruby/common/DataBlock.hh" +#include "mem/ruby/common/SubBlock.hh" +#include "mem/ruby/network/MessageBuffer.hh" +#include "mem/ruby/profiler/Profiler.hh" +#include "mem/ruby/slicc_interface/AbstractController.hh" +#include "mem/ruby/slicc_interface/RubyRequest.hh" +#include "mem/ruby/structures/CacheMemory.hh" +#include "mem/ruby/system/RubySystem.hh" +#include "params/RubyGPUCoalescer.hh" + +using namespace std; + +GPUCoalescer * +RubyGPUCoalescerParams::create() +{ + return new GPUCoalescer(this); +} + +HSAScope +reqScopeToHSAScope(Request* req) +{ + HSAScope accessScope = HSAScope_UNSPECIFIED; + if (req->isScoped()) { + if (req->isWavefrontScope()) { + accessScope = HSAScope_WAVEFRONT; + } else if (req->isWorkgroupScope()) { + accessScope = HSAScope_WORKGROUP; + } else if (req->isDeviceScope()) { + accessScope = HSAScope_DEVICE; + } else if (req->isSystemScope()) { + accessScope = HSAScope_SYSTEM; + } else { + fatal("Bad scope type"); + } + } + return accessScope; +} + +HSASegment +reqSegmentToHSASegment(Request* req) +{ + HSASegment accessSegment = HSASegment_GLOBAL; + + if (req->isGlobalSegment()) { + accessSegment = HSASegment_GLOBAL; + } else if (req->isGroupSegment()) { + accessSegment = HSASegment_GROUP; + } else if (req->isPrivateSegment()) { + accessSegment = HSASegment_PRIVATE; + } else if (req->isKernargSegment()) { + accessSegment = HSASegment_KERNARG; + } else if (req->isReadonlySegment()) { + accessSegment = HSASegment_READONLY; + } else if (req->isSpillSegment()) { + accessSegment = HSASegment_SPILL; + } else if (req->isArgSegment()) { + accessSegment = HSASegment_ARG; + } else { + fatal("Bad segment type"); + } + + return accessSegment; +} + +GPUCoalescer::GPUCoalescer(const Params *p) + : RubyPort(p), issueEvent(this), deadlockCheckEvent(this) +{ + m_store_waiting_on_load_cycles = 0; + m_store_waiting_on_store_cycles = 0; + m_load_waiting_on_store_cycles = 0; + m_load_waiting_on_load_cycles = 0; + + m_outstanding_count = 0; + + m_max_outstanding_requests = 0; + m_deadlock_threshold = 0; + m_instCache_ptr = nullptr; + m_dataCache_ptr = nullptr; + + m_instCache_ptr = p->icache; + m_dataCache_ptr = p->dcache; + m_max_outstanding_requests = p->max_outstanding_requests; + m_deadlock_threshold = p->deadlock_threshold; + + assert(m_max_outstanding_requests > 0); + assert(m_deadlock_threshold > 0); + assert(m_instCache_ptr); + assert(m_dataCache_ptr); + + m_data_cache_hit_latency = p->dcache_hit_latency; + + m_usingNetworkTester = p->using_network_tester; + assumingRfOCoherence = p->assume_rfo; +} + +GPUCoalescer::~GPUCoalescer() +{ +} + +void +GPUCoalescer::wakeup() +{ + // Check for deadlock of any of the requests + Cycles current_time = curCycle(); + + // Check across all outstanding requests + int total_outstanding = 0; + + RequestTable::iterator read = m_readRequestTable.begin(); + RequestTable::iterator read_end = m_readRequestTable.end(); + for (; read != read_end; ++read) { + GPUCoalescerRequest* request = read->second; + if (current_time - request->issue_time < m_deadlock_threshold) + continue; + + panic("Possible Deadlock detected. Aborting!\n" + "version: %d request.paddr: 0x%x m_readRequestTable: %d " + "current time: %u issue_time: %d difference: %d\n", m_version, + request->pkt->getAddr(), m_readRequestTable.size(), + current_time * clockPeriod(), request->issue_time * clockPeriod(), + (current_time - request->issue_time)*clockPeriod()); + } + + RequestTable::iterator write = m_writeRequestTable.begin(); + RequestTable::iterator write_end = m_writeRequestTable.end(); + for (; write != write_end; ++write) { + GPUCoalescerRequest* request = write->second; + if (current_time - request->issue_time < m_deadlock_threshold) + continue; + + panic("Possible Deadlock detected. Aborting!\n" + "version: %d request.paddr: 0x%x m_writeRequestTable: %d " + "current time: %u issue_time: %d difference: %d\n", m_version, + request->pkt->getAddr(), m_writeRequestTable.size(), + current_time * clockPeriod(), request->issue_time * clockPeriod(), + (current_time - request->issue_time) * clockPeriod()); + } + + total_outstanding += m_writeRequestTable.size(); + total_outstanding += m_readRequestTable.size(); + + assert(m_outstanding_count == total_outstanding); + + if (m_outstanding_count > 0) { + // If there are still outstanding requests, keep checking + schedule(deadlockCheckEvent, + m_deadlock_threshold * clockPeriod() + + curTick()); + } +} + +void +GPUCoalescer::resetStats() +{ + m_latencyHist.reset(); + m_missLatencyHist.reset(); + for (int i = 0; i < RubyRequestType_NUM; i++) { + m_typeLatencyHist[i]->reset(); + m_missTypeLatencyHist[i]->reset(); + for (int j = 0; j < MachineType_NUM; j++) { + m_missTypeMachLatencyHist[i][j]->reset(); + } + } + + for (int i = 0; i < MachineType_NUM; i++) { + m_missMachLatencyHist[i]->reset(); + + m_IssueToInitialDelayHist[i]->reset(); + m_InitialToForwardDelayHist[i]->reset(); + m_ForwardToFirstResponseDelayHist[i]->reset(); + m_FirstResponseToCompletionDelayHist[i]->reset(); + } +} + +void +GPUCoalescer::printProgress(ostream& out) const +{ +} + +RequestStatus +GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type) +{ + Addr line_addr = makeLineAddress(pkt->getAddr()); + + if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) { + return RequestStatus_BufferFull; + } + + if(m_controller->isBlocked(line_addr) && + request_type != RubyRequestType_Locked_RMW_Write) { + return RequestStatus_Aliased; + } + + if ((request_type == RubyRequestType_ST) || + (request_type == RubyRequestType_ATOMIC) || + (request_type == RubyRequestType_ATOMIC_RETURN) || + (request_type == RubyRequestType_ATOMIC_NO_RETURN) || + (request_type == RubyRequestType_RMW_Read) || + (request_type == RubyRequestType_RMW_Write) || + (request_type == RubyRequestType_Load_Linked) || + (request_type == RubyRequestType_Store_Conditional) || + (request_type == RubyRequestType_Locked_RMW_Read) || + (request_type == RubyRequestType_Locked_RMW_Write) || + (request_type == RubyRequestType_FLUSH)) { + + // Check if there is any outstanding read request for the same + // cache line. + if (m_readRequestTable.count(line_addr) > 0) { + m_store_waiting_on_load_cycles++; + return RequestStatus_Aliased; + } + + if (m_writeRequestTable.count(line_addr) > 0) { + // There is an outstanding write request for the cache line + m_store_waiting_on_store_cycles++; + return RequestStatus_Aliased; + } + } else { + // Check if there is any outstanding write request for the same + // cache line. + if (m_writeRequestTable.count(line_addr) > 0) { + m_load_waiting_on_store_cycles++; + return RequestStatus_Aliased; + } + + if (m_readRequestTable.count(line_addr) > 0) { + // There is an outstanding read request for the cache line + m_load_waiting_on_load_cycles++; + return RequestStatus_Aliased; + } + } + + return RequestStatus_Ready; + +} + + + +// sets the kernelEndList +void +GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt) +{ + // Don't know if this will happen or is possible + // but I just want to be careful and not have it become + // simulator hang in the future + DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id); + assert(kernelEndList.count(wavefront_id) == 0); + + kernelEndList[wavefront_id] = pkt; + DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n", + kernelEndList.size()); +} + + +// Insert the request on the correct request table. Return true if +// the entry was already present. +bool +GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type) +{ + assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready || + pkt->req->isLockedRMW() || + !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())); + + int total_outstanding M5_VAR_USED = + m_writeRequestTable.size() + m_readRequestTable.size(); + + assert(m_outstanding_count == total_outstanding); + + // See if we should schedule a deadlock check + if (deadlockCheckEvent.scheduled() == false) { + schedule(deadlockCheckEvent, m_deadlock_threshold + curTick()); + } + + Addr line_addr = makeLineAddress(pkt->getAddr()); + if ((request_type == RubyRequestType_ST) || + (request_type == RubyRequestType_ATOMIC) || + (request_type == RubyRequestType_ATOMIC_RETURN) || + (request_type == RubyRequestType_ATOMIC_NO_RETURN) || + (request_type == RubyRequestType_RMW_Read) || + (request_type == RubyRequestType_RMW_Write) || + (request_type == RubyRequestType_Load_Linked) || + (request_type == RubyRequestType_Store_Conditional) || + (request_type == RubyRequestType_Locked_RMW_Read) || + (request_type == RubyRequestType_Locked_RMW_Write) || + (request_type == RubyRequestType_FLUSH)) { + + pair<RequestTable::iterator, bool> r = + m_writeRequestTable.insert(RequestTable::value_type(line_addr, + (GPUCoalescerRequest*) NULL)); + if (r.second) { + RequestTable::iterator i = r.first; + i->second = new GPUCoalescerRequest(pkt, request_type, + curCycle()); + DPRINTF(GPUCoalescer, + "Inserting write request for paddr %#x for type %d\n", + pkt->req->getPaddr(), i->second->m_type); + m_outstanding_count++; + } else { + return true; + } + } else { + pair<RequestTable::iterator, bool> r = + m_readRequestTable.insert(RequestTable::value_type(line_addr, + (GPUCoalescerRequest*) NULL)); + + if (r.second) { + RequestTable::iterator i = r.first; + i->second = new GPUCoalescerRequest(pkt, request_type, + curCycle()); + DPRINTF(GPUCoalescer, + "Inserting read request for paddr %#x for type %d\n", + pkt->req->getPaddr(), i->second->m_type); + m_outstanding_count++; + } else { + return true; + } + } + + m_outstandReqHist.sample(m_outstanding_count); + + total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size(); + assert(m_outstanding_count == total_outstanding); + + return false; +} + +void +GPUCoalescer::markRemoved() +{ + m_outstanding_count--; + assert(m_outstanding_count == + m_writeRequestTable.size() + m_readRequestTable.size()); +} + +void +GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest) +{ + assert(m_outstanding_count == + m_writeRequestTable.size() + m_readRequestTable.size()); + + Addr line_addr = makeLineAddress(srequest->pkt->getAddr()); + if ((srequest->m_type == RubyRequestType_ST) || + (srequest->m_type == RubyRequestType_RMW_Read) || + (srequest->m_type == RubyRequestType_RMW_Write) || + (srequest->m_type == RubyRequestType_Load_Linked) || + (srequest->m_type == RubyRequestType_Store_Conditional) || + (srequest->m_type == RubyRequestType_Locked_RMW_Read) || + (srequest->m_type == RubyRequestType_Locked_RMW_Write)) { + m_writeRequestTable.erase(line_addr); + } else { + m_readRequestTable.erase(line_addr); + } + + markRemoved(); +} + +bool +GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request) +{ + // + // The success flag indicates whether the LLSC operation was successful. + // LL ops will always succeed, but SC may fail if the cache line is no + // longer locked. + // + bool success = true; + if (request->m_type == RubyRequestType_Store_Conditional) { + if (!m_dataCache_ptr->isLocked(address, m_version)) { + // + // For failed SC requests, indicate the failure to the cpu by + // setting the extra data to zero. + // + request->pkt->req->setExtraData(0); + success = false; + } else { + // + // For successful SC requests, indicate the success to the cpu by + // setting the extra data to one. + // + request->pkt->req->setExtraData(1); + } + // + // Independent of success, all SC operations must clear the lock + // + m_dataCache_ptr->clearLocked(address); + } else if (request->m_type == RubyRequestType_Load_Linked) { + // + // Note: To fully follow Alpha LLSC semantics, should the LL clear any + // previously locked cache lines? + // + m_dataCache_ptr->setLocked(address, m_version); + } else if ((m_dataCache_ptr->isTagPresent(address)) && + (m_dataCache_ptr->isLocked(address, m_version))) { + // + // Normal writes should clear the locked address + // + m_dataCache_ptr->clearLocked(address); + } + return success; +} + +void +GPUCoalescer::writeCallback(Addr address, DataBlock& data) +{ + writeCallback(address, MachineType_NULL, data); +} + +void +GPUCoalescer::writeCallback(Addr address, + MachineType mach, + DataBlock& data) +{ + writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); +} + +void +GPUCoalescer::writeCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime) +{ + writeCallback(address, mach, data, + initialRequestTime, forwardRequestTime, firstResponseTime, + false); +} + +void +GPUCoalescer::writeCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion) +{ + assert(address == makeLineAddress(address)); + + DPRINTF(GPUCoalescer, "write callback for address %#x\n", address); + assert(m_writeRequestTable.count(makeLineAddress(address))); + + RequestTable::iterator i = m_writeRequestTable.find(address); + assert(i != m_writeRequestTable.end()); + GPUCoalescerRequest* request = i->second; + + m_writeRequestTable.erase(i); + markRemoved(); + + assert((request->m_type == RubyRequestType_ST) || + (request->m_type == RubyRequestType_ATOMIC) || + (request->m_type == RubyRequestType_ATOMIC_RETURN) || + (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) || + (request->m_type == RubyRequestType_RMW_Read) || + (request->m_type == RubyRequestType_RMW_Write) || + (request->m_type == RubyRequestType_Load_Linked) || + (request->m_type == RubyRequestType_Store_Conditional) || + (request->m_type == RubyRequestType_Locked_RMW_Read) || + (request->m_type == RubyRequestType_Locked_RMW_Write) || + (request->m_type == RubyRequestType_FLUSH)); + + + // + // For Alpha, properly handle LL, SC, and write requests with respect to + // locked cache blocks. + // + // Not valid for Network_test protocl + // + bool success = true; + if(!m_usingNetworkTester) + success = handleLlsc(address, request); + + if (request->m_type == RubyRequestType_Locked_RMW_Read) { + m_controller->blockOnQueue(address, m_mandatory_q_ptr); + } else if (request->m_type == RubyRequestType_Locked_RMW_Write) { + m_controller->unblock(address); + } + + hitCallback(request, mach, data, success, + request->issue_time, forwardRequestTime, firstResponseTime, + isRegion); +} + +void +GPUCoalescer::readCallback(Addr address, DataBlock& data) +{ + readCallback(address, MachineType_NULL, data); +} + +void +GPUCoalescer::readCallback(Addr address, + MachineType mach, + DataBlock& data) +{ + readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); +} + +void +GPUCoalescer::readCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime) +{ + + readCallback(address, mach, data, + initialRequestTime, forwardRequestTime, firstResponseTime, + false); +} + +void +GPUCoalescer::readCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion) +{ + assert(address == makeLineAddress(address)); + assert(m_readRequestTable.count(makeLineAddress(address))); + + DPRINTF(GPUCoalescer, "read callback for address %#x\n", address); + RequestTable::iterator i = m_readRequestTable.find(address); + assert(i != m_readRequestTable.end()); + GPUCoalescerRequest* request = i->second; + + m_readRequestTable.erase(i); + markRemoved(); + + assert((request->m_type == RubyRequestType_LD) || + (request->m_type == RubyRequestType_IFETCH)); + + hitCallback(request, mach, data, true, + request->issue_time, forwardRequestTime, firstResponseTime, + isRegion); +} + +void +GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest, + MachineType mach, + DataBlock& data, + bool success, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion) +{ + PacketPtr pkt = srequest->pkt; + Addr request_address = pkt->getAddr(); + Addr request_line_address = makeLineAddress(request_address); + + RubyRequestType type = srequest->m_type; + + // Set this cache entry to the most recently used + if (type == RubyRequestType_IFETCH) { + if (m_instCache_ptr->isTagPresent(request_line_address)) + m_instCache_ptr->setMRU(request_line_address); + } else { + if (m_dataCache_ptr->isTagPresent(request_line_address)) + m_dataCache_ptr->setMRU(request_line_address); + } + + recordMissLatency(srequest, mach, + initialRequestTime, + forwardRequestTime, + firstResponseTime, + success, isRegion); + // update the data + // + // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER + int len = reqCoalescer[request_line_address].size(); + std::vector<PacketPtr> mylist; + for (int i = 0; i < len; ++i) { + PacketPtr pkt = reqCoalescer[request_line_address][i].first; + assert(type == + reqCoalescer[request_line_address][i].second[PrimaryType]); + request_address = pkt->getAddr(); + request_line_address = makeLineAddress(pkt->getAddr()); + if (pkt->getPtr<uint8_t>()) { + if ((type == RubyRequestType_LD) || + (type == RubyRequestType_ATOMIC) || + (type == RubyRequestType_ATOMIC_RETURN) || + (type == RubyRequestType_IFETCH) || + (type == RubyRequestType_RMW_Read) || + (type == RubyRequestType_Locked_RMW_Read) || + (type == RubyRequestType_Load_Linked)) { + memcpy(pkt->getPtr<uint8_t>(), + data.getData(getOffset(request_address), + pkt->getSize()), + pkt->getSize()); + } else { + data.setData(pkt->getPtr<uint8_t>(), + getOffset(request_address), pkt->getSize()); + } + } else { + DPRINTF(MemoryAccess, + "WARNING. Data not transfered from Ruby to M5 for type " \ + "%s\n", + RubyRequestType_to_string(type)); + } + + // If using the RubyTester, update the RubyTester sender state's + // subBlock with the recieved data. The tester will later access + // this state. + // Note: RubyPort will access it's sender state before the + // RubyTester. + if (m_usingRubyTester) { + RubyPort::SenderState *requestSenderState = + safe_cast<RubyPort::SenderState*>(pkt->senderState); + RubyTester::SenderState* testerSenderState = + safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor); + testerSenderState->subBlock.mergeFrom(data); + } + + mylist.push_back(pkt); + } + delete srequest; + reqCoalescer.erase(request_line_address); + assert(!reqCoalescer.count(request_line_address)); + + + + completeHitCallback(mylist, len); +} + +bool +GPUCoalescer::empty() const +{ + return m_writeRequestTable.empty() && m_readRequestTable.empty(); +} + +// Analyzes the packet to see if this request can be coalesced. +// If request can be coalesced, this request is added to the reqCoalescer table +// and makeRequest returns RequestStatus_Issued; +// If this is the first request to a cacheline, request is added to both +// newRequests queue and to the reqCoalescer table; makeRequest +// returns RequestStatus_Issued. +// If there is a pending request to this cacheline and this request +// can't be coalesced, RequestStatus_Aliased is returned and +// the packet needs to be reissued. +RequestStatus +GPUCoalescer::makeRequest(PacketPtr pkt) +{ + // Check for GPU Barrier Kernel End or Kernel Begin + // Leave these to be handled by the child class + // Kernel End/Barrier = isFlush + isRelease + // Kernel Begin = isFlush + isAcquire + if (pkt->req->isKernel()) { + if (pkt->req->isAcquire()){ + // This is a Kernel Begin leave handling to + // virtual xCoalescer::makeRequest + return RequestStatus_Issued; + }else if(pkt->req->isRelease()) { + // This is a Kernel End leave handling to + // virtual xCoalescer::makeRequest + // If we are here then we didn't call + // a virtual version of this function + // so we will also schedule the callback + int wf_id = 0; + if (pkt->req->hasContextId()) { + wf_id = pkt->req->contextId(); + } + insertKernel(wf_id, pkt); + newKernelEnds.push_back(wf_id); + if (!issueEvent.scheduled()) { + schedule(issueEvent, curTick()); + } + return RequestStatus_Issued; + } + } + + // If number of outstanding requests greater than the max allowed, + // return RequestStatus_BufferFull. This logic can be extended to + // support proper backpressure. + if (m_outstanding_count >= m_max_outstanding_requests) { + return RequestStatus_BufferFull; + } + + RubyRequestType primary_type = RubyRequestType_NULL; + RubyRequestType secondary_type = RubyRequestType_NULL; + + if (pkt->isLLSC()) { + // + // Alpha LL/SC instructions need to be handled carefully by the cache + // coherence protocol to ensure they follow the proper semantics. In + // particular, by identifying the operations as atomic, the protocol + // should understand that migratory sharing optimizations should not + // be performed (i.e. a load between the LL and SC should not steal + // away exclusive permission). + // + if (pkt->isWrite()) { + primary_type = RubyRequestType_Store_Conditional; + } else { + assert(pkt->isRead()); + primary_type = RubyRequestType_Load_Linked; + } + secondary_type = RubyRequestType_ATOMIC; + } else if (pkt->req->isLockedRMW()) { + // + // x86 locked instructions are translated to store cache coherence + // requests because these requests should always be treated as read + // exclusive operations and should leverage any migratory sharing + // optimization built into the protocol. + // + if (pkt->isWrite()) { + primary_type = RubyRequestType_Locked_RMW_Write; + } else { + assert(pkt->isRead()); + primary_type = RubyRequestType_Locked_RMW_Read; + } + secondary_type = RubyRequestType_ST; + } else if (pkt->isAtomicOp()) { + // + // GPU Atomic Operation + // + primary_type = RubyRequestType_ATOMIC; + secondary_type = RubyRequestType_ATOMIC; + } else { + if (pkt->isRead()) { + if (pkt->req->isInstFetch()) { + primary_type = secondary_type = RubyRequestType_IFETCH; + } else { +#if THE_ISA == X86_ISA + uint32_t flags = pkt->req->getFlags(); + bool storeCheck = flags & + (TheISA::StoreCheck << TheISA::FlagShift); +#else + bool storeCheck = false; +#endif // X86_ISA + if (storeCheck) { + primary_type = RubyRequestType_RMW_Read; + secondary_type = RubyRequestType_ST; + } else { + primary_type = secondary_type = RubyRequestType_LD; + } + } + } else if (pkt->isWrite()) { + // + // Note: M5 packets do not differentiate ST from RMW_Write + // + primary_type = secondary_type = RubyRequestType_ST; + } else if (pkt->isFlush()) { + primary_type = secondary_type = RubyRequestType_FLUSH; + } else if (pkt->req->isRelease() || pkt->req->isAcquire()) { + if (assumingRfOCoherence) { + // If we reached here, this request must be a memFence + // and the protocol implements RfO, the coalescer can + // assume sequentially consistency and schedule the callback + // immediately. + // Currently the code implements fence callbacks + // by reusing the mechanism for kernel completions. + // This should be fixed. + int wf_id = 0; + if (pkt->req->hasContextId()) { + wf_id = pkt->req->contextId(); + } + insertKernel(wf_id, pkt); + newKernelEnds.push_back(wf_id); + if (!issueEvent.scheduled()) { + schedule(issueEvent, curTick()); + } + return RequestStatus_Issued; + } else { + // If not RfO, return issued here and let the child coalescer + // take care of it. + return RequestStatus_Issued; + } + } else { + panic("Unsupported ruby packet type\n"); + } + } + + // Check if there is any pending request to this cache line from + // previous cycles. + // If there is a pending request, return aliased. Since coalescing + // across time is not permitted, aliased requests are not coalesced. + // If a request for this address has already been issued, we must block + RequestStatus status = getRequestStatus(pkt, primary_type); + if (status != RequestStatus_Ready) + return status; + + Addr line_addr = makeLineAddress(pkt->getAddr()); + + // Check if this request can be coalesced with previous + // requests from this cycle. + if (!reqCoalescer.count(line_addr)) { + // This is the first access to this cache line. + // A new request to the memory subsystem has to be + // made in the next cycle for this cache line, so + // add this line addr to the "newRequests" queue + newRequests.push_back(line_addr); + + // There was a request to this cache line in this cycle, + // let us see if we can coalesce this request with the previous + // requests from this cycle + } else if (primary_type != + reqCoalescer[line_addr][0].second[PrimaryType]) { + // can't coalesce loads, stores and atomics! + return RequestStatus_Aliased; + } else if (pkt->req->isLockedRMW() || + reqCoalescer[line_addr][0].first->req->isLockedRMW()) { + // can't coalesce locked accesses, but can coalesce atomics! + return RequestStatus_Aliased; + } else if (pkt->req->hasContextId() && pkt->req->isRelease() && + pkt->req->contextId() != + reqCoalescer[line_addr][0].first->req->contextId()) { + // can't coalesce releases from different wavefronts + return RequestStatus_Aliased; + } + + // in addition to the packet, we need to save both request types + reqCoalescer[line_addr].push_back( + RequestDesc(pkt, std::vector<RubyRequestType>()) ); + reqCoalescer[line_addr].back().second.push_back(primary_type); + reqCoalescer[line_addr].back().second.push_back(secondary_type); + if (!issueEvent.scheduled()) + schedule(issueEvent, curTick()); + // TODO: issue hardware prefetches here + return RequestStatus_Issued; +} + +void +GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) +{ + + int proc_id = -1; + if (pkt != NULL && pkt->req->hasContextId()) { + proc_id = pkt->req->contextId(); + } + + // If valid, copy the pc to the ruby request + Addr pc = 0; + if (pkt->req->hasPC()) { + pc = pkt->req->getPC(); + } + + // At the moment setting scopes only counts + // for GPU spill space accesses + // which is pkt->req->isStack() + // this scope is REPLACE since it + // does not need to be flushed at the end + // of a kernel Private and local may need + // to be visible at the end of the kernel + HSASegment accessSegment = reqSegmentToHSASegment(pkt->req); + HSAScope accessScope = reqScopeToHSAScope(pkt->req); + + Addr line_addr = makeLineAddress(pkt->getAddr()); + + // Creating WriteMask that records written bytes + // and atomic operations. This enables partial writes + // and partial reads of those writes + DataBlock dataBlock; + dataBlock.clear(); + uint32_t blockSize = RubySystem::getBlockSizeBytes(); + std::vector<bool> accessMask(blockSize,false); + std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps; + uint32_t tableSize = reqCoalescer[line_addr].size(); + for (int i = 0; i < tableSize; i++) { + PacketPtr tmpPkt = reqCoalescer[line_addr][i].first; + uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr; + uint32_t tmpSize = tmpPkt->getSize(); + if (tmpPkt->isAtomicOp()) { + std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset, + tmpPkt->getAtomicOp()); + atomicOps.push_back(tmpAtomicOp); + } else if(tmpPkt->isWrite()) { + dataBlock.setData(tmpPkt->getPtr<uint8_t>(), + tmpOffset, tmpSize); + } + for (int j = 0; j < tmpSize; j++) { + accessMask[tmpOffset + j] = true; + } + } + std::shared_ptr<RubyRequest> msg; + if (pkt->isAtomicOp()) { + msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), + pkt->getPtr<uint8_t>(), + pkt->getSize(), pc, secondary_type, + RubyAccessMode_Supervisor, pkt, + PrefetchBit_No, proc_id, 100, + blockSize, accessMask, + dataBlock, atomicOps, + accessScope, accessSegment); + } else { + msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), + pkt->getPtr<uint8_t>(), + pkt->getSize(), pc, secondary_type, + RubyAccessMode_Supervisor, pkt, + PrefetchBit_No, proc_id, 100, + blockSize, accessMask, + dataBlock, + accessScope, accessSegment); + } + DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n", + curTick(), m_version, "Coal", "Begin", "", "", + printAddress(msg->getPhysicalAddress()), + RubyRequestType_to_string(secondary_type)); + + fatal_if(secondary_type == RubyRequestType_IFETCH, + "there should not be any I-Fetch requests in the GPU Coalescer"); + + // Send the message to the cache controller + fatal_if(m_data_cache_hit_latency == 0, + "should not have a latency of zero"); + + assert(m_mandatory_q_ptr); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); +} + +template <class KEY, class VALUE> +std::ostream & +operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map) +{ + out << "["; + for (auto i = map.begin(); i != map.end(); ++i) + out << " " << i->first << "=" << i->second; + out << " ]"; + + return out; +} + +void +GPUCoalescer::print(ostream& out) const +{ + out << "[GPUCoalescer: " << m_version + << ", outstanding requests: " << m_outstanding_count + << ", read request table: " << m_readRequestTable + << ", write request table: " << m_writeRequestTable + << "]"; +} + +// this can be called from setState whenever coherence permissions are +// upgraded when invoked, coherence violations will be checked for the +// given block +void +GPUCoalescer::checkCoherence(Addr addr) +{ +#ifdef CHECK_COHERENCE + m_ruby_system->checkGlobalCoherenceInvariant(addr); +#endif +} + +void +GPUCoalescer::recordRequestType(SequencerRequestType requestType) { + DPRINTF(RubyStats, "Recorded statistic: %s\n", + SequencerRequestType_to_string(requestType)); +} + +GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq) + : Event(Progress_Event_Pri), seq(_seq) +{ +} + + +void +GPUCoalescer::completeIssue() +{ + // newRequests has the cacheline addresses of all the + // requests which need to be issued to the memory subsystem + // in this cycle + int len = newRequests.size(); + DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len); + for (int i = 0; i < len; ++i) { + // Get the requests from reqCoalescer table. Get only the + // first request for each cacheline, the remaining requests + // can be coalesced with the first request. So, only + // one request is issued per cacheline. + RequestDesc info = reqCoalescer[newRequests[i]][0]; + PacketPtr pkt = info.first; + DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n", + i, pkt->req->getPaddr()); + // Insert this request to the read/writeRequestTables. These tables + // are used to track aliased requests in makeRequest subroutine + bool found = insertRequest(pkt, info.second[PrimaryType]); + + if (found) { + panic("GPUCoalescer::makeRequest should never be called if the " + "request is already outstanding\n"); + } + + // Issue request to ruby subsystem + issueRequest(pkt, info.second[SecondaryType]); + } + newRequests.clear(); + + // have Kernel End releases been issued this cycle + len = newKernelEnds.size(); + for (int i = 0; i < len; i++) { + kernelCallback(newKernelEnds[i]); + } + newKernelEnds.clear(); +} + +void +GPUCoalescer::IssueEvent::process() +{ + seq->completeIssue(); +} + +const char * +GPUCoalescer::IssueEvent::description() const +{ + return "Issue coalesced request"; +} + +void +GPUCoalescer::evictionCallback(Addr address) +{ + ruby_eviction_callback(address); +} + +void +GPUCoalescer::kernelCallback(int wavefront_id) +{ + assert(kernelEndList.count(wavefront_id)); + + ruby_hit_callback(kernelEndList[wavefront_id]); + + kernelEndList.erase(wavefront_id); +} + +void +GPUCoalescer::atomicCallback(Addr address, + MachineType mach, + const DataBlock& data) +{ + assert(address == makeLineAddress(address)); + + DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address); + assert(m_writeRequestTable.count(makeLineAddress(address))); + + RequestTable::iterator i = m_writeRequestTable.find(address); + assert(i != m_writeRequestTable.end()); + GPUCoalescerRequest* srequest = i->second; + + m_writeRequestTable.erase(i); + markRemoved(); + + assert((srequest->m_type == RubyRequestType_ATOMIC) || + (srequest->m_type == RubyRequestType_ATOMIC_RETURN) || + (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN)); + + + // Atomics don't write to cache, so there is no MRU update... + + recordMissLatency(srequest, mach, + srequest->issue_time, Cycles(0), Cycles(0), true, false); + + PacketPtr pkt = srequest->pkt; + Addr request_address = pkt->getAddr(); + Addr request_line_address = makeLineAddress(pkt->getAddr()); + + int len = reqCoalescer[request_line_address].size(); + std::vector<PacketPtr> mylist; + for (int i = 0; i < len; ++i) { + PacketPtr pkt = reqCoalescer[request_line_address][i].first; + assert(srequest->m_type == + reqCoalescer[request_line_address][i].second[PrimaryType]); + request_address = (pkt->getAddr()); + request_line_address = makeLineAddress(request_address); + if (pkt->getPtr<uint8_t>() && + srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) { + /* atomics are done in memory, and return the data *before* the atomic op... */ + memcpy(pkt->getPtr<uint8_t>(), + data.getData(getOffset(request_address), + pkt->getSize()), + pkt->getSize()); + } else { + DPRINTF(MemoryAccess, + "WARNING. Data not transfered from Ruby to M5 for type " \ + "%s\n", + RubyRequestType_to_string(srequest->m_type)); + } + + // If using the RubyTester, update the RubyTester sender state's + // subBlock with the recieved data. The tester will later access + // this state. + // Note: RubyPort will access it's sender state before the + // RubyTester. + if (m_usingRubyTester) { + RubyPort::SenderState *requestSenderState = + safe_cast<RubyPort::SenderState*>(pkt->senderState); + RubyTester::SenderState* testerSenderState = + safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor); + testerSenderState->subBlock.mergeFrom(data); + } + + mylist.push_back(pkt); + } + delete srequest; + reqCoalescer.erase(request_line_address); + assert(!reqCoalescer.count(request_line_address)); + + completeHitCallback(mylist, len); +} + +void +GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID) +{ + if(myMachID == senderMachID) { + CP_TCPLdHits++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) { + CP_TCPLdTransfers++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) { + CP_TCCLdHits++; + } else { + CP_LdMiss++; + } +} + +void +GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID) +{ + if(myMachID == senderMachID) { + CP_TCPStHits++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) { + CP_TCPStTransfers++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) { + CP_TCCStHits++; + } else { + CP_StMiss++; + } +} + +void +GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len) +{ + for (int i = 0; i < len; ++i) { + RubyPort::SenderState *ss = + safe_cast<RubyPort::SenderState *>(mylist[i]->senderState); + MemSlavePort *port = ss->port; + assert(port != NULL); + + mylist[i]->senderState = ss->predecessor; + delete ss; + port->hitCallback(mylist[i]); + trySendRetries(); + } + + testDrainComplete(); +} + +PacketPtr +GPUCoalescer::mapAddrToPkt(Addr address) +{ + RequestTable::iterator i = m_readRequestTable.find(address); + assert(i != m_readRequestTable.end()); + GPUCoalescerRequest* request = i->second; + return request->pkt; +} + +void +GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest, + MachineType mach, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool success, bool isRegion) +{ + RubyRequestType type = srequest->m_type; + Cycles issued_time = srequest->issue_time; + Cycles completion_time = curCycle(); + assert(completion_time >= issued_time); + Cycles total_lat = completion_time - issued_time; + + // cache stats (valid for RfO protocol only) + if (mach == MachineType_TCP) { + if (type == RubyRequestType_LD) { + GPU_TCPLdHits++; + } else { + GPU_TCPStHits++; + } + } else if (mach == MachineType_L1Cache_wCC) { + if (type == RubyRequestType_LD) { + GPU_TCPLdTransfers++; + } else { + GPU_TCPStTransfers++; + } + } else if (mach == MachineType_TCC) { + if (type == RubyRequestType_LD) { + GPU_TCCLdHits++; + } else { + GPU_TCCStHits++; + } + } else { + if (type == RubyRequestType_LD) { + GPU_LdMiss++; + } else { + GPU_StMiss++; + } + } + + // Profile all access latency, even zero latency accesses + m_latencyHist.sample(total_lat); + m_typeLatencyHist[type]->sample(total_lat); + + // Profile the miss latency for all non-zero demand misses + if (total_lat != Cycles(0)) { + m_missLatencyHist.sample(total_lat); + m_missTypeLatencyHist[type]->sample(total_lat); + + if (mach != MachineType_NUM) { + m_missMachLatencyHist[mach]->sample(total_lat); + m_missTypeMachLatencyHist[type][mach]->sample(total_lat); + + if ((issued_time <= initialRequestTime) && + (initialRequestTime <= forwardRequestTime) && + (forwardRequestTime <= firstResponseTime) && + (firstResponseTime <= completion_time)) { + + m_IssueToInitialDelayHist[mach]->sample( + initialRequestTime - issued_time); + m_InitialToForwardDelayHist[mach]->sample( + forwardRequestTime - initialRequestTime); + m_ForwardToFirstResponseDelayHist[mach]->sample( + firstResponseTime - forwardRequestTime); + m_FirstResponseToCompletionDelayHist[mach]->sample( + completion_time - firstResponseTime); + } + } + + } + + DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n", + curTick(), m_version, "Coal", + success ? "Done" : "SC_Failed", "", "", + printAddress(srequest->pkt->getAddr()), total_lat); +} + +void +GPUCoalescer::regStats() +{ + // These statistical variables are not for display. + // The profiler will collate these across different + // coalescers and display those collated statistics. + m_outstandReqHist.init(10); + m_latencyHist.init(10); + m_missLatencyHist.init(10); + + for (int i = 0; i < RubyRequestType_NUM; i++) { + m_typeLatencyHist.push_back(new Stats::Histogram()); + m_typeLatencyHist[i]->init(10); + + m_missTypeLatencyHist.push_back(new Stats::Histogram()); + m_missTypeLatencyHist[i]->init(10); + } + + for (int i = 0; i < MachineType_NUM; i++) { + m_missMachLatencyHist.push_back(new Stats::Histogram()); + m_missMachLatencyHist[i]->init(10); + + m_IssueToInitialDelayHist.push_back(new Stats::Histogram()); + m_IssueToInitialDelayHist[i]->init(10); + + m_InitialToForwardDelayHist.push_back(new Stats::Histogram()); + m_InitialToForwardDelayHist[i]->init(10); + + m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram()); + m_ForwardToFirstResponseDelayHist[i]->init(10); + + m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram()); + m_FirstResponseToCompletionDelayHist[i]->init(10); + } + + for (int i = 0; i < RubyRequestType_NUM; i++) { + m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>()); + + for (int j = 0; j < MachineType_NUM; j++) { + m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram()); + m_missTypeMachLatencyHist[i][j]->init(10); + } + } + + // GPU cache stats + GPU_TCPLdHits + .name(name() + ".gpu_tcp_ld_hits") + .desc("loads that hit in the TCP") + ; + GPU_TCPLdTransfers + .name(name() + ".gpu_tcp_ld_transfers") + .desc("TCP to TCP load transfers") + ; + GPU_TCCLdHits + .name(name() + ".gpu_tcc_ld_hits") + .desc("loads that hit in the TCC") + ; + GPU_LdMiss + .name(name() + ".gpu_ld_misses") + .desc("loads that miss in the GPU") + ; + + GPU_TCPStHits + .name(name() + ".gpu_tcp_st_hits") + .desc("stores that hit in the TCP") + ; + GPU_TCPStTransfers + .name(name() + ".gpu_tcp_st_transfers") + .desc("TCP to TCP store transfers") + ; + GPU_TCCStHits + .name(name() + ".gpu_tcc_st_hits") + .desc("stores that hit in the TCC") + ; + GPU_StMiss + .name(name() + ".gpu_st_misses") + .desc("stores that miss in the GPU") + ; + + // CP cache stats + CP_TCPLdHits + .name(name() + ".cp_tcp_ld_hits") + .desc("loads that hit in the TCP") + ; + CP_TCPLdTransfers + .name(name() + ".cp_tcp_ld_transfers") + .desc("TCP to TCP load transfers") + ; + CP_TCCLdHits + .name(name() + ".cp_tcc_ld_hits") + .desc("loads that hit in the TCC") + ; + CP_LdMiss + .name(name() + ".cp_ld_misses") + .desc("loads that miss in the GPU") + ; + + CP_TCPStHits + .name(name() + ".cp_tcp_st_hits") + .desc("stores that hit in the TCP") + ; + CP_TCPStTransfers + .name(name() + ".cp_tcp_st_transfers") + .desc("TCP to TCP store transfers") + ; + CP_TCCStHits + .name(name() + ".cp_tcc_st_hits") + .desc("stores that hit in the TCC") + ; + CP_StMiss + .name(name() + ".cp_st_misses") + .desc("stores that miss in the GPU") + ; +} |