diff options
Diffstat (limited to 'src/mem/ruby')
25 files changed, 2610 insertions, 62 deletions
diff --git a/src/mem/ruby/SConscript b/src/mem/ruby/SConscript index 16e932432..82a16c9b0 100644 --- a/src/mem/ruby/SConscript +++ b/src/mem/ruby/SConscript @@ -124,13 +124,20 @@ MakeInclude('common/Set.hh') MakeInclude('common/WriteMask.hh') MakeInclude('filters/AbstractBloomFilter.hh') MakeInclude('network/MessageBuffer.hh') -MakeInclude('structures/Prefetcher.hh') MakeInclude('structures/CacheMemory.hh') -MakeInclude('system/DMASequencer.hh') MakeInclude('structures/DirectoryMemory.hh') -MakeInclude('structures/WireBuffer.hh') MakeInclude('structures/PerfectCacheMemory.hh') MakeInclude('structures/PersistentTable.hh') -MakeInclude('system/Sequencer.hh') +MakeInclude('structures/Prefetcher.hh') MakeInclude('structures/TBETable.hh') MakeInclude('structures/TimerTable.hh') +MakeInclude('structures/WireBuffer.hh') +MakeInclude('system/DMASequencer.hh') +MakeInclude('system/Sequencer.hh') + +# External types : Group "mem/protocol" : include "header.hh" to the bottom +# of this MakeIncludes if it is referenced as +# <# include "mem/protocol/header.hh"> in any file +# generated_dir = Dir('../protocol') +MakeInclude('system/GPUCoalescer.hh') +MakeInclude('system/VIPERCoalescer.hh') diff --git a/src/mem/ruby/profiler/Profiler.cc b/src/mem/ruby/profiler/Profiler.cc index b3b37e5a6..7d3f20982 100644 --- a/src/mem/ruby/profiler/Profiler.cc +++ b/src/mem/ruby/profiler/Profiler.cc @@ -269,7 +269,7 @@ Profiler::collateStats() it != m_ruby_system->m_abstract_controls[i].end(); ++it) { AbstractController *ctr = (*it).second; - Sequencer *seq = ctr->getSequencer(); + Sequencer *seq = ctr->getCPUSequencer(); if (seq != NULL) { m_outstandReqHist.add(seq->getOutstandReqHist()); } @@ -282,7 +282,7 @@ Profiler::collateStats() it != m_ruby_system->m_abstract_controls[i].end(); ++it) { AbstractController *ctr = (*it).second; - Sequencer *seq = ctr->getSequencer(); + Sequencer *seq = ctr->getCPUSequencer(); if (seq != NULL) { // add all the latencies m_latencyHist.add(seq->getLatencyHist()); diff --git a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh index 926556781..cbd068c04 100644 --- a/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh +++ b/src/mem/ruby/slicc_interface/AbstractCacheEntry.hh @@ -56,6 +56,12 @@ class AbstractCacheEntry : public AbstractEntry virtual DataBlock& getDataBlk() { panic("getDataBlk() not implemented!"); } + int validBlocks; + virtual int& getNumValidBlocks() + { + return validBlocks; + } + // Functions for locking and unlocking the cache entry. These are required // for supporting atomic memory accesses. void setLocked(int context); diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc index 93fe50c88..458fde5bc 100644 --- a/src/mem/ruby/slicc_interface/AbstractController.cc +++ b/src/mem/ruby/slicc_interface/AbstractController.cc @@ -200,6 +200,12 @@ AbstractController::unblock(Addr addr) } } +bool +AbstractController::isBlocked(Addr addr) +{ + return (m_block_map.count(addr) > 0); +} + BaseMasterPort & AbstractController::getMasterPort(const std::string &if_name, PortID idx) diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh index 383507eed..4488ee3f4 100644 --- a/src/mem/ruby/slicc_interface/AbstractController.hh +++ b/src/mem/ruby/slicc_interface/AbstractController.hh @@ -73,6 +73,7 @@ class AbstractController : public MemObject, public Consumer // return instance name void blockOnQueue(Addr, MessageBuffer*); void unblock(Addr); + bool isBlocked(Addr); virtual MessageBuffer* getMandatoryQueue() const = 0; virtual MessageBuffer* getMemoryQueue() const = 0; @@ -84,7 +85,7 @@ class AbstractController : public MemObject, public Consumer virtual void regStats(); virtual void recordCacheTrace(int cntrl, CacheRecorder* tr) = 0; - virtual Sequencer* getSequencer() const = 0; + virtual Sequencer* getCPUSequencer() const = 0; //! These functions are used by ruby system to read/write the data blocks //! that exist with in the controller. diff --git a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh index 46071335e..cdedc2e14 100644 --- a/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh +++ b/src/mem/ruby/slicc_interface/RubySlicc_ComponentMapping.hh @@ -43,6 +43,12 @@ map_Address_to_DirectoryNode(Addr addr) return DirectoryMemory::mapAddressToDirectoryVersion(addr); } +inline NodeID +map_Address_to_TCCdirNode(Addr addr) +{ + return DirectoryMemory::mapAddressToDirectoryVersion(addr); +} + // used to determine the home directory // returns a value between 0 and total_directories_within_the_system inline MachineID @@ -53,6 +59,22 @@ map_Address_to_Directory(Addr addr) return mach; } +inline MachineID +map_Address_to_RegionDir(Addr addr) +{ + MachineID mach = {MachineType_RegionDir, + map_Address_to_DirectoryNode(addr)}; + return mach; +} + +inline MachineID +map_Address_to_TCCdir(Addr addr) +{ + MachineID mach = + {MachineType_TCCdir, map_Address_to_TCCdirNode(addr)}; + return mach; +} + inline NetDest broadcast(MachineType type) { @@ -102,4 +124,11 @@ createMachineID(MachineType type, NodeID id) return mach; } +inline MachineID +MachineTypeAndNodeIDToMachineID(MachineType type, NodeID node) +{ + MachineID mach = {type, node}; + return mach; +} + #endif // __MEM_RUBY_SLICC_INTERFACE_COMPONENTMAPPINGS_HH__ diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc index a8a3ba949..45fb85d05 100644 --- a/src/mem/ruby/structures/CacheMemory.cc +++ b/src/mem/ruby/structures/CacheMemory.cc @@ -35,6 +35,7 @@ #include "mem/protocol/AccessPermission.hh" #include "mem/ruby/structures/CacheMemory.hh" #include "mem/ruby/system/RubySystem.hh" +#include "mem/ruby/system/WeightedLRUPolicy.hh" using namespace std; @@ -66,29 +67,27 @@ CacheMemory::CacheMemory(const Params *p) m_start_index_bit = p->start_index_bit; m_is_instruction_only_cache = p->is_icache; m_resource_stalls = p->resourceStalls; + m_block_size = p->block_size; // may be 0 at this point. Updated in init() } void CacheMemory::init() { - m_cache_num_sets = (m_cache_size / m_cache_assoc) / - RubySystem::getBlockSizeBytes(); + if (m_block_size == 0) { + m_block_size = RubySystem::getBlockSizeBytes(); + } + m_cache_num_sets = (m_cache_size / m_cache_assoc) / m_block_size; assert(m_cache_num_sets > 1); m_cache_num_set_bits = floorLog2(m_cache_num_sets); assert(m_cache_num_set_bits > 0); - m_cache.resize(m_cache_num_sets); - for (int i = 0; i < m_cache_num_sets; i++) { - m_cache[i].resize(m_cache_assoc); - for (int j = 0; j < m_cache_assoc; j++) { - m_cache[i][j] = NULL; - } - } + m_cache.resize(m_cache_num_sets, + std::vector<AbstractCacheEntry*>(m_cache_assoc, nullptr)); } CacheMemory::~CacheMemory() { - if (m_replacementPolicy_ptr != NULL) + if (m_replacementPolicy_ptr) delete m_replacementPolicy_ptr; for (int i = 0; i < m_cache_num_sets; i++) { for (int j = 0; j < m_cache_assoc; j++) { @@ -359,6 +358,37 @@ CacheMemory::setMRU(const AbstractCacheEntry *e) } void +CacheMemory::setMRU(Addr address, int occupancy) +{ + int64_t cacheSet = addressToCacheSet(address); + int loc = findTagInSet(cacheSet, address); + + if(loc != -1) { + if (m_replacementPolicy_ptr->useOccupancy()) { + (static_cast<WeightedLRUPolicy*>(m_replacementPolicy_ptr))-> + touch(cacheSet, loc, curTick(), occupancy); + } else { + m_replacementPolicy_ptr-> + touch(cacheSet, loc, curTick()); + } + } +} + +int +CacheMemory::getReplacementWeight(int64_t set, int64_t loc) +{ + assert(set < m_cache_num_sets); + assert(loc < m_cache_assoc); + int ret = 0; + if(m_cache[set][loc] != NULL) { + ret = m_cache[set][loc]->getNumValidBlocks(); + assert(ret >= 0); + } + + return ret; +} + +void CacheMemory::recordCacheContents(int cntrl, CacheRecorder* tr) const { uint64_t warmedUpBlocks = 0; diff --git a/src/mem/ruby/structures/CacheMemory.hh b/src/mem/ruby/structures/CacheMemory.hh index 72805b32b..5b30505d3 100644 --- a/src/mem/ruby/structures/CacheMemory.hh +++ b/src/mem/ruby/structures/CacheMemory.hh @@ -106,7 +106,8 @@ class CacheMemory : public SimObject // Set this address to most recently used void setMRU(Addr address); - // Set this entry to most recently used + void setMRU(Addr addr, int occupancy); + int getReplacementWeight(int64_t set, int64_t loc); void setMRU(const AbstractCacheEntry *e); // Functions for locking and unlocking cache lines corresponding to the @@ -146,6 +147,7 @@ class CacheMemory : public SimObject Stats::Scalar numDataArrayStalls; int getCacheSize() const { return m_cache_size; } + int getCacheAssoc() const { return m_cache_assoc; } int getNumBlocks() const { return m_cache_num_sets * m_cache_assoc; } Addr getAddressAtIdx(int idx) const; @@ -182,6 +184,7 @@ class CacheMemory : public SimObject int m_cache_assoc; int m_start_index_bit; bool m_resource_stalls; + int m_block_size; }; std::ostream& operator<<(std::ostream& out, const CacheMemory& obj); diff --git a/src/mem/ruby/structures/RubyCache.py b/src/mem/ruby/structures/RubyCache.py index 4eb87ac74..9fc4726b0 100644 --- a/src/mem/ruby/structures/RubyCache.py +++ b/src/mem/ruby/structures/RubyCache.py @@ -42,6 +42,7 @@ class RubyCache(SimObject): "") start_index_bit = Param.Int(6, "index start, default 6 for 64-byte line"); is_icache = Param.Bool(False, "is instruction only cache"); + block_size = Param.MemorySize("0B", "block size in bytes. 0 means default RubyBlockSize") dataArrayBanks = Param.Int(1, "Number of banks for the data array") tagArrayBanks = Param.Int(1, "Number of banks for the tag array") diff --git a/src/mem/ruby/system/GPUCoalescer.cc b/src/mem/ruby/system/GPUCoalescer.cc new file mode 100644 index 000000000..db279bd3a --- /dev/null +++ b/src/mem/ruby/system/GPUCoalescer.cc @@ -0,0 +1,1397 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "base/misc.hh" +#include "base/str.hh" +#include "config/the_isa.hh" + +#if THE_ISA == X86_ISA +#include "arch/x86/insts/microldstop.hh" + +#endif // X86_ISA +#include "mem/ruby/system/GPUCoalescer.hh" + +#include "cpu/testers/rubytest/RubyTester.hh" +#include "debug/GPUCoalescer.hh" +#include "debug/MemoryAccess.hh" +#include "debug/ProtocolTrace.hh" +#include "debug/RubyPort.hh" +#include "debug/RubyStats.hh" +#include "gpu-compute/shader.hh" +#include "mem/packet.hh" +#include "mem/ruby/common/DataBlock.hh" +#include "mem/ruby/common/SubBlock.hh" +#include "mem/ruby/network/MessageBuffer.hh" +#include "mem/ruby/profiler/Profiler.hh" +#include "mem/ruby/slicc_interface/AbstractController.hh" +#include "mem/ruby/slicc_interface/RubyRequest.hh" +#include "mem/ruby/structures/CacheMemory.hh" +#include "mem/ruby/system/RubySystem.hh" +#include "params/RubyGPUCoalescer.hh" + +using namespace std; + +GPUCoalescer * +RubyGPUCoalescerParams::create() +{ + return new GPUCoalescer(this); +} + +HSAScope +reqScopeToHSAScope(Request* req) +{ + HSAScope accessScope = HSAScope_UNSPECIFIED; + if (req->isScoped()) { + if (req->isWavefrontScope()) { + accessScope = HSAScope_WAVEFRONT; + } else if (req->isWorkgroupScope()) { + accessScope = HSAScope_WORKGROUP; + } else if (req->isDeviceScope()) { + accessScope = HSAScope_DEVICE; + } else if (req->isSystemScope()) { + accessScope = HSAScope_SYSTEM; + } else { + fatal("Bad scope type"); + } + } + return accessScope; +} + +HSASegment +reqSegmentToHSASegment(Request* req) +{ + HSASegment accessSegment = HSASegment_GLOBAL; + + if (req->isGlobalSegment()) { + accessSegment = HSASegment_GLOBAL; + } else if (req->isGroupSegment()) { + accessSegment = HSASegment_GROUP; + } else if (req->isPrivateSegment()) { + accessSegment = HSASegment_PRIVATE; + } else if (req->isKernargSegment()) { + accessSegment = HSASegment_KERNARG; + } else if (req->isReadonlySegment()) { + accessSegment = HSASegment_READONLY; + } else if (req->isSpillSegment()) { + accessSegment = HSASegment_SPILL; + } else if (req->isArgSegment()) { + accessSegment = HSASegment_ARG; + } else { + fatal("Bad segment type"); + } + + return accessSegment; +} + +GPUCoalescer::GPUCoalescer(const Params *p) + : RubyPort(p), issueEvent(this), deadlockCheckEvent(this) +{ + m_store_waiting_on_load_cycles = 0; + m_store_waiting_on_store_cycles = 0; + m_load_waiting_on_store_cycles = 0; + m_load_waiting_on_load_cycles = 0; + + m_outstanding_count = 0; + + m_max_outstanding_requests = 0; + m_deadlock_threshold = 0; + m_instCache_ptr = nullptr; + m_dataCache_ptr = nullptr; + + m_instCache_ptr = p->icache; + m_dataCache_ptr = p->dcache; + m_max_outstanding_requests = p->max_outstanding_requests; + m_deadlock_threshold = p->deadlock_threshold; + + assert(m_max_outstanding_requests > 0); + assert(m_deadlock_threshold > 0); + assert(m_instCache_ptr); + assert(m_dataCache_ptr); + + m_data_cache_hit_latency = p->dcache_hit_latency; + + m_usingNetworkTester = p->using_network_tester; + assumingRfOCoherence = p->assume_rfo; +} + +GPUCoalescer::~GPUCoalescer() +{ +} + +void +GPUCoalescer::wakeup() +{ + // Check for deadlock of any of the requests + Cycles current_time = curCycle(); + + // Check across all outstanding requests + int total_outstanding = 0; + + RequestTable::iterator read = m_readRequestTable.begin(); + RequestTable::iterator read_end = m_readRequestTable.end(); + for (; read != read_end; ++read) { + GPUCoalescerRequest* request = read->second; + if (current_time - request->issue_time < m_deadlock_threshold) + continue; + + panic("Possible Deadlock detected. Aborting!\n" + "version: %d request.paddr: 0x%x m_readRequestTable: %d " + "current time: %u issue_time: %d difference: %d\n", m_version, + request->pkt->getAddr(), m_readRequestTable.size(), + current_time * clockPeriod(), request->issue_time * clockPeriod(), + (current_time - request->issue_time)*clockPeriod()); + } + + RequestTable::iterator write = m_writeRequestTable.begin(); + RequestTable::iterator write_end = m_writeRequestTable.end(); + for (; write != write_end; ++write) { + GPUCoalescerRequest* request = write->second; + if (current_time - request->issue_time < m_deadlock_threshold) + continue; + + panic("Possible Deadlock detected. Aborting!\n" + "version: %d request.paddr: 0x%x m_writeRequestTable: %d " + "current time: %u issue_time: %d difference: %d\n", m_version, + request->pkt->getAddr(), m_writeRequestTable.size(), + current_time * clockPeriod(), request->issue_time * clockPeriod(), + (current_time - request->issue_time) * clockPeriod()); + } + + total_outstanding += m_writeRequestTable.size(); + total_outstanding += m_readRequestTable.size(); + + assert(m_outstanding_count == total_outstanding); + + if (m_outstanding_count > 0) { + // If there are still outstanding requests, keep checking + schedule(deadlockCheckEvent, + m_deadlock_threshold * clockPeriod() + + curTick()); + } +} + +void +GPUCoalescer::resetStats() +{ + m_latencyHist.reset(); + m_missLatencyHist.reset(); + for (int i = 0; i < RubyRequestType_NUM; i++) { + m_typeLatencyHist[i]->reset(); + m_missTypeLatencyHist[i]->reset(); + for (int j = 0; j < MachineType_NUM; j++) { + m_missTypeMachLatencyHist[i][j]->reset(); + } + } + + for (int i = 0; i < MachineType_NUM; i++) { + m_missMachLatencyHist[i]->reset(); + + m_IssueToInitialDelayHist[i]->reset(); + m_InitialToForwardDelayHist[i]->reset(); + m_ForwardToFirstResponseDelayHist[i]->reset(); + m_FirstResponseToCompletionDelayHist[i]->reset(); + } +} + +void +GPUCoalescer::printProgress(ostream& out) const +{ +} + +RequestStatus +GPUCoalescer::getRequestStatus(PacketPtr pkt, RubyRequestType request_type) +{ + Addr line_addr = makeLineAddress(pkt->getAddr()); + + if (!m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())) { + return RequestStatus_BufferFull; + } + + if(m_controller->isBlocked(line_addr) && + request_type != RubyRequestType_Locked_RMW_Write) { + return RequestStatus_Aliased; + } + + if ((request_type == RubyRequestType_ST) || + (request_type == RubyRequestType_ATOMIC) || + (request_type == RubyRequestType_ATOMIC_RETURN) || + (request_type == RubyRequestType_ATOMIC_NO_RETURN) || + (request_type == RubyRequestType_RMW_Read) || + (request_type == RubyRequestType_RMW_Write) || + (request_type == RubyRequestType_Load_Linked) || + (request_type == RubyRequestType_Store_Conditional) || + (request_type == RubyRequestType_Locked_RMW_Read) || + (request_type == RubyRequestType_Locked_RMW_Write) || + (request_type == RubyRequestType_FLUSH)) { + + // Check if there is any outstanding read request for the same + // cache line. + if (m_readRequestTable.count(line_addr) > 0) { + m_store_waiting_on_load_cycles++; + return RequestStatus_Aliased; + } + + if (m_writeRequestTable.count(line_addr) > 0) { + // There is an outstanding write request for the cache line + m_store_waiting_on_store_cycles++; + return RequestStatus_Aliased; + } + } else { + // Check if there is any outstanding write request for the same + // cache line. + if (m_writeRequestTable.count(line_addr) > 0) { + m_load_waiting_on_store_cycles++; + return RequestStatus_Aliased; + } + + if (m_readRequestTable.count(line_addr) > 0) { + // There is an outstanding read request for the cache line + m_load_waiting_on_load_cycles++; + return RequestStatus_Aliased; + } + } + + return RequestStatus_Ready; + +} + + + +// sets the kernelEndList +void +GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt) +{ + // Don't know if this will happen or is possible + // but I just want to be careful and not have it become + // simulator hang in the future + DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id); + assert(kernelEndList.count(wavefront_id) == 0); + + kernelEndList[wavefront_id] = pkt; + DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n", + kernelEndList.size()); +} + + +// Insert the request on the correct request table. Return true if +// the entry was already present. +bool +GPUCoalescer::insertRequest(PacketPtr pkt, RubyRequestType request_type) +{ + assert(getRequestStatus(pkt, request_type) == RequestStatus_Ready || + pkt->req->isLockedRMW() || + !m_mandatory_q_ptr->areNSlotsAvailable(1, clockEdge())); + + int total_outstanding M5_VAR_USED = + m_writeRequestTable.size() + m_readRequestTable.size(); + + assert(m_outstanding_count == total_outstanding); + + // See if we should schedule a deadlock check + if (deadlockCheckEvent.scheduled() == false) { + schedule(deadlockCheckEvent, m_deadlock_threshold + curTick()); + } + + Addr line_addr = makeLineAddress(pkt->getAddr()); + if ((request_type == RubyRequestType_ST) || + (request_type == RubyRequestType_ATOMIC) || + (request_type == RubyRequestType_ATOMIC_RETURN) || + (request_type == RubyRequestType_ATOMIC_NO_RETURN) || + (request_type == RubyRequestType_RMW_Read) || + (request_type == RubyRequestType_RMW_Write) || + (request_type == RubyRequestType_Load_Linked) || + (request_type == RubyRequestType_Store_Conditional) || + (request_type == RubyRequestType_Locked_RMW_Read) || + (request_type == RubyRequestType_Locked_RMW_Write) || + (request_type == RubyRequestType_FLUSH)) { + + pair<RequestTable::iterator, bool> r = + m_writeRequestTable.insert(RequestTable::value_type(line_addr, + (GPUCoalescerRequest*) NULL)); + if (r.second) { + RequestTable::iterator i = r.first; + i->second = new GPUCoalescerRequest(pkt, request_type, + curCycle()); + DPRINTF(GPUCoalescer, + "Inserting write request for paddr %#x for type %d\n", + pkt->req->getPaddr(), i->second->m_type); + m_outstanding_count++; + } else { + return true; + } + } else { + pair<RequestTable::iterator, bool> r = + m_readRequestTable.insert(RequestTable::value_type(line_addr, + (GPUCoalescerRequest*) NULL)); + + if (r.second) { + RequestTable::iterator i = r.first; + i->second = new GPUCoalescerRequest(pkt, request_type, + curCycle()); + DPRINTF(GPUCoalescer, + "Inserting read request for paddr %#x for type %d\n", + pkt->req->getPaddr(), i->second->m_type); + m_outstanding_count++; + } else { + return true; + } + } + + m_outstandReqHist.sample(m_outstanding_count); + + total_outstanding = m_writeRequestTable.size() + m_readRequestTable.size(); + assert(m_outstanding_count == total_outstanding); + + return false; +} + +void +GPUCoalescer::markRemoved() +{ + m_outstanding_count--; + assert(m_outstanding_count == + m_writeRequestTable.size() + m_readRequestTable.size()); +} + +void +GPUCoalescer::removeRequest(GPUCoalescerRequest* srequest) +{ + assert(m_outstanding_count == + m_writeRequestTable.size() + m_readRequestTable.size()); + + Addr line_addr = makeLineAddress(srequest->pkt->getAddr()); + if ((srequest->m_type == RubyRequestType_ST) || + (srequest->m_type == RubyRequestType_RMW_Read) || + (srequest->m_type == RubyRequestType_RMW_Write) || + (srequest->m_type == RubyRequestType_Load_Linked) || + (srequest->m_type == RubyRequestType_Store_Conditional) || + (srequest->m_type == RubyRequestType_Locked_RMW_Read) || + (srequest->m_type == RubyRequestType_Locked_RMW_Write)) { + m_writeRequestTable.erase(line_addr); + } else { + m_readRequestTable.erase(line_addr); + } + + markRemoved(); +} + +bool +GPUCoalescer::handleLlsc(Addr address, GPUCoalescerRequest* request) +{ + // + // The success flag indicates whether the LLSC operation was successful. + // LL ops will always succeed, but SC may fail if the cache line is no + // longer locked. + // + bool success = true; + if (request->m_type == RubyRequestType_Store_Conditional) { + if (!m_dataCache_ptr->isLocked(address, m_version)) { + // + // For failed SC requests, indicate the failure to the cpu by + // setting the extra data to zero. + // + request->pkt->req->setExtraData(0); + success = false; + } else { + // + // For successful SC requests, indicate the success to the cpu by + // setting the extra data to one. + // + request->pkt->req->setExtraData(1); + } + // + // Independent of success, all SC operations must clear the lock + // + m_dataCache_ptr->clearLocked(address); + } else if (request->m_type == RubyRequestType_Load_Linked) { + // + // Note: To fully follow Alpha LLSC semantics, should the LL clear any + // previously locked cache lines? + // + m_dataCache_ptr->setLocked(address, m_version); + } else if ((m_dataCache_ptr->isTagPresent(address)) && + (m_dataCache_ptr->isLocked(address, m_version))) { + // + // Normal writes should clear the locked address + // + m_dataCache_ptr->clearLocked(address); + } + return success; +} + +void +GPUCoalescer::writeCallback(Addr address, DataBlock& data) +{ + writeCallback(address, MachineType_NULL, data); +} + +void +GPUCoalescer::writeCallback(Addr address, + MachineType mach, + DataBlock& data) +{ + writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); +} + +void +GPUCoalescer::writeCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime) +{ + writeCallback(address, mach, data, + initialRequestTime, forwardRequestTime, firstResponseTime, + false); +} + +void +GPUCoalescer::writeCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion) +{ + assert(address == makeLineAddress(address)); + + DPRINTF(GPUCoalescer, "write callback for address %#x\n", address); + assert(m_writeRequestTable.count(makeLineAddress(address))); + + RequestTable::iterator i = m_writeRequestTable.find(address); + assert(i != m_writeRequestTable.end()); + GPUCoalescerRequest* request = i->second; + + m_writeRequestTable.erase(i); + markRemoved(); + + assert((request->m_type == RubyRequestType_ST) || + (request->m_type == RubyRequestType_ATOMIC) || + (request->m_type == RubyRequestType_ATOMIC_RETURN) || + (request->m_type == RubyRequestType_ATOMIC_NO_RETURN) || + (request->m_type == RubyRequestType_RMW_Read) || + (request->m_type == RubyRequestType_RMW_Write) || + (request->m_type == RubyRequestType_Load_Linked) || + (request->m_type == RubyRequestType_Store_Conditional) || + (request->m_type == RubyRequestType_Locked_RMW_Read) || + (request->m_type == RubyRequestType_Locked_RMW_Write) || + (request->m_type == RubyRequestType_FLUSH)); + + + // + // For Alpha, properly handle LL, SC, and write requests with respect to + // locked cache blocks. + // + // Not valid for Network_test protocl + // + bool success = true; + if(!m_usingNetworkTester) + success = handleLlsc(address, request); + + if (request->m_type == RubyRequestType_Locked_RMW_Read) { + m_controller->blockOnQueue(address, m_mandatory_q_ptr); + } else if (request->m_type == RubyRequestType_Locked_RMW_Write) { + m_controller->unblock(address); + } + + hitCallback(request, mach, data, success, + request->issue_time, forwardRequestTime, firstResponseTime, + isRegion); +} + +void +GPUCoalescer::readCallback(Addr address, DataBlock& data) +{ + readCallback(address, MachineType_NULL, data); +} + +void +GPUCoalescer::readCallback(Addr address, + MachineType mach, + DataBlock& data) +{ + readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0)); +} + +void +GPUCoalescer::readCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime) +{ + + readCallback(address, mach, data, + initialRequestTime, forwardRequestTime, firstResponseTime, + false); +} + +void +GPUCoalescer::readCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion) +{ + assert(address == makeLineAddress(address)); + assert(m_readRequestTable.count(makeLineAddress(address))); + + DPRINTF(GPUCoalescer, "read callback for address %#x\n", address); + RequestTable::iterator i = m_readRequestTable.find(address); + assert(i != m_readRequestTable.end()); + GPUCoalescerRequest* request = i->second; + + m_readRequestTable.erase(i); + markRemoved(); + + assert((request->m_type == RubyRequestType_LD) || + (request->m_type == RubyRequestType_IFETCH)); + + hitCallback(request, mach, data, true, + request->issue_time, forwardRequestTime, firstResponseTime, + isRegion); +} + +void +GPUCoalescer::hitCallback(GPUCoalescerRequest* srequest, + MachineType mach, + DataBlock& data, + bool success, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion) +{ + PacketPtr pkt = srequest->pkt; + Addr request_address = pkt->getAddr(); + Addr request_line_address = makeLineAddress(request_address); + + RubyRequestType type = srequest->m_type; + + // Set this cache entry to the most recently used + if (type == RubyRequestType_IFETCH) { + if (m_instCache_ptr->isTagPresent(request_line_address)) + m_instCache_ptr->setMRU(request_line_address); + } else { + if (m_dataCache_ptr->isTagPresent(request_line_address)) + m_dataCache_ptr->setMRU(request_line_address); + } + + recordMissLatency(srequest, mach, + initialRequestTime, + forwardRequestTime, + firstResponseTime, + success, isRegion); + // update the data + // + // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER + int len = reqCoalescer[request_line_address].size(); + std::vector<PacketPtr> mylist; + for (int i = 0; i < len; ++i) { + PacketPtr pkt = reqCoalescer[request_line_address][i].first; + assert(type == + reqCoalescer[request_line_address][i].second[PrimaryType]); + request_address = pkt->getAddr(); + request_line_address = makeLineAddress(pkt->getAddr()); + if (pkt->getPtr<uint8_t>()) { + if ((type == RubyRequestType_LD) || + (type == RubyRequestType_ATOMIC) || + (type == RubyRequestType_ATOMIC_RETURN) || + (type == RubyRequestType_IFETCH) || + (type == RubyRequestType_RMW_Read) || + (type == RubyRequestType_Locked_RMW_Read) || + (type == RubyRequestType_Load_Linked)) { + memcpy(pkt->getPtr<uint8_t>(), + data.getData(getOffset(request_address), + pkt->getSize()), + pkt->getSize()); + } else { + data.setData(pkt->getPtr<uint8_t>(), + getOffset(request_address), pkt->getSize()); + } + } else { + DPRINTF(MemoryAccess, + "WARNING. Data not transfered from Ruby to M5 for type " \ + "%s\n", + RubyRequestType_to_string(type)); + } + + // If using the RubyTester, update the RubyTester sender state's + // subBlock with the recieved data. The tester will later access + // this state. + // Note: RubyPort will access it's sender state before the + // RubyTester. + if (m_usingRubyTester) { + RubyPort::SenderState *requestSenderState = + safe_cast<RubyPort::SenderState*>(pkt->senderState); + RubyTester::SenderState* testerSenderState = + safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor); + testerSenderState->subBlock.mergeFrom(data); + } + + mylist.push_back(pkt); + } + delete srequest; + reqCoalescer.erase(request_line_address); + assert(!reqCoalescer.count(request_line_address)); + + + + completeHitCallback(mylist, len); +} + +bool +GPUCoalescer::empty() const +{ + return m_writeRequestTable.empty() && m_readRequestTable.empty(); +} + +// Analyzes the packet to see if this request can be coalesced. +// If request can be coalesced, this request is added to the reqCoalescer table +// and makeRequest returns RequestStatus_Issued; +// If this is the first request to a cacheline, request is added to both +// newRequests queue and to the reqCoalescer table; makeRequest +// returns RequestStatus_Issued. +// If there is a pending request to this cacheline and this request +// can't be coalesced, RequestStatus_Aliased is returned and +// the packet needs to be reissued. +RequestStatus +GPUCoalescer::makeRequest(PacketPtr pkt) +{ + // Check for GPU Barrier Kernel End or Kernel Begin + // Leave these to be handled by the child class + // Kernel End/Barrier = isFlush + isRelease + // Kernel Begin = isFlush + isAcquire + if (pkt->req->isKernel()) { + if (pkt->req->isAcquire()){ + // This is a Kernel Begin leave handling to + // virtual xCoalescer::makeRequest + return RequestStatus_Issued; + }else if(pkt->req->isRelease()) { + // This is a Kernel End leave handling to + // virtual xCoalescer::makeRequest + // If we are here then we didn't call + // a virtual version of this function + // so we will also schedule the callback + int wf_id = 0; + if (pkt->req->hasContextId()) { + wf_id = pkt->req->contextId(); + } + insertKernel(wf_id, pkt); + newKernelEnds.push_back(wf_id); + if (!issueEvent.scheduled()) { + schedule(issueEvent, curTick()); + } + return RequestStatus_Issued; + } + } + + // If number of outstanding requests greater than the max allowed, + // return RequestStatus_BufferFull. This logic can be extended to + // support proper backpressure. + if (m_outstanding_count >= m_max_outstanding_requests) { + return RequestStatus_BufferFull; + } + + RubyRequestType primary_type = RubyRequestType_NULL; + RubyRequestType secondary_type = RubyRequestType_NULL; + + if (pkt->isLLSC()) { + // + // Alpha LL/SC instructions need to be handled carefully by the cache + // coherence protocol to ensure they follow the proper semantics. In + // particular, by identifying the operations as atomic, the protocol + // should understand that migratory sharing optimizations should not + // be performed (i.e. a load between the LL and SC should not steal + // away exclusive permission). + // + if (pkt->isWrite()) { + primary_type = RubyRequestType_Store_Conditional; + } else { + assert(pkt->isRead()); + primary_type = RubyRequestType_Load_Linked; + } + secondary_type = RubyRequestType_ATOMIC; + } else if (pkt->req->isLockedRMW()) { + // + // x86 locked instructions are translated to store cache coherence + // requests because these requests should always be treated as read + // exclusive operations and should leverage any migratory sharing + // optimization built into the protocol. + // + if (pkt->isWrite()) { + primary_type = RubyRequestType_Locked_RMW_Write; + } else { + assert(pkt->isRead()); + primary_type = RubyRequestType_Locked_RMW_Read; + } + secondary_type = RubyRequestType_ST; + } else if (pkt->isAtomicOp()) { + // + // GPU Atomic Operation + // + primary_type = RubyRequestType_ATOMIC; + secondary_type = RubyRequestType_ATOMIC; + } else { + if (pkt->isRead()) { + if (pkt->req->isInstFetch()) { + primary_type = secondary_type = RubyRequestType_IFETCH; + } else { +#if THE_ISA == X86_ISA + uint32_t flags = pkt->req->getFlags(); + bool storeCheck = flags & + (TheISA::StoreCheck << TheISA::FlagShift); +#else + bool storeCheck = false; +#endif // X86_ISA + if (storeCheck) { + primary_type = RubyRequestType_RMW_Read; + secondary_type = RubyRequestType_ST; + } else { + primary_type = secondary_type = RubyRequestType_LD; + } + } + } else if (pkt->isWrite()) { + // + // Note: M5 packets do not differentiate ST from RMW_Write + // + primary_type = secondary_type = RubyRequestType_ST; + } else if (pkt->isFlush()) { + primary_type = secondary_type = RubyRequestType_FLUSH; + } else if (pkt->req->isRelease() || pkt->req->isAcquire()) { + if (assumingRfOCoherence) { + // If we reached here, this request must be a memFence + // and the protocol implements RfO, the coalescer can + // assume sequentially consistency and schedule the callback + // immediately. + // Currently the code implements fence callbacks + // by reusing the mechanism for kernel completions. + // This should be fixed. + int wf_id = 0; + if (pkt->req->hasContextId()) { + wf_id = pkt->req->contextId(); + } + insertKernel(wf_id, pkt); + newKernelEnds.push_back(wf_id); + if (!issueEvent.scheduled()) { + schedule(issueEvent, curTick()); + } + return RequestStatus_Issued; + } else { + // If not RfO, return issued here and let the child coalescer + // take care of it. + return RequestStatus_Issued; + } + } else { + panic("Unsupported ruby packet type\n"); + } + } + + // Check if there is any pending request to this cache line from + // previous cycles. + // If there is a pending request, return aliased. Since coalescing + // across time is not permitted, aliased requests are not coalesced. + // If a request for this address has already been issued, we must block + RequestStatus status = getRequestStatus(pkt, primary_type); + if (status != RequestStatus_Ready) + return status; + + Addr line_addr = makeLineAddress(pkt->getAddr()); + + // Check if this request can be coalesced with previous + // requests from this cycle. + if (!reqCoalescer.count(line_addr)) { + // This is the first access to this cache line. + // A new request to the memory subsystem has to be + // made in the next cycle for this cache line, so + // add this line addr to the "newRequests" queue + newRequests.push_back(line_addr); + + // There was a request to this cache line in this cycle, + // let us see if we can coalesce this request with the previous + // requests from this cycle + } else if (primary_type != + reqCoalescer[line_addr][0].second[PrimaryType]) { + // can't coalesce loads, stores and atomics! + return RequestStatus_Aliased; + } else if (pkt->req->isLockedRMW() || + reqCoalescer[line_addr][0].first->req->isLockedRMW()) { + // can't coalesce locked accesses, but can coalesce atomics! + return RequestStatus_Aliased; + } else if (pkt->req->hasContextId() && pkt->req->isRelease() && + pkt->req->contextId() != + reqCoalescer[line_addr][0].first->req->contextId()) { + // can't coalesce releases from different wavefronts + return RequestStatus_Aliased; + } + + // in addition to the packet, we need to save both request types + reqCoalescer[line_addr].push_back( + RequestDesc(pkt, std::vector<RubyRequestType>()) ); + reqCoalescer[line_addr].back().second.push_back(primary_type); + reqCoalescer[line_addr].back().second.push_back(secondary_type); + if (!issueEvent.scheduled()) + schedule(issueEvent, curTick()); + // TODO: issue hardware prefetches here + return RequestStatus_Issued; +} + +void +GPUCoalescer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) +{ + + int proc_id = -1; + if (pkt != NULL && pkt->req->hasContextId()) { + proc_id = pkt->req->contextId(); + } + + // If valid, copy the pc to the ruby request + Addr pc = 0; + if (pkt->req->hasPC()) { + pc = pkt->req->getPC(); + } + + // At the moment setting scopes only counts + // for GPU spill space accesses + // which is pkt->req->isStack() + // this scope is REPLACE since it + // does not need to be flushed at the end + // of a kernel Private and local may need + // to be visible at the end of the kernel + HSASegment accessSegment = reqSegmentToHSASegment(pkt->req); + HSAScope accessScope = reqScopeToHSAScope(pkt->req); + + Addr line_addr = makeLineAddress(pkt->getAddr()); + + // Creating WriteMask that records written bytes + // and atomic operations. This enables partial writes + // and partial reads of those writes + DataBlock dataBlock; + dataBlock.clear(); + uint32_t blockSize = RubySystem::getBlockSizeBytes(); + std::vector<bool> accessMask(blockSize,false); + std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps; + uint32_t tableSize = reqCoalescer[line_addr].size(); + for (int i = 0; i < tableSize; i++) { + PacketPtr tmpPkt = reqCoalescer[line_addr][i].first; + uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr; + uint32_t tmpSize = tmpPkt->getSize(); + if (tmpPkt->isAtomicOp()) { + std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset, + tmpPkt->getAtomicOp()); + atomicOps.push_back(tmpAtomicOp); + } else if(tmpPkt->isWrite()) { + dataBlock.setData(tmpPkt->getPtr<uint8_t>(), + tmpOffset, tmpSize); + } + for (int j = 0; j < tmpSize; j++) { + accessMask[tmpOffset + j] = true; + } + } + std::shared_ptr<RubyRequest> msg; + if (pkt->isAtomicOp()) { + msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), + pkt->getPtr<uint8_t>(), + pkt->getSize(), pc, secondary_type, + RubyAccessMode_Supervisor, pkt, + PrefetchBit_No, proc_id, 100, + blockSize, accessMask, + dataBlock, atomicOps, + accessScope, accessSegment); + } else { + msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), + pkt->getPtr<uint8_t>(), + pkt->getSize(), pc, secondary_type, + RubyAccessMode_Supervisor, pkt, + PrefetchBit_No, proc_id, 100, + blockSize, accessMask, + dataBlock, + accessScope, accessSegment); + } + DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n", + curTick(), m_version, "Coal", "Begin", "", "", + printAddress(msg->getPhysicalAddress()), + RubyRequestType_to_string(secondary_type)); + + fatal_if(secondary_type == RubyRequestType_IFETCH, + "there should not be any I-Fetch requests in the GPU Coalescer"); + + // Send the message to the cache controller + fatal_if(m_data_cache_hit_latency == 0, + "should not have a latency of zero"); + + assert(m_mandatory_q_ptr); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); +} + +template <class KEY, class VALUE> +std::ostream & +operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map) +{ + out << "["; + for (auto i = map.begin(); i != map.end(); ++i) + out << " " << i->first << "=" << i->second; + out << " ]"; + + return out; +} + +void +GPUCoalescer::print(ostream& out) const +{ + out << "[GPUCoalescer: " << m_version + << ", outstanding requests: " << m_outstanding_count + << ", read request table: " << m_readRequestTable + << ", write request table: " << m_writeRequestTable + << "]"; +} + +// this can be called from setState whenever coherence permissions are +// upgraded when invoked, coherence violations will be checked for the +// given block +void +GPUCoalescer::checkCoherence(Addr addr) +{ +#ifdef CHECK_COHERENCE + m_ruby_system->checkGlobalCoherenceInvariant(addr); +#endif +} + +void +GPUCoalescer::recordRequestType(SequencerRequestType requestType) { + DPRINTF(RubyStats, "Recorded statistic: %s\n", + SequencerRequestType_to_string(requestType)); +} + +GPUCoalescer::IssueEvent::IssueEvent(GPUCoalescer* _seq) + : Event(Progress_Event_Pri), seq(_seq) +{ +} + + +void +GPUCoalescer::completeIssue() +{ + // newRequests has the cacheline addresses of all the + // requests which need to be issued to the memory subsystem + // in this cycle + int len = newRequests.size(); + DPRINTF(GPUCoalescer, "Completing issue for %d new requests.\n", len); + for (int i = 0; i < len; ++i) { + // Get the requests from reqCoalescer table. Get only the + // first request for each cacheline, the remaining requests + // can be coalesced with the first request. So, only + // one request is issued per cacheline. + RequestDesc info = reqCoalescer[newRequests[i]][0]; + PacketPtr pkt = info.first; + DPRINTF(GPUCoalescer, "Completing for newReq %d: paddr %#x\n", + i, pkt->req->getPaddr()); + // Insert this request to the read/writeRequestTables. These tables + // are used to track aliased requests in makeRequest subroutine + bool found = insertRequest(pkt, info.second[PrimaryType]); + + if (found) { + panic("GPUCoalescer::makeRequest should never be called if the " + "request is already outstanding\n"); + } + + // Issue request to ruby subsystem + issueRequest(pkt, info.second[SecondaryType]); + } + newRequests.clear(); + + // have Kernel End releases been issued this cycle + len = newKernelEnds.size(); + for (int i = 0; i < len; i++) { + kernelCallback(newKernelEnds[i]); + } + newKernelEnds.clear(); +} + +void +GPUCoalescer::IssueEvent::process() +{ + seq->completeIssue(); +} + +const char * +GPUCoalescer::IssueEvent::description() const +{ + return "Issue coalesced request"; +} + +void +GPUCoalescer::evictionCallback(Addr address) +{ + ruby_eviction_callback(address); +} + +void +GPUCoalescer::kernelCallback(int wavefront_id) +{ + assert(kernelEndList.count(wavefront_id)); + + ruby_hit_callback(kernelEndList[wavefront_id]); + + kernelEndList.erase(wavefront_id); +} + +void +GPUCoalescer::atomicCallback(Addr address, + MachineType mach, + const DataBlock& data) +{ + assert(address == makeLineAddress(address)); + + DPRINTF(GPUCoalescer, "atomic callback for address %#x\n", address); + assert(m_writeRequestTable.count(makeLineAddress(address))); + + RequestTable::iterator i = m_writeRequestTable.find(address); + assert(i != m_writeRequestTable.end()); + GPUCoalescerRequest* srequest = i->second; + + m_writeRequestTable.erase(i); + markRemoved(); + + assert((srequest->m_type == RubyRequestType_ATOMIC) || + (srequest->m_type == RubyRequestType_ATOMIC_RETURN) || + (srequest->m_type == RubyRequestType_ATOMIC_NO_RETURN)); + + + // Atomics don't write to cache, so there is no MRU update... + + recordMissLatency(srequest, mach, + srequest->issue_time, Cycles(0), Cycles(0), true, false); + + PacketPtr pkt = srequest->pkt; + Addr request_address = pkt->getAddr(); + Addr request_line_address = makeLineAddress(pkt->getAddr()); + + int len = reqCoalescer[request_line_address].size(); + std::vector<PacketPtr> mylist; + for (int i = 0; i < len; ++i) { + PacketPtr pkt = reqCoalescer[request_line_address][i].first; + assert(srequest->m_type == + reqCoalescer[request_line_address][i].second[PrimaryType]); + request_address = (pkt->getAddr()); + request_line_address = makeLineAddress(request_address); + if (pkt->getPtr<uint8_t>() && + srequest->m_type != RubyRequestType_ATOMIC_NO_RETURN) { + /* atomics are done in memory, and return the data *before* the atomic op... */ + memcpy(pkt->getPtr<uint8_t>(), + data.getData(getOffset(request_address), + pkt->getSize()), + pkt->getSize()); + } else { + DPRINTF(MemoryAccess, + "WARNING. Data not transfered from Ruby to M5 for type " \ + "%s\n", + RubyRequestType_to_string(srequest->m_type)); + } + + // If using the RubyTester, update the RubyTester sender state's + // subBlock with the recieved data. The tester will later access + // this state. + // Note: RubyPort will access it's sender state before the + // RubyTester. + if (m_usingRubyTester) { + RubyPort::SenderState *requestSenderState = + safe_cast<RubyPort::SenderState*>(pkt->senderState); + RubyTester::SenderState* testerSenderState = + safe_cast<RubyTester::SenderState*>(requestSenderState->predecessor); + testerSenderState->subBlock.mergeFrom(data); + } + + mylist.push_back(pkt); + } + delete srequest; + reqCoalescer.erase(request_line_address); + assert(!reqCoalescer.count(request_line_address)); + + completeHitCallback(mylist, len); +} + +void +GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID) +{ + if(myMachID == senderMachID) { + CP_TCPLdHits++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) { + CP_TCPLdTransfers++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) { + CP_TCCLdHits++; + } else { + CP_LdMiss++; + } +} + +void +GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID) +{ + if(myMachID == senderMachID) { + CP_TCPStHits++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCP) { + CP_TCPStTransfers++; + } else if(machineIDToMachineType(senderMachID) == MachineType_TCC) { + CP_TCCStHits++; + } else { + CP_StMiss++; + } +} + +void +GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist, int len) +{ + for (int i = 0; i < len; ++i) { + RubyPort::SenderState *ss = + safe_cast<RubyPort::SenderState *>(mylist[i]->senderState); + MemSlavePort *port = ss->port; + assert(port != NULL); + + mylist[i]->senderState = ss->predecessor; + delete ss; + port->hitCallback(mylist[i]); + trySendRetries(); + } + + testDrainComplete(); +} + +PacketPtr +GPUCoalescer::mapAddrToPkt(Addr address) +{ + RequestTable::iterator i = m_readRequestTable.find(address); + assert(i != m_readRequestTable.end()); + GPUCoalescerRequest* request = i->second; + return request->pkt; +} + +void +GPUCoalescer::recordMissLatency(GPUCoalescerRequest* srequest, + MachineType mach, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool success, bool isRegion) +{ + RubyRequestType type = srequest->m_type; + Cycles issued_time = srequest->issue_time; + Cycles completion_time = curCycle(); + assert(completion_time >= issued_time); + Cycles total_lat = completion_time - issued_time; + + // cache stats (valid for RfO protocol only) + if (mach == MachineType_TCP) { + if (type == RubyRequestType_LD) { + GPU_TCPLdHits++; + } else { + GPU_TCPStHits++; + } + } else if (mach == MachineType_L1Cache_wCC) { + if (type == RubyRequestType_LD) { + GPU_TCPLdTransfers++; + } else { + GPU_TCPStTransfers++; + } + } else if (mach == MachineType_TCC) { + if (type == RubyRequestType_LD) { + GPU_TCCLdHits++; + } else { + GPU_TCCStHits++; + } + } else { + if (type == RubyRequestType_LD) { + GPU_LdMiss++; + } else { + GPU_StMiss++; + } + } + + // Profile all access latency, even zero latency accesses + m_latencyHist.sample(total_lat); + m_typeLatencyHist[type]->sample(total_lat); + + // Profile the miss latency for all non-zero demand misses + if (total_lat != Cycles(0)) { + m_missLatencyHist.sample(total_lat); + m_missTypeLatencyHist[type]->sample(total_lat); + + if (mach != MachineType_NUM) { + m_missMachLatencyHist[mach]->sample(total_lat); + m_missTypeMachLatencyHist[type][mach]->sample(total_lat); + + if ((issued_time <= initialRequestTime) && + (initialRequestTime <= forwardRequestTime) && + (forwardRequestTime <= firstResponseTime) && + (firstResponseTime <= completion_time)) { + + m_IssueToInitialDelayHist[mach]->sample( + initialRequestTime - issued_time); + m_InitialToForwardDelayHist[mach]->sample( + forwardRequestTime - initialRequestTime); + m_ForwardToFirstResponseDelayHist[mach]->sample( + firstResponseTime - forwardRequestTime); + m_FirstResponseToCompletionDelayHist[mach]->sample( + completion_time - firstResponseTime); + } + } + + } + + DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n", + curTick(), m_version, "Coal", + success ? "Done" : "SC_Failed", "", "", + printAddress(srequest->pkt->getAddr()), total_lat); +} + +void +GPUCoalescer::regStats() +{ + // These statistical variables are not for display. + // The profiler will collate these across different + // coalescers and display those collated statistics. + m_outstandReqHist.init(10); + m_latencyHist.init(10); + m_missLatencyHist.init(10); + + for (int i = 0; i < RubyRequestType_NUM; i++) { + m_typeLatencyHist.push_back(new Stats::Histogram()); + m_typeLatencyHist[i]->init(10); + + m_missTypeLatencyHist.push_back(new Stats::Histogram()); + m_missTypeLatencyHist[i]->init(10); + } + + for (int i = 0; i < MachineType_NUM; i++) { + m_missMachLatencyHist.push_back(new Stats::Histogram()); + m_missMachLatencyHist[i]->init(10); + + m_IssueToInitialDelayHist.push_back(new Stats::Histogram()); + m_IssueToInitialDelayHist[i]->init(10); + + m_InitialToForwardDelayHist.push_back(new Stats::Histogram()); + m_InitialToForwardDelayHist[i]->init(10); + + m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram()); + m_ForwardToFirstResponseDelayHist[i]->init(10); + + m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram()); + m_FirstResponseToCompletionDelayHist[i]->init(10); + } + + for (int i = 0; i < RubyRequestType_NUM; i++) { + m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>()); + + for (int j = 0; j < MachineType_NUM; j++) { + m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram()); + m_missTypeMachLatencyHist[i][j]->init(10); + } + } + + // GPU cache stats + GPU_TCPLdHits + .name(name() + ".gpu_tcp_ld_hits") + .desc("loads that hit in the TCP") + ; + GPU_TCPLdTransfers + .name(name() + ".gpu_tcp_ld_transfers") + .desc("TCP to TCP load transfers") + ; + GPU_TCCLdHits + .name(name() + ".gpu_tcc_ld_hits") + .desc("loads that hit in the TCC") + ; + GPU_LdMiss + .name(name() + ".gpu_ld_misses") + .desc("loads that miss in the GPU") + ; + + GPU_TCPStHits + .name(name() + ".gpu_tcp_st_hits") + .desc("stores that hit in the TCP") + ; + GPU_TCPStTransfers + .name(name() + ".gpu_tcp_st_transfers") + .desc("TCP to TCP store transfers") + ; + GPU_TCCStHits + .name(name() + ".gpu_tcc_st_hits") + .desc("stores that hit in the TCC") + ; + GPU_StMiss + .name(name() + ".gpu_st_misses") + .desc("stores that miss in the GPU") + ; + + // CP cache stats + CP_TCPLdHits + .name(name() + ".cp_tcp_ld_hits") + .desc("loads that hit in the TCP") + ; + CP_TCPLdTransfers + .name(name() + ".cp_tcp_ld_transfers") + .desc("TCP to TCP load transfers") + ; + CP_TCCLdHits + .name(name() + ".cp_tcc_ld_hits") + .desc("loads that hit in the TCC") + ; + CP_LdMiss + .name(name() + ".cp_ld_misses") + .desc("loads that miss in the GPU") + ; + + CP_TCPStHits + .name(name() + ".cp_tcp_st_hits") + .desc("stores that hit in the TCP") + ; + CP_TCPStTransfers + .name(name() + ".cp_tcp_st_transfers") + .desc("TCP to TCP store transfers") + ; + CP_TCCStHits + .name(name() + ".cp_tcc_st_hits") + .desc("stores that hit in the TCC") + ; + CP_StMiss + .name(name() + ".cp_st_misses") + .desc("stores that miss in the GPU") + ; +} diff --git a/src/mem/ruby/system/GPUCoalescer.hh b/src/mem/ruby/system/GPUCoalescer.hh new file mode 100644 index 000000000..dbd47059c --- /dev/null +++ b/src/mem/ruby/system/GPUCoalescer.hh @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ +#define __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ + +#include <iostream> +#include <unordered_map> + +#include "base/statistics.hh" +#include "mem/protocol/HSAScope.hh" +#include "mem/protocol/HSASegment.hh" +#include "mem/protocol/PrefetchBit.hh" +#include "mem/protocol/RubyAccessMode.hh" +#include "mem/protocol/RubyRequestType.hh" +#include "mem/protocol/SequencerRequestType.hh" +#include "mem/request.hh" +#include "mem/ruby/common/Address.hh" +#include "mem/ruby/common/Consumer.hh" +#include "mem/ruby/system/RubyPort.hh" + +class DataBlock; +class CacheMsg; +class MachineID; +class CacheMemory; + +class RubyGPUCoalescerParams; + +HSAScope reqScopeToHSAScope(Request* req); +HSASegment reqSegmentToHSASegment(Request* req); + +struct GPUCoalescerRequest +{ + PacketPtr pkt; + RubyRequestType m_type; + Cycles issue_time; + + GPUCoalescerRequest(PacketPtr _pkt, RubyRequestType _m_type, + Cycles _issue_time) + : pkt(_pkt), m_type(_m_type), issue_time(_issue_time) + {} +}; + +std::ostream& operator<<(std::ostream& out, const GPUCoalescerRequest& obj); + +class GPUCoalescer : public RubyPort +{ + public: + typedef RubyGPUCoalescerParams Params; + GPUCoalescer(const Params *); + ~GPUCoalescer(); + + // Public Methods + void wakeup(); // Used only for deadlock detection + + void printProgress(std::ostream& out) const; + void resetStats(); + void collateStats(); + void regStats(); + + void writeCallback(Addr address, DataBlock& data); + + void writeCallback(Addr address, + MachineType mach, + DataBlock& data); + + void writeCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion); + + void writeCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime); + + void readCallback(Addr address, DataBlock& data); + + void readCallback(Addr address, + MachineType mach, + DataBlock& data); + + void readCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime); + + void readCallback(Addr address, + MachineType mach, + DataBlock& data, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion); + /* atomics need their own callback because the data + might be const coming from SLICC */ + void atomicCallback(Addr address, + MachineType mach, + const DataBlock& data); + + void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID); + void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID); + + // Alternate implementations in VIPER Coalescer + virtual RequestStatus makeRequest(PacketPtr pkt); + + int outstandingCount() const { return m_outstanding_count; } + + bool + isDeadlockEventScheduled() const + { + return deadlockCheckEvent.scheduled(); + } + + void + descheduleDeadlockEvent() + { + deschedule(deadlockCheckEvent); + } + + bool empty() const; + + void print(std::ostream& out) const; + void checkCoherence(Addr address); + + void markRemoved(); + void removeRequest(GPUCoalescerRequest* request); + void evictionCallback(Addr address); + void completeIssue(); + + void insertKernel(int wavefront_id, PacketPtr pkt); + + void recordRequestType(SequencerRequestType requestType); + Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; } + + Stats::Histogram& getLatencyHist() { return m_latencyHist; } + Stats::Histogram& getTypeLatencyHist(uint32_t t) + { return *m_typeLatencyHist[t]; } + + Stats::Histogram& getMissLatencyHist() + { return m_missLatencyHist; } + Stats::Histogram& getMissTypeLatencyHist(uint32_t t) + { return *m_missTypeLatencyHist[t]; } + + Stats::Histogram& getMissMachLatencyHist(uint32_t t) const + { return *m_missMachLatencyHist[t]; } + + Stats::Histogram& + getMissTypeMachLatencyHist(uint32_t r, uint32_t t) const + { return *m_missTypeMachLatencyHist[r][t]; } + + Stats::Histogram& getIssueToInitialDelayHist(uint32_t t) const + { return *m_IssueToInitialDelayHist[t]; } + + Stats::Histogram& + getInitialToForwardDelayHist(const MachineType t) const + { return *m_InitialToForwardDelayHist[t]; } + + Stats::Histogram& + getForwardRequestToFirstResponseHist(const MachineType t) const + { return *m_ForwardToFirstResponseDelayHist[t]; } + + Stats::Histogram& + getFirstResponseToCompletionDelayHist(const MachineType t) const + { return *m_FirstResponseToCompletionDelayHist[t]; } + + // Changed to protected to enable inheritance by VIPER Coalescer + protected: + bool tryCacheAccess(Addr addr, RubyRequestType type, + Addr pc, RubyAccessMode access_mode, + int size, DataBlock*& data_ptr); + // Alternate implementations in VIPER Coalescer + virtual void issueRequest(PacketPtr pkt, RubyRequestType type); + + void kernelCallback(int wavfront_id); + + void hitCallback(GPUCoalescerRequest* request, + MachineType mach, + DataBlock& data, + bool success, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool isRegion); + void recordMissLatency(GPUCoalescerRequest* request, + MachineType mach, + Cycles initialRequestTime, + Cycles forwardRequestTime, + Cycles firstResponseTime, + bool success, bool isRegion); + void completeHitCallback(std::vector<PacketPtr> & mylist, int len); + PacketPtr mapAddrToPkt(Addr address); + + + RequestStatus getRequestStatus(PacketPtr pkt, + RubyRequestType request_type); + bool insertRequest(PacketPtr pkt, RubyRequestType request_type); + + bool handleLlsc(Addr address, GPUCoalescerRequest* request); + + // Private copy constructor and assignment operator + GPUCoalescer(const GPUCoalescer& obj); + GPUCoalescer& operator=(const GPUCoalescer& obj); + + class IssueEvent : public Event + { + private: + GPUCoalescer *seq; + public: + IssueEvent(GPUCoalescer *_seq); + void process(); + const char *description() const; + }; + + IssueEvent issueEvent; + + + // Changed to protected to enable inheritance by VIPER Coalescer + protected: + int m_max_outstanding_requests; + int m_deadlock_threshold; + + CacheMemory* m_dataCache_ptr; + CacheMemory* m_instCache_ptr; + + // The cache access latency for this GPU data cache. This is assessed at the + // beginning of each access. This should be very similar to the + // implementation in Sequencer() as this is very much like a Sequencer + Cycles m_data_cache_hit_latency; + + // We need to track both the primary and secondary request types. + // The secondary request type comprises a subset of RubyRequestTypes that + // are understood by the L1 Controller. A primary request type can be any + // RubyRequestType. + enum {PrimaryType, SecondaryType}; + typedef std::pair<PacketPtr, std::vector<RubyRequestType> > RequestDesc; + typedef std::unordered_map<Addr, std::vector<RequestDesc> > CoalescingTable; + CoalescingTable reqCoalescer; + std::vector<Addr> newRequests; + + typedef std::unordered_map<Addr, GPUCoalescerRequest*> RequestTable; + RequestTable m_writeRequestTable; + RequestTable m_readRequestTable; + // Global outstanding request count, across all request tables + int m_outstanding_count; + bool m_deadlock_check_scheduled; + std::unordered_map<int, PacketPtr> kernelEndList; + std::vector<int> newKernelEnds; + + int m_store_waiting_on_load_cycles; + int m_store_waiting_on_store_cycles; + int m_load_waiting_on_store_cycles; + int m_load_waiting_on_load_cycles; + + bool m_usingNetworkTester; + + class GPUCoalescerWakeupEvent : public Event + { + private: + GPUCoalescer *m_GPUCoalescer_ptr; + + public: + GPUCoalescerWakeupEvent(GPUCoalescer *_seq) : + m_GPUCoalescer_ptr(_seq) {} + void process() { m_GPUCoalescer_ptr->wakeup(); } + const char *description() const + { + return "GPUCoalescer deadlock check"; + } + }; + + GPUCoalescerWakeupEvent deadlockCheckEvent; + bool assumingRfOCoherence; + + // m5 style stats for TCP hit/miss counts + Stats::Scalar GPU_TCPLdHits; + Stats::Scalar GPU_TCPLdTransfers; + Stats::Scalar GPU_TCCLdHits; + Stats::Scalar GPU_LdMiss; + + Stats::Scalar GPU_TCPStHits; + Stats::Scalar GPU_TCPStTransfers; + Stats::Scalar GPU_TCCStHits; + Stats::Scalar GPU_StMiss; + + Stats::Scalar CP_TCPLdHits; + Stats::Scalar CP_TCPLdTransfers; + Stats::Scalar CP_TCCLdHits; + Stats::Scalar CP_LdMiss; + + Stats::Scalar CP_TCPStHits; + Stats::Scalar CP_TCPStTransfers; + Stats::Scalar CP_TCCStHits; + Stats::Scalar CP_StMiss; + + //! Histogram for number of outstanding requests per cycle. + Stats::Histogram m_outstandReqHist; + + //! Histogram for holding latency profile of all requests. + Stats::Histogram m_latencyHist; + std::vector<Stats::Histogram *> m_typeLatencyHist; + + //! Histogram for holding latency profile of all requests that + //! miss in the controller connected to this sequencer. + Stats::Histogram m_missLatencyHist; + std::vector<Stats::Histogram *> m_missTypeLatencyHist; + + //! Histograms for profiling the latencies for requests that + //! required external messages. + std::vector<Stats::Histogram *> m_missMachLatencyHist; + std::vector< std::vector<Stats::Histogram *> > m_missTypeMachLatencyHist; + + //! Histograms for recording the breakdown of miss latency + std::vector<Stats::Histogram *> m_IssueToInitialDelayHist; + std::vector<Stats::Histogram *> m_InitialToForwardDelayHist; + std::vector<Stats::Histogram *> m_ForwardToFirstResponseDelayHist; + std::vector<Stats::Histogram *> m_FirstResponseToCompletionDelayHist; +}; + +inline std::ostream& +operator<<(std::ostream& out, const GPUCoalescer& obj) +{ + obj.print(out); + out << std::flush; + return out; +} + +#endif // __MEM_RUBY_SYSTEM_GPU_COALESCER_HH__ + diff --git a/src/mem/ruby/system/GPUCoalescer.py b/src/mem/ruby/system/GPUCoalescer.py new file mode 100644 index 000000000..0c19f875d --- /dev/null +++ b/src/mem/ruby/system/GPUCoalescer.py @@ -0,0 +1,48 @@ +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Steve Reinhardt +# Brad Beckmann + +from m5.params import * +from m5.proxy import * +from Sequencer import * + +class RubyGPUCoalescer(RubySequencer): + type = 'RubyGPUCoalescer' + cxx_class = 'GPUCoalescer' + cxx_header = "mem/ruby/system/GPUCoalescer.hh" + + # max_outstanding_requests = (wave front slots) x (wave front size) + max_outstanding_requests = Param.Int(40*64, + "max requests (incl. prefetches) outstanding") + assume_rfo = Param.Bool(True, "assume protocol implementes Read for " + "Ownership coherence"); diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc index 5a5f528bb..bf4002126 100644 --- a/src/mem/ruby/system/RubyPort.cc +++ b/src/mem/ruby/system/RubyPort.cc @@ -60,7 +60,8 @@ RubyPort::RubyPort(const Params *p) memSlavePort(csprintf("%s-mem-slave-port", name()), this, p->ruby_system->getAccessBackingStore(), -1, p->no_retry_on_stall), - gotAddrRanges(p->port_master_connection_count) + gotAddrRanges(p->port_master_connection_count), + m_isCPUSequencer(p->is_cpu_sequencer) { assert(m_version != -1); diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh index 07e0fde5a..6bd92b654 100644 --- a/src/mem/ruby/system/RubyPort.hh +++ b/src/mem/ruby/system/RubyPort.hh @@ -167,6 +167,8 @@ class RubyPort : public MemObject uint32_t getId() { return m_version; } DrainState drain() override; + bool isCPUSequencer() { return m_isCPUSequencer; } + protected: void trySendRetries(); void ruby_hit_callback(PacketPtr pkt); @@ -218,6 +220,8 @@ class RubyPort : public MemObject // that should be called when the Sequencer becomes available after a stall. // std::vector<MemSlavePort *> retryList; + + bool m_isCPUSequencer; }; #endif // __MEM_RUBY_SYSTEM_RUBYPORT_HH__ diff --git a/src/mem/ruby/system/RubySystem.cc b/src/mem/ruby/system/RubySystem.cc index 1ecd2e098..e1717e519 100644 --- a/src/mem/ruby/system/RubySystem.cc +++ b/src/mem/ruby/system/RubySystem.cc @@ -107,7 +107,7 @@ RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace, Sequencer* sequencer_ptr = NULL; for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) { - sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer()); + sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getCPUSequencer()); if (sequencer_ptr == NULL) { sequencer_ptr = sequencer_map[cntrl]; } diff --git a/src/mem/ruby/system/SConscript b/src/mem/ruby/system/SConscript index 8c5077362..b67311bca 100644 --- a/src/mem/ruby/system/SConscript +++ b/src/mem/ruby/system/SConscript @@ -33,12 +33,22 @@ Import('*') if env['PROTOCOL'] == 'None': Return() +if env['BUILD_GPU']: + SimObject('GPUCoalescer.py') SimObject('RubySystem.py') SimObject('Sequencer.py') +SimObject('WeightedLRUReplacementPolicy.py') +if env['BUILD_GPU']: + SimObject('VIPERCoalescer.py') Source('CacheRecorder.cc') Source('DMASequencer.cc') +if env['BUILD_GPU']: + Source('GPUCoalescer.cc') Source('RubyPort.cc') Source('RubyPortProxy.cc') Source('RubySystem.cc') Source('Sequencer.cc') +if env['BUILD_GPU']: + Source('VIPERCoalescer.cc') +Source('WeightedLRUPolicy.cc') diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc index 50418c700..c2727b41d 100644 --- a/src/mem/ruby/system/Sequencer.cc +++ b/src/mem/ruby/system/Sequencer.cc @@ -63,6 +63,7 @@ Sequencer::Sequencer(const Params *p) m_max_outstanding_requests = p->max_outstanding_requests; m_deadlock_threshold = p->deadlock_threshold; + m_coreId = p->coreid; // for tracking the two CorePair sequencers assert(m_max_outstanding_requests > 0); assert(m_deadlock_threshold > 0); assert(m_instCache_ptr != NULL); @@ -593,6 +594,8 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) ContextID proc_id = pkt->req->hasContextId() ? pkt->req->contextId() : InvalidContextID; + ContextID core_id = coreId(); + // If valid, copy the pc to the ruby request Addr pc = 0; if (pkt->req->hasPC()) { @@ -607,7 +610,7 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) nullptr : pkt->getPtr<uint8_t>(), pkt->getSize(), pc, secondary_type, RubyAccessMode_Supervisor, pkt, - PrefetchBit_No, proc_id); + PrefetchBit_No, proc_id, core_id); DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %#x %s\n", curTick(), m_version, "Seq", "Begin", "", "", diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh index 47af7ea1e..2a2f49587 100644 --- a/src/mem/ruby/system/Sequencer.hh +++ b/src/mem/ruby/system/Sequencer.hh @@ -99,6 +99,7 @@ class Sequencer : public RubyPort void markRemoved(); void evictionCallback(Addr address); void invalidateSC(Addr address); + int coreId() const { return m_coreId; } void recordRequestType(SequencerRequestType requestType); Stats::Histogram& getOutstandReqHist() { return m_outstandReqHist; } @@ -198,6 +199,8 @@ class Sequencer : public RubyPort Stats::Scalar m_load_waiting_on_store; Stats::Scalar m_load_waiting_on_load; + int m_coreId; + bool m_usingNetworkTester; //! Histogram for number of outstanding requests per cycle. diff --git a/src/mem/ruby/system/Sequencer.py b/src/mem/ruby/system/Sequencer.py index 7c90eb29c..d6ee0aa2f 100644 --- a/src/mem/ruby/system/Sequencer.py +++ b/src/mem/ruby/system/Sequencer.py @@ -32,54 +32,58 @@ from m5.proxy import * from MemObject import MemObject class RubyPort(MemObject): - type = 'RubyPort' - abstract = True - cxx_header = "mem/ruby/system/RubyPort.hh" - version = Param.Int(0, "") + type = 'RubyPort' + abstract = True + cxx_header = "mem/ruby/system/RubyPort.hh" + version = Param.Int(0, "") - slave = VectorSlavePort("CPU slave port") - master = VectorMasterPort("CPU master port") - pio_master_port = MasterPort("Ruby mem master port") - mem_master_port = MasterPort("Ruby mem master port") - pio_slave_port = SlavePort("Ruby pio slave port") - mem_slave_port = SlavePort("Ruby memory port") + slave = VectorSlavePort("CPU slave port") + master = VectorMasterPort("CPU master port") + pio_master_port = MasterPort("Ruby mem master port") + mem_master_port = MasterPort("Ruby mem master port") + pio_slave_port = SlavePort("Ruby pio slave port") + mem_slave_port = SlavePort("Ruby memory port") - using_ruby_tester = Param.Bool(False, "") - no_retry_on_stall = Param.Bool(False, "") - ruby_system = Param.RubySystem(Parent.any, "") - system = Param.System(Parent.any, "system object") - support_data_reqs = Param.Bool(True, "data cache requests supported") - support_inst_reqs = Param.Bool(True, "inst cache requests supported") + using_ruby_tester = Param.Bool(False, "") + no_retry_on_stall = Param.Bool(False, "") + ruby_system = Param.RubySystem(Parent.any, "") + system = Param.System(Parent.any, "system object") + support_data_reqs = Param.Bool(True, "data cache requests supported") + support_inst_reqs = Param.Bool(True, "inst cache requests supported") + is_cpu_sequencer = Param.Bool(True, "connected to a cpu") class RubyPortProxy(RubyPort): - type = 'RubyPortProxy' - cxx_header = "mem/ruby/system/RubyPortProxy.hh" + type = 'RubyPortProxy' + cxx_header = "mem/ruby/system/RubyPortProxy.hh" class RubySequencer(RubyPort): - type = 'RubySequencer' - cxx_class = 'Sequencer' - cxx_header = "mem/ruby/system/Sequencer.hh" + type = 'RubySequencer' + cxx_class = 'Sequencer' + cxx_header = "mem/ruby/system/Sequencer.hh" - icache = Param.RubyCache("") - dcache = Param.RubyCache("") - # Cache latencies currently assessed at the beginning of each access - # NOTE: Setting these values to a value greater than one will result in - # O3 CPU pipeline bubbles and negatively impact performance - # TODO: Latencies should be migrated into each top-level cache controller - icache_hit_latency = Param.Cycles(1, "Inst cache hit latency") - dcache_hit_latency = Param.Cycles(1, "Data cache hit latency") - max_outstanding_requests = Param.Int(16, - "max requests (incl. prefetches) outstanding") - deadlock_threshold = Param.Cycles(500000, - "max outstanding cycles for a request before deadlock/livelock declared") - using_network_tester = Param.Bool(False, "") + icache = Param.RubyCache("") + dcache = Param.RubyCache("") + # Cache latencies currently assessed at the beginning of each access + # NOTE: Setting these values to a value greater than one will result in + # O3 CPU pipeline bubbles and negatively impact performance + # TODO: Latencies should be migrated into each top-level cache controller + icache_hit_latency = Param.Cycles(1, "Inst cache hit latency") + dcache_hit_latency = Param.Cycles(1, "Data cache hit latency") + max_outstanding_requests = Param.Int(16, + "max requests (incl. prefetches) outstanding") + deadlock_threshold = Param.Cycles(500000, + "max outstanding cycles for a request before deadlock/livelock declared") + using_network_tester = Param.Bool(False, "") + # id used by protocols that support multiple sequencers per controller + # 99 is the dummy default value + coreid = Param.Int(99, "CorePair core id") class DMASequencer(MemObject): - type = 'DMASequencer' - cxx_header = "mem/ruby/system/DMASequencer.hh" + type = 'DMASequencer' + cxx_header = "mem/ruby/system/DMASequencer.hh" - version = Param.Int(0, "") - slave = SlavePort("Device slave port") - using_ruby_tester = Param.Bool(False, "") - ruby_system = Param.RubySystem(Parent.any, "") - system = Param.System(Parent.any, "system object") + version = Param.Int(0, "") + slave = SlavePort("Device slave port") + using_ruby_tester = Param.Bool(False, "") + ruby_system = Param.RubySystem(Parent.any, "") + system = Param.System(Parent.any, "system object") diff --git a/src/mem/ruby/system/VIPERCoalescer.cc b/src/mem/ruby/system/VIPERCoalescer.cc new file mode 100644 index 000000000..ca91f2723 --- /dev/null +++ b/src/mem/ruby/system/VIPERCoalescer.cc @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#include "base/misc.hh" +#include "base/str.hh" +#include "config/the_isa.hh" + +#if THE_ISA == X86_ISA +#include "arch/x86/insts/microldstop.hh" + +#endif // X86_ISA +#include "mem/ruby/system/VIPERCoalescer.hh" + +#include "cpu/testers/rubytest/RubyTester.hh" +#include "debug/GPUCoalescer.hh" +#include "debug/MemoryAccess.hh" +#include "mem/packet.hh" +#include "mem/ruby/common/SubBlock.hh" +#include "mem/ruby/network/MessageBuffer.hh" +#include "mem/ruby/profiler/Profiler.hh" +#include "mem/ruby/slicc_interface/AbstractController.hh" +#include "mem/ruby/slicc_interface/RubyRequest.hh" +#include "mem/ruby/structures/CacheMemory.hh" +#include "mem/ruby/system/GPUCoalescer.hh" +#include "mem/ruby/system/RubySystem.hh" +#include "params/VIPERCoalescer.hh" + +using namespace std; + +VIPERCoalescer * +VIPERCoalescerParams::create() +{ + return new VIPERCoalescer(this); +} + +VIPERCoalescer::VIPERCoalescer(const Params *p) + : GPUCoalescer(p) +{ + m_max_wb_per_cycle=p->max_wb_per_cycle; + m_max_inv_per_cycle=p->max_inv_per_cycle; + m_outstanding_inv = 0; + m_outstanding_wb = 0; +} + +VIPERCoalescer::~VIPERCoalescer() +{ +} + +// Analyzes the packet to see if this request can be coalesced. +// If request can be coalesced, this request is added to the reqCoalescer table +// and makeRequest returns RequestStatus_Issued; +// If this is the first request to a cacheline, request is added to both +// newRequests queue and to the reqCoalescer table; makeRequest +// returns RequestStatus_Issued. +// If there is a pending request to this cacheline and this request +// can't be coalesced, RequestStatus_Aliased is returned and +// the packet needs to be reissued. +RequestStatus +VIPERCoalescer::makeRequest(PacketPtr pkt) +{ + if (m_outstanding_wb | m_outstanding_inv) { + DPRINTF(GPUCoalescer, + "There are %d Writebacks and %d Invalidatons\n", + m_outstanding_wb, m_outstanding_inv); + } + // Are we in the middle of a release + if ((m_outstanding_wb) > 0) { + if (pkt->req->isKernel()) { + // Everythign is fine + // Barriers and Kernel End scan coalesce + // If it is a Kerenl Begin flush the cache + if (pkt->req->isAcquire() && (m_outstanding_inv == 0)) { + invL1(); + } + + if (pkt->req->isRelease()) { + insertKernel(pkt->req->contextId(), pkt); + } + + return RequestStatus_Issued; + } +// return RequestStatus_Aliased; + } else if (pkt->req->isKernel() && pkt->req->isRelease()) { + // Flush Dirty Data on Kernel End + // isKernel + isRelease + insertKernel(pkt->req->contextId(), pkt); + wbL1(); + if(m_outstanding_wb == 0) { + for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) { + newKernelEnds.push_back(it->first); + } + completeIssue(); + } + return RequestStatus_Issued; + } + RequestStatus requestStatus = GPUCoalescer::makeRequest(pkt); + if (requestStatus!=RequestStatus_Issued) { + // Request not isssued + // enqueue Retry + DPRINTF(GPUCoalescer, "Request not issued by GPUCoaleser\n"); + return requestStatus; + } else if (pkt->req->isKernel() && pkt->req->isAcquire()) { + // Invalidate clean Data on Kernel Begin + // isKernel + isAcquire + invL1(); + } else if (pkt->req->isAcquire() && pkt->req->isRelease()) { + // Deschedule the AtomicAcqRel and + // Flush and Invalidate the L1 cache + invwbL1(); + if (m_outstanding_wb > 0 && issueEvent.scheduled()) { + DPRINTF(GPUCoalescer, "issueEvent Descheduled\n"); + deschedule(issueEvent); + } + } else if (pkt->req->isRelease()) { + // Deschedule the StoreRel and + // Flush the L1 cache + wbL1(); + if (m_outstanding_wb > 0 && issueEvent.scheduled()) { + DPRINTF(GPUCoalescer, "issueEvent Descheduled\n"); + deschedule(issueEvent); + } + } else if (pkt->req->isAcquire()) { + // LoadAcq or AtomicAcq + // Invalidate the L1 cache + invL1(); + } + // Request was successful + if (m_outstanding_wb == 0) { + if (!issueEvent.scheduled()) { + DPRINTF(GPUCoalescer, "issueEvent Rescheduled\n"); + schedule(issueEvent, curTick()); + } + } + return RequestStatus_Issued; +} + +void +VIPERCoalescer::wbCallback(Addr addr) +{ + m_outstanding_wb--; + // if L1 Flush Complete + // attemnpt to schedule issueEvent + assert(((int) m_outstanding_wb) >= 0); + if (m_outstanding_wb == 0) { + for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) { + newKernelEnds.push_back(it->first); + } + completeIssue(); + } + trySendRetries(); +} + +void +VIPERCoalescer::invCallback(Addr addr) +{ + m_outstanding_inv--; + // if L1 Flush Complete + // attemnpt to schedule issueEvent + // This probably won't happen, since + // we dont wait on cache invalidations + if (m_outstanding_wb == 0) { + for (auto it = kernelEndList.begin(); it != kernelEndList.end(); it++) { + newKernelEnds.push_back(it->first); + } + completeIssue(); + } + trySendRetries(); +} + +/** + * Invalidate L1 cache (Acquire) + */ +void +VIPERCoalescer::invL1() +{ + int size = m_dataCache_ptr->getNumBlocks(); + DPRINTF(GPUCoalescer, + "There are %d Invalidations outstanding before Cache Walk\n", + m_outstanding_inv); + // Walk the cache + for (int i = 0; i < size; i++) { + Addr addr = m_dataCache_ptr->getAddressAtIdx(i); + // Evict Read-only data + std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>( + clockEdge(), addr, (uint8_t*) 0, 0, 0, + RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor, + nullptr); + assert(m_mandatory_q_ptr != NULL); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); + m_outstanding_inv++; + } + DPRINTF(GPUCoalescer, + "There are %d Invalidatons outstanding after Cache Walk\n", + m_outstanding_inv); +} + +/** + * Writeback L1 cache (Release) + */ +void +VIPERCoalescer::wbL1() +{ + int size = m_dataCache_ptr->getNumBlocks(); + DPRINTF(GPUCoalescer, + "There are %d Writebacks outstanding before Cache Walk\n", + m_outstanding_wb); + // Walk the cache + for (int i = 0; i < size; i++) { + Addr addr = m_dataCache_ptr->getAddressAtIdx(i); + // Write dirty data back + std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>( + clockEdge(), addr, (uint8_t*) 0, 0, 0, + RubyRequestType_FLUSH, RubyAccessMode_Supervisor, + nullptr); + assert(m_mandatory_q_ptr != NULL); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); + m_outstanding_wb++; + } + DPRINTF(GPUCoalescer, + "There are %d Writebacks outstanding after Cache Walk\n", + m_outstanding_wb); +} + +/** + * Invalidate and Writeback L1 cache (Acquire&Release) + */ +void +VIPERCoalescer::invwbL1() +{ + int size = m_dataCache_ptr->getNumBlocks(); + // Walk the cache + for(int i = 0; i < size; i++) { + Addr addr = m_dataCache_ptr->getAddressAtIdx(i); + // Evict Read-only data + std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>( + clockEdge(), addr, (uint8_t*) 0, 0, 0, + RubyRequestType_REPLACEMENT, RubyAccessMode_Supervisor, + nullptr); + assert(m_mandatory_q_ptr != NULL); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); + m_outstanding_inv++; + } + // Walk the cache + for(int i = 0; i< size; i++) { + Addr addr = m_dataCache_ptr->getAddressAtIdx(i); + // Write dirty data back + std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>( + clockEdge(), addr, (uint8_t*) 0, 0, 0, + RubyRequestType_FLUSH, RubyAccessMode_Supervisor, + nullptr); + assert(m_mandatory_q_ptr != NULL); + m_mandatory_q_ptr->enqueue(msg, clockEdge(), m_data_cache_hit_latency); + m_outstanding_wb++; + } +} diff --git a/src/mem/ruby/system/VIPERCoalescer.hh b/src/mem/ruby/system/VIPERCoalescer.hh new file mode 100644 index 000000000..af6e44e7f --- /dev/null +++ b/src/mem/ruby/system/VIPERCoalescer.hh @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __MEM_RUBY_SYSTEM_VI_COALESCER_HH__ +#define __MEM_RUBY_SYSTEM_VI_COALESCER_HH__ + +#include <iostream> + +#include "mem/protocol/PrefetchBit.hh" +#include "mem/protocol/RubyAccessMode.hh" +#include "mem/protocol/RubyRequestType.hh" +#include "mem/ruby/common/Address.hh" +#include "mem/ruby/common/Consumer.hh" +#include "mem/ruby/system/GPUCoalescer.hh" +#include "mem/ruby/system/RubyPort.hh" + +class DataBlock; +class CacheMsg; +class MachineID; +class CacheMemory; + +class VIPERCoalescerParams; + +class VIPERCoalescer : public GPUCoalescer +{ + public: + typedef VIPERCoalescerParams Params; + VIPERCoalescer(const Params *); + ~VIPERCoalescer(); + void wbCallback(Addr address); + void invCallback(Addr address); + RequestStatus makeRequest(PacketPtr pkt); + private: + void invL1(); + void wbL1(); + void invwbL1(); + uint64_t m_outstanding_inv; + uint64_t m_outstanding_wb; + uint64_t m_max_inv_per_cycle; + uint64_t m_max_wb_per_cycle; +}; +#endif // __MEM_RUBY_SYSTEM_VI_COALESCER_HH__ + diff --git a/src/mem/ruby/system/VIPERCoalescer.py b/src/mem/ruby/system/VIPERCoalescer.py new file mode 100644 index 000000000..05c74386f --- /dev/null +++ b/src/mem/ruby/system/VIPERCoalescer.py @@ -0,0 +1,45 @@ +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Steve Reinhardt +# Brad Beckmann + +from m5.params import * +from m5.proxy import * +from GPUCoalescer import * + +class VIPERCoalescer(RubyGPUCoalescer): + type = 'VIPERCoalescer' + cxx_class = 'VIPERCoalescer' + cxx_header = "mem/ruby/system/VIPERCoalescer.hh" + max_inv_per_cycle = Param.Int(32, "max invalidations per cycle") + max_wb_per_cycle = Param.Int(32, "max writebacks per cycle") + assume_rfo = False diff --git a/src/mem/ruby/system/WeightedLRUPolicy.cc b/src/mem/ruby/system/WeightedLRUPolicy.cc new file mode 100644 index 000000000..5baa4d9a5 --- /dev/null +++ b/src/mem/ruby/system/WeightedLRUPolicy.cc @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Derek Hower + */ + +#include "mem/ruby/system/WeightedLRUPolicy.hh" + +WeightedLRUPolicy::WeightedLRUPolicy(const Params* p) + : AbstractReplacementPolicy(p), m_cache(p->cache) +{ + m_last_occ_ptr = new int*[m_num_sets]; + for(unsigned i = 0; i < m_num_sets; i++){ + m_last_occ_ptr[i] = new int[m_assoc]; + for(unsigned j = 0; j < m_assoc; j++){ + m_last_occ_ptr[i][j] = 0; + } + } +} + +WeightedLRUPolicy * +WeightedLRUReplacementPolicyParams::create() +{ + return new WeightedLRUPolicy(this); +} + +WeightedLRUPolicy::~WeightedLRUPolicy() +{ + if (m_last_occ_ptr != NULL){ + for (unsigned i = 0; i < m_num_sets; i++){ + if (m_last_occ_ptr[i] != NULL){ + delete[] m_last_occ_ptr[i]; + } + } + delete[] m_last_occ_ptr; + } +} + +void +WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time) +{ + assert(index >= 0 && index < m_assoc); + assert(set >= 0 && set < m_num_sets); + + m_last_ref_ptr[set][index] = time; +} + +void +WeightedLRUPolicy::touch(int64_t set, int64_t index, Tick time, int occupancy) +{ + assert(index >= 0 && index < m_assoc); + assert(set >= 0 && set < m_num_sets); + + m_last_ref_ptr[set][index] = time; + m_last_occ_ptr[set][index] = occupancy; +} + +int64_t +WeightedLRUPolicy::getVictim(int64_t set) const +{ + Tick time, smallest_time; + int64_t smallest_index; + + smallest_index = 0; + smallest_time = m_last_ref_ptr[set][0]; + int smallest_weight = m_last_ref_ptr[set][0]; + + for (unsigned i = 1; i < m_assoc; i++) { + + int weight = m_last_occ_ptr[set][i]; + if (weight < smallest_weight) { + smallest_weight = weight; + smallest_index = i; + smallest_time = m_last_ref_ptr[set][i]; + } else if (weight == smallest_weight) { + time = m_last_ref_ptr[set][i]; + if (time < smallest_time) { + smallest_index = i; + smallest_time = time; + } + } + } + return smallest_index; +} diff --git a/src/mem/ruby/system/WeightedLRUPolicy.hh b/src/mem/ruby/system/WeightedLRUPolicy.hh new file mode 100644 index 000000000..3150779b2 --- /dev/null +++ b/src/mem/ruby/system/WeightedLRUPolicy.hh @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2013-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Sooraj Puthoor + */ + +#ifndef __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__ +#define __MEM_RUBY_SYSTEM_WEIGHTEDLRUPOLICY_HH__ + +#include "mem/ruby/structures/AbstractReplacementPolicy.hh" +#include "mem/ruby/structures/CacheMemory.hh" +#include "params/WeightedLRUReplacementPolicy.hh" + +/* Simple true LRU replacement policy */ + +class WeightedLRUPolicy : public AbstractReplacementPolicy +{ + public: + typedef WeightedLRUReplacementPolicyParams Params; + WeightedLRUPolicy(const Params* p); + ~WeightedLRUPolicy(); + + void touch(int64_t set, int64_t way, Tick time); + void touch(int64_t set, int64_t way, Tick time, int occupancy); + int64_t getVictim(int64_t set) const override; + + bool useOccupancy() const { return true; } + + CacheMemory * m_cache; + int **m_last_occ_ptr; +}; + +#endif // __MEM_RUBY_SYSTEM_WeightedLRUPolicy_HH__ diff --git a/src/mem/ruby/system/WeightedLRUReplacementPolicy.py b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py new file mode 100644 index 000000000..e7de33496 --- /dev/null +++ b/src/mem/ruby/system/WeightedLRUReplacementPolicy.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2013-2015 Advanced Micro Devices, Inc. +# All rights reserved. +# +# For use for simulation and test purposes only +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# Author: Derek Hower +# + +from m5.params import * +from m5.proxy import * +from MemObject import MemObject +from ReplacementPolicy import ReplacementPolicy + +class WeightedLRUReplacementPolicy(ReplacementPolicy): + type = "WeightedLRUReplacementPolicy" + cxx_class = "WeightedLRUPolicy" + cxx_header = "mem/ruby/system/WeightedLRUPolicy.hh" + cache = Param.RubyCache("") |