diff options
Diffstat (limited to 'src/gpu-compute/tlb_coalescer.cc')
-rw-r--r-- | src/gpu-compute/tlb_coalescer.cc | 583 |
1 files changed, 583 insertions, 0 deletions
diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc new file mode 100644 index 000000000..835d7b740 --- /dev/null +++ b/src/gpu-compute/tlb_coalescer.cc @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#include "gpu-compute/tlb_coalescer.hh" + +#include <cstring> + +#include "debug/GPUTLB.hh" + +TLBCoalescer::TLBCoalescer(const Params *p) : MemObject(p), + clock(p->clk_domain->clockPeriod()), TLBProbesPerCycle(p->probesPerCycle), + coalescingWindow(p->coalescingWindow), + disableCoalescing(p->disableCoalescing), probeTLBEvent(this), + cleanupEvent(this) +{ + // create the slave ports based on the number of connected ports + for (size_t i = 0; i < p->port_slave_connection_count; ++i) { + cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i), + this, i)); + } + + // create the master ports based on the number of connected ports + for (size_t i = 0; i < p->port_master_connection_count; ++i) { + memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i), + this, i)); + } +} + +BaseSlavePort& +TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx) +{ + if (if_name == "slave") { + if (idx >= static_cast<PortID>(cpuSidePort.size())) { + panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx); + } + + return *cpuSidePort[idx]; + } else { + panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name); + } +} + +BaseMasterPort& +TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx) +{ + if (if_name == "master") { + if (idx >= static_cast<PortID>(memSidePort.size())) { + panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx); + } + + return *memSidePort[idx]; + } else { + panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name); + } +} + +/* + * This method returns true if the <incoming_pkt> + * can be coalesced with <coalesced_pkt> and false otherwise. + * A given set of rules is checked. + * The rules can potentially be modified based on the TLB level. + */ +bool +TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt) +{ + if (disableCoalescing) + return false; + + TheISA::GpuTLB::TranslationState *incoming_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState); + + TheISA::GpuTLB::TranslationState *coalesced_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState); + + // Rule 1: Coalesce requests only if they + // fall within the same virtual page + Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(), + TheISA::PageBytes); + + Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(), + TheISA::PageBytes); + + if (incoming_virt_page_addr != coalesced_virt_page_addr) + return false; + + //* Rule 2: Coalesce requests only if they + // share a TLB Mode, i.e. they are both read + // or write requests. + BaseTLB::Mode incoming_mode = incoming_state->tlbMode; + BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode; + + if (incoming_mode != coalesced_mode) + return false; + + // when we can coalesce a packet update the reqCnt + // that is the number of packets represented by + // this coalesced packet + if (!incoming_state->prefetch) + coalesced_state->reqCnt.back() += incoming_state->reqCnt.back(); + + return true; +} + +/* + * We need to update the physical addresses of all the translation requests + * that were coalesced into the one that just returned. + */ +void +TLBCoalescer::updatePhysAddresses(PacketPtr pkt) +{ + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); + + DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n", + issuedTranslationsTable[virt_page_addr].size(), virt_page_addr); + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); + + TheISA::GpuTlbEntry *tlb_entry = sender_state->tlbEntry; + assert(tlb_entry); + Addr first_entry_vaddr = tlb_entry->vaddr; + Addr first_entry_paddr = tlb_entry->paddr; + int page_size = tlb_entry->size(); + bool uncacheable = tlb_entry->uncacheable; + int first_hit_level = sender_state->hitLevel; + bool valid = tlb_entry->valid; + + // Get the physical page address of the translated request + // Using the page_size specified in the TLBEntry allows us + // to support different page sizes. + Addr phys_page_paddr = pkt->req->getPaddr(); + phys_page_paddr &= ~(page_size - 1); + + for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) { + PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i]; + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast<TheISA::GpuTLB::TranslationState*>( + local_pkt->senderState); + + // we are sending the packet back, so pop the reqCnt associated + // with this level in the TLB hiearchy + if (!sender_state->prefetch) + sender_state->reqCnt.pop_back(); + + /* + * Only the first packet from this coalesced request has been + * translated. Grab the translated phys. page addr and update the + * physical addresses of the remaining packets with the appropriate + * page offsets. + */ + if (i) { + Addr paddr = phys_page_paddr; + paddr |= (local_pkt->req->getVaddr() & (page_size - 1)); + local_pkt->req->setPaddr(paddr); + + if (uncacheable) + local_pkt->req->setFlags(Request::UNCACHEABLE); + + // update senderState->tlbEntry, so we can insert + // the correct TLBEentry in the TLBs above. + sender_state->tlbEntry = + new TheISA::GpuTlbEntry(0, first_entry_vaddr, first_entry_paddr, + valid); + + // update the hitLevel for all uncoalesced reqs + // so that each packet knows where it hit + // (used for statistics in the CUs) + sender_state->hitLevel = first_hit_level; + } + + SlavePort *return_port = sender_state->ports.back(); + sender_state->ports.pop_back(); + + // Translation is done - Convert to a response pkt if necessary and + // send the translation back + if (local_pkt->isRequest()) { + local_pkt->makeTimingResponse(); + } + + return_port->sendTimingResp(local_pkt); + } + + // schedule clean up for end of this cycle + // This is a maximum priority event and must be on + // the same cycle as GPUTLB cleanup event to prevent + // race conditions with an IssueProbeEvent caused by + // MemSidePort::recvReqRetry + cleanupQueue.push(virt_page_addr); + + if (!cleanupEvent.scheduled()) + schedule(cleanupEvent, curTick()); +} + +// Receive translation requests, create a coalesced request, +// and send them to the TLB (TLBProbesPerCycle) +bool +TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt) +{ + // first packet of a coalesced request + PacketPtr first_packet = nullptr; + // true if we are able to do coalescing + bool didCoalesce = false; + // number of coalesced reqs for a given window + int coalescedReq_cnt = 0; + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); + + // push back the port to remember the path back + sender_state->ports.push_back(this); + + bool update_stats = !sender_state->prefetch; + + if (update_stats) { + // if reqCnt is empty then this packet does not represent + // multiple uncoalesced reqs(pkts) but just a single pkt. + // If it does though then the reqCnt for each level in the + // hierarchy accumulates the total number of reqs this packet + // represents + int req_cnt = 1; + + if (!sender_state->reqCnt.empty()) + req_cnt = sender_state->reqCnt.back(); + + sender_state->reqCnt.push_back(req_cnt); + + // update statistics + coalescer->uncoalescedAccesses++; + req_cnt = sender_state->reqCnt.back(); + DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt); + coalescer->queuingCycles -= (curTick() * req_cnt); + coalescer->localqueuingCycles -= curTick(); + } + + // FIXME if you want to coalesce not based on the issueTime + // of the packets (i.e., from the compute unit's perspective) + // but based on when they reached this coalescer then + // remove the following if statement and use curTick() or + // coalescingWindow for the tick_index. + if (!sender_state->issueTime) + sender_state->issueTime = curTick(); + + // The tick index is used as a key to the coalescerFIFO hashmap. + // It is shared by all candidates that fall within the + // given coalescingWindow. + int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow; + + if (coalescer->coalescerFIFO.count(tick_index)) { + coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size(); + } + + // see if we can coalesce the incoming pkt with another + // coalesced request with the same tick_index + for (int i = 0; i < coalescedReq_cnt; ++i) { + first_packet = coalescer->coalescerFIFO[tick_index][i][0]; + + if (coalescer->canCoalesce(pkt, first_packet)) { + coalescer->coalescerFIFO[tick_index][i].push_back(pkt); + + DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n", + i, tick_index, + coalescer->coalescerFIFO[tick_index][i].size()); + + didCoalesce = true; + break; + } + } + + // if this is the first request for this tick_index + // or we did not manage to coalesce, update stats + // and make necessary allocations. + if (!coalescedReq_cnt || !didCoalesce) { + if (update_stats) + coalescer->coalescedAccesses++; + + std::vector<PacketPtr> new_array; + new_array.push_back(pkt); + coalescer->coalescerFIFO[tick_index].push_back(new_array); + + DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after " + "push\n", tick_index, + coalescer->coalescerFIFO[tick_index].size()); + } + + //schedule probeTLBEvent next cycle to send the + //coalesced requests to the TLB + if (!coalescer->probeTLBEvent.scheduled()) { + coalescer->schedule(coalescer->probeTLBEvent, + curTick() + coalescer->ticks(1)); + } + + return true; +} + +void +TLBCoalescer::CpuSidePort::recvReqRetry() +{ + assert(false); +} + +void +TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt) +{ + + TheISA::GpuTLB::TranslationState *sender_state = + safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState); + + bool update_stats = !sender_state->prefetch; + + if (update_stats) + coalescer->uncoalescedAccesses++; + + // If there is a pending timing request for this virtual address + // print a warning message. This is a temporary caveat of + // the current simulator where atomic and timing requests can + // coexist. FIXME remove this check/warning in the future. + Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); + int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr); + + if (map_count) { + DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing " + "req. pending\n", virt_page_addr); + } + + coalescer->memSidePort[0]->sendFunctional(pkt); +} + +AddrRangeList +TLBCoalescer::CpuSidePort::getAddrRanges() const +{ + // currently not checked by the master + AddrRangeList ranges; + + return ranges; +} + +bool +TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt) +{ + // a translation completed and returned + coalescer->updatePhysAddresses(pkt); + + return true; +} + +void +TLBCoalescer::MemSidePort::recvReqRetry() +{ + //we've receeived a retry. Schedule a probeTLBEvent + if (!coalescer->probeTLBEvent.scheduled()) + coalescer->schedule(coalescer->probeTLBEvent, + curTick() + coalescer->ticks(1)); +} + +void +TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt) +{ + fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n"); +} + +TLBCoalescer::IssueProbeEvent::IssueProbeEvent(TLBCoalescer * _coalescer) + : Event(CPU_Tick_Pri), coalescer(_coalescer) +{ +} + +const char* +TLBCoalescer::IssueProbeEvent::description() const +{ + return "Probe the TLB below"; +} + +/* + * Here we scan the coalescer FIFO and issue the max + * number of permitted probes to the TLB below. We + * permit bypassing of coalesced requests for the same + * tick_index. + * + * We do not access the next tick_index unless we've + * drained the previous one. The coalesced requests + * that are successfully sent are moved to the + * issuedTranslationsTable table (the table which keeps + * track of the outstanding reqs) + */ +void +TLBCoalescer::IssueProbeEvent::process() +{ + // number of TLB probes sent so far + int sent_probes = 0; + // rejected denotes a blocking event + bool rejected = false; + + // It is set to true either when the recvTiming of the TLB below + // returns false or when there is another outstanding request for the + // same virt. page. + + DPRINTF(GPUTLB, "triggered TLBCoalescer IssueProbeEvent\n"); + + for (auto iter = coalescer->coalescerFIFO.begin(); + iter != coalescer->coalescerFIFO.end() && !rejected; ) { + int coalescedReq_cnt = iter->second.size(); + int i = 0; + int vector_index = 0; + + DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n", + coalescedReq_cnt, iter->first); + + while (i < coalescedReq_cnt) { + ++i; + PacketPtr first_packet = iter->second[vector_index][0]; + + // compute virtual page address for this request + Addr virt_page_addr = roundDown(first_packet->req->getVaddr(), + TheISA::PageBytes); + + // is there another outstanding request for the same page addr? + int pending_reqs = + coalescer->issuedTranslationsTable.count(virt_page_addr); + + if (pending_reqs) { + DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for " + "page %#x\n", virt_page_addr); + + ++vector_index; + rejected = true; + + continue; + } + + // send the coalesced request for virt_page_addr + if (!coalescer->memSidePort[0]->sendTimingReq(first_packet)) { + DPRINTF(GPUTLB, "Failed to send TLB request for page %#x", + virt_page_addr); + + // No need for a retries queue since we are already buffering + // the coalesced request in coalescerFIFO. + rejected = true; + ++vector_index; + } else { + TheISA::GpuTLB::TranslationState *tmp_sender_state = + safe_cast<TheISA::GpuTLB::TranslationState*> + (first_packet->senderState); + + bool update_stats = !tmp_sender_state->prefetch; + + if (update_stats) { + // req_cnt is total number of packets represented + // by the one we just sent counting all the way from + // the top of TLB hiearchy (i.e., from the CU) + int req_cnt = tmp_sender_state->reqCnt.back(); + coalescer->queuingCycles += (curTick() * req_cnt); + + DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n", + coalescer->name(), req_cnt); + + // pkt_cnt is number of packets we coalesced into the one + // we just sent but only at this coalescer level + int pkt_cnt = iter->second[vector_index].size(); + coalescer->localqueuingCycles += (curTick() * pkt_cnt); + } + + DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x", + virt_page_addr); + + //copy coalescedReq to issuedTranslationsTable + coalescer->issuedTranslationsTable[virt_page_addr] + = iter->second[vector_index]; + + //erase the entry of this coalesced req + iter->second.erase(iter->second.begin() + vector_index); + + if (iter->second.empty()) + assert(i == coalescedReq_cnt); + + sent_probes++; + if (sent_probes == coalescer->TLBProbesPerCycle) + return; + } + } + + //if there are no more coalesced reqs for this tick_index + //erase the hash_map with the first iterator + if (iter->second.empty()) { + coalescer->coalescerFIFO.erase(iter++); + } else { + ++iter; + } + } +} + +TLBCoalescer::CleanupEvent::CleanupEvent(TLBCoalescer* _coalescer) + : Event(Maximum_Pri), coalescer(_coalescer) +{ +} + +const char* +TLBCoalescer::CleanupEvent::description() const +{ + return "Cleanup issuedTranslationsTable hashmap"; +} + +void +TLBCoalescer::CleanupEvent::process() +{ + while (!coalescer->cleanupQueue.empty()) { + Addr cleanup_addr = coalescer->cleanupQueue.front(); + coalescer->cleanupQueue.pop(); + coalescer->issuedTranslationsTable.erase(cleanup_addr); + + DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n", + cleanup_addr); + } +} + +void +TLBCoalescer::regStats() +{ + uncoalescedAccesses + .name(name() + ".uncoalesced_accesses") + .desc("Number of uncoalesced TLB accesses") + ; + + coalescedAccesses + .name(name() + ".coalesced_accesses") + .desc("Number of coalesced TLB accesses") + ; + + queuingCycles + .name(name() + ".queuing_cycles") + .desc("Number of cycles spent in queue") + ; + + localqueuingCycles + .name(name() + ".local_queuing_cycles") + .desc("Number of cycles spent in queue for all incoming reqs") + ; + + localLatency + .name(name() + ".local_latency") + .desc("Avg. latency over all incoming pkts") + ; + + localLatency = localqueuingCycles / uncoalescedAccesses; +} + + +TLBCoalescer* +TLBCoalescerParams::create() +{ + return new TLBCoalescer(this); +} + |