/* * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. * All rights reserved. * * For use for simulation and test purposes only * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Authors: Lisa Hsu */ #include "gpu-compute/tlb_coalescer.hh" #include #include "base/logging.hh" #include "debug/GPUTLB.hh" #include "sim/process.hh" TLBCoalescer::TLBCoalescer(const Params *p) : MemObject(p), clock(p->clk_domain->clockPeriod()), TLBProbesPerCycle(p->probesPerCycle), coalescingWindow(p->coalescingWindow), disableCoalescing(p->disableCoalescing), probeTLBEvent([this]{ processProbeTLBEvent(); }, "Probe the TLB below", false, Event::CPU_Tick_Pri), cleanupEvent([this]{ processCleanupEvent(); }, "Cleanup issuedTranslationsTable hashmap", false, Event::Maximum_Pri) { // create the slave ports based on the number of connected ports for (size_t i = 0; i < p->port_slave_connection_count; ++i) { cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i), this, i)); } // create the master ports based on the number of connected ports for (size_t i = 0; i < p->port_master_connection_count; ++i) { memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i), this, i)); } } BaseSlavePort& TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx) { if (if_name == "slave") { if (idx >= static_cast(cpuSidePort.size())) { panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx); } return *cpuSidePort[idx]; } else { panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name); } } BaseMasterPort& TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx) { if (if_name == "master") { if (idx >= static_cast(memSidePort.size())) { panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx); } return *memSidePort[idx]; } else { panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name); } } /* * This method returns true if the * can be coalesced with and false otherwise. * A given set of rules is checked. * The rules can potentially be modified based on the TLB level. */ bool TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt) { if (disableCoalescing) return false; TheISA::GpuTLB::TranslationState *incoming_state = safe_cast(incoming_pkt->senderState); TheISA::GpuTLB::TranslationState *coalesced_state = safe_cast(coalesced_pkt->senderState); // Rule 1: Coalesce requests only if they // fall within the same virtual page Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(), TheISA::PageBytes); Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(), TheISA::PageBytes); if (incoming_virt_page_addr != coalesced_virt_page_addr) return false; //* Rule 2: Coalesce requests only if they // share a TLB Mode, i.e. they are both read // or write requests. BaseTLB::Mode incoming_mode = incoming_state->tlbMode; BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode; if (incoming_mode != coalesced_mode) return false; // when we can coalesce a packet update the reqCnt // that is the number of packets represented by // this coalesced packet if (!incoming_state->prefetch) coalesced_state->reqCnt.back() += incoming_state->reqCnt.back(); return true; } /* * We need to update the physical addresses of all the translation requests * that were coalesced into the one that just returned. */ void TLBCoalescer::updatePhysAddresses(PacketPtr pkt) { Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n", issuedTranslationsTable[virt_page_addr].size(), virt_page_addr); TheISA::GpuTLB::TranslationState *sender_state = safe_cast(pkt->senderState); TheISA::TlbEntry *tlb_entry = sender_state->tlbEntry; assert(tlb_entry); Addr first_entry_vaddr = tlb_entry->vaddr; Addr first_entry_paddr = tlb_entry->paddr; int page_size = tlb_entry->size(); bool uncacheable = tlb_entry->uncacheable; int first_hit_level = sender_state->hitLevel; // Get the physical page address of the translated request // Using the page_size specified in the TLBEntry allows us // to support different page sizes. Addr phys_page_paddr = pkt->req->getPaddr(); phys_page_paddr &= ~(page_size - 1); for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) { PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i]; TheISA::GpuTLB::TranslationState *sender_state = safe_cast( local_pkt->senderState); // we are sending the packet back, so pop the reqCnt associated // with this level in the TLB hiearchy if (!sender_state->prefetch) sender_state->reqCnt.pop_back(); /* * Only the first packet from this coalesced request has been * translated. Grab the translated phys. page addr and update the * physical addresses of the remaining packets with the appropriate * page offsets. */ if (i) { Addr paddr = phys_page_paddr; paddr |= (local_pkt->req->getVaddr() & (page_size - 1)); local_pkt->req->setPaddr(paddr); if (uncacheable) local_pkt->req->setFlags(Request::UNCACHEABLE); // update senderState->tlbEntry, so we can insert // the correct TLBEentry in the TLBs above. auto p = sender_state->tc->getProcessPtr(); sender_state->tlbEntry = new TheISA::TlbEntry(p->pid(), first_entry_vaddr, first_entry_paddr, false, false); // update the hitLevel for all uncoalesced reqs // so that each packet knows where it hit // (used for statistics in the CUs) sender_state->hitLevel = first_hit_level; } SlavePort *return_port = sender_state->ports.back(); sender_state->ports.pop_back(); // Translation is done - Convert to a response pkt if necessary and // send the translation back if (local_pkt->isRequest()) { local_pkt->makeTimingResponse(); } return_port->sendTimingResp(local_pkt); } // schedule clean up for end of this cycle // This is a maximum priority event and must be on // the same cycle as GPUTLB cleanup event to prevent // race conditions with an IssueProbeEvent caused by // MemSidePort::recvReqRetry cleanupQueue.push(virt_page_addr); if (!cleanupEvent.scheduled()) schedule(cleanupEvent, curTick()); } // Receive translation requests, create a coalesced request, // and send them to the TLB (TLBProbesPerCycle) bool TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt) { // first packet of a coalesced request PacketPtr first_packet = nullptr; // true if we are able to do coalescing bool didCoalesce = false; // number of coalesced reqs for a given window int coalescedReq_cnt = 0; TheISA::GpuTLB::TranslationState *sender_state = safe_cast(pkt->senderState); // push back the port to remember the path back sender_state->ports.push_back(this); bool update_stats = !sender_state->prefetch; if (update_stats) { // if reqCnt is empty then this packet does not represent // multiple uncoalesced reqs(pkts) but just a single pkt. // If it does though then the reqCnt for each level in the // hierarchy accumulates the total number of reqs this packet // represents int req_cnt = 1; if (!sender_state->reqCnt.empty()) req_cnt = sender_state->reqCnt.back(); sender_state->reqCnt.push_back(req_cnt); // update statistics coalescer->uncoalescedAccesses++; req_cnt = sender_state->reqCnt.back(); DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt); coalescer->queuingCycles -= (curTick() * req_cnt); coalescer->localqueuingCycles -= curTick(); } // FIXME if you want to coalesce not based on the issueTime // of the packets (i.e., from the compute unit's perspective) // but based on when they reached this coalescer then // remove the following if statement and use curTick() or // coalescingWindow for the tick_index. if (!sender_state->issueTime) sender_state->issueTime = curTick(); // The tick index is used as a key to the coalescerFIFO hashmap. // It is shared by all candidates that fall within the // given coalescingWindow. int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow; if (coalescer->coalescerFIFO.count(tick_index)) { coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size(); } // see if we can coalesce the incoming pkt with another // coalesced request with the same tick_index for (int i = 0; i < coalescedReq_cnt; ++i) { first_packet = coalescer->coalescerFIFO[tick_index][i][0]; if (coalescer->canCoalesce(pkt, first_packet)) { coalescer->coalescerFIFO[tick_index][i].push_back(pkt); DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n", i, tick_index, coalescer->coalescerFIFO[tick_index][i].size()); didCoalesce = true; break; } } // if this is the first request for this tick_index // or we did not manage to coalesce, update stats // and make necessary allocations. if (!coalescedReq_cnt || !didCoalesce) { if (update_stats) coalescer->coalescedAccesses++; std::vector new_array; new_array.push_back(pkt); coalescer->coalescerFIFO[tick_index].push_back(new_array); DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after " "push\n", tick_index, coalescer->coalescerFIFO[tick_index].size()); } //schedule probeTLBEvent next cycle to send the //coalesced requests to the TLB if (!coalescer->probeTLBEvent.scheduled()) { coalescer->schedule(coalescer->probeTLBEvent, curTick() + coalescer->ticks(1)); } return true; } void TLBCoalescer::CpuSidePort::recvReqRetry() { panic("recvReqRetry called"); } void TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt) { TheISA::GpuTLB::TranslationState *sender_state = safe_cast(pkt->senderState); bool update_stats = !sender_state->prefetch; if (update_stats) coalescer->uncoalescedAccesses++; // If there is a pending timing request for this virtual address // print a warning message. This is a temporary caveat of // the current simulator where atomic and timing requests can // coexist. FIXME remove this check/warning in the future. Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes); int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr); if (map_count) { DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing " "req. pending\n", virt_page_addr); } coalescer->memSidePort[0]->sendFunctional(pkt); } AddrRangeList TLBCoalescer::CpuSidePort::getAddrRanges() const { // currently not checked by the master AddrRangeList ranges; return ranges; } bool TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt) { // a translation completed and returned coalescer->updatePhysAddresses(pkt); return true; } void TLBCoalescer::MemSidePort::recvReqRetry() { //we've receeived a retry. Schedule a probeTLBEvent if (!coalescer->probeTLBEvent.scheduled()) coalescer->schedule(coalescer->probeTLBEvent, curTick() + coalescer->ticks(1)); } void TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt) { fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n"); } /* * Here we scan the coalescer FIFO and issue the max * number of permitted probes to the TLB below. We * permit bypassing of coalesced requests for the same * tick_index. * * We do not access the next tick_index unless we've * drained the previous one. The coalesced requests * that are successfully sent are moved to the * issuedTranslationsTable table (the table which keeps * track of the outstanding reqs) */ void TLBCoalescer::processProbeTLBEvent() { // number of TLB probes sent so far int sent_probes = 0; // rejected denotes a blocking event bool rejected = false; // It is set to true either when the recvTiming of the TLB below // returns false or when there is another outstanding request for the // same virt. page. DPRINTF(GPUTLB, "triggered TLBCoalescer %s\n", __func__); for (auto iter = coalescerFIFO.begin(); iter != coalescerFIFO.end() && !rejected; ) { int coalescedReq_cnt = iter->second.size(); int i = 0; int vector_index = 0; DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n", coalescedReq_cnt, iter->first); while (i < coalescedReq_cnt) { ++i; PacketPtr first_packet = iter->second[vector_index][0]; // compute virtual page address for this request Addr virt_page_addr = roundDown(first_packet->req->getVaddr(), TheISA::PageBytes); // is there another outstanding request for the same page addr? int pending_reqs = issuedTranslationsTable.count(virt_page_addr); if (pending_reqs) { DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for " "page %#x\n", virt_page_addr); ++vector_index; rejected = true; continue; } // send the coalesced request for virt_page_addr if (!memSidePort[0]->sendTimingReq(first_packet)) { DPRINTF(GPUTLB, "Failed to send TLB request for page %#x", virt_page_addr); // No need for a retries queue since we are already buffering // the coalesced request in coalescerFIFO. rejected = true; ++vector_index; } else { TheISA::GpuTLB::TranslationState *tmp_sender_state = safe_cast (first_packet->senderState); bool update_stats = !tmp_sender_state->prefetch; if (update_stats) { // req_cnt is total number of packets represented // by the one we just sent counting all the way from // the top of TLB hiearchy (i.e., from the CU) int req_cnt = tmp_sender_state->reqCnt.back(); queuingCycles += (curTick() * req_cnt); DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n", name(), req_cnt); // pkt_cnt is number of packets we coalesced into the one // we just sent but only at this coalescer level int pkt_cnt = iter->second[vector_index].size(); localqueuingCycles += (curTick() * pkt_cnt); } DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x", virt_page_addr); //copy coalescedReq to issuedTranslationsTable issuedTranslationsTable[virt_page_addr] = iter->second[vector_index]; //erase the entry of this coalesced req iter->second.erase(iter->second.begin() + vector_index); if (iter->second.empty()) assert(i == coalescedReq_cnt); sent_probes++; if (sent_probes == TLBProbesPerCycle) return; } } //if there are no more coalesced reqs for this tick_index //erase the hash_map with the first iterator if (iter->second.empty()) { coalescerFIFO.erase(iter++); } else { ++iter; } } } void TLBCoalescer::processCleanupEvent() { while (!cleanupQueue.empty()) { Addr cleanup_addr = cleanupQueue.front(); cleanupQueue.pop(); issuedTranslationsTable.erase(cleanup_addr); DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n", cleanup_addr); } } void TLBCoalescer::regStats() { MemObject::regStats(); uncoalescedAccesses .name(name() + ".uncoalesced_accesses") .desc("Number of uncoalesced TLB accesses") ; coalescedAccesses .name(name() + ".coalesced_accesses") .desc("Number of coalesced TLB accesses") ; queuingCycles .name(name() + ".queuing_cycles") .desc("Number of cycles spent in queue") ; localqueuingCycles .name(name() + ".local_queuing_cycles") .desc("Number of cycles spent in queue for all incoming reqs") ; localLatency .name(name() + ".local_latency") .desc("Avg. latency over all incoming pkts") ; localLatency = localqueuingCycles / uncoalescedAccesses; } TLBCoalescer* TLBCoalescerParams::create() { return new TLBCoalescer(this); }