1 files changed, 583 insertions, 0 deletions
diff --git a/src/gpu-compute/tlb_coalescer.cc b/src/gpu-compute/tlb_coalescer.cc
new file mode 100644
index 000000000..835d7b740
--- /dev/null
+++ b/src/gpu-compute/tlb_coalescer.cc
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#include "gpu-compute/tlb_coalescer.hh"
+
+#include <cstring>
+
+#include "debug/GPUTLB.hh"
+
+TLBCoalescer::TLBCoalescer(const Params *p) : MemObject(p),
+    clock(p->clk_domain->clockPeriod()), TLBProbesPerCycle(p->probesPerCycle),
+    coalescingWindow(p->coalescingWindow),
+    disableCoalescing(p->disableCoalescing), probeTLBEvent(this),
+    cleanupEvent(this)
+{
+    // create the slave ports based on the number of connected ports
+    for (size_t i = 0; i < p->port_slave_connection_count; ++i) {
+        cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),
+                                              this, i));
+    }
+
+    // create the master ports based on the number of connected ports
+    for (size_t i = 0; i < p->port_master_connection_count; ++i) {
+        memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),
+                                              this, i));
+    }
+}
+
+BaseSlavePort&
+TLBCoalescer::getSlavePort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "slave") {
+        if (idx >= static_cast<PortID>(cpuSidePort.size())) {
+            panic("TLBCoalescer::getSlavePort: unknown index %d\n", idx);
+        }
+
+        return *cpuSidePort[idx];
+    } else {
+        panic("TLBCoalescer::getSlavePort: unknown port %s\n", if_name);
+    }
+}
+
+BaseMasterPort&
+TLBCoalescer::getMasterPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "master") {
+        if (idx >= static_cast<PortID>(memSidePort.size())) {
+            panic("TLBCoalescer::getMasterPort: unknown index %d\n", idx);
+        }
+
+        return *memSidePort[idx];
+    } else {
+        panic("TLBCoalescer::getMasterPort: unknown port %s\n", if_name);
+    }
+}
+
+/*
+ * This method returns true if the <incoming_pkt>
+ * can be coalesced with <coalesced_pkt> and false otherwise.
+ * A given set of rules is checked.
+ * The rules can potentially be modified based on the TLB level.
+ */
+bool
+TLBCoalescer::canCoalesce(PacketPtr incoming_pkt, PacketPtr coalesced_pkt)
+{
+    if (disableCoalescing)
+        return false;
+
+    TheISA::GpuTLB::TranslationState *incoming_state =
+      safe_cast<TheISA::GpuTLB::TranslationState*>(incoming_pkt->senderState);
+
+    TheISA::GpuTLB::TranslationState *coalesced_state =
+     safe_cast<TheISA::GpuTLB::TranslationState*>(coalesced_pkt->senderState);
+
+    // Rule 1: Coalesce requests only if they
+    // fall within the same virtual page
+    Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),
+                                             TheISA::PageBytes);
+
+    Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),
+                                              TheISA::PageBytes);
+
+    if (incoming_virt_page_addr != coalesced_virt_page_addr)
+        return false;
+
+    //* Rule 2: Coalesce requests only if they
+    // share a TLB Mode, i.e. they are both read
+    // or write requests.
+    BaseTLB::Mode incoming_mode = incoming_state->tlbMode;
+    BaseTLB::Mode coalesced_mode = coalesced_state->tlbMode;
+
+    if (incoming_mode != coalesced_mode)
+        return false;
+
+    // when we can coalesce a packet update the reqCnt
+    // that is the number of packets represented by
+    // this coalesced packet
+    if (!incoming_state->prefetch)
+        coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();
+
+    return true;
+}
+
+/*
+ * We need to update the physical addresses of all the translation requests
+ * that were coalesced into the one that just returned.
+ */
+void
+TLBCoalescer::updatePhysAddresses(PacketPtr pkt)
+{
+    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
+
+    DPRINTF(GPUTLB, "Update phys. addr. for %d coalesced reqs for page %#x\n",
+            issuedTranslationsTable[virt_page_addr].size(), virt_page_addr);
+
+    TheISA::GpuTLB::TranslationState *sender_state =
+        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    TheISA::GpuTlbEntry *tlb_entry = sender_state->tlbEntry;
+    assert(tlb_entry);
+    Addr first_entry_vaddr = tlb_entry->vaddr;
+    Addr first_entry_paddr = tlb_entry->paddr;
+    int page_size = tlb_entry->size();
+    bool uncacheable = tlb_entry->uncacheable;
+    int first_hit_level = sender_state->hitLevel;
+    bool valid = tlb_entry->valid;
+
+    // Get the physical page address of the translated request
+    // Using the page_size specified in the TLBEntry allows us
+    // to support different page sizes.
+    Addr phys_page_paddr = pkt->req->getPaddr();
+    phys_page_paddr &= ~(page_size - 1);
+
+    for (int i = 0; i < issuedTranslationsTable[virt_page_addr].size(); ++i) {
+        PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];
+        TheISA::GpuTLB::TranslationState *sender_state =
+            safe_cast<TheISA::GpuTLB::TranslationState*>(
+                    local_pkt->senderState);
+
+        // we are sending the packet back, so pop the reqCnt associated
+        // with this level in the TLB hiearchy
+        if (!sender_state->prefetch)
+            sender_state->reqCnt.pop_back();
+
+        /*
+         * Only the first packet from this coalesced request has been
+         * translated. Grab the translated phys. page addr and update the
+         * physical addresses of the remaining packets with the appropriate
+         * page offsets.
+         */
+        if (i) {
+            Addr paddr = phys_page_paddr;
+            paddr |= (local_pkt->req->getVaddr() & (page_size - 1));
+            local_pkt->req->setPaddr(paddr);
+
+            if (uncacheable)
+                local_pkt->req->setFlags(Request::UNCACHEABLE);
+
+            // update senderState->tlbEntry, so we can insert
+            // the correct TLBEentry in the TLBs above.
+            sender_state->tlbEntry =
+                new TheISA::GpuTlbEntry(0, first_entry_vaddr, first_entry_paddr,
+                                        valid);
+
+            // update the hitLevel for all uncoalesced reqs
+            // so that each packet knows where it hit
+            // (used for statistics in the CUs)
+            sender_state->hitLevel = first_hit_level;
+        }
+
+        SlavePort *return_port = sender_state->ports.back();
+        sender_state->ports.pop_back();
+
+        // Translation is done - Convert to a response pkt if necessary and
+        // send the translation back
+        if (local_pkt->isRequest()) {
+            local_pkt->makeTimingResponse();
+        }
+
+        return_port->sendTimingResp(local_pkt);
+    }
+
+    // schedule clean up for end of this cycle
+    // This is a maximum priority event and must be on
+    // the same cycle as GPUTLB cleanup event to prevent
+    // race conditions with an IssueProbeEvent caused by
+    // MemSidePort::recvReqRetry
+    cleanupQueue.push(virt_page_addr);
+
+    if (!cleanupEvent.scheduled())
+        schedule(cleanupEvent, curTick());
+}
+
+// Receive translation requests, create a coalesced request,
+// and send them to the TLB (TLBProbesPerCycle)
+bool
+TLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)
+{
+    // first packet of a coalesced request
+    PacketPtr first_packet = nullptr;
+    // true if we are able to do coalescing
+    bool didCoalesce = false;
+    // number of coalesced reqs for a given window
+    int coalescedReq_cnt = 0;
+
+    TheISA::GpuTLB::TranslationState *sender_state =
+        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    // push back the port to remember the path back
+    sender_state->ports.push_back(this);
+
+    bool update_stats = !sender_state->prefetch;
+
+    if (update_stats) {
+        // if reqCnt is empty then this packet does not represent
+        // multiple uncoalesced reqs(pkts) but just a single pkt.
+        // If it does though then the reqCnt for each level in the
+        // hierarchy accumulates the total number of reqs this packet
+        // represents
+        int req_cnt = 1;
+
+        if (!sender_state->reqCnt.empty())
+            req_cnt = sender_state->reqCnt.back();
+
+        sender_state->reqCnt.push_back(req_cnt);
+
+        // update statistics
+        coalescer->uncoalescedAccesses++;
+        req_cnt = sender_state->reqCnt.back();
+        DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);
+        coalescer->queuingCycles -= (curTick() * req_cnt);
+        coalescer->localqueuingCycles -= curTick();
+    }
+
+    // FIXME if you want to coalesce not based on the issueTime
+    // of the packets (i.e., from the compute unit's perspective)
+    // but based on when they reached this coalescer then
+    // remove the following if statement and use curTick() or
+    // coalescingWindow for the tick_index.
+    if (!sender_state->issueTime)
+       sender_state->issueTime = curTick();
+
+    // The tick index is used as a key to the coalescerFIFO hashmap.
+    // It is shared by all candidates that fall within the
+    // given coalescingWindow.
+    int64_t tick_index = sender_state->issueTime / coalescer->coalescingWindow;
+
+    if (coalescer->coalescerFIFO.count(tick_index)) {
+        coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();
+    }
+
+    // see if we can coalesce the incoming pkt with another
+    // coalesced request with the same tick_index
+    for (int i = 0; i < coalescedReq_cnt; ++i) {
+        first_packet = coalescer->coalescerFIFO[tick_index][i][0];
+
+        if (coalescer->canCoalesce(pkt, first_packet)) {
+            coalescer->coalescerFIFO[tick_index][i].push_back(pkt);
+
+            DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",
+                    i, tick_index,
+                    coalescer->coalescerFIFO[tick_index][i].size());
+
+            didCoalesce = true;
+            break;
+        }
+    }
+
+    // if this is the first request for this tick_index
+    // or we did not manage to coalesce, update stats
+    // and make necessary allocations.
+    if (!coalescedReq_cnt || !didCoalesce) {
+        if (update_stats)
+            coalescer->coalescedAccesses++;
+
+        std::vector<PacketPtr> new_array;
+        new_array.push_back(pkt);
+        coalescer->coalescerFIFO[tick_index].push_back(new_array);
+
+        DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "
+                "push\n", tick_index,
+                coalescer->coalescerFIFO[tick_index].size());
+    }
+
+    //schedule probeTLBEvent next cycle to send the
+    //coalesced requests to the TLB
+    if (!coalescer->probeTLBEvent.scheduled()) {
+        coalescer->schedule(coalescer->probeTLBEvent,
+                curTick() + coalescer->ticks(1));
+    }
+
+    return true;
+}
+
+void
+TLBCoalescer::CpuSidePort::recvReqRetry()
+{
+    assert(false);
+}
+
+void
+TLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)
+{
+
+    TheISA::GpuTLB::TranslationState *sender_state =
+        safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+    bool update_stats = !sender_state->prefetch;
+
+    if (update_stats)
+        coalescer->uncoalescedAccesses++;
+
+    // If there is a pending timing request for this virtual address
+    // print a warning message. This is a temporary caveat of
+    // the current simulator where atomic and timing requests can
+    // coexist. FIXME remove this check/warning in the future.
+    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), TheISA::PageBytes);
+    int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);
+
+    if (map_count) {
+        DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "
+                "req. pending\n", virt_page_addr);
+    }
+
+    coalescer->memSidePort[0]->sendFunctional(pkt);
+}
+
+AddrRangeList
+TLBCoalescer::CpuSidePort::getAddrRanges() const
+{
+    // currently not checked by the master
+    AddrRangeList ranges;
+
+    return ranges;
+}
+
+bool
+TLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)
+{
+    // a translation completed and returned
+    coalescer->updatePhysAddresses(pkt);
+
+    return true;
+}
+
+void
+TLBCoalescer::MemSidePort::recvReqRetry()
+{
+    //we've receeived a retry. Schedule a probeTLBEvent
+    if (!coalescer->probeTLBEvent.scheduled())
+        coalescer->schedule(coalescer->probeTLBEvent,
+                curTick() + coalescer->ticks(1));
+}
+
+void
+TLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)
+{
+    fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");
+}
+
+TLBCoalescer::IssueProbeEvent::IssueProbeEvent(TLBCoalescer * _coalescer)
+    : Event(CPU_Tick_Pri), coalescer(_coalescer)
+{
+}
+
+const char*
+TLBCoalescer::IssueProbeEvent::description() const
+{
+    return "Probe the TLB below";
+}
+
+/*
+ * Here we scan the coalescer FIFO and issue the max
+ * number of permitted probes to the TLB below. We
+ * permit bypassing of coalesced requests for the same
+ * tick_index.
+ *
+ * We do not access the next tick_index unless we've
+ * drained the previous one. The coalesced requests
+ * that are successfully sent are moved to the
+ * issuedTranslationsTable table (the table which keeps
+ * track of the outstanding reqs)
+ */
+void
+TLBCoalescer::IssueProbeEvent::process()
+{
+    // number of TLB probes sent so far
+    int sent_probes = 0;
+    // rejected denotes a blocking event
+    bool rejected = false;
+
+    // It is set to true either when the recvTiming of the TLB below
+    // returns false or when there is another outstanding request for the
+    // same virt. page.
+
+    DPRINTF(GPUTLB, "triggered TLBCoalescer IssueProbeEvent\n");
+
+    for (auto iter = coalescer->coalescerFIFO.begin();
+         iter != coalescer->coalescerFIFO.end() && !rejected; ) {
+        int coalescedReq_cnt = iter->second.size();
+        int i = 0;
+        int vector_index = 0;
+
+        DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",
+               coalescedReq_cnt, iter->first);
+
+        while (i < coalescedReq_cnt) {
+            ++i;
+            PacketPtr first_packet = iter->second[vector_index][0];
+
+            // compute virtual page address for this request
+            Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),
+                    TheISA::PageBytes);
+
+            // is there another outstanding request for the same page addr?
+            int pending_reqs =
+                coalescer->issuedTranslationsTable.count(virt_page_addr);
+
+            if (pending_reqs) {
+                DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "
+                        "page %#x\n", virt_page_addr);
+
+                ++vector_index;
+                rejected = true;
+
+                continue;
+            }
+
+            // send the coalesced request for virt_page_addr
+            if (!coalescer->memSidePort[0]->sendTimingReq(first_packet)) {
+                DPRINTF(GPUTLB, "Failed to send TLB request for page %#x",
+                       virt_page_addr);
+
+                // No need for a retries queue since we are already buffering
+                // the coalesced request in coalescerFIFO.
+                rejected = true;
+                ++vector_index;
+            } else {
+                TheISA::GpuTLB::TranslationState *tmp_sender_state =
+                    safe_cast<TheISA::GpuTLB::TranslationState*>
+                    (first_packet->senderState);
+
+                bool update_stats = !tmp_sender_state->prefetch;
+
+                if (update_stats) {
+                    // req_cnt is total number of packets represented
+                    // by the one we just sent counting all the way from
+                    // the top of TLB hiearchy (i.e., from the CU)
+                    int req_cnt = tmp_sender_state->reqCnt.back();
+                    coalescer->queuingCycles += (curTick() * req_cnt);
+
+                    DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",
+                            coalescer->name(), req_cnt);
+
+                    // pkt_cnt is number of packets we coalesced into the one
+                    // we just sent but only at this coalescer level
+                    int pkt_cnt = iter->second[vector_index].size();
+                    coalescer->localqueuingCycles += (curTick() * pkt_cnt);
+                }
+
+                DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x",
+                       virt_page_addr);
+
+                //copy coalescedReq to issuedTranslationsTable
+                coalescer->issuedTranslationsTable[virt_page_addr]
+                    = iter->second[vector_index];
+
+                //erase the entry of this coalesced req
+                iter->second.erase(iter->second.begin() + vector_index);
+
+                if (iter->second.empty())
+                    assert(i == coalescedReq_cnt);
+
+                sent_probes++;
+                if (sent_probes == coalescer->TLBProbesPerCycle)
+                   return;
+            }
+        }
+
+        //if there are no more coalesced reqs for this tick_index
+        //erase the hash_map with the first iterator
+        if (iter->second.empty()) {
+            coalescer->coalescerFIFO.erase(iter++);
+        } else {
+            ++iter;
+        }
+    }
+}
+
+TLBCoalescer::CleanupEvent::CleanupEvent(TLBCoalescer* _coalescer)
+    : Event(Maximum_Pri), coalescer(_coalescer)
+{
+}
+
+const char*
+TLBCoalescer::CleanupEvent::description() const
+{
+    return "Cleanup issuedTranslationsTable hashmap";
+}
+
+void
+TLBCoalescer::CleanupEvent::process()
+{
+    while (!coalescer->cleanupQueue.empty()) {
+        Addr cleanup_addr = coalescer->cleanupQueue.front();
+        coalescer->cleanupQueue.pop();
+        coalescer->issuedTranslationsTable.erase(cleanup_addr);
+
+        DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",
+                cleanup_addr);
+    }
+}
+
+void
+TLBCoalescer::regStats()
+{
+    uncoalescedAccesses
+        .name(name() + ".uncoalesced_accesses")
+        .desc("Number of uncoalesced TLB accesses")
+        ;
+
+    coalescedAccesses
+        .name(name() + ".coalesced_accesses")
+        .desc("Number of coalesced TLB accesses")
+        ;
+
+    queuingCycles
+        .name(name() + ".queuing_cycles")
+        .desc("Number of cycles spent in queue")
+        ;
+
+    localqueuingCycles
+        .name(name() + ".local_queuing_cycles")
+        .desc("Number of cycles spent in queue for all incoming reqs")
+        ;
+
+    localLatency
+        .name(name() + ".local_latency")
+        .desc("Avg. latency over all incoming pkts")
+        ;
+
+    localLatency = localqueuingCycles / uncoalescedAccesses;
+}
+
+
+TLBCoalescer*
+TLBCoalescerParams::create()
+{
+    return new TLBCoalescer(this);
+}
+