summaryrefslogtreecommitdiff
path: root/src/gpu-compute/gpu_tlb.hh
diff options
context:
space:
mode:
Diffstat (limited to 'src/gpu-compute/gpu_tlb.hh')
-rw-r--r--src/gpu-compute/gpu_tlb.hh465
1 files changed, 465 insertions, 0 deletions
diff --git a/src/gpu-compute/gpu_tlb.hh b/src/gpu-compute/gpu_tlb.hh
new file mode 100644
index 000000000..3549c598b
--- /dev/null
+++ b/src/gpu-compute/gpu_tlb.hh
@@ -0,0 +1,465 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __GPU_TLB_HH__
+#define __GPU_TLB_HH__
+
+#include <fstream>
+#include <list>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "arch/generic/tlb.hh"
+#include "arch/x86/pagetable.hh"
+#include "arch/x86/pagetable_walker.hh"
+#include "arch/x86/regs/segment.hh"
+#include "base/callback.hh"
+#include "base/misc.hh"
+#include "base/statistics.hh"
+#include "gpu-compute/compute_unit.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/X86GPUTLB.hh"
+#include "sim/sim_object.hh"
+
+class BaseTLB;
+class Packet;
+class ThreadContext;
+
+namespace X86ISA
+{
+ class GpuTlbEntry : public TlbEntry
+ {
+ public:
+ GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid)
+ : TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { }
+
+ GpuTlbEntry() : TlbEntry() { }
+
+ bool valid;
+ };
+
+ class GpuTLB : public MemObject
+ {
+ protected:
+ friend class Walker;
+
+ typedef std::list<GpuTlbEntry*> EntryList;
+
+ uint32_t configAddress;
+
+ // TLB clock: will inherit clock from shader's clock period in terms
+ // of nuber of ticks of curTime (aka global simulation clock)
+ // The assignment of TLB clock from shader clock is done in the python
+ // config files.
+ int clock;
+
+ public:
+ // clock related functions ; maps to-and-from Simulation ticks and
+ // object clocks.
+ Tick frequency() const { return SimClock::Frequency / clock; }
+
+ Tick
+ ticks(int numCycles) const
+ {
+ return (Tick)clock * numCycles;
+ }
+
+ Tick curCycle() const { return curTick() / clock; }
+ Tick tickToCycles(Tick val) const { return val / clock;}
+
+ typedef X86GPUTLBParams Params;
+ GpuTLB(const Params *p);
+ ~GpuTLB();
+
+ typedef enum BaseTLB::Mode Mode;
+
+ class Translation
+ {
+ public:
+ virtual ~Translation() { }
+
+ /**
+ * Signal that the translation has been delayed due to a hw page
+ * table walk.
+ */
+ virtual void markDelayed() = 0;
+
+ /**
+ * The memory for this object may be dynamically allocated, and it
+ * may be responsible for cleaning itslef up which will happen in
+ * this function. Once it's called the object is no longer valid.
+ */
+ virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc,
+ Mode mode) = 0;
+ };
+
+ void dumpAll();
+ GpuTlbEntry *lookup(Addr va, bool update_lru=true);
+ void setConfigAddress(uint32_t addr);
+
+ protected:
+ EntryList::iterator lookupIt(Addr va, bool update_lru=true);
+ Walker *walker;
+
+ public:
+ Walker *getWalker();
+ void invalidateAll();
+ void invalidateNonGlobal();
+ void demapPage(Addr va, uint64_t asn);
+
+ protected:
+ int size;
+ int assoc;
+ int numSets;
+
+ /**
+ * true if this is a fully-associative TLB
+ */
+ bool FA;
+ Addr setMask;
+
+ /**
+ * Allocation Policy: true if we always allocate on a hit, false
+ * otherwise. Default is true.
+ */
+ bool allocationPolicy;
+
+ /**
+ * if true, then this is not the last level TLB
+ */
+ bool hasMemSidePort;
+
+ /**
+ * Print out accessDistance stats. One stat file
+ * per TLB.
+ */
+ bool accessDistance;
+
+ GpuTlbEntry *tlb;
+
+ /*
+ * It's a per-set list. As long as we have not reached
+ * the full capacity of the given set, grab an entry from
+ * the freeList.
+ */
+ std::vector<EntryList> freeList;
+
+ /**
+ * An entryList per set is the equivalent of an LRU stack;
+ * it's used to guide replacement decisions. The head of the list
+ * contains the MRU TLB entry of the given set. If the freeList
+ * for this set is empty, the last element of the list
+ * is evicted (i.e., dropped on the floor).
+ */
+ std::vector<EntryList> entryList;
+
+ Fault translateInt(RequestPtr req, ThreadContext *tc);
+
+ Fault translate(RequestPtr req, ThreadContext *tc,
+ Translation *translation, Mode mode, bool &delayedResponse,
+ bool timing, int &latency);
+
+ public:
+ // latencies for a TLB hit, miss and page fault
+ int hitLatency;
+ int missLatency1;
+ int missLatency2;
+
+ // local_stats are as seen from the TLB
+ // without taking into account coalescing
+ Stats::Scalar localNumTLBAccesses;
+ Stats::Scalar localNumTLBHits;
+ Stats::Scalar localNumTLBMisses;
+ Stats::Formula localTLBMissRate;
+
+ // global_stats are as seen from the
+ // CU's perspective taking into account
+ // all coalesced requests.
+ Stats::Scalar globalNumTLBAccesses;
+ Stats::Scalar globalNumTLBHits;
+ Stats::Scalar globalNumTLBMisses;
+ Stats::Formula globalTLBMissRate;
+
+ // from the CU perspective (global)
+ Stats::Scalar accessCycles;
+ // from the CU perspective (global)
+ Stats::Scalar pageTableCycles;
+ Stats::Scalar numUniquePages;
+ // from the perspective of this TLB
+ Stats::Scalar localCycles;
+ // from the perspective of this TLB
+ Stats::Formula localLatency;
+ // I take the avg. per page and then
+ // the avg. over all pages.
+ Stats::Scalar avgReuseDistance;
+
+ void regStats();
+ void updatePageFootprint(Addr virt_page_addr);
+ void printAccessPattern();
+
+
+ Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode,
+ int &latency);
+
+ void translateTiming(RequestPtr req, ThreadContext *tc,
+ Translation *translation, Mode mode,
+ int &latency);
+
+ Tick doMmuRegRead(ThreadContext *tc, Packet *pkt);
+ Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt);
+
+ GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry);
+
+ // Checkpointing
+ virtual void serialize(CheckpointOut& cp) const;
+ virtual void unserialize(CheckpointIn& cp);
+ void issueTranslation();
+ enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN};
+ bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats);
+
+ void handleTranslationReturn(Addr addr, tlbOutcome outcome,
+ PacketPtr pkt);
+
+ void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome);
+
+ void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,
+ GpuTlbEntry *tlb_entry, Mode mode);
+
+ void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry,
+ Addr phys_page_addr);
+
+ void issueTLBLookup(PacketPtr pkt);
+
+ // CpuSidePort is the TLB Port closer to the CPU/CU side
+ class CpuSidePort : public SlavePort
+ {
+ public:
+ CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB,
+ PortID _index)
+ : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
+
+ protected:
+ GpuTLB *tlb;
+ int index;
+
+ virtual bool recvTimingReq(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt);
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+ virtual void recvRespRetry() { assert(false); }
+ virtual AddrRangeList getAddrRanges() const;
+ };
+
+ /**
+ * MemSidePort is the TLB Port closer to the memory side
+ * If this is a last level TLB then this port will not be connected.
+ *
+ * Future action item: if we ever do real page walks, then this port
+ * should be connected to a RubyPort.
+ */
+ class MemSidePort : public MasterPort
+ {
+ public:
+ MemSidePort(const std::string &_name, GpuTLB * gpu_TLB,
+ PortID _index)
+ : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { }
+
+ std::deque<PacketPtr> retries;
+
+ protected:
+ GpuTLB *tlb;
+ int index;
+
+ virtual bool recvTimingResp(PacketPtr pkt);
+ virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+ virtual void recvFunctional(PacketPtr pkt) { }
+ virtual void recvRangeChange() { }
+ virtual void recvReqRetry();
+ };
+
+ // TLB ports on the cpu Side
+ std::vector<CpuSidePort*> cpuSidePort;
+ // TLB ports on the memory side
+ std::vector<MemSidePort*> memSidePort;
+
+ BaseMasterPort &getMasterPort(const std::string &if_name,
+ PortID idx=InvalidPortID);
+
+ BaseSlavePort &getSlavePort(const std::string &if_name,
+ PortID idx=InvalidPortID);
+
+ /**
+ * TLB TranslationState: this currently is a somewhat bastardization of
+ * the usage of SenderState, whereby the receiver of a packet is not
+ * usually supposed to need to look at the contents of the senderState,
+ * you're really only supposed to look at what you pushed on, pop it
+ * off, and send it back.
+ *
+ * However, since there is state that we want to pass to the TLBs using
+ * the send/recv Timing/Functional/etc. APIs, which don't allow for new
+ * arguments, we need a common TLB senderState to pass between TLBs,
+ * both "forwards" and "backwards."
+ *
+ * So, basically, the rule is that any packet received by a TLB port
+ * (cpuside OR memside) must be safely castable to a TranslationState.
+ */
+
+ struct TranslationState : public Packet::SenderState
+ {
+ // TLB mode, read or write
+ Mode tlbMode;
+ // Thread context associated with this req
+ ThreadContext *tc;
+
+ /*
+ * TLB entry to be populated and passed back and filled in
+ * previous TLBs. Equivalent to the data cache concept of
+ * "data return."
+ */
+ GpuTlbEntry *tlbEntry;
+ // Is this a TLB prefetch request?
+ bool prefetch;
+ // When was the req for this translation issued
+ uint64_t issueTime;
+ // Remember where this came from
+ std::vector<SlavePort*>ports;
+
+ // keep track of #uncoalesced reqs per packet per TLB level;
+ // reqCnt per level >= reqCnt higher level
+ std::vector<int> reqCnt;
+ // TLB level this packet hit in; 0 if it hit in the page table
+ int hitLevel;
+ Packet::SenderState *saved;
+
+ TranslationState(Mode tlb_mode, ThreadContext *_tc,
+ bool _prefetch=false,
+ Packet::SenderState *_saved=nullptr)
+ : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr),
+ prefetch(_prefetch), issueTime(0),
+ hitLevel(0),saved(_saved) { }
+ };
+
+ // maximum number of permitted coalesced requests per cycle
+ int maxCoalescedReqs;
+
+ // Current number of outstandings coalesced requests.
+ // Should be <= maxCoalescedReqs
+ int outstandingReqs;
+
+ /**
+ * A TLBEvent is scheduled after the TLB lookup and helps us take the
+ * appropriate actions:
+ * (e.g., update TLB on a hit,
+ * send request to lower level TLB on a miss,
+ * or start a page walk if this was the last-level TLB).
+ */
+ void translationReturn(Addr virtPageAddr, tlbOutcome outcome,
+ PacketPtr pkt);
+
+ class TLBEvent : public Event
+ {
+ private:
+ GpuTLB *tlb;
+ Addr virtPageAddr;
+ /**
+ * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK
+ */
+ tlbOutcome outcome;
+ PacketPtr pkt;
+
+ public:
+ TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome,
+ PacketPtr _pkt);
+
+ void process();
+ const char *description() const;
+
+ // updateOutcome updates the tlbOutcome of a TLBEvent
+ void updateOutcome(tlbOutcome _outcome);
+ Addr getTLBEventVaddr();
+ };
+
+ std::unordered_map<Addr, TLBEvent*> translationReturnEvent;
+
+ // this FIFO queue keeps track of the virt. page addresses
+ // that are pending cleanup
+ std::queue<Addr> cleanupQueue;
+
+ // the cleanupEvent is scheduled after a TLBEvent triggers in order to
+ // free memory and do the required clean-up
+ void cleanup();
+
+ EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent;
+
+ /**
+ * This hash map will use the virtual page address as a key
+ * and will keep track of total number of accesses per page
+ */
+
+ struct AccessInfo
+ {
+ unsigned int lastTimeAccessed; // last access to this page
+ unsigned int accessesPerPage;
+ // need to divide it by accessesPerPage at the end
+ unsigned int totalReuseDistance;
+
+ /**
+ * The field below will help us compute the access distance,
+ * that is the number of (coalesced) TLB accesses that
+ * happened in between each access to this page
+ *
+ * localTLBAccesses[x] is the value of localTLBNumAccesses
+ * when the page <Addr> was accessed for the <x>th time
+ */
+ std::vector<unsigned int> localTLBAccesses;
+ unsigned int sumDistance;
+ unsigned int meanDistance;
+ };
+
+ typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable;
+ AccessPatternTable TLBFootprint;
+
+ // Called at the end of simulation to dump page access stats.
+ void exitCallback();
+
+ EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent;
+ };
+}
+
+#endif // __GPU_TLB_HH__