diff options
author | Tony Gutierrez <anthony.gutierrez@amd.com> | 2016-01-19 14:28:22 -0500 |
---|---|---|
committer | Tony Gutierrez <anthony.gutierrez@amd.com> | 2016-01-19 14:28:22 -0500 |
commit | 1a7d3f9fcb76a68540dd948f91413533a383bfde (patch) | |
tree | 867510a147cd095f19499d26b7c02d27de4cae9d /src/gpu-compute/gpu_tlb.hh | |
parent | 28e353e0403ea379d244a418e8dc8ee0b48187cf (diff) | |
download | gem5-1a7d3f9fcb76a68540dd948f91413533a383bfde.tar.xz |
gpu-compute: AMD's baseline GPU model
Diffstat (limited to 'src/gpu-compute/gpu_tlb.hh')
-rw-r--r-- | src/gpu-compute/gpu_tlb.hh | 465 |
1 files changed, 465 insertions, 0 deletions
diff --git a/src/gpu-compute/gpu_tlb.hh b/src/gpu-compute/gpu_tlb.hh new file mode 100644 index 000000000..3549c598b --- /dev/null +++ b/src/gpu-compute/gpu_tlb.hh @@ -0,0 +1,465 @@ +/* + * Copyright (c) 2011-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: Lisa Hsu + */ + +#ifndef __GPU_TLB_HH__ +#define __GPU_TLB_HH__ + +#include <fstream> +#include <list> +#include <queue> +#include <string> +#include <vector> + +#include "arch/generic/tlb.hh" +#include "arch/x86/pagetable.hh" +#include "arch/x86/pagetable_walker.hh" +#include "arch/x86/regs/segment.hh" +#include "base/callback.hh" +#include "base/misc.hh" +#include "base/statistics.hh" +#include "gpu-compute/compute_unit.hh" +#include "mem/mem_object.hh" +#include "mem/port.hh" +#include "mem/request.hh" +#include "params/X86GPUTLB.hh" +#include "sim/sim_object.hh" + +class BaseTLB; +class Packet; +class ThreadContext; + +namespace X86ISA +{ + class GpuTlbEntry : public TlbEntry + { + public: + GpuTlbEntry(Addr asn, Addr _vaddr, Addr _paddr, bool _valid) + : TlbEntry(asn, _vaddr, _paddr, false, false), valid(_valid) { } + + GpuTlbEntry() : TlbEntry() { } + + bool valid; + }; + + class GpuTLB : public MemObject + { + protected: + friend class Walker; + + typedef std::list<GpuTlbEntry*> EntryList; + + uint32_t configAddress; + + // TLB clock: will inherit clock from shader's clock period in terms + // of nuber of ticks of curTime (aka global simulation clock) + // The assignment of TLB clock from shader clock is done in the python + // config files. + int clock; + + public: + // clock related functions ; maps to-and-from Simulation ticks and + // object clocks. + Tick frequency() const { return SimClock::Frequency / clock; } + + Tick + ticks(int numCycles) const + { + return (Tick)clock * numCycles; + } + + Tick curCycle() const { return curTick() / clock; } + Tick tickToCycles(Tick val) const { return val / clock;} + + typedef X86GPUTLBParams Params; + GpuTLB(const Params *p); + ~GpuTLB(); + + typedef enum BaseTLB::Mode Mode; + + class Translation + { + public: + virtual ~Translation() { } + + /** + * Signal that the translation has been delayed due to a hw page + * table walk. + */ + virtual void markDelayed() = 0; + + /** + * The memory for this object may be dynamically allocated, and it + * may be responsible for cleaning itslef up which will happen in + * this function. Once it's called the object is no longer valid. + */ + virtual void finish(Fault fault, RequestPtr req, ThreadContext *tc, + Mode mode) = 0; + }; + + void dumpAll(); + GpuTlbEntry *lookup(Addr va, bool update_lru=true); + void setConfigAddress(uint32_t addr); + + protected: + EntryList::iterator lookupIt(Addr va, bool update_lru=true); + Walker *walker; + + public: + Walker *getWalker(); + void invalidateAll(); + void invalidateNonGlobal(); + void demapPage(Addr va, uint64_t asn); + + protected: + int size; + int assoc; + int numSets; + + /** + * true if this is a fully-associative TLB + */ + bool FA; + Addr setMask; + + /** + * Allocation Policy: true if we always allocate on a hit, false + * otherwise. Default is true. + */ + bool allocationPolicy; + + /** + * if true, then this is not the last level TLB + */ + bool hasMemSidePort; + + /** + * Print out accessDistance stats. One stat file + * per TLB. + */ + bool accessDistance; + + GpuTlbEntry *tlb; + + /* + * It's a per-set list. As long as we have not reached + * the full capacity of the given set, grab an entry from + * the freeList. + */ + std::vector<EntryList> freeList; + + /** + * An entryList per set is the equivalent of an LRU stack; + * it's used to guide replacement decisions. The head of the list + * contains the MRU TLB entry of the given set. If the freeList + * for this set is empty, the last element of the list + * is evicted (i.e., dropped on the floor). + */ + std::vector<EntryList> entryList; + + Fault translateInt(RequestPtr req, ThreadContext *tc); + + Fault translate(RequestPtr req, ThreadContext *tc, + Translation *translation, Mode mode, bool &delayedResponse, + bool timing, int &latency); + + public: + // latencies for a TLB hit, miss and page fault + int hitLatency; + int missLatency1; + int missLatency2; + + // local_stats are as seen from the TLB + // without taking into account coalescing + Stats::Scalar localNumTLBAccesses; + Stats::Scalar localNumTLBHits; + Stats::Scalar localNumTLBMisses; + Stats::Formula localTLBMissRate; + + // global_stats are as seen from the + // CU's perspective taking into account + // all coalesced requests. + Stats::Scalar globalNumTLBAccesses; + Stats::Scalar globalNumTLBHits; + Stats::Scalar globalNumTLBMisses; + Stats::Formula globalTLBMissRate; + + // from the CU perspective (global) + Stats::Scalar accessCycles; + // from the CU perspective (global) + Stats::Scalar pageTableCycles; + Stats::Scalar numUniquePages; + // from the perspective of this TLB + Stats::Scalar localCycles; + // from the perspective of this TLB + Stats::Formula localLatency; + // I take the avg. per page and then + // the avg. over all pages. + Stats::Scalar avgReuseDistance; + + void regStats(); + void updatePageFootprint(Addr virt_page_addr); + void printAccessPattern(); + + + Fault translateAtomic(RequestPtr req, ThreadContext *tc, Mode mode, + int &latency); + + void translateTiming(RequestPtr req, ThreadContext *tc, + Translation *translation, Mode mode, + int &latency); + + Tick doMmuRegRead(ThreadContext *tc, Packet *pkt); + Tick doMmuRegWrite(ThreadContext *tc, Packet *pkt); + + GpuTlbEntry *insert(Addr vpn, GpuTlbEntry &entry); + + // Checkpointing + virtual void serialize(CheckpointOut& cp) const; + virtual void unserialize(CheckpointIn& cp); + void issueTranslation(); + enum tlbOutcome {TLB_HIT, TLB_MISS, PAGE_WALK, MISS_RETURN}; + bool tlbLookup(RequestPtr req, ThreadContext *tc, bool update_stats); + + void handleTranslationReturn(Addr addr, tlbOutcome outcome, + PacketPtr pkt); + + void handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome outcome); + + void pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt, + GpuTlbEntry *tlb_entry, Mode mode); + + void updatePhysAddresses(Addr virt_page_addr, GpuTlbEntry *tlb_entry, + Addr phys_page_addr); + + void issueTLBLookup(PacketPtr pkt); + + // CpuSidePort is the TLB Port closer to the CPU/CU side + class CpuSidePort : public SlavePort + { + public: + CpuSidePort(const std::string &_name, GpuTLB * gpu_TLB, + PortID _index) + : SlavePort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } + + protected: + GpuTLB *tlb; + int index; + + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + virtual void recvRespRetry() { assert(false); } + virtual AddrRangeList getAddrRanges() const; + }; + + /** + * MemSidePort is the TLB Port closer to the memory side + * If this is a last level TLB then this port will not be connected. + * + * Future action item: if we ever do real page walks, then this port + * should be connected to a RubyPort. + */ + class MemSidePort : public MasterPort + { + public: + MemSidePort(const std::string &_name, GpuTLB * gpu_TLB, + PortID _index) + : MasterPort(_name, gpu_TLB), tlb(gpu_TLB), index(_index) { } + + std::deque<PacketPtr> retries; + + protected: + GpuTLB *tlb; + int index; + + virtual bool recvTimingResp(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt) { return 0; } + virtual void recvFunctional(PacketPtr pkt) { } + virtual void recvRangeChange() { } + virtual void recvReqRetry(); + }; + + // TLB ports on the cpu Side + std::vector<CpuSidePort*> cpuSidePort; + // TLB ports on the memory side + std::vector<MemSidePort*> memSidePort; + + BaseMasterPort &getMasterPort(const std::string &if_name, + PortID idx=InvalidPortID); + + BaseSlavePort &getSlavePort(const std::string &if_name, + PortID idx=InvalidPortID); + + /** + * TLB TranslationState: this currently is a somewhat bastardization of + * the usage of SenderState, whereby the receiver of a packet is not + * usually supposed to need to look at the contents of the senderState, + * you're really only supposed to look at what you pushed on, pop it + * off, and send it back. + * + * However, since there is state that we want to pass to the TLBs using + * the send/recv Timing/Functional/etc. APIs, which don't allow for new + * arguments, we need a common TLB senderState to pass between TLBs, + * both "forwards" and "backwards." + * + * So, basically, the rule is that any packet received by a TLB port + * (cpuside OR memside) must be safely castable to a TranslationState. + */ + + struct TranslationState : public Packet::SenderState + { + // TLB mode, read or write + Mode tlbMode; + // Thread context associated with this req + ThreadContext *tc; + + /* + * TLB entry to be populated and passed back and filled in + * previous TLBs. Equivalent to the data cache concept of + * "data return." + */ + GpuTlbEntry *tlbEntry; + // Is this a TLB prefetch request? + bool prefetch; + // When was the req for this translation issued + uint64_t issueTime; + // Remember where this came from + std::vector<SlavePort*>ports; + + // keep track of #uncoalesced reqs per packet per TLB level; + // reqCnt per level >= reqCnt higher level + std::vector<int> reqCnt; + // TLB level this packet hit in; 0 if it hit in the page table + int hitLevel; + Packet::SenderState *saved; + + TranslationState(Mode tlb_mode, ThreadContext *_tc, + bool _prefetch=false, + Packet::SenderState *_saved=nullptr) + : tlbMode(tlb_mode), tc(_tc), tlbEntry(nullptr), + prefetch(_prefetch), issueTime(0), + hitLevel(0),saved(_saved) { } + }; + + // maximum number of permitted coalesced requests per cycle + int maxCoalescedReqs; + + // Current number of outstandings coalesced requests. + // Should be <= maxCoalescedReqs + int outstandingReqs; + + /** + * A TLBEvent is scheduled after the TLB lookup and helps us take the + * appropriate actions: + * (e.g., update TLB on a hit, + * send request to lower level TLB on a miss, + * or start a page walk if this was the last-level TLB). + */ + void translationReturn(Addr virtPageAddr, tlbOutcome outcome, + PacketPtr pkt); + + class TLBEvent : public Event + { + private: + GpuTLB *tlb; + Addr virtPageAddr; + /** + * outcome can be TLB_HIT, TLB_MISS, or PAGE_WALK + */ + tlbOutcome outcome; + PacketPtr pkt; + + public: + TLBEvent(GpuTLB *_tlb, Addr _addr, tlbOutcome outcome, + PacketPtr _pkt); + + void process(); + const char *description() const; + + // updateOutcome updates the tlbOutcome of a TLBEvent + void updateOutcome(tlbOutcome _outcome); + Addr getTLBEventVaddr(); + }; + + std::unordered_map<Addr, TLBEvent*> translationReturnEvent; + + // this FIFO queue keeps track of the virt. page addresses + // that are pending cleanup + std::queue<Addr> cleanupQueue; + + // the cleanupEvent is scheduled after a TLBEvent triggers in order to + // free memory and do the required clean-up + void cleanup(); + + EventWrapper<GpuTLB, &GpuTLB::cleanup> cleanupEvent; + + /** + * This hash map will use the virtual page address as a key + * and will keep track of total number of accesses per page + */ + + struct AccessInfo + { + unsigned int lastTimeAccessed; // last access to this page + unsigned int accessesPerPage; + // need to divide it by accessesPerPage at the end + unsigned int totalReuseDistance; + + /** + * The field below will help us compute the access distance, + * that is the number of (coalesced) TLB accesses that + * happened in between each access to this page + * + * localTLBAccesses[x] is the value of localTLBNumAccesses + * when the page <Addr> was accessed for the <x>th time + */ + std::vector<unsigned int> localTLBAccesses; + unsigned int sumDistance; + unsigned int meanDistance; + }; + + typedef std::unordered_map<Addr, AccessInfo> AccessPatternTable; + AccessPatternTable TLBFootprint; + + // Called at the end of simulation to dump page access stats. + void exitCallback(); + + EventWrapper<GpuTLB, &GpuTLB::exitCallback> exitEvent; + }; +} + +#endif // __GPU_TLB_HH__ |