1 files changed, 767 insertions, 0 deletions
diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh
new file mode 100644
index 000000000..f47c27a0a
--- /dev/null
+++ b/src/gpu-compute/compute_unit.hh
@@ -0,0 +1,767 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: John Kalamatianos, Anthony Gutierrez
+ */
+
+#ifndef __COMPUTE_UNIT_HH__
+#define __COMPUTE_UNIT_HH__
+
+#include <deque>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "base/callback.hh"
+#include "base/statistics.hh"
+#include "base/types.hh"
+#include "enums/PrefetchType.hh"
+#include "gpu-compute/exec_stage.hh"
+#include "gpu-compute/fetch_stage.hh"
+#include "gpu-compute/global_memory_pipeline.hh"
+#include "gpu-compute/local_memory_pipeline.hh"
+#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/schedule_stage.hh"
+#include "gpu-compute/scoreboard_check_stage.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
+
+static const int MAX_REGS_FOR_NON_VEC_MEM_INST = 1;
+static const int MAX_WIDTH_FOR_MEM_INST = 32;
+
+class NDRange;
+class Shader;
+class VectorRegisterFile;
+
+struct ComputeUnitParams;
+
+enum EXEC_POLICY
+{
+    OLDEST = 0,
+    RR
+};
+
+// List of execution units
+enum EXEC_UNIT
+{
+    SIMD0 = 0,
+    SIMD1,
+    SIMD2,
+    SIMD3,
+    GLBMEM_PIPE,
+    LDSMEM_PIPE,
+    NUM_UNITS
+};
+
+enum TLB_CACHE
+{
+    TLB_MISS_CACHE_MISS = 0,
+    TLB_MISS_CACHE_HIT,
+    TLB_HIT_CACHE_MISS,
+    TLB_HIT_CACHE_HIT
+};
+
+class ComputeUnit : public MemObject
+{
+  public:
+    FetchStage fetchStage;
+    ScoreboardCheckStage scoreboardCheckStage;
+    ScheduleStage scheduleStage;
+    ExecStage execStage;
+    GlobalMemPipeline globalMemoryPipe;
+    LocalMemPipeline localMemoryPipe;
+
+    // Buffers used to communicate between various pipeline stages
+
+    // List of waves which are ready to be scheduled.
+    // Each execution resource has a ready list. readyList is
+    // used to communicate between scoreboardCheck stage and
+    // schedule stage
+    // TODO: make enum to index readyList
+    std::vector<std::vector<Wavefront*>> readyList;
+
+    // Stores the status of waves. A READY implies the
+    // wave is ready to be scheduled this cycle and
+    // is already present in the readyList. waveStatusList is
+    // used to communicate between scoreboardCheck stage and
+    // schedule stage
+    // TODO: convert std::pair to a class to increase readability
+    std::vector<std::vector<std::pair<Wavefront*, WAVE_STATUS>>> waveStatusList;
+
+    // List of waves which will be dispatched to
+    // each execution resource. A FILLED implies
+    // dispatch list is non-empty and
+    // execution unit has something to execute
+    // this cycle. Currently, the dispatch list of
+    // an execution resource can hold only one wave because
+    // an execution resource can execute only one wave in a cycle.
+    // dispatchList is used to communicate between schedule
+    // and exec stage
+    // TODO: convert std::pair to a class to increase readability
+    std::vector<std::pair<Wavefront*, DISPATCH_STATUS>> dispatchList;
+
+    int rrNextMemID; // used by RR WF exec policy to cycle through WF's
+    int rrNextALUWp;
+    typedef ComputeUnitParams Params;
+    std::vector<std::vector<Wavefront*>> wfList;
+    int cu_id;
+
+    // array of vector register files, one per SIMD
+    std::vector<VectorRegisterFile*> vrf;
+    // Number of vector ALU units (SIMDs) in CU
+    int numSIMDs;
+    // number of pipe stages for bypassing data to next dependent single
+    // precision vector instruction inside the vector ALU pipeline
+    int spBypassPipeLength;
+    // number of pipe stages for bypassing data to next dependent double
+    // precision vector instruction inside the vector ALU pipeline
+    int dpBypassPipeLength;
+    // number of cycles per issue period
+    int issuePeriod;
+
+    // Number of global and local memory execution resources in CU
+    int numGlbMemUnits;
+    int numLocMemUnits;
+    // tracks the last cycle a vector instruction was executed on a SIMD
+    std::vector<uint64_t> lastExecCycle;
+
+    // true if we allow a separate TLB per lane
+    bool perLaneTLB;
+    // if 0, TLB prefetching is off.
+    int prefetchDepth;
+    // if fixed-stride prefetching, this is the stride.
+    int prefetchStride;
+
+    class LastVaddrWave
+    {
+      public:
+        Addr vaddrs[VSZ];
+        Addr& operator[](int idx) {
+            return vaddrs[idx];
+        }
+
+        LastVaddrWave() {
+            for (int i = 0; i < VSZ; ++i)
+                vaddrs[i] = 0;
+        }
+    };
+
+    LastVaddrWave lastVaddrCU;
+    std::vector<LastVaddrWave> lastVaddrPhase;
+    std::vector<std::vector<std::vector<Addr>>> lastVaddrWF;
+    Enums::PrefetchType prefetchType;
+    EXEC_POLICY exec_policy;
+
+    bool xact_cas_mode;
+    bool debugSegFault;
+    bool functionalTLB;
+    bool localMemBarrier;
+
+    /*
+     * for Counting page accesses
+     *
+     * cuExitCallback inherits from Callback. When you register a callback
+     * function as an exit callback, it will get added to an exit callback
+     * queue, such that on simulation exit, all callbacks in the callback
+     * queue will have their process() function called.
+     */
+    bool countPages;
+
+    Shader *shader;
+    uint32_t barrier_id;
+    // vector of Vector ALU (MACC) pipelines
+    std::vector<WaitClass> aluPipe;
+    // minimum issue period per SIMD unit (in cycles)
+    std::vector<WaitClass> wfWait;
+
+    // Resource control for Vector Register File->Global Memory pipe buses
+    std::vector<WaitClass> vrfToGlobalMemPipeBus;
+    // Resource control for Vector Register File->Local Memory pipe buses
+    std::vector<WaitClass> vrfToLocalMemPipeBus;
+    int nextGlbMemBus;
+    int nextLocMemBus;
+    // Resource control for global memory to VRF data/address bus
+    WaitClass glbMemToVrfBus;
+    // Resource control for local memory to VRF data/address bus
+    WaitClass locMemToVrfBus;
+
+    uint32_t vrfToCoalescerBusWidth; // VRF->Coalescer data bus width in bytes
+    uint32_t coalescerToVrfBusWidth; // Coalescer->VRF data bus width in bytes
+    uint32_t numCyclesPerStoreTransfer;  // number of cycles per vector store
+    uint32_t numCyclesPerLoadTransfer;  // number of cycles per vector load
+
+    Tick req_tick_latency;
+    Tick resp_tick_latency;
+
+    // number of vector registers being reserved for each SIMD unit
+    std::vector<int> vectorRegsReserved;
+    // number of vector registers per SIMD unit
+    uint32_t numVecRegsPerSimd;
+    // Support for scheduling VGPR status update events
+    std::vector<std::pair<uint32_t, uint32_t> > regIdxVec;
+    std::vector<uint64_t> timestampVec;
+    std::vector<uint8_t>  statusVec;
+
+    void
+    registerEvent(uint32_t simdId,
+                  uint32_t regIdx,
+                  uint32_t operandSize,
+                  uint64_t when,
+                  uint8_t newStatus) {
+        regIdxVec.push_back(std::make_pair(simdId, regIdx));
+        timestampVec.push_back(when);
+        statusVec.push_back(newStatus);
+        if (operandSize > 4) {
+            regIdxVec.push_back(std::make_pair(simdId,
+                                               ((regIdx + 1) %
+                                                numVecRegsPerSimd)));
+            timestampVec.push_back(when);
+            statusVec.push_back(newStatus);
+        }
+    }
+
+    void updateEvents();
+
+    // this hash map will keep track of page divergence
+    // per memory instruction per wavefront. The hash map
+    // is cleared in GPUDynInst::updateStats() in gpu_dyn_inst.cc.
+    std::map<Addr, int> pagesTouched;
+
+    ComputeUnit(const Params *p);
+    ~ComputeUnit();
+    int spBypassLength() { return spBypassPipeLength; };
+    int dpBypassLength() { return dpBypassPipeLength; };
+    int storeBusLength() { return numCyclesPerStoreTransfer; };
+    int loadBusLength() { return numCyclesPerLoadTransfer; };
+    int wfSize() const { return wavefrontSize; };
+
+    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+    void exec();
+    void initiateFetch(Wavefront *wavefront);
+    void fetch(PacketPtr pkt, Wavefront *wavefront);
+    void FillKernelState(Wavefront *w, NDRange *ndr);
+
+    void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
+                 int trueWgSizeTotal);
+
+    void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
+                             int trueWgSize[], int trueWgSizeTotal,
+                             LdsChunk *ldsChunk, uint64_t origSpillMemStart);
+
+    void StartWorkgroup(NDRange *ndr);
+    int ReadyWorkgroup(NDRange *ndr);
+
+    bool isVecAlu(int unitId) { return unitId >= SIMD0 && unitId <= SIMD3; }
+    bool isGlbMem(int unitId) { return unitId == GLBMEM_PIPE; }
+    bool isShrMem(int unitId) { return unitId == LDSMEM_PIPE; }
+    int GlbMemUnitId() { return GLBMEM_PIPE; }
+    int ShrMemUnitId() { return LDSMEM_PIPE; }
+    int nextGlbRdBus() { return (++nextGlbMemBus) % numGlbMemUnits; }
+    int nextLocRdBus() { return (++nextLocMemBus) % numLocMemUnits; }
+    /* This function cycles through all the wavefronts in all the phases to see
+     * if all of the wavefronts which should be associated with one barrier
+     * (denoted with _barrier_id), are all at the same barrier in the program
+     * (denoted by bcnt). When the number at the barrier matches bslots, then
+     * return true.
+     */
+    int AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots);
+    bool cedeSIMD(int simdId, int wfSlotId);
+
+    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr gpuDynInst);
+    virtual void init();
+    void sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+    void sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt);
+    void injectGlobalMemFence(GPUDynInstPtr gpuDynInst,
+                              bool kernelLaunch=true,
+                              RequestPtr req=nullptr);
+    void handleMemPacket(PacketPtr pkt, int memport_index);
+    bool processTimingPacket(PacketPtr pkt);
+    void processFetchReturn(PacketPtr pkt);
+    void updatePageDivergenceDist(Addr addr);
+
+    MasterID masterId() { return _masterId; }
+
+    bool isDone() const;
+    bool isSimdDone(uint32_t) const;
+
+  protected:
+    MasterID _masterId;
+
+    LdsState &lds;
+
+  public:
+    // the following stats compute the avg. TLB accesslatency per
+    // uncoalesced request (only for data)
+    Stats::Scalar tlbRequests;
+    Stats::Scalar tlbCycles;
+    Stats::Formula tlbLatency;
+    // hitsPerTLBLevel[x] are the hits in Level x TLB. x = 0 is the page table.
+    Stats::Vector hitsPerTLBLevel;
+
+    Stats::Scalar ldsBankAccesses;
+    Stats::Distribution ldsBankConflictDist;
+
+    // over all memory instructions executed over all wavefronts
+    // how many touched 0-4 pages, 4-8, ..., 60-64 pages
+    Stats::Distribution pageDivergenceDist;
+    Stats::Scalar dynamicGMemInstrCnt;
+    Stats::Scalar dynamicLMemInstrCnt;
+
+    Stats::Scalar wgBlockedDueLdsAllocation;
+    // Number of instructions executed, i.e. if 64 (or 32 or 7) lanes are active
+    // when the instruction is committed, this number is still incremented by 1
+    Stats::Scalar numInstrExecuted;
+    // Number of cycles among successive instruction executions across all
+    // wavefronts of the same CU
+    Stats::Distribution execRateDist;
+    // number of individual vector operations executed
+    Stats::Scalar numVecOpsExecuted;
+    // Total cycles that something is running on the GPU
+    Stats::Scalar totalCycles;
+    Stats::Formula vpc; // vector ops per cycle
+    Stats::Formula ipc; // vector instructions per cycle
+    Stats::Distribution controlFlowDivergenceDist;
+    Stats::Distribution activeLanesPerGMemInstrDist;
+    Stats::Distribution activeLanesPerLMemInstrDist;
+    // number of vector ALU instructions received
+    Stats::Formula numALUInstsExecuted;
+    // number of times a WG can not start due to lack of free VGPRs in SIMDs
+    Stats::Scalar numTimesWgBlockedDueVgprAlloc;
+    Stats::Scalar numCASOps;
+    Stats::Scalar numFailedCASOps;
+    Stats::Scalar completedWfs;
+    // flag per vector SIMD unit that is set when there is at least one
+    // WV that has a vector ALU instruction as the oldest in its
+    // Instruction Buffer: Defined in the Scoreboard stage, consumed
+    // by the Execute stage.
+    std::vector<bool> vectorAluInstAvail;
+    // number of available (oldest) LDS instructions that could have
+    // been issued to the LDS at a specific issue slot
+    int shrMemInstAvail;
+    // number of available Global memory instructions that could have
+    // been issued to TCP at a specific issue slot
+    int glbMemInstAvail;
+
+    void
+    regStats();
+
+    LdsState &
+    getLds() const
+    {
+        return lds;
+    }
+
+    int32_t
+    getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const;
+
+    bool
+    sendToLds(GPUDynInstPtr gpuDynInst) __attribute__((warn_unused_result));
+
+    typedef std::unordered_map<Addr, std::pair<int, int>> pageDataStruct;
+    pageDataStruct pageAccesses;
+
+    class CUExitCallback : public Callback
+    {
+      private:
+        ComputeUnit *computeUnit;
+
+      public:
+        virtual ~CUExitCallback() { }
+
+        CUExitCallback(ComputeUnit *_cu)
+        {
+            computeUnit = _cu;
+        }
+
+        virtual void
+        process();
+    };
+
+    CUExitCallback *cuExitCallback;
+
+    /** Data access Port **/
+    class DataPort : public MasterPort
+    {
+      public:
+        DataPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+            : MasterPort(_name, _cu), computeUnit(_cu),
+              index(_index) { }
+
+        bool snoopRangeSent;
+
+        struct SenderState : public Packet::SenderState
+        {
+            GPUDynInstPtr _gpuDynInst;
+            int port_index;
+            Packet::SenderState *saved;
+
+            SenderState(GPUDynInstPtr gpuDynInst, PortID _port_index,
+                        Packet::SenderState *sender_state=nullptr)
+                : _gpuDynInst(gpuDynInst),
+                  port_index(_port_index),
+                  saved(sender_state) { }
+        };
+
+        class MemReqEvent : public Event
+        {
+          private:
+            DataPort *dataPort;
+            PacketPtr pkt;
+
+          public:
+            MemReqEvent(DataPort *_data_port, PacketPtr _pkt)
+                : Event(), dataPort(_data_port), pkt(_pkt)
+            {
+              setFlags(Event::AutoDelete);
+            }
+
+            void process();
+            const char *description() const;
+        };
+
+        class MemRespEvent : public Event
+        {
+          private:
+            DataPort *dataPort;
+            PacketPtr pkt;
+
+          public:
+            MemRespEvent(DataPort *_data_port, PacketPtr _pkt)
+                : Event(), dataPort(_data_port), pkt(_pkt)
+            {
+              setFlags(Event::AutoDelete);
+            }
+
+            void process();
+            const char *description() const;
+        };
+
+        std::deque<std::pair<PacketPtr, GPUDynInstPtr>> retries;
+
+      protected:
+        ComputeUnit *computeUnit;
+        int index;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+
+        virtual void
+        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
+        {
+            resp.clear();
+            snoop = true;
+        }
+
+    };
+
+    // Instruction cache access port
+    class SQCPort : public MasterPort
+    {
+      public:
+        SQCPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+            : MasterPort(_name, _cu), computeUnit(_cu),
+              index(_index) { }
+
+        bool snoopRangeSent;
+
+        struct SenderState : public Packet::SenderState
+        {
+            Wavefront *wavefront;
+            Packet::SenderState *saved;
+
+            SenderState(Wavefront *_wavefront, Packet::SenderState
+                    *sender_state=nullptr)
+                : wavefront(_wavefront), saved(sender_state) { }
+        };
+
+        std::deque<std::pair<PacketPtr, Wavefront*>> retries;
+
+      protected:
+        ComputeUnit *computeUnit;
+        int index;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+
+        virtual void
+        getDeviceAddressRanges(AddrRangeList &resp, bool &snoop)
+        {
+            resp.clear();
+            snoop = true;
+        }
+     };
+
+    /** Data TLB port **/
+    class DTLBPort : public MasterPort
+    {
+      public:
+        DTLBPort(const std::string &_name, ComputeUnit *_cu, PortID _index)
+            : MasterPort(_name, _cu), computeUnit(_cu),
+              index(_index), stalled(false)
+        { }
+
+        bool isStalled() { return stalled; }
+        void stallPort() { stalled = true; }
+        void unstallPort() { stalled = false; }
+
+        /**
+         * here we queue all the translation requests that were
+         * not successfully sent.
+         */
+        std::deque<PacketPtr> retries;
+
+        /** SenderState is information carried along with the packet
+         * throughout the TLB hierarchy
+         */
+        struct SenderState: public Packet::SenderState
+        {
+            // the memInst that this is associated with
+            GPUDynInstPtr _gpuDynInst;
+
+            // the lane in the memInst this is associated with, so we send
+            // the memory request down the right port
+            int portIndex;
+
+            // constructor used for packets involved in timing accesses
+            SenderState(GPUDynInstPtr gpuDynInst, PortID port_index)
+                : _gpuDynInst(gpuDynInst), portIndex(port_index) { }
+
+        };
+
+      protected:
+        ComputeUnit *computeUnit;
+        int index;
+        bool stalled;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+    };
+
+    class ITLBPort : public MasterPort
+    {
+      public:
+        ITLBPort(const std::string &_name, ComputeUnit *_cu)
+            : MasterPort(_name, _cu), computeUnit(_cu), stalled(false) { }
+
+
+        bool isStalled() { return stalled; }
+        void stallPort() { stalled = true; }
+        void unstallPort() { stalled = false; }
+
+        /**
+         * here we queue all the translation requests that were
+         * not successfully sent.
+         */
+        std::deque<PacketPtr> retries;
+
+        /** SenderState is information carried along with the packet
+         * throughout the TLB hierarchy
+         */
+        struct SenderState: public Packet::SenderState
+        {
+            // The wavefront associated with this request
+            Wavefront *wavefront;
+
+            SenderState(Wavefront *_wavefront) : wavefront(_wavefront) { }
+        };
+
+      protected:
+        ComputeUnit *computeUnit;
+        bool stalled;
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt) { return 0; }
+        virtual void recvFunctional(PacketPtr pkt) { }
+        virtual void recvRangeChange() { }
+        virtual void recvReqRetry();
+    };
+
+    /**
+     * the port intended to communicate between the CU and its LDS
+     */
+    class LDSPort : public MasterPort
+    {
+      public:
+        LDSPort(const std::string &_name, ComputeUnit *_cu, PortID _id)
+        : MasterPort(_name, _cu, _id), computeUnit(_cu)
+        {
+        }
+
+        bool isStalled() const { return stalled; }
+        void stallPort() { stalled = true; }
+        void unstallPort() { stalled = false; }
+
+        /**
+         * here we queue all the requests that were
+         * not successfully sent.
+         */
+        std::queue<PacketPtr> retries;
+
+        /**
+         *  SenderState is information carried along with the packet, esp. the
+         *  GPUDynInstPtr
+         */
+        class SenderState: public Packet::SenderState
+        {
+          protected:
+            // The actual read/write/atomic request that goes with this command
+            GPUDynInstPtr _gpuDynInst = nullptr;
+
+          public:
+            SenderState(GPUDynInstPtr gpuDynInst):
+              _gpuDynInst(gpuDynInst)
+            {
+            }
+
+            GPUDynInstPtr
+            getMemInst() const
+            {
+              return _gpuDynInst;
+            }
+        };
+
+        virtual bool
+        sendTimingReq(PacketPtr pkt);
+
+      protected:
+
+        bool stalled = false; ///< whether or not it is stalled
+
+        ComputeUnit *computeUnit;
+
+        virtual bool
+        recvTimingResp(PacketPtr pkt);
+
+        virtual Tick
+        recvAtomic(PacketPtr pkt) { return 0; }
+
+        virtual void
+        recvFunctional(PacketPtr pkt)
+        {
+        }
+
+        virtual void
+        recvRangeChange()
+        {
+        }
+
+        virtual void
+        recvReqRetry();
+    };
+
+    /** The port to access the Local Data Store
+     *  Can be connected to a LDS object
+     */
+    LDSPort *ldsPort = nullptr;
+
+    LDSPort *
+    getLdsPort() const
+    {
+        return ldsPort;
+    }
+
+    /** The memory port for SIMD data accesses.
+     *  Can be connected to PhysMem for Ruby for timing simulations
+     */
+    std::vector<DataPort*> memPort;
+    // port to the TLB hierarchy (i.e., the L1 TLB)
+    std::vector<DTLBPort*> tlbPort;
+    // port to the SQC (i.e. the I-cache)
+    SQCPort *sqcPort;
+    // port to the SQC TLB (there's a separate TLB for each I-cache)
+    ITLBPort *sqcTLBPort;
+
+    virtual BaseMasterPort&
+    getMasterPort(const std::string &if_name, PortID idx)
+    {
+        if (if_name == "memory_port") {
+            memPort[idx] = new DataPort(csprintf("%s-port%d", name(), idx),
+                                        this, idx);
+            return *memPort[idx];
+        } else if (if_name == "translation_port") {
+            tlbPort[idx] = new DTLBPort(csprintf("%s-port%d", name(), idx),
+                                        this, idx);
+            return *tlbPort[idx];
+        } else if (if_name == "sqc_port") {
+            sqcPort = new SQCPort(csprintf("%s-port%d", name(), idx),
+                                  this, idx);
+            return *sqcPort;
+        } else if (if_name == "sqc_tlb_port") {
+            sqcTLBPort = new ITLBPort(csprintf("%s-port", name()), this);
+            return *sqcTLBPort;
+        } else if (if_name == "ldsPort") {
+            if (ldsPort) {
+                fatal("an LDS port was already allocated");
+            }
+            ldsPort = new LDSPort(csprintf("%s-port", name()), this, idx);
+            return *ldsPort;
+        } else {
+            panic("incorrect port name");
+        }
+    }
+
+    // xact_cas_load()
+    class waveIdentifier
+    {
+      public:
+        waveIdentifier() { }
+        waveIdentifier(int _simdId, int _wfSlotId)
+          : simdId(_simdId), wfSlotId(_wfSlotId) { }
+
+        int simdId;
+        int wfSlotId;
+    };
+
+    class waveQueue
+    {
+      public:
+        std::list<waveIdentifier> waveIDQueue;
+    };
+    std::map<unsigned, waveQueue> xactCasLoadMap;
+
+    uint64_t getAndIncSeqNum() { return globalSeqNum++; }
+
+  private:
+    uint64_t globalSeqNum;
+    int wavefrontSize;
+};
+
+#endif // __COMPUTE_UNIT_HH__