1 files changed, 368 insertions, 0 deletions
diff --git a/src/gpu-compute/wavefront.hh b/src/gpu-compute/wavefront.hh
new file mode 100644
index 000000000..0abab8e83
--- /dev/null
+++ b/src/gpu-compute/wavefront.hh
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Lisa Hsu
+ */
+
+#ifndef __WAVEFRONT_HH__
+#define __WAVEFRONT_HH__
+
+#include <cassert>
+#include <deque>
+#include <memory>
+#include <stack>
+#include <vector>
+
+#include "base/misc.hh"
+#include "base/types.hh"
+#include "gpu-compute/condition_register_state.hh"
+#include "gpu-compute/lds_state.hh"
+#include "gpu-compute/misc.hh"
+#include "params/Wavefront.hh"
+#include "sim/sim_object.hh"
+
+static const int MAX_NUM_INSTS_PER_WF = 12;
+
+/*
+ * Arguments for the hsail opcode call, are user defined and variable length.
+ * The hardware/finalizer can support arguments in hardware or use memory to
+ * pass arguments. For now, let's assume that an unlimited number of arguments
+ * are supported in hardware (the compiler inlines functions whenver it can
+ * anyways, so unless someone is interested in the implications of linking/
+ * library functions, I think this is a reasonable assumption given the typical
+ * size of an OpenCL kernel).
+ *
+ * Note that call args are different than kernel arguments:
+ *   * All work-items in a kernel refer the same set of kernel arguments
+ *   * Each work-item has it's on set of call args. So a call argument at
+ *     address 0x4 is different for work-item 0 and work-item 1.
+ *
+ * Ok, the table below shows an example of how we organize the call arguments in
+ * the CallArgMem class.
+ *
+ * int foo(int arg1, double arg2)
+ *  ___________________________________________________
+ * | 0: return.0 | 4: return.1 | ... | 252: return.63  |
+ * |---------------------------------------------------|
+ * | 256: arg1.0 | 260: arg1.1 | ... | 508: arg1.63    |
+ * |---------------------------------------------------|
+ * | 512: arg2.0 | 520: arg2.1 | ... | 1016: arg2.63   |
+ *  ___________________________________________________
+ */
+class CallArgMem
+{
+  public:
+    // pointer to buffer for storing function arguments
+    uint8_t *mem;
+    // size of function args
+    int funcArgsSizePerItem;
+
+    template<typename CType>
+    int
+    getLaneOffset(int lane, int addr)
+    {
+        return addr * VSZ + sizeof(CType) * lane;
+    }
+
+    CallArgMem(int func_args_size_per_item)
+      : funcArgsSizePerItem(func_args_size_per_item)
+    {
+        mem = (uint8_t*)malloc(funcArgsSizePerItem * VSZ);
+    }
+
+    ~CallArgMem()
+    {
+        free(mem);
+    }
+
+    template<typename CType>
+    uint8_t*
+    getLaneAddr(int lane, int addr)
+    {
+        return mem + getLaneOffset<CType>(lane, addr);
+    }
+
+    template<typename CType>
+    void
+    setLaneAddr(int lane, int addr, CType val)
+    {
+        *((CType*)(mem + getLaneOffset<CType>(lane, addr))) = val;
+    }
+};
+
+/**
+ * A reconvergence stack entry conveys the necessary state to implement
+ * control flow divergence.
+ */
+class ReconvergenceStackEntry {
+
+  public:
+    ReconvergenceStackEntry(uint32_t new_pc, uint32_t new_rpc,
+                            VectorMask new_mask) : pc(new_pc), rpc(new_rpc),
+                            execMask(new_mask) {
+    }
+
+    /**
+     * PC of current instruction.
+     */
+    uint32_t pc;
+    /**
+     * PC of the immediate post-dominator instruction, i.e., the value of
+     * @a pc for the first instruction that will be executed by the wavefront
+     * when a reconvergence point is reached.
+     */
+    uint32_t rpc;
+    /**
+     * Execution mask.
+     */
+    VectorMask execMask;
+};
+
+class Wavefront : public SimObject
+{
+  public:
+    enum itype_e {I_ALU,I_GLOBAL,I_SHARED,I_FLAT,I_PRIVATE};
+    enum status_e {S_STOPPED,S_RETURNING,S_RUNNING};
+
+    // Base pointer for array of instruction pointers
+    uint64_t base_ptr;
+
+    uint32_t old_barrier_cnt;
+    uint32_t barrier_cnt;
+    uint32_t barrier_id;
+    uint32_t barrier_slots;
+    status_e status;
+    // HW slot id where the WF is mapped to inside a SIMD unit
+    int wfSlotId;
+    int kern_id;
+    // SIMD unit where the WV has been scheduled
+    int simdId;
+    // pointer to parent CU
+    ComputeUnit *computeUnit;
+
+    std::deque<GPUDynInstPtr> instructionBuffer;
+
+    bool pendingFetch;
+    bool dropFetch;
+
+    // Condition Register State (for HSAIL simulations only)
+    class ConditionRegisterState *condRegState;
+    // number of single precision VGPRs required by WF
+    uint32_t maxSpVgprs;
+    // number of double precision VGPRs required by WF
+    uint32_t maxDpVgprs;
+    // map virtual to physical vector register
+    uint32_t remap(uint32_t vgprIndex, uint32_t size, uint8_t mode=0);
+    void resizeRegFiles(int num_cregs, int num_sregs, int num_dregs);
+    bool isGmInstruction(GPUDynInstPtr ii);
+    bool isLmInstruction(GPUDynInstPtr ii);
+    bool isOldestInstGMem();
+    bool isOldestInstLMem();
+    bool isOldestInstPrivMem();
+    bool isOldestInstFlatMem();
+    bool isOldestInstALU();
+    bool isOldestInstBarrier();
+    // used for passing spill address to DDInstGPU
+    uint64_t last_addr[VSZ];
+    uint32_t workitemid[3][VSZ];
+    uint32_t workitemFlatId[VSZ];
+    uint32_t workgroupid[3];
+    uint32_t workgroupsz[3];
+    uint32_t gridsz[3];
+    uint32_t wg_id;
+    uint32_t wg_sz;
+    uint32_t dynwaveid;
+    uint32_t maxdynwaveid;
+    uint32_t dispatchid;
+    // outstanding global+local memory requests
+    uint32_t outstanding_reqs;
+    // memory requests between scoreboard
+    // and execute stage not yet executed
+    uint32_t mem_reqs_in_pipe;
+    // outstanding global memory write requests
+    uint32_t outstanding_reqs_wr_gm;
+    // outstanding local memory write requests
+    uint32_t outstanding_reqs_wr_lm;
+    // outstanding global memory read requests
+    uint32_t outstanding_reqs_rd_gm;
+    // outstanding local memory read requests
+    uint32_t outstanding_reqs_rd_lm;
+    uint32_t rd_lm_reqs_in_pipe;
+    uint32_t rd_gm_reqs_in_pipe;
+    uint32_t wr_lm_reqs_in_pipe;
+    uint32_t wr_gm_reqs_in_pipe;
+
+    int mem_trace_busy;
+    uint64_t last_trace;
+    // number of vector registers reserved by WF
+    int reservedVectorRegs;
+    // Index into the Vector Register File's namespace where the WF's registers
+    // will live while the WF is executed
+    uint32_t startVgprIndex;
+
+    // Old value of destination gpr (for trace)
+    uint32_t old_vgpr[VSZ];
+    // Id of destination gpr (for trace)
+    uint32_t old_vgpr_id;
+    // Tick count of last old_vgpr copy
+    uint64_t old_vgpr_tcnt;
+
+    // Old value of destination gpr (for trace)
+    uint64_t old_dgpr[VSZ];
+    // Id of destination gpr (for trace)
+    uint32_t old_dgpr_id;
+    // Tick count of last old_vgpr copy
+    uint64_t old_dgpr_tcnt;
+
+    // Execution mask at wavefront start
+    VectorMask init_mask;
+
+    // number of barriers this WF has joined
+    int bar_cnt[VSZ];
+    int max_bar_cnt;
+    // Flag to stall a wave on barrier
+    bool stalledAtBarrier;
+
+    // a pointer to the fraction of the LDS allocated
+    // to this workgroup (thus this wavefront)
+    LdsChunk *ldsChunk;
+
+    // A pointer to the spill area
+    Addr spillBase;
+    // The size of the spill area
+    uint32_t spillSizePerItem;
+    // The vector width of the spill area
+    uint32_t spillWidth;
+
+    // A pointer to the private memory area
+    Addr privBase;
+    // The size of the private memory area
+    uint32_t privSizePerItem;
+
+    // A pointer ot the read-only memory area
+    Addr roBase;
+    // size of the read-only memory area
+    uint32_t roSize;
+
+    // pointer to buffer for storing kernel arguments
+    uint8_t *kernelArgs;
+    // unique WF id over all WFs executed across all CUs
+    uint64_t wfDynId;
+
+    // number of times instruction issue for this wavefront is blocked
+    // due to VRF port availability
+    Stats::Scalar numTimesBlockedDueVrfPortAvail;
+    // number of times an instruction of a WF is blocked from being issued
+    // due to WAR and WAW dependencies
+    Stats::Scalar numTimesBlockedDueWAXDependencies;
+    // number of times an instruction of a WF is blocked from being issued
+    // due to WAR and WAW dependencies
+    Stats::Scalar numTimesBlockedDueRAWDependencies;
+    // distribution of executed instructions based on their register
+    // operands; this is used to highlight the load on the VRF
+    Stats::Distribution srcRegOpDist;
+    Stats::Distribution dstRegOpDist;
+
+    // Functions to operate on call argument memory
+    // argument memory for hsail call instruction
+    CallArgMem *callArgMem;
+    void
+    initCallArgMem(int func_args_size_per_item)
+    {
+        callArgMem = new CallArgMem(func_args_size_per_item);
+    }
+
+    template<typename CType>
+    CType
+    readCallArgMem(int lane, int addr)
+    {
+        return *((CType*)(callArgMem->getLaneAddr<CType>(lane, addr)));
+    }
+
+    template<typename CType>
+    void
+    writeCallArgMem(int lane, int addr, CType val)
+    {
+        callArgMem->setLaneAddr<CType>(lane, addr, val);
+    }
+
+    typedef WavefrontParams Params;
+    Wavefront(const Params *p);
+    ~Wavefront();
+    virtual void init();
+
+    void
+    setParent(ComputeUnit *cu)
+    {
+        computeUnit = cu;
+    }
+
+    void start(uint64_t _wfDynId, uint64_t _base_ptr);
+
+    void exec();
+    void updateResources();
+    int ready(itype_e type);
+    bool instructionBufferHasBranch();
+    void regStats();
+    VectorMask get_pred() { return execMask() & init_mask; }
+
+    bool waitingAtBarrier(int lane);
+
+    void pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
+                                  const VectorMask& exec_mask);
+
+    void popFromReconvergenceStack();
+
+    uint32_t pc() const;
+
+    uint32_t rpc() const;
+
+    VectorMask execMask() const;
+
+    bool execMask(int lane) const;
+
+    void pc(uint32_t new_pc);
+
+    void discardFetch();
+
+  private:
+    /**
+     * Stack containing Control Flow Graph nodes (i.e., kernel instructions)
+     * to be visited by the wavefront, and the associated execution masks. The
+     * reconvergence stack grows every time the wavefront reaches a divergence
+     * point (branch instruction), and shrinks every time the wavefront
+     * reaches a reconvergence point (immediate post-dominator instruction).
+     */
+    std::stack<std::unique_ptr<ReconvergenceStackEntry>> reconvergenceStack;
+};
+
+#endif // __WAVEFRONT_HH__