/*
 * Copyright (c) 2015-2017 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Authors: Anthony Gutierrez
 */

#ifndef __GPU_DYN_INST_HH__
#define __GPU_DYN_INST_HH__

#include <cstdint>
#include <string>

#include "base/logging.hh"
#include "enums/MemType.hh"
#include "enums/StorageClassType.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_exec_context.hh"

class GPUStaticInst;

template<typename T>
class AtomicOpAnd : public TypedAtomicOpFunctor<T>
{
  public:
    T a;

    AtomicOpAnd(T _a) : a(_a) { }
    void execute(T *b) { *b &= a; }
    AtomicOpFunctor* clone () { return new AtomicOpAnd(a); }
};

template<typename T>
class AtomicOpOr : public TypedAtomicOpFunctor<T>
{
  public:
    T a;
    AtomicOpOr(T _a) : a(_a) { }
    void execute(T *b) { *b |= a; }
    AtomicOpFunctor* clone () { return new AtomicOpOr(a); }
};

template<typename T>
class AtomicOpXor : public TypedAtomicOpFunctor<T>
{
  public:
    T a;
    AtomicOpXor(T _a) : a(_a) {}
    void execute(T *b) { *b ^= a; }
    AtomicOpFunctor* clone () { return new AtomicOpXor(a); }
};

template<typename T>
class AtomicOpCAS : public TypedAtomicOpFunctor<T>
{
  public:
    T c;
    T s;

    ComputeUnit *computeUnit;

    AtomicOpCAS(T _c, T _s, ComputeUnit *compute_unit)
      : c(_c), s(_s), computeUnit(compute_unit) { }

    void
    execute(T *b)
    {
        computeUnit->numCASOps++;

        if (*b == c) {
            *b = s;
        } else {
            computeUnit->numFailedCASOps++;
        }

        if (computeUnit->xact_cas_mode) {
            computeUnit->xactCasLoadMap.clear();
        }
    }
    AtomicOpFunctor* clone () { return new AtomicOpCAS(c, s, computeUnit); }
};

template<typename T>
class AtomicOpExch : public TypedAtomicOpFunctor<T>
{
  public:
    T a;
    AtomicOpExch(T _a) : a(_a) { }
    void execute(T *b) { *b = a; }
    AtomicOpFunctor* clone () { return new AtomicOpExch(a); }
};

template<typename T>
class AtomicOpAdd : public TypedAtomicOpFunctor<T>
{
  public:
    T a;
    AtomicOpAdd(T _a) : a(_a) { }
    void execute(T *b) { *b += a; }
    AtomicOpFunctor* clone () { return new AtomicOpAdd(a); }
};

template<typename T>
class AtomicOpSub : public TypedAtomicOpFunctor<T>
{
  public:
    T a;
    AtomicOpSub(T _a) : a(_a) { }
    void execute(T *b) { *b -= a; }
    AtomicOpFunctor* clone () { return new AtomicOpSub(a); }
};

template<typename T>
class AtomicOpInc : public TypedAtomicOpFunctor<T>
{
  public:
    AtomicOpInc() { }
    void execute(T *b) { *b += 1; }
    AtomicOpFunctor* clone () { return new AtomicOpInc(); }
};

template<typename T>
class AtomicOpDec : public TypedAtomicOpFunctor<T>
{
  public:
    AtomicOpDec() {}
    void execute(T *b) { *b -= 1; }
    AtomicOpFunctor* clone () { return new AtomicOpDec(); }
};

template<typename T>
class AtomicOpMax : public TypedAtomicOpFunctor<T>
{
  public:
    T a;
    AtomicOpMax(T _a) : a(_a) { }

    void
    execute(T *b)
    {
        if (a > *b)
            *b = a;
    }
    AtomicOpFunctor* clone () { return new AtomicOpMax(a); }
};

template<typename T>
class AtomicOpMin : public TypedAtomicOpFunctor<T>
{
  public:
    T a;
    AtomicOpMin(T _a) : a(_a) {}

    void
    execute(T *b)
    {
        if (a < *b)
            *b = a;
    }
    AtomicOpFunctor* clone () { return new AtomicOpMin(a); }
};

typedef enum
{
    VT_32,
    VT_64,
} vgpr_type;

class GPUDynInst : public GPUExecContext
{
  public:
    GPUDynInst(ComputeUnit *_cu, Wavefront *_wf, GPUStaticInst *static_inst,
               uint64_t instSeqNum);
    ~GPUDynInst();
    void execute(GPUDynInstPtr gpuDynInst);
    int numSrcRegOperands();
    int numDstRegOperands();
    int getNumOperands();
    bool isVectorRegister(int operandIdx);
    bool isScalarRegister(int operandIdx);
    bool isCondRegister(int operandIdx);
    int getRegisterIndex(int operandIdx, GPUDynInstPtr gpuDynInst);
    int getOperandSize(int operandIdx);
    bool isDstOperand(int operandIdx);
    bool isSrcOperand(int operandIdx);

    const std::string &disassemble() const;

    uint64_t seqNum() const;

    Enums::StorageClassType executedAs();

    // The address of the memory operation
    std::vector<Addr> addr;
    Addr pAddr;

    // The data to get written
    uint8_t *d_data;
    // Additional data (for atomics)
    uint8_t *a_data;
    // Additional data (for atomics)
    uint8_t *x_data;
    // The execution mask
    VectorMask exec_mask;

    // The memory type (M_U32, M_S32, ...)
    Enums::MemType m_type;

    // The equivalency class
    int equiv;
    // The return VGPR type (VT_32 or VT_64)
    vgpr_type v_type;
    // Number of VGPR's accessed (1, 2, or 4)
    int n_reg;
    // The return VGPR index
    int dst_reg;
    // There can be max 4 dest regs>
    int dst_reg_vec[4];
    // SIMD where the WF of the memory instruction has been mapped to
    int simdId;
    // unique id of the WF where the memory instruction belongs to
    int wfDynId;
    // The kernel id of the requesting wf
    int kern_id;
    // The CU id of the requesting wf
    int cu_id;
    // HW slot id where the WF is mapped to inside a SIMD unit
    int wfSlotId;
    // execution pipeline id where the memory instruction has been scheduled
    int pipeId;
    // The execution time of this operation
    Tick time;
    // The latency of this operation
    WaitClass latency;
    // A list of bank conflicts for the 4 cycles.
    uint32_t bc[4];

    // A pointer to ROM
    uint8_t *rom;
    // The size of the READONLY segment
    int sz_rom;

    // Initiate the specified memory operation, by creating a
    // memory request and sending it off to the memory system.
    void initiateAcc(GPUDynInstPtr gpuDynInst);
    // Complete the specified memory operation, by writing
    // value back to the RF in the case of a load or atomic
    // return or, in the case of a store, we do nothing
    void completeAcc(GPUDynInstPtr gpuDynInst);

    void updateStats();

    GPUStaticInst* staticInstruction() { return _staticInst; }

    bool isALU() const;
    bool isBranch() const;
    bool isNop() const;
    bool isReturn() const;
    bool isUnconditionalJump() const;
    bool isSpecialOp() const;
    bool isWaitcnt() const;

    bool isBarrier() const;
    bool isMemFence() const;
    bool isMemRef() const;
    bool isFlat() const;
    bool isLoad() const;
    bool isStore() const;

    bool isAtomic() const;
    bool isAtomicNoRet() const;
    bool isAtomicRet() const;

    bool isScalar() const;
    bool readsSCC() const;
    bool writesSCC() const;
    bool readsVCC() const;
    bool writesVCC() const;

    bool isAtomicAnd() const;
    bool isAtomicOr() const;
    bool isAtomicXor() const;
    bool isAtomicCAS() const;
    bool isAtomicExch() const;
    bool isAtomicAdd() const;
    bool isAtomicSub() const;
    bool isAtomicInc() const;
    bool isAtomicDec() const;
    bool isAtomicMax() const;
    bool isAtomicMin() const;

    bool isArgLoad() const;
    bool isGlobalMem() const;
    bool isLocalMem() const;

    bool isArgSeg() const;
    bool isGlobalSeg() const;
    bool isGroupSeg() const;
    bool isKernArgSeg() const;
    bool isPrivateSeg() const;
    bool isReadOnlySeg() const;
    bool isSpillSeg() const;

    bool isWorkitemScope() const;
    bool isWavefrontScope() const;
    bool isWorkgroupScope() const;
    bool isDeviceScope() const;
    bool isSystemScope() const;
    bool isNoScope() const;

    bool isRelaxedOrder() const;
    bool isAcquire() const;
    bool isRelease() const;
    bool isAcquireRelease() const;
    bool isNoOrder() const;

    bool isGloballyCoherent() const;
    bool isSystemCoherent() const;

    /*
     * Loads/stores/atomics may have acquire/release semantics associated
     * withthem. Some protocols want to see the acquire/release as separate
     * requests from the load/store/atomic. We implement that separation
     * using continuations (i.e., a function pointer with an object associated
     * with it). When, for example, the front-end generates a store with
     * release semantics, we will first issue a normal store and set the
     * continuation in the GPUDynInst to a function that generate a
     * release request. That continuation will be called when the normal
     * store completes (in ComputeUnit::DataPort::recvTimingResponse). The
     * continuation will be called in the context of the same GPUDynInst
     * that generated the initial store.
     */
    std::function<void(GPUStaticInst*, GPUDynInstPtr)> execContinuation;

    // when true, call execContinuation when response arrives
    bool useContinuation;

    template<typename c0> AtomicOpFunctor*
    makeAtomicOpFunctor(c0 *reg0, c0 *reg1)
    {
        if (isAtomicAnd()) {
            return new AtomicOpAnd<c0>(*reg0);
        } else if (isAtomicOr()) {
            return new AtomicOpOr<c0>(*reg0);
        } else if (isAtomicXor()) {
            return new AtomicOpXor<c0>(*reg0);
        } else if (isAtomicCAS()) {
            return new AtomicOpCAS<c0>(*reg0, *reg1, cu);
        } else if (isAtomicExch()) {
            return new AtomicOpExch<c0>(*reg0);
        } else if (isAtomicAdd()) {
            return new AtomicOpAdd<c0>(*reg0);
        } else if (isAtomicSub()) {
            return new AtomicOpSub<c0>(*reg0);
        } else if (isAtomicInc()) {
            return new AtomicOpInc<c0>();
        } else if (isAtomicDec()) {
            return new AtomicOpDec<c0>();
        } else if (isAtomicMax()) {
            return new AtomicOpMax<c0>(*reg0);
        } else if (isAtomicMin()) {
            return new AtomicOpMin<c0>(*reg0);
        } else {
            fatal("Unrecognized atomic operation");
        }
    }

    void
    setRequestFlags(RequestPtr req, bool setMemOrder=true)
    {
        // currently these are the easy scopes to deduce
        if (isPrivateSeg()) {
            req->setMemSpaceConfigFlags(Request::PRIVATE_SEGMENT);
        } else if (isSpillSeg()) {
            req->setMemSpaceConfigFlags(Request::SPILL_SEGMENT);
        } else if (isGlobalSeg()) {
            req->setMemSpaceConfigFlags(Request::GLOBAL_SEGMENT);
        } else if (isReadOnlySeg()) {
            req->setMemSpaceConfigFlags(Request::READONLY_SEGMENT);
        } else if (isGroupSeg()) {
            req->setMemSpaceConfigFlags(Request::GROUP_SEGMENT);
        } else if (isFlat()) {
            panic("TODO: translate to correct scope");
        } else {
            fatal("%s has bad segment type\n", disassemble());
        }

        if (isWavefrontScope()) {
            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
                                        Request::WAVEFRONT_SCOPE);
        } else if (isWorkgroupScope()) {
            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
                                        Request::WORKGROUP_SCOPE);
        } else if (isDeviceScope()) {
            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
                                        Request::DEVICE_SCOPE);
        } else if (isSystemScope()) {
            req->setMemSpaceConfigFlags(Request::SCOPE_VALID |
                                        Request::SYSTEM_SCOPE);
        } else if (!isNoScope() && !isWorkitemScope()) {
            fatal("%s has bad scope type\n", disassemble());
        }

        if (setMemOrder) {
            // set acquire and release flags
            if (isAcquire()) {
                req->setFlags(Request::ACQUIRE);
            } else if (isRelease()) {
                req->setFlags(Request::RELEASE);
            } else if (isAcquireRelease()) {
                req->setFlags(Request::ACQUIRE | Request::RELEASE);
            } else if (!isNoOrder()) {
                fatal("%s has bad memory order\n", disassemble());
            }
        }

        // set atomic type
        // currently, the instruction genenerator only produces atomic return
        // but a magic instruction can produce atomic no return
        if (isAtomicRet()) {
            req->setFlags(Request::ATOMIC_RETURN_OP);
        } else if (isAtomicNoRet()) {
            req->setFlags(Request::ATOMIC_NO_RETURN_OP);
        }
    }

    // Map returned packets and the addresses they satisfy with which lane they
    // were requested from
    typedef std::unordered_map<Addr, std::vector<int>> StatusVector;
    StatusVector memStatusVector;

    // Track the status of memory requests per lane, a bit per lane
    VectorMask statusBitVector;
    // for ld_v# or st_v#
    std::vector<int> statusVector;
    std::vector<int> tlbHitLevel;

  private:
    GPUStaticInst *_staticInst;
    uint64_t _seqNum;
};

#endif // __GPU_DYN_INST_HH__