41 files changed, 2479 insertions, 90 deletions
diff --git a/src/arch/arm/tlb.cc b/src/arch/arm/tlb.cc
index 46056d07b..25e210b55 100644
--- a/src/arch/arm/tlb.cc
+++ b/src/arch/arm/tlb.cc
@@ -539,6 +539,11 @@ TLB::regStats()
         .name(name() + ".prefetch_faults")
         .desc("Number of TLB faults due to prefetch")
         ;
+    
+    specTLBMisses 
+        .name(name() + ".spec_tlb_misses")
+        .desc("Number of TLB misses from a speculative mem instructions")
+        ;
 
     domainFaults
         .name(name() + ".domain_faults")
@@ -1434,6 +1439,17 @@ TLB::getTE(TlbEntry **te, const RequestPtr &req, ThreadContext *tc, Mode mode,
                vaddr_tainted, ArmFault::PrefetchTLBMiss, isStage2);
         }
 
+        if (req->isSpec()) {
+            // if the request is a prefetch don't attempt to fill the TLB or go
+            // any further with the memory access (here we can safely use the
+            // fault status for the short desc. format in all cases)
+           specTLBMisses++;
+           //FIXME: currently resue the prefetch tlbmiss fault
+           //do not want to introduce new fault declaration
+           return std::make_shared<PrefetchAbort>(
+               vaddr_tainted, ArmFault::PrefetchTLBMiss, isStage2);
+        }
+
         if (is_fetch)
             instMisses++;
         else if (is_write)
diff --git a/src/arch/arm/tlb.hh b/src/arch/arm/tlb.hh
index 637240abb..862a48380 100644
--- a/src/arch/arm/tlb.hh
+++ b/src/arch/arm/tlb.hh
@@ -181,6 +181,7 @@ class TLB : public BaseTLB
     mutable Stats::Scalar flushedEntries;
     mutable Stats::Scalar alignFaults;
     mutable Stats::Scalar prefetchFaults;
+    mutable Stats::Scalar specTLBMisses;
     mutable Stats::Scalar domainFaults;
     mutable Stats::Scalar permsFaults;
 
diff --git a/src/arch/generic/memhelpers.hh b/src/arch/generic/memhelpers.hh
index 7fd4f70de..de128cb41 100644
--- a/src/arch/generic/memhelpers.hh
+++ b/src/arch/generic/memhelpers.hh
@@ -53,6 +53,7 @@
 /// Initiate a read from memory in timing mode.  Note that the 'mem'
 /// parameter is unused; only the type of that parameter is used
 /// to determine the size of the access.
+// XC: executeContextPtr [mengjia]
 template <class XC, class MemT>
 Fault
 initiateMemRead(XC *xc, Trace::InstRecord *traceData, Addr addr,
diff --git a/src/arch/x86/tlb.cc b/src/arch/x86/tlb.cc
index 8e83208f4..ba14d7ef3 100644
--- a/src/arch/x86/tlb.cc
+++ b/src/arch/x86/tlb.cc
@@ -340,6 +340,17 @@ TLB::translate(const RequestPtr &req,
                 wrAccesses++;
             }
             if (!entry) {
+                if(req->isSpec()){
+                    // [InvisiSpec] do not perform TLB fill for
+                    // speculative load
+                    specMisses++;
+                    DPRINTF(TLB, "Get a TLB miss for a speculative load "
+                            "address %#x at pc %#x.\n",
+                            vaddr, tc->instAddr());
+                    //FIXME: currently reuse the GeneralProtection fault
+                    //instead of creating new faults
+                    return std::make_shared<GeneralProtection>(0);
+                }
                 DPRINTF(TLB, "Handling a TLB miss for "
                         "address %#x at pc %#x.\n",
                         vaddr, tc->instAddr());
@@ -472,6 +483,9 @@ TLB::regStats()
         .name(name() + ".wrMisses")
         .desc("TLB misses on write requests");
 
+    specMisses
+        .name(name() + ".spec_tlb_misses")
+        .desc("TLB misses on speculative memory requests");
 }
 
 void
diff --git a/src/arch/x86/tlb.hh b/src/arch/x86/tlb.hh
index 827ab8166..7213b8b41 100644
--- a/src/arch/x86/tlb.hh
+++ b/src/arch/x86/tlb.hh
@@ -105,6 +105,7 @@ namespace X86ISA
         Stats::Scalar wrAccesses;
         Stats::Scalar rdMisses;
         Stats::Scalar wrMisses;
+        Stats::Scalar specMisses;
 
         Fault translateInt(const RequestPtr &req, ThreadContext *tc);
 
diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh
index b4431da1f..6301864b7 100644
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@@ -120,6 +120,28 @@ class BaseDynInst : public ExecContext, public RefCounted
                                  /// instructions ahead of it
         SerializeAfter,          /// Needs to serialize instructions behind it
         SerializeHandled,        /// Serialization has been handled
+
+        SpecCompleted,
+        // [mengjia] indicates whether received specReadResp
+        ValidationCompleted,
+        // indicates whether validation finishes
+        ExposeCompleted,
+        ExposeSent,
+        // indicates whether expose finishes
+        // (should set when expose is sent out)
+        PrevInstsCompleted,
+        // indicate whether previous instructions completed
+        PrevBrsResolved,
+        // [mengjia] indicate whether previous branches are resolved
+        PrevInstsCommitted,
+        // indicate whether previous instructions committed
+        PrevBrsCommitted,
+        // [mengjia] indicate whether previous branches are committed
+        L1HitHigh,
+        L1HitLow,
+        SpecBuffObsoleteHigh,
+        SpecBuffObsoleteLow,
+        // [InvisiSpec] it hits in L1 and is open to invalidations
         NumStatus
     };
 
@@ -136,6 +158,22 @@ class BaseDynInst : public ExecContext, public RefCounted
         IsStrictlyOrdered,
         ReqMade,
         MemOpDone,
+        // [mengjia] indicates need validation or expose
+        NeedPostFetch,
+        NeedDeletePostReq,
+        // [mengjia] indicates only need to expose, do not need to validate
+        NeedExposeOnly,
+        // [InvisiSpec] indicate the instruction needs to be delayed
+        // due to virtual fences before to defend against speculative attacks
+        FenceDelay,
+        // [InvisiSpec] indicate the load is legal to be visible
+        ReadyToExpose,
+        HitInvalidation,
+        HitExternalEviction,
+        ValidationFail,
+        OnlyWaitForFence,
+        OnlyWaitForExpose,
+        SpecTLBMiss,
         MaxFlags
     };
 
@@ -222,6 +260,9 @@ class BaseDynInst : public ExecContext, public RefCounted
     /** Pointer to the data for the memory access. */
     uint8_t *memData;
 
+    /** Pointer to the data for the validation result. */
+    uint8_t *vldData;
+
     /** Load queue index. */
     int16_t lqIdx;
 
@@ -238,6 +279,12 @@ class BaseDynInst : public ExecContext, public RefCounted
     RequestPtr savedSreqLow;
     RequestPtr savedSreqHigh;
 
+    /** [InvisiSpec]
+     * Saved memory requests (needed for post-fetch validation/expose).
+     */
+    RequestPtr postReq;
+    RequestPtr postSreqLow;
+    RequestPtr postSreqHigh;
     /////////////////////// Checker //////////////////////
     // Need a copy of main request pointer to verify on writes.
     RequestPtr reqToVerify;
@@ -275,6 +322,43 @@ class BaseDynInst : public ExecContext, public RefCounted
     bool memOpDone() const { return instFlags[MemOpDone]; }
     void memOpDone(bool f) { instFlags[MemOpDone] = f; }
 
+    /** [mengjia] Whether or not need pseudo-validation.
+     * whether speculative laod finishes,
+     * whether validation completes or not (success) */
+    bool needExposeOnly() const { return instFlags[NeedExposeOnly]; }
+    void needExposeOnly(bool f) { instFlags[NeedExposeOnly] = f; }
+
+    bool needPostFetch() const { return instFlags[NeedPostFetch]; }
+    void needPostFetch(bool f) { instFlags[NeedPostFetch] = f; }
+
+    bool needDeletePostReq() const { return instFlags[NeedDeletePostReq]; }
+    void needDeletePostReq(bool f) { instFlags[NeedDeletePostReq] = f; }
+
+    bool fenceDelay() const { return instFlags[ReadyToExpose]; }
+    void fenceDelay(bool f) { instFlags[ReadyToExpose] = f; }
+
+    bool readyToExpose() const { return instFlags[FenceDelay]; }
+    void readyToExpose(bool f) { instFlags[FenceDelay] = f; }
+
+    bool hitInvalidation() const { return instFlags[HitInvalidation]; }
+    void hitInvalidation(bool f) { instFlags[HitInvalidation] = f; }
+
+    bool hitExternalEviction() const { return instFlags[HitExternalEviction]; }
+    void hitExternalEviction(bool f) { instFlags[HitExternalEviction] = f; }
+
+    bool validationFail() const { return instFlags[ValidationFail]; }
+    void validationFail(bool f) { instFlags[ValidationFail] = f; }
+
+    bool onlyWaitForFence() const { return instFlags[OnlyWaitForFence]; }
+    void onlyWaitForFence(bool f) { instFlags[OnlyWaitForFence] = f; }
+
+    bool onlyWaitForExpose() const { return instFlags[OnlyWaitForExpose]; }
+    void onlyWaitForExpose(bool f) { instFlags[OnlyWaitForExpose] = f; }
+
+    bool specTLBMiss() const { return instFlags[SpecTLBMiss]; }
+    void specTLBMiss(bool f) { instFlags[SpecTLBMiss] = f; }
+    /*[mengjia] added 2 new flags and corresponding functions*/
+
     bool notAnInst() const { return instFlags[NotAnInst]; }
     void setNotAnInst() { instFlags[NotAnInst] = true; }
 
@@ -704,6 +788,49 @@ class BaseDynInst : public ExecContext, public RefCounted
     /** Returns whether or not this instruction is completed. */
     bool isCompleted() const { return status[Completed]; }
 
+    /* [mengjia] new status for load operations */
+    //void setSpecSent() { status.set(SpecSent); }
+    //bool isSpecSent() const { return status[SpecSent]; }
+
+    void setSpecCompleted() { status.set(SpecCompleted); }
+    bool isSpecCompleted() const { return status[SpecCompleted]; }
+
+    void setValidationCompleted() { status.set(ValidationCompleted); }
+    bool isValidationCompleted() const { return status[ValidationCompleted]; }
+
+    void setExposeCompleted() { status.set(ExposeCompleted); }
+    bool isExposeCompleted() const { return status[ExposeCompleted]; }
+
+    void setExposeSent() { status.set(ExposeSent); }
+    bool isExposeSent() const { return status[ExposeSent]; }
+
+    void setL1HitHigh() { status.set(L1HitHigh); }
+    void clearL1HitHigh() { status.reset(L1HitHigh); }
+    bool isL1HitHigh() const { return status[L1HitHigh]; }
+
+    void setL1HitLow() { status.set(L1HitLow); }
+    void clearL1HitLow() { status.reset(L1HitLow); }
+    bool isL1HitLow() const { return status[L1HitLow]; }
+
+    void setPrevInstsCompleted() { status.set(PrevInstsCompleted); }
+    bool isPrevInstsCompleted() const { return status[PrevInstsCompleted]; }
+
+    void setSpecBuffObsoleteHigh() { status.set(SpecBuffObsoleteHigh); }
+    bool isSpecBuffObsoleteHigh() const { return status[SpecBuffObsoleteHigh]; }
+
+    void setSpecBuffObsoleteLow() { status.set(SpecBuffObsoleteLow); }
+    bool isSpecBuffObsoleteLow() const { return status[SpecBuffObsoleteLow]; }
+
+    void setPrevBrsResolved() { status.set(PrevBrsResolved); }
+    bool isPrevBrsResolved() const { return status[PrevBrsResolved]; }
+
+    void setPrevInstsCommitted() { status.set(PrevInstsCommitted); }
+    bool isPrevInstsCommitted() const { return status[PrevInstsCommitted]; }
+
+    void setPrevBrsCommitted() { status.set(PrevBrsCommitted); }
+    bool isPrevBrsCommitted() const { return status[PrevBrsCommitted]; }
+    /* Configure load related status */
+
     /** Marks the result as ready. */
     void setResultReady() { status.set(ResultReady); }
 
@@ -893,7 +1020,25 @@ Fault
 BaseDynInst<Impl>::initiateMemRead(Addr addr, unsigned size,
                                    Request::Flags flags)
 {
+    // [InvisiSpec] do not start translation if
+    // there is a virtual fence ahead
+    assert(!fenceDelay());
+
+    if ( (flags.isSet(Request::ATOMIC_RETURN_OP)
+            || flags.isSet(Request::ATOMIC_NO_RETURN_OP)
+            || flags.isSet(Request::UNCACHEABLE)
+            || flags.isSet(Request::LLSC)
+            || flags.isSet(Request::STRICT_ORDER))
+            && !readyToExpose()){
+        onlyWaitForExpose(true);
+        // FIXME: reschedule due to LLSC
+        // reuse TLBMiss for now
+        specTLBMiss(true);
+        return NoFault;
+    }
+
     instFlags[ReqMade] = true;
+    instFlags[SpecTLBMiss] = false;
     RequestPtr req = NULL;
     RequestPtr sreqLow = NULL;
     RequestPtr sreqHigh = NULL;
@@ -908,16 +1053,36 @@ BaseDynInst<Impl>::initiateMemRead(Addr addr, unsigned size,
             this->pc.instAddr(), thread->contextId());
 
         req->taskId(cpu->taskId());
-
+        if(!readyToExpose()){
+            req->setFlags(Request::SPEC);
+        }
         // Only split the request if the ISA supports unaligned accesses.
         if (TheISA::HasUnalignedMemAcc) {
             splitRequest(req, sreqLow, sreqHigh);
         }
+
         initiateTranslation(req, sreqLow, sreqHigh, NULL, BaseTLB::Read);
+
     }
 
     if (translationCompleted()) {
+        // [InvisiSpec] to fix the memory leakage problem
+        // in the case the read is squashed and the request
+        // is never sent out due to a virtual fence ahead
         if (fault == NoFault) {
+            /*
+            if (fenceDelay()){
+                translationStarted(false);
+                translationCompleted(false);
+                onlyWaitForFence(true);
+                delete req;
+                if (sreqLow){
+                    delete sreqLow;
+                    delete sreqHigh;
+                }
+                return NoFault;
+            }
+            */
             effAddr = req->getVaddr();
             effSize = size;
             instFlags[EffAddrValid] = true;
@@ -925,10 +1090,30 @@ BaseDynInst<Impl>::initiateMemRead(Addr addr, unsigned size,
             if (cpu->checker) {
                 reqToVerify = std::make_shared<Request>(*req);
             }
+
+            // issue load request [mengjia]
             fault = cpu->read(req, sreqLow, sreqHigh, lqIdx);
         } else {
             // Commit will have to clean up whatever happened.  Set this
             // instruction as executed.
+           
+            // [InvisiSpec] If it is a fault on translating a spec load
+            // Deffer it and retry when it is ready to expose
+            if (!readyToExpose()){
+                translationStarted(false);
+                translationCompleted(false);
+                onlyWaitForExpose(true);
+                specTLBMiss(true);
+                //delete req;
+                //if (sreqLow){
+                //    delete sreqLow;
+                //    delete sreqHigh;
+                //}
+                return NoFault;
+            }
+            // set it as executed and fault flag.
+            // when it enters ROB and try to commit,
+            // the commit stage will squash this inst [mengjia]
             this->setExecuted();
         }
     }
@@ -939,6 +1124,7 @@ BaseDynInst<Impl>::initiateMemRead(Addr addr, unsigned size,
     return fault;
 }
 
+
 template<class Impl>
 Fault
 BaseDynInst<Impl>::writeMem(uint8_t *data, unsigned size, Addr addr,
diff --git a/src/cpu/base_dyn_inst_impl.hh b/src/cpu/base_dyn_inst_impl.hh
index cd4740de5..542f37650 100644
--- a/src/cpu/base_dyn_inst_impl.hh
+++ b/src/cpu/base_dyn_inst_impl.hh
@@ -95,6 +95,7 @@ void
 BaseDynInst<Impl>::initVars()
 {
     memData = NULL;
+    vldData = NULL;
     effAddr = 0;
     physEffAddrLow = 0;
     physEffAddrHigh = 0;
@@ -148,6 +149,10 @@ BaseDynInst<Impl>::~BaseDynInst()
         delete [] memData;
     }
 
+    if (vldData) {
+        delete [] vldData;
+    }
+
     if (traceData) {
         delete traceData;
     }
diff --git a/src/cpu/o3/O3CPU.py b/src/cpu/o3/O3CPU.py
index b8152f663..371433eef 100644
--- a/src/cpu/o3/O3CPU.py
+++ b/src/cpu/o3/O3CPU.py
@@ -68,6 +68,10 @@ class DerivO3CPU(BaseCPU):
 
     cacheStorePorts = Param.Unsigned(200, "Cache Ports. "
           "Constrains stores only. Loads are constrained by load FUs.")
+    # we deal with validation very similar as store writes back
+    # FIXME: not sure whether it is the correct parameter or not
+    cacheValidationPorts = Param.Unsigned(200, "Validation Ports. "
+          "Constrains validations only. Loads are constrained by load FUs.")
 
     decodeToFetchDelay = Param.Cycles(1, "Decode to fetch delay")
     renameToFetchDelay = Param.Cycles(1 ,"Rename to fetch delay")
@@ -124,7 +128,7 @@ class DerivO3CPU(BaseCPU):
     LFSTSize = Param.Unsigned(1024, "Last fetched store table size")
     SSITSize = Param.Unsigned(1024, "Store set ID table size")
 
-    numRobs = Param.Unsigned(1, "Number of Reorder Buffers");
+    numRobs = Param.Unsigned(1, "Number of Reorder Buffers")
 
     numPhysIntRegs = Param.Unsigned(256, "Number of physical integer registers")
     numPhysFloatRegs = Param.Unsigned(256, "Number of physical floating point "
@@ -157,10 +161,14 @@ class DerivO3CPU(BaseCPU):
     smtCommitPolicy = Param.String('RoundRobin', "SMT Commit Policy")
 
     branchPred = Param.BranchPredictor(TournamentBP(numThreads =
-                                                       Parent.numThreads),
+                                       Parent.numThreads),
                                        "Branch Predictor")
-    needsTSO = Param.Bool(buildEnv['TARGET_ISA'] == 'x86',
-                          "Enable TSO Memory model")
+
+    # [mengjia] add configuration variables
+    simulateScheme = Param.String('UnsafeBaseline',
+                                   "The scheme specificed for simulation")
+    needsTSO = Param.Bool(False, "Enable TSO Memory model")
+    allowSpecBuffHit = Param.Bool(True, "Enable hit/reuse spec buffer entries")
 
     def addCheckerCpu(self):
         if buildEnv['TARGET_ISA'] in ['arm']:
diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh
index b74917da3..1d9839b80 100644
--- a/src/cpu/o3/commit.hh
+++ b/src/cpu/o3/commit.hh
@@ -450,6 +450,7 @@ class DefaultCommit
 
     /** The sequence number of the last commited instruction. */
     InstSeqNum lastCommitedSeqNum[Impl::MaxThreads];
+    Tick lastCommitTick;
 
     /** Records if there is a trap currently in flight. */
     bool trapInFlight[Impl::MaxThreads];
@@ -479,6 +480,9 @@ class DefaultCommit
     /** Updates commit stats based on this instruction. */
     void updateComInstStats(const DynInstPtr &inst);
 
+    /** [InvisiSpec] Updates squash stats based on this instruction. */
+    void updateSquashStats(const DynInstPtr &inst);
+
     /** Stat for the total number of squashed instructions discarded by commit.
      */
     Stats::Scalar commitSquashedInsts;
@@ -488,6 +492,19 @@ class DefaultCommit
     Stats::Scalar commitNonSpecStalls;
     /** Stat for the total number of branch mispredicts that caused a squash. */
     Stats::Scalar branchMispredicts;
+
+    // [InvisiSpec] count #squash
+    /** Stat for the total number of invalidation packets
+     * that caused a squash. */
+    Stats::Scalar loadHitInvalidations;
+    Stats::Scalar loadHitExternalEvictions;
+    /** Stat for the total number of failed validations
+     * that caused a squash. */
+    Stats::Scalar loadValidationFails;
+    // [InvisiSpec] count cycles stall due to waiting for
+    // validation responses
+    Stats::Scalar validationStalls;
+
     /** Distribution of the number of committed instructions each cycle. */
     Stats::Distribution numCommittedDist;
 
diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh
index 40ce8480e..468b4b39c 100644
--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@@ -146,6 +146,7 @@ DefaultCommit<Impl>::DefaultCommit(O3CPU *_cpu, DerivO3CPUParams *params)
         checkEmptyROB[tid] = false;
         renameMap[tid] = nullptr;
     }
+    lastCommitTick = curTick();
     interrupt = NoFault;
 }
 
@@ -186,6 +187,28 @@ DefaultCommit<Impl>::regStats()
         .desc("The number of times a branch was mispredicted")
         .prereq(branchMispredicts);
 
+    // [InvisiSpec] stat for squash due to invalidation, failed validation
+    loadHitInvalidations
+        .name(name() + ".loadHitInvalidations")
+        .desc("The number of times a load hits a invalidation");
+        //.prereq(loadHitInvalidations);
+
+    loadHitExternalEvictions
+        .name(name() + ".loadHitExternalEvictions")
+        .desc("The number of times a load hits an external invalidation");
+        //.prereq(loadHitInvalidations);
+
+    loadValidationFails
+        .name(name() + ".loadValidationFails")
+        .desc("The number of times a load fails validation");
+        //.prereq(loadValidationFails);
+
+    validationStalls
+        .name(name() + ".validationStalls")
+        .desc("The number of ticks the commit is stalled due to waiting "
+                "for validation responses");
+        //.prereq(loadValidationFails);
+
     numCommittedDist
         .init(0,commitWidth,1)
         .name(name() + ".committed_per_cycle")
@@ -582,6 +605,9 @@ DefaultCommit<Impl>::squashAll(ThreadID tid)
     toIEW->commitInfo[tid].squashInst = NULL;
 
     toIEW->commitInfo[tid].pc = pc[tid];
+
+    //TODO: send a packet to SpecBuffer to indicate flush
+    //
 }
 
 template <class Impl>
@@ -708,13 +734,21 @@ DefaultCommit<Impl>::tick()
         } else if (!rob->isEmpty(tid)) {
             const DynInstPtr &inst = rob->readHeadInst(tid);
 
+            if (inst->isExecuted() && inst->needPostFetch()
+                    && !inst->isExposeCompleted()){
+                //stall due to waiting for validation response
+                if (curTick()-lastCommitTick > 0){
+                    validationStalls+= curTick()-lastCommitTick;
+                }
+
+            }
             ppCommitStall->notify(inst);
 
             DPRINTF(Commit,"[tid:%i]: Can't commit, Instruction [sn:%lli] PC "
                     "%s is head of ROB and not ready\n",
                     tid, inst->seqNum, inst->pcState());
         }
-
+        lastCommitTick = curTick();
         DPRINTF(Commit, "[tid:%i]: ROB has %d insts & %d free entries.\n",
                 tid, rob->countInsts(tid), rob->numFreeEntries(tid));
     }
@@ -835,6 +869,7 @@ DefaultCommit<Impl>::commit()
             squashFromTrap(tid);
         } else if (tcSquash[tid]) {
             assert(commitStatus[tid] != TrapPending);
+            //TC: thread context. [mengjia]
             squashFromTC(tid);
         } else if (commitStatus[tid] == SquashAfterPending) {
             // A squash from the previous cycle of the commit stage (i.e.,
@@ -1042,6 +1077,7 @@ DefaultCommit<Impl>::commitInsts()
                 toIEW->commitInfo[tid].doneSeqNum = head_inst->seqNum;
 
                 if (tid == 0) {
+                    //maybe we can use this to mask interrupts [mengjia]
                     canHandleInterrupts =  (!head_inst->isDelayedCommit()) &&
                                            ((THE_ISA != ALPHA_ISA) ||
                                              (!(pc[0].instAddr() & 0x3)));
@@ -1222,6 +1258,8 @@ DefaultCommit<Impl>::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
         // execution doesn't generate extra squashes.
         thread[tid]->noSquashFromTC = true;
 
+        // [InvisiSpec] update squash stat for invalidation or validation fails
+        updateSquashStats(head_inst);
         // Execute the trap.  Although it's slightly unrealistic in
         // terms of timing (as it doesn't wait for the full timing of
         // the trap event to complete before updating state), it's
@@ -1351,6 +1389,7 @@ DefaultCommit<Impl>::markCompletedInsts()
     // Grab completed insts out of the IEW instruction queue, and mark
     // instructions completed within the ROB.
     for (int inst_num = 0; inst_num < fromIEW->size; ++inst_num) {
+        DPRINTF(Commit, "get the inst [num:%d]\n", inst_num);
         assert(fromIEW->insts[inst_num]);
         if (!fromIEW->insts[inst_num]->isSquashed()) {
             DPRINTF(Commit, "[tid:%i]: Marking PC %s, [sn:%lli] ready "
@@ -1363,6 +1402,27 @@ DefaultCommit<Impl>::markCompletedInsts()
             fromIEW->insts[inst_num]->setCanCommit();
         }
     }
+
+    // [InvisiSpec]
+    // update load status
+    // isPrevInstsCompleted; isPrevBrsResolved
+    rob->updateVisibleState();
+}
+
+// [InvisiSpec] update squash stat for loads
+template <class Impl>
+void
+DefaultCommit<Impl>::updateSquashStats(const DynInstPtr &inst)
+{
+    if (inst->hitInvalidation()){
+        loadHitInvalidations++;
+    }
+    if (inst->validationFail()){
+        loadValidationFails++;
+    }
+    if (inst->hitExternalEviction()){
+        loadHitExternalEvictions++;
+    }
 }
 
 template <class Impl>
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index 2f793453d..b298b9baa 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -593,6 +593,7 @@ FullO3CPU<Impl>::tick()
 
     activityRec.advance();
 
+    DPRINTF(O3CPU, "activityRec.advance() complete\n");
     if (removeInstsThisCycle) {
         cleanUpRemovedInsts();
     }
@@ -610,6 +611,8 @@ FullO3CPU<Impl>::tick()
             schedule(tickEvent, clockEdge(Cycles(1)));
             DPRINTF(O3CPU, "Scheduling next tick!\n");
         }
+    } else {
+        DPRINTF(O3CPU, "tickEvent.scheduled == false, %lu", curTick());
     }
 
     if (!FullSystem)
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
index e706b09a1..ee6e76ddd 100644
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -1188,9 +1188,16 @@ DefaultIEW<Impl>::executeInsts()
         fetchRedirect[tid] = false;
     }
 
+    // [mengjia] Validate/Expose any loads which are ready last cycle 
+    // very tricky, need make the state consistent
+    // if we successfully commit sth, then we need to activate the stage or somehow
+    // problems happen when interacting with squash
+    // NOTE: we always send validations before execute load requests 
+    ldstQueue.exposeLoads();
+    
     // Uncomment this if you want to see all available instructions.
     // @todo This doesn't actually work anymore, we should fix it.
-//    printAvailableInsts();
+    // printAvailableInsts();
 
     // Execute/writeback any instructions that are available.
     int insts_to_execute = fromIssue->size;
@@ -1237,18 +1244,40 @@ DefaultIEW<Impl>::executeInsts()
             DPRINTF(IEW, "Execute: Calculating address for memory "
                     "reference.\n");
 
+            DPRINTF(IEW, "Execute: %s\n", inst->staticInst->getName());
             // Tell the LDSTQ to execute this instruction (if it is a load).
             if (inst->isLoad()) {
                 // Loads will mark themselves as executed, and their writeback
                 // event adds the instruction to the queue to commit
+
+                // [InvisiSpec] a lifetime of a load
+                // always let it translate --> translation not complete, defer
+                // if !loadInExec, need to check whether there
+                // is a virtual fence ahead
+                // --> if existing virtual fence, defer
+                if (inst->fenceDelay()){
+                    DPRINTF(IEW, "Deferring load due to virtual fence.\n");
+                    inst->onlyWaitForFence(true);
+                    instQueue.deferMemInst(inst);
+                    continue;
+                }
+
                 fault = ldstQueue.executeLoad(inst);
 
-                if (inst->isTranslationDelayed() &&
+                // [InvisiSpec] delay the load if there is a virtual fence ahead
+                if ((inst->isTranslationDelayed() ) &&
                     fault == NoFault) {
                     // A hw page table walk is currently going on; the
                     // instruction must be deferred.
-                    DPRINTF(IEW, "Execute: Delayed translation, deferring "
-                            "load.\n");
+                    DPRINTF(IEW, "Execute: Delayed translation,  deferring load.\n");
+                    instQueue.deferMemInst(inst);
+                    continue;
+                }
+                
+                if ((inst->specTLBMiss() ) &&
+                    fault == NoFault) {
+                    DPRINTF(IEW, "Execute: Speculative load gets a TLB miss,"
+                            " deferring load.\n");
                     instQueue.deferMemInst(inst);
                     continue;
                 }
@@ -1383,10 +1412,11 @@ DefaultIEW<Impl>::executeInsts()
                 ++memOrderViolationEvents;
             }
         }
-    }
+    } 
 
     // Update and record activity if we processed any instructions.
     if (inst_num) {
+        
         if (exeStatus == Idle) {
             exeStatus = Running;
         }
@@ -1478,16 +1508,18 @@ DefaultIEW<Impl>::tick()
         dispatch(tid);
     }
 
+    ldstQueue.updateVisibleState();
+
     if (exeStatus != Squashing) {
         executeInsts();
-
+ 
         writebackInsts();
 
         // Have the instruction queue try to schedule any ready instructions.
         // (In actuality, this scheduling is for instructions that will
         // be executed next cycle.)
         instQueue.scheduleReadyInsts();
-
+ 
         // Also should advance its own time buffers if the stage ran.
         // Not the best place for it, but this works (hopefully).
         issueToExecQueue.advance();
@@ -1504,6 +1536,7 @@ DefaultIEW<Impl>::tick()
 
     // Writeback any stores using any leftover bandwidth.
     ldstQueue.writebackStores();
+   
 
     // Check the committed load/store signals to see if there's a load
     // or store to commit.  Also check if it's being told to execute a
diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh
index b34e6d980..980f29b35 100644
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@@ -1176,8 +1176,19 @@ InstructionQueue<Impl>::getDeferredMemInstToExecute()
 {
     for (ListIt it = deferredMemInsts.begin(); it != deferredMemInsts.end();
          ++it) {
-        if ((*it)->translationCompleted() || (*it)->isSquashed()) {
+        // [InvisiSpec] we need to check the FenceDelay
+        // a load can be delayed due to
+        // 1. translation delay
+        // 2. virtual fence ahead
+        // 3. not ready to expose and gets a TLB miss
+        // for both (2, 3) we need to restart the translation
+        if ( (*it)->translationCompleted() 
+                || ((*it)->onlyWaitForFence() && !(*it)->fenceDelay())
+                || ((*it)->onlyWaitForExpose() && (*it)->readyToExpose())
+                || (*it)->isSquashed()) {
             DynInstPtr mem_inst = std::move(*it);
+            mem_inst->onlyWaitForFence(false);
+            mem_inst->onlyWaitForExpose(false);
             deferredMemInsts.erase(it);
             return mem_inst;
         }
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 0c93121e3..e5c35a3a6 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -134,6 +134,23 @@ class LSQ {
     /** Same as above, but only for one thread. */
     void writebackStores(ThreadID tid);
 
+
+    /** [mengjia]
+     * Attempts to validate loads until all cache ports are used or the
+     * interface becomes blocked.
+     */
+    int exposeLoads();
+    /** Same as above, but only for one thread. */
+    int exposeLoads(ThreadID tid);
+
+    /** [mengjia]
+     * attempt to update FenceDelay state for load insts
+     */
+    void updateVisibleState();
+    /** Same as above, but only for one thread. */
+    void updateVisibleState(ThreadID tid);
+
+
     /**
      * Squash instructions from a thread until the specified sequence number.
      */
@@ -255,6 +272,10 @@ class LSQ {
     int numStoresToWB(ThreadID tid)
     { return thread[tid].numStoresToWB(); }
 
+    /** Returns the number of stores a specific thread has to write back. */
+    int numLoadsToVLD(ThreadID tid)
+    { return thread[tid].numLoadsToVLD(); }
+
     /** Returns if the LSQ will write back to memory this cycle. */
     bool willWB();
     /** Returns if the LSQ of a specific thread will write back to memory this
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
index 36bc17bc8..24066cd4b 100644
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -292,6 +292,44 @@ LSQ<Impl>::writebackStores()
     }
 }
 
+// [mengjia]
+template<class Impl>
+int 
+LSQ<Impl>::exposeLoads()
+{
+    list<ThreadID>::iterator threads = activeThreads->begin();
+    list<ThreadID>::iterator end = activeThreads->end();
+
+    int exposedLoads = 0;
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        if (numLoadsToVLD(tid) > 0) {
+            DPRINTF(Writeback,"[tid:%i] Validate loads. %i loads "
+                "available for Validate.\n", tid, numLoadsToVLD(tid));
+        }
+
+        exposedLoads += thread[tid].exposeLoads();
+    }
+    return exposedLoads;
+}
+
+
+// [mengjia]
+template<class Impl>
+void
+LSQ<Impl>::updateVisibleState()
+{
+    list<ThreadID>::iterator threads = activeThreads->begin();
+    list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        thread[tid].updateVisibleState();
+    }
+}
+
 template<class Impl>
 bool
 LSQ<Impl>::violation()
@@ -321,6 +359,7 @@ LSQ<Impl>::recvReqRetry()
     }
 }
 
+// [InvisiSpec] Callback function for receiving a response
 template <class Impl>
 bool
 LSQ<Impl>::recvTimingResp(PacketPtr pkt)
@@ -329,6 +368,17 @@ LSQ<Impl>::recvTimingResp(PacketPtr pkt)
         DPRINTF(LSQ, "Got error packet back for address: %#X\n",
                 pkt->getAddr());
 
+    // for expose or validate request,
+    // if the instruction is squashed, maybe the req has been deleted
+    if (pkt->isValidate() || pkt->isExpose()){
+        if (!pkt->req){
+            delete pkt;
+            return true;
+        }
+        DPRINTF(LSQ, "Receive an expose/validate response, idx=%d\n",
+                    pkt->reqIdx);
+    }
+
     thread[cpu->contextToThread(pkt->req->contextId())]
         .completeDataAccess(pkt);
 
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 48a06b386..5cb29dc52 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -141,6 +141,12 @@ class LSQUnit {
      */
     void checkSnoop(PacketPtr pkt);
 
+    // [InvisiSpec] check whether current request will hit in the
+    // spec buffer or not
+    int checkSpecBuffHit(const RequestPtr req, const int req_idx);
+    void setSpecBuffState(const RequestPtr req);
+
+    bool checkPrevLoadsExecuted(const int req_idx);
     /** Executes a load instruction. */
     Fault executeLoad(const DynInstPtr &inst);
 
@@ -159,6 +165,15 @@ class LSQUnit {
     /** Writes back stores. */
     void writebackStores();
 
+    /** [mengjia] Validate loads. */
+    int exposeLoads();
+
+    /** [mengjia] Update Visbible State.
+     * In the mode defence relying on fence: setup fenceDelay state.
+     * In the mode defence relying on invisibleSpec:
+     * setup readyToExpose*/
+    void updateVisibleState();
+
     /** Completes the data access that has been returned from the
      * memory system. */
     void completeDataAccess(PacketPtr pkt);
@@ -224,6 +239,8 @@ class LSQUnit {
 
     /** Returns the number of stores to writeback. */
     int numStoresToWB() { return storesToWB; }
+    /** [InvisiSpec] Returns the number of loads to validate. */
+    int numLoadsToVLD() { return loadsToVLD; }
 
     /** Returns if the LSQ unit will writeback on this cycle. */
     bool willWB() { return storeQueue[storeWBIdx].canWB &&
@@ -240,18 +257,30 @@ class LSQUnit {
     /** Writes back the instruction, sending it to IEW. */
     void writeback(const DynInstPtr &inst, PacketPtr pkt);
 
+    // [InvisiSpec] complete Validates
+    void completeValidate(DynInstPtr &inst, PacketPtr pkt);
+
     /** Writes back a store that couldn't be completed the previous cycle. */
     void writebackPendingStore();
 
+    /** Validates a load that couldn't be completed the previous cycle. */
+    void validatePendingLoad();
+
     /** Handles completing the send of a store to memory. */
     void storePostSend(PacketPtr pkt);
 
+    /** Handles completing the send of a validation to memory. */
+    //void validationPostSend(PacketPtr pkt, int loadVLDIdx);
+
     /** Completes the store at the specified index. */
     void completeStore(int store_idx);
 
     /** Attempts to send a store to the cache. */
     bool sendStore(PacketPtr data_pkt);
 
+    /** Attempts to send a validation to the cache. */
+    //bool sendValidation(PacketPtr data_pkt, int loadVLDIdx);
+
     /** Increments the given store index (circular queue). */
     inline void incrStIdx(int &store_idx) const;
     /** Decrements the given store index (circular queue). */
@@ -415,6 +444,8 @@ class LSQUnit {
 
     /** The number of load instructions in the LQ. */
     int loads;
+    /** [mengjia] The number of store instructions in the SQ waiting to writeback. */
+    int loadsToVLD;
     /** The number of store instructions in the SQ. */
     int stores;
     /** The number of store instructions in the SQ waiting to writeback. */
@@ -422,6 +453,10 @@ class LSQUnit {
 
     /** The index of the head instruction in the LQ. */
     int loadHead;
+    /** [mengjia] The index of the first instruction that may be ready to be
+     * validated, and has not yet been validated.
+     */
+    //int pendingLoadVLDIdx;
     /** The index of the tail instruction in the LQ. */
     int loadTail;
 
@@ -438,7 +473,7 @@ class LSQUnit {
     /** The number of cache ports available each cycle (stores only). */
     int cacheStorePorts;
 
-    /** The number of used cache ports in this cycle by stores. */
+    /** [InvisiSpec] The number of used cache ports in this cycle by stores. */
     int usedStorePorts;
 
     //list<InstSeqNum> mshrSeqNums;
@@ -464,6 +499,9 @@ class LSQUnit {
     /** Whehter or not a store is blocked due to the memory system. */
     bool isStoreBlocked;
 
+    /** Whehter or not a validation is blocked due to the memory system. */
+    bool isValidationBlocked;
+    
     /** Whether or not a store is in flight. */
     bool storeInFlight;
 
@@ -477,9 +515,21 @@ class LSQUnit {
     /** The packet that is pending free cache ports. */
     PacketPtr pendingPkt;
 
+    /* [mengjia] define scheme variables */
+    // Flag for whether issue packets in execution stage
+    bool loadInExec;
+
+    // Flag for whether to use invisible speculative load
+    bool isInvisibleSpec;
+
     /** Flag for memory model. */
     bool needsTSO;
 
+    // Flag for whether defending against spectre attack or future attacks
+    bool isFuturistic;
+    bool allowSpecBuffHit;
+    /* [mengjia] different schemes determine values of 4 variables. */
+
     // Will also need how many read/write ports the Dcache has.  Or keep track
     // of that in stage that is one level up, and only call executeLoad/Store
     // the appropriate number of times.
@@ -514,6 +564,12 @@ class LSQUnit {
     /** Number of times the LSQ is blocked due to the cache. */
     Stats::Scalar lsqCacheBlocked;
 
+    Stats::Scalar specBuffHits;
+    Stats::Scalar specBuffMisses;
+    Stats::Scalar numValidates;
+    Stats::Scalar numExposes;
+    Stats::Scalar numConvertedExposes;
+
   public:
     /** Executes the load at the given index. */
     Fault read(const RequestPtr &req,
@@ -555,6 +611,8 @@ class LSQUnit {
     bool isStalled()  { return stalled; }
 };
 
+
+// IMPORTANT: the function to issue packets, interact with memory [mengjia]
 template <class Impl>
 Fault
 LSQUnit<Impl>::read(const RequestPtr &req,
@@ -584,6 +642,7 @@ LSQUnit<Impl>::read(const RequestPtr &req,
     }
 
     // Check the SQ for any previous stores that might lead to forwarding
+    // why we have store queue index for a load operation? [mengjia]
     int store_idx = load_inst->sqIdx;
 
     int store_size = 0;
@@ -593,6 +652,7 @@ LSQUnit<Impl>::read(const RequestPtr &req,
             load_idx, store_idx, storeHead, req->getPaddr(),
             sreqLow ? " split" : "");
 
+    // LLSC: load-link/store-conditional [mengjia]
     if (req->isLLSC()) {
         assert(!sreqLow);
         // Disable recording the result temporarily.  Writing to misc
@@ -603,12 +663,14 @@ LSQUnit<Impl>::read(const RequestPtr &req,
         load_inst->recordResult(true);
     }
 
+    // request to memory mapped register [mengjia]
     if (req->isMmappedIpr()) {
         assert(!load_inst->memData);
         load_inst->memData = new uint8_t[64];
 
         ThreadContext *thread = cpu->tcBase(lsqID);
         Cycles delay(0);
+
         PacketPtr data_pkt = new Packet(req, MemCmd::ReadReq);
 
         data_pkt->dataStatic(load_inst->memData);
@@ -756,6 +818,7 @@ LSQUnit<Impl>::read(const RequestPtr &req,
     DPRINTF(LSQUnit, "Doing memory access for inst [sn:%lli] PC %s\n",
             load_inst->seqNum, load_inst->pcState());
 
+
     // Allocate memory if this is the first time a load is issued.
     if (!load_inst->memData) {
         load_inst->memData = new uint8_t[req->getSize()];
@@ -763,10 +826,35 @@ LSQUnit<Impl>::read(const RequestPtr &req,
 
     // if we the cache is not blocked, do cache access
     bool completedFirst = false;
-    PacketPtr data_pkt = Packet::createRead(req);
+
+    PacketPtr data_pkt = NULL;
     PacketPtr fst_data_pkt = NULL;
     PacketPtr snd_data_pkt = NULL;
 
+    // According to the isInsivisibleSpec variable to create
+    // corresponding type of packets [mengjia]
+    bool sendSpecRead = false;
+    if(isInvisibleSpec){
+        if(!load_inst->readyToExpose()){
+            assert(!req->isLLSC());
+            assert(!req->isStrictlyOrdered());
+            assert(!req->isMmappedIpr());
+            sendSpecRead = true;
+            DPRINTF(LSQUnit, "send a spec read for inst [sn:%lli]\n",
+                    load_inst->seqNum);
+        }
+
+    }
+
+    assert( !(sendSpecRead && load_inst->isSpecCompleted()) &&
+            "Sending specRead twice for the same load insts");
+
+    if(sendSpecRead){
+       data_pkt = Packet::createReadSpec(req);
+    }else{
+        data_pkt = Packet::createRead(req);
+    }
+
     data_pkt->dataStatic(load_inst->memData);
 
     LSQSenderState *state = new LSQSenderState;
@@ -778,17 +866,64 @@ LSQUnit<Impl>::read(const RequestPtr &req,
     if (!TheISA::HasUnalignedMemAcc || !sreqLow) {
         // Point the first packet at the main data packet.
         fst_data_pkt = data_pkt;
+
+        fst_data_pkt->setFirst();
+        if (sendSpecRead){
+            int src_idx = checkSpecBuffHit(req, load_idx);
+            if (src_idx != -1) {
+                if (allowSpecBuffHit){
+                    data_pkt->setOnlyAccessSpecBuff();
+                }
+                data_pkt->srcIdx = src_idx;
+                specBuffHits++;
+            }else{
+                specBuffMisses++;
+            }
+        }
+        fst_data_pkt->reqIdx = load_idx;
     } else {
         // Create the split packets.
-        fst_data_pkt = Packet::createRead(sreqLow);
-        snd_data_pkt = Packet::createRead(sreqHigh);
+        if(sendSpecRead){
+
+            fst_data_pkt = Packet::createReadSpec(sreqLow);
+            int fst_src_idx = checkSpecBuffHit(sreqLow, load_idx);
+            if ( fst_src_idx != -1 ) {
+                if (allowSpecBuffHit){
+                    fst_data_pkt->setOnlyAccessSpecBuff();
+                }
+                fst_data_pkt->srcIdx = fst_src_idx;
+                specBuffHits++;
+            } else {
+                specBuffMisses++;
+            }
+
+            snd_data_pkt = Packet::createReadSpec(sreqHigh);
+            int snd_src_idx = checkSpecBuffHit(sreqHigh, load_idx);
+            if ( snd_src_idx != -1 ) {
+                if (allowSpecBuffHit){
+                    snd_data_pkt->setOnlyAccessSpecBuff();
+                }
+                snd_data_pkt->srcIdx = snd_src_idx;
+                specBuffHits++;
+            } else {
+                specBuffMisses++;
+            }
+        }else{
+            fst_data_pkt = Packet::createRead(sreqLow);
+            snd_data_pkt = Packet::createRead(sreqHigh);
+        }
 
+        fst_data_pkt->setFirst();
         fst_data_pkt->dataStatic(load_inst->memData);
         snd_data_pkt->dataStatic(load_inst->memData + sreqLow->getSize());
 
         fst_data_pkt->senderState = state;
         snd_data_pkt->senderState = state;
+        fst_data_pkt->reqIdx = load_idx;
+        snd_data_pkt->reqIdx = load_idx;
 
+        fst_data_pkt->isSplit = true;
+        snd_data_pkt->isSplit = true;
         state->isSplit = true;
         state->outstanding = 2;
         state->mainPkt = data_pkt;
@@ -800,6 +935,8 @@ LSQUnit<Impl>::read(const RequestPtr &req,
     // @todo We should account for cache port contention
     // and arbitrate between loads and stores.
     bool successful_load = true;
+    // MARK: here is the place memory request of read is sent [mengjia]
+    // [InvisiSpec] Sending out a memory request
     if (!dcachePort->sendTimingReq(fst_data_pkt)) {
         successful_load = false;
     } else if (TheISA::HasUnalignedMemAcc && sreqLow) {
@@ -856,6 +993,62 @@ LSQUnit<Impl>::read(const RequestPtr &req,
         return NoFault;
     }
 
+    DPRINTF(LSQUnit, "successfully sent out packet(s) for inst [sn:%lli]\n",
+            load_inst->seqNum);
+    // Set everything ready for expose/validation after the read is
+    // successfully sent out
+    if(sendSpecRead){ // sending actual request
+
+            // [mengjia] Here we set the needExposeOnly flag
+            if (needsTSO && !load_inst->isDataPrefetch()){
+                // need to check whether previous load_instructions specComplete or not
+                if ( checkPrevLoadsExecuted(load_idx) ){
+                    load_inst->needExposeOnly(true);
+                    DPRINTF(LSQUnit, "Set load PC %s, [sn:%lli] as "
+                            "needExposeOnly\n",
+                        load_inst->pcState(), load_inst->seqNum);
+                } else {
+                    DPRINTF(LSQUnit, "Set load PC %s, [sn:%lli] as "
+                            "needValidation\n",
+                        load_inst->pcState(), load_inst->seqNum);
+                }
+            }else{
+                //if RC, always only need expose
+                load_inst->needExposeOnly(true);
+                DPRINTF(LSQUnit, "Set load PC %s, [sn:%lli] as needExposeOnly\n",
+                    load_inst->pcState(), load_inst->seqNum);
+            }
+
+            load_inst->needPostFetch(true);
+            assert(!req->isMmappedIpr());
+            //save expose requestPtr
+            if (TheISA::HasUnalignedMemAcc && sreqLow) {
+                load_inst->postSreqLow = std::make_shared<Request>(*sreqLow);
+                load_inst->postSreqHigh = std::make_shared<Request>(*sreqHigh);
+                load_inst->postReq = nullptr;
+            }else{
+                load_inst->postReq = std::make_shared<Request>(*req);
+                load_inst->postSreqLow = nullptr;
+                load_inst->postSreqHigh = nullptr;
+            }
+            load_inst->needDeletePostReq(true);
+            DPRINTF(LSQUnit, "created validation/expose"
+                    " request for inst [sn:%lli]"
+                    "req=%#x, reqLow=%#x, reqHigh=%#x\n",
+                load_inst->seqNum, (Addr)(load_inst->postReq),
+                (Addr)(load_inst->postSreqLow),
+                (Addr)(load_inst->postSreqHigh));
+    } else {
+        load_inst->setExposeCompleted();
+        load_inst->needPostFetch(false);
+        if (TheISA::HasUnalignedMemAcc && sreqLow) {
+            setSpecBuffState(sreqLow);
+            setSpecBuffState(sreqHigh);
+        } else {
+            setSpecBuffState(req);
+        }
+    }
+
     return NoFault;
 }
 
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index 13b148768..9b0883a23 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -89,6 +89,9 @@ LSQUnit<Impl>::WritebackEvent::description() const
     return "Store writeback";
 }
 
+
+// [InvisiSpec] This function deals with
+// acknowledge response to memory read/write
 template<class Impl>
 void
 LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
@@ -107,6 +110,17 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
         return;
     }
 
+    // need to update hit info for corresponding instruction
+    if (pkt->isL1Hit() && pkt->isSpec() && pkt->isRead()){
+        if (state->isSplit && ! pkt->isFirst()){
+            inst->setL1HitHigh();
+        } else {
+            inst->setL1HitLow();
+        }
+    } else if (!pkt->isSpec()) {
+        setSpecBuffState(pkt->req);
+    }
+
     // If this is a split access, wait until all packets are received.
     if (TheISA::HasUnalignedMemAcc && !state->complete()) {
         return;
@@ -117,7 +131,9 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
         if (!state->noWB) {
             // Only loads and store conditionals perform the writeback
             // after receving the response from the memory
+            // [mengjia] validation also needs writeback, expose do not need
             assert(inst->isLoad() || inst->isStoreConditional());
+
             if (!TheISA::HasUnalignedMemAcc || !state->isSplit ||
                 !state->isLoad) {
                 writeback(inst, pkt);
@@ -129,6 +145,10 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
         if (inst->isStore()) {
             completeStore(state->idx);
         }
+
+        if (pkt->isValidate() || pkt->isExpose()) {
+            completeValidate(inst, pkt);
+        }
     }
 
     if (TheISA::HasUnalignedMemAcc && state->isSplit && state->isLoad) {
@@ -136,6 +156,7 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
     }
 
     pkt->req->setAccessLatency();
+    // probe point, not sure about the mechanism [mengjia]
     cpu->ppDataAccessComplete->notify(std::make_pair(inst, pkt));
 
     delete state;
@@ -145,8 +166,8 @@ template <class Impl>
 LSQUnit<Impl>::LSQUnit(uint32_t lqEntries, uint32_t sqEntries)
     : lsqID(-1), storeQueue(sqEntries+1), loadQueue(lqEntries+1),
       LQEntries(lqEntries+1), SQEntries(sqEntries+1),
-      loads(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false),
-      isStoreBlocked(false), storeInFlight(false), hasPendingPkt(false),
+      loads(0), loadsToVLD(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false),
+      isStoreBlocked(false), isValidationBlocked(false), storeInFlight(false), hasPendingPkt(false),
       pendingPkt(nullptr)
 {
 }
@@ -168,7 +189,52 @@ LSQUnit<Impl>::init(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params,
     depCheckShift = params->LSQDepCheckShift;
     checkLoads = params->LSQCheckLoads;
     cacheStorePorts = params->cacheStorePorts;
+
+    // According to the scheme, we need to define actions as follows.
+    // loadInExec: if False, no packets are sent in execution stage;
+    //             if True, send either readReq or readSpecReq
+    // isInvisibleSpec: if True, send readSpecReq in execution statge;
+    //                  if False, send readReq
+    // needsTSO: if True, squash read on receiving invalidations, and only allow one outstanding write at a time;
+    //           if False, no squash on receiving invalidaiton, and allow multiple outstanding writes.
+    // isConservative: if True, react after all preceding instructions complete/no exception;
+    //                 if False, react only after all preceding stores/brancehs complete
+    const std::string scheme = params->simulateScheme;
+    if (scheme.compare("UnsafeBaseline")==0){
+        loadInExec = true;
+        isInvisibleSpec = false; // send real request
+        isFuturistic = false; // not relevant in unsafe mode.
+    }else if (scheme.compare("FuturisticSafeFence")==0){
+        // "LFENCE" before every load
+        loadInExec = false;
+        isInvisibleSpec = false; // not used since loadInExec is false
+        isFuturistic = true; // send readReq at head of ROB
+    }else if (scheme.compare("FuturisticSafeInvisibleSpec")==0){
+        // only make load visible when all preceding instructions
+        // complete and no exception
+        loadInExec = true;
+        isInvisibleSpec = true; // send request but not change cache state
+        isFuturistic = true; // conservative condition to send validations
+    }else if (scheme.compare("SpectreSafeFence")==0){
+        // "LFENCE" after every branch
+        loadInExec = false;
+        isInvisibleSpec = false; // not used since loadInExec is false
+        isFuturistic = false; // commit when preceding branches are resolved
+    }else if (scheme.compare("SpectreSafeInvisibleSpec")==0){
+        // make load visible when all preceiding branches are resolved
+        loadInExec = true;
+        isInvisibleSpec = true; // send request but not change cache state
+        isFuturistic = false; // only deal with spectre attacks
+    }else {
+        cprintf("ERROR: unsupported simulation scheme: %s!\n", scheme);
+        exit(1);
+    }
     needsTSO = params->needsTSO;
+    allowSpecBuffHit = params->allowSpecBuffHit;
+    cprintf("Info: simulation uses scheme: %s; "
+                "needsTSO=%d; allowSpecBuffHit=%d\n",
+                scheme, needsTSO, allowSpecBuffHit);
+    // [mengjia] end of setting configuration variables
 
     resetState();
 }
@@ -178,7 +244,7 @@ template<class Impl>
 void
 LSQUnit<Impl>::resetState()
 {
-    loads = stores = storesToWB = 0;
+    loads = stores = loadsToVLD = storesToWB = 0;
 
     loadHead = loadTail = 0;
 
@@ -248,6 +314,26 @@ LSQUnit<Impl>::regStats()
     lsqCacheBlocked
         .name(name() + ".cacheBlocked")
         .desc("Number of times an access to memory failed due to the cache being blocked");
+
+    specBuffHits
+        .name(name() + ".specBuffHits")
+        .desc("Number of times an access hits in speculative buffer");
+
+    specBuffMisses
+        .name(name() + ".specBuffMisses")
+        .desc("Number of times an access misses in speculative buffer");
+
+    numValidates
+        .name(name() + ".numValidates")
+        .desc("Number of validates sent to cache");
+
+    numExposes
+        .name(name() + ".numExposes")
+        .desc("Number of exposes sent to cache");
+
+    numConvertedExposes
+        .name(name() + ".numConvertedExposes")
+        .desc("Number of exposes converted from validation");
 }
 
 template<class Impl>
@@ -279,6 +365,7 @@ LSQUnit<Impl>::drainSanityCheck() const
         assert(!loadQueue[i]);
 
     assert(storesToWB == 0);
+    assert(loadsToVLD == 0);
     assert(!retryPkt);
 }
 
@@ -367,6 +454,7 @@ LSQUnit<Impl>::insertLoad(const DynInstPtr &load_inst)
     incrLdIdx(loadTail);
 
     ++loads;
+
 }
 
 template <class Impl>
@@ -390,6 +478,7 @@ LSQUnit<Impl>::insertStore(const DynInstPtr &store_inst)
     ++stores;
 }
 
+// It is an empty function? why? [mengjia]
 template <class Impl>
 typename Impl::DynInstPtr
 LSQUnit<Impl>::getMemDepViolator()
@@ -450,13 +539,16 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt)
         Addr load_addr_high = ld_inst->physEffAddrHigh & cacheBlockMask;
 
         // Check that this snoop didn't just invalidate our lock flag
-        if (ld_inst->effAddrValid() && (load_addr_low == invalidate_addr
-                                        || load_addr_high == invalidate_addr)
+        // [InvisiSpec] also make sure the instruction has been sent out
+        // otherwise, we cause unneccessary squash
+        if (ld_inst->effAddrValid() && !ld_inst->fenceDelay()
+                && (load_addr_low == invalidate_addr
+                    || load_addr_high == invalidate_addr)
             && ld_inst->memReqFlags & Request::LLSC)
             TheISA::handleLockedSnoopHit(ld_inst.get());
     }
 
-    // If this is the only load in the LSQ we don't care
+    // If not match any load entry, then do nothing [mengjia]
     if (load_idx == loadTail)
         return;
 
@@ -467,7 +559,10 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt)
     while (load_idx != loadTail) {
         DynInstPtr ld_inst = loadQueue[load_idx];
 
-        if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) {
+        // [SafeSpce] check snoop violation when the load has
+        // been sent out; otherwise, unneccessary squash
+        if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()
+                || ld_inst->fenceDelay()) {
             incrLdIdx(load_idx);
             continue;
         }
@@ -485,11 +580,29 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt)
                 // all other loads, this load as well as *all* subsequent loads
                 // need to be squashed to prevent possible load reordering.
                 force_squash = true;
+
+                // [InvisiSpec] in InvisiSpec, we do not need to squash
+                // the load at the head of LQ,
+                // as well as the one do not need validation
+                if (isInvisibleSpec &&
+                    (load_idx==loadHead || ld_inst->needExposeOnly())){
+                    force_squash = false;
+                }
+                if (!pkt->isExternalEviction() && isInvisibleSpec){
+                    force_squash = false;
+                    ld_inst->clearL1HitHigh();
+                    ld_inst->clearL1HitLow();
+                }
             }
             if (ld_inst->possibleLoadViolation() || force_squash) {
                 DPRINTF(LSQUnit, "Conflicting load at addr %#x [sn:%lli]\n",
                         pkt->getAddr(), ld_inst->seqNum);
 
+                //[InvisiSpec] mark the load hit invalidation
+                ld_inst->hitInvalidation(true);
+                if (pkt->isExternalEviction()){
+                    ld_inst->hitExternalEviction(true);
+                }
                 // Mark the load for re-execution
                 ld_inst->fault = std::make_shared<ReExec>();
             } else {
@@ -514,6 +627,103 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt)
 }
 
 template <class Impl>
+bool
+LSQUnit<Impl>::checkPrevLoadsExecuted(int req_idx)
+{
+    int load_idx = loadHead;
+    while (load_idx != req_idx){
+        if (!loadQueue[load_idx]->isExecuted()){
+            // if at least on load ahead of current load
+            // does not finish spec access,
+            // then return false
+            return false;
+        }
+        incrLdIdx(load_idx);
+    }
+
+    //if all executed, return true
+    return true;
+}
+
+template <class Impl>
+void
+LSQUnit<Impl>::setSpecBuffState(RequestPtr expose_req)
+{
+    Addr req_eff_addr1 = expose_req->getPaddr() & cacheBlockMask;
+
+    int load_idx = loadHead;
+    while (load_idx != loadTail){
+        DynInstPtr ld_inst = loadQueue[load_idx];
+        if (ld_inst->effAddrValid()){
+
+            Addr ld_eff_addr1 = ld_inst->physEffAddrLow & cacheBlockMask;
+            Addr ld_eff_addr2 = ld_inst->physEffAddrHigh & cacheBlockMask;
+            if (ld_eff_addr1 == req_eff_addr1){
+                ld_inst->setSpecBuffObsoleteLow();
+            } else if (ld_eff_addr2 == req_eff_addr1){
+                ld_inst->setSpecBuffObsoleteHigh();
+            }
+        }
+        incrLdIdx(load_idx);
+    }
+}
+
+
+template <class Impl>
+int
+LSQUnit<Impl>::checkSpecBuffHit(RequestPtr req, int req_idx)
+{
+
+    Addr req_eff_addr1 = req->getPaddr() & cacheBlockMask;
+    //Addr req_eff_addr2 = (req->getPaddr() + req->getSize()-1) & cacheBlockMask;
+    // the req should be within the same cache line
+    //assert (req_eff_addr1 == req_eff_addr2);
+    assert (!loadQueue[req_idx]->isExecuted());
+
+    int load_idx = loadHead;
+
+    while (load_idx != loadTail){
+        DynInstPtr ld_inst = loadQueue[load_idx];
+        if (ld_inst->effAddrValid()){
+            Addr ld_eff_addr1 = ld_inst->physEffAddrLow & cacheBlockMask;
+            Addr ld_eff_addr2 = ld_inst->physEffAddrHigh & cacheBlockMask;
+
+            if ((req_eff_addr1 == ld_eff_addr1 && ld_inst->isL1HitLow())
+                || (req_eff_addr1 == ld_eff_addr2 && ld_inst->isL1HitHigh())){
+                return -1;
+                //already in L1, do not copy from buffer
+            } else {
+
+                if (ld_inst->isExecuted() && ld_inst->needPostFetch()
+                    && !ld_inst->isSquashed() && ld_inst->fault==NoFault){
+                    if (req_eff_addr1 == ld_eff_addr1 && !ld_inst->isL1HitLow()
+                            && !ld_inst->isSpecBuffObsoleteLow()){
+                        DPRINTF(LSQUnit, "Detected Spec Hit with inst [sn:%lli] "
+                            "and [sn:%lli] (low) at address %#x\n",
+                            loadQueue[req_idx]->seqNum, ld_inst->seqNum,
+                            req_eff_addr1);
+                        return load_idx;
+                    } else if ( ld_eff_addr2 !=0  &&
+                        req_eff_addr1 == ld_eff_addr2 && !ld_inst->isL1HitHigh()
+                        && !ld_inst->isSpecBuffObsoleteHigh()){
+                        DPRINTF(LSQUnit, "Detected Spec Hit with inst [sn:%lli] "
+                            "and [sn:%lli] (high) at address %#x\n",
+                            loadQueue[req_idx]->seqNum, ld_inst->seqNum,
+                            req_eff_addr1);
+                        return load_idx;
+                    }
+                }
+            }
+        }
+        incrLdIdx(load_idx);
+    }
+
+    return -1;
+}
+
+
+
+template <class Impl>
 Fault
 LSQUnit<Impl>::checkViolations(int load_idx, const DynInstPtr &inst)
 {
@@ -527,7 +737,10 @@ LSQUnit<Impl>::checkViolations(int load_idx, const DynInstPtr &inst)
      */
     while (load_idx != loadTail) {
         DynInstPtr ld_inst = loadQueue[load_idx];
-        if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) {
+        // [InvisiSpec] no need to check violation for unsent load
+        // otherwise, unneccessary squash
+        if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()
+                || ld_inst->fenceDelay()) {
             incrLdIdx(load_idx);
             continue;
         }
@@ -606,14 +819,25 @@ LSQUnit<Impl>::executeLoad(const DynInstPtr &inst)
 
     assert(!inst->isSquashed());
 
+    // use ISA interface to generate correct access request
+    // initiateAcc is implemented in dyn_inst_impl.hh
+    // The interface calls corresponding ISA defined function
+    // check buld/ARM/arch/generic/memhelper.hh for more info [mengjia]
     load_fault = inst->initiateAcc();
 
-    if (inst->isTranslationDelayed() &&
+    // if translation delay, deferMem [mengjia]
+    // in the case it is not the correct time to send the load
+    // also defer it
+    if ( (inst->isTranslationDelayed() || inst->fenceDelay()
+                || inst->specTLBMiss()) &&
         load_fault == NoFault)
         return load_fault;
 
     // If the instruction faulted or predicated false, then we need to send it
     // along to commit without the instruction completing.
+    //
+    // if it is faulty, not execute it, send it to commit, and commit statge will deal with it
+    // here is signling the ROB, the inst can commit [mengjia]
     if (load_fault != NoFault || !inst->readPredicate()) {
         // Send this instruction to commit, also make sure iew stage
         // realizes there is activity.  Mark it as executed unless it
@@ -661,6 +885,8 @@ LSQUnit<Impl>::executeStore(const DynInstPtr &store_inst)
     // address.  If so, then we have a memory ordering violation.
     int load_idx = store_inst->lqIdx;
 
+    // TODO: Check whether this store tries to get an exclusive copy 
+    // of target line [mengjia]
     Fault store_fault = store_inst->initiateAcc();
 
     if (store_inst->isTranslationDelayed() &&
@@ -759,15 +985,343 @@ LSQUnit<Impl>::writebackPendingStore()
     if (hasPendingPkt) {
         assert(pendingPkt != NULL);
 
-        // If the cache is blocked, this will store the packet for retry.
-        if (sendStore(pendingPkt)) {
-            storePostSend(pendingPkt);
+        if(pendingPkt->isWrite()){
+            // If the cache is blocked, this will store the packet for retry.
+            if (sendStore(pendingPkt)) {
+                storePostSend(pendingPkt);
+            }
+            pendingPkt = NULL;
+            hasPendingPkt = false;
         }
-        pendingPkt = NULL;
-        hasPendingPkt = false;
     }
 }
 
+
+
+
+// [InvisiSpec] update FenceDelay State
+template <class Impl>
+void
+LSQUnit<Impl>::updateVisibleState()
+{
+    int load_idx = loadHead;
+
+    //iterate all the loads and update its fencedelay state accordingly
+    while (load_idx != loadTail && loadQueue[load_idx]){
+        DynInstPtr inst = loadQueue[load_idx];
+
+        if (!loadInExec){
+
+            if ( (isFuturistic && inst->isPrevInstsCommitted()) ||
+                    (!isFuturistic && inst->isPrevBrsCommitted())){
+                if (inst->fenceDelay()){
+                    DPRINTF(LSQUnit, "Clear virtual fence for "
+                            "inst [sn:%lli] PC %s\n",
+                        inst->seqNum, inst->pcState());
+                }
+                inst->fenceDelay(false);
+            }else {
+                if (!inst->fenceDelay()){
+                    DPRINTF(LSQUnit, "Deffering an inst [sn:%lli] PC %s"
+                            " due to virtual fence\n",
+                        inst->seqNum, inst->pcState());
+                }
+                inst->fenceDelay(true);
+            }
+            inst->readyToExpose(true);
+        } else if (loadInExec && isInvisibleSpec){
+
+            if ( (isFuturistic && inst->isPrevInstsCompleted()) ||
+                    (!isFuturistic && inst->isPrevBrsResolved())){
+                if (!inst->readyToExpose()){
+                    DPRINTF(LSQUnit, "Set readyToExpose for "
+                            "inst [sn:%lli] PC %s\n",
+                        inst->seqNum, inst->pcState());
+                    if (inst->needPostFetch()){
+                        ++loadsToVLD;
+                    }
+                }
+                inst->readyToExpose(true);
+            }else {
+                if (inst->readyToExpose()){
+                    DPRINTF(LSQUnit, "The load can not be validated "
+                            "[sn:%lli] PC %s\n",
+                        inst->seqNum, inst->pcState());
+                    assert(0);
+                    //--loadsToVLD;
+                }
+                inst->readyToExpose(false);
+            }
+            inst->fenceDelay(false);
+        } else {
+            inst->readyToExpose(true);
+            inst->fenceDelay(false);
+        }
+        incrLdIdx(load_idx);
+    }
+}
+
+// [InvisiSpec] validate loads
+template <class Impl>
+int
+LSQUnit<Impl>::exposeLoads()
+{
+    if(!isInvisibleSpec){
+        assert(loadsToVLD==0
+            && "request validation on Non invisible Spec mode");
+    }
+
+    int old_loadsToVLD = loadsToVLD;
+
+    // [InvisiSpec] Note:
+    // need to iterate from the head every time
+    // since the load can be exposed out-of-order
+    int loadVLDIdx = loadHead;
+
+    while (loadsToVLD > 0 &&
+        loadVLDIdx != loadTail &&
+        loadQueue[loadVLDIdx]) {
+
+        if (loadQueue[loadVLDIdx]->isSquashed()){
+            incrLdIdx(loadVLDIdx);
+            continue;
+        }
+        // skip the loads that either do not need to expose
+        // or exposed already
+        if(!loadQueue[loadVLDIdx]->needPostFetch()
+                || loadQueue[loadVLDIdx]->isExposeSent() ){
+            incrLdIdx(loadVLDIdx);
+            continue;
+        }
+
+        DynInstPtr load_inst = loadQueue[loadVLDIdx];
+        if (loadQueue[loadVLDIdx]->fault!=NoFault){
+            //load is executed, so it wait for expose complete
+            //to send it to commit, regardless of whether it is ready
+            //to expose
+            load_inst->setExposeCompleted();
+            load_inst->setExposeSent();
+            loadsToVLD--;
+            if (load_inst->isExecuted()){
+                DPRINTF(LSQUnit, "Execute finished and gets violation fault."
+                    "Send inst [sn:%lli] to commit stage.\n",
+                    load_inst->seqNum);
+                    iewStage->instToCommit(load_inst);
+                    iewStage->activityThisCycle();
+            }
+            incrLdIdx(loadVLDIdx);
+            continue;
+        }
+
+        // skip the loads that need expose but
+        // are not ready
+        if (loadQueue[loadVLDIdx]->needPostFetch()
+                && !loadQueue[loadVLDIdx]->readyToExpose()){
+            incrLdIdx(loadVLDIdx);
+            continue;
+        }
+
+        assert(loadQueue[loadVLDIdx]->needPostFetch()
+                && loadQueue[loadVLDIdx]->readyToExpose() );
+
+        assert(!load_inst->isCommitted());
+
+
+        RequestPtr req = load_inst->postReq;
+        RequestPtr sreqLow = load_inst->postSreqLow;
+        RequestPtr sreqHigh = load_inst->postSreqHigh;
+
+        // we should not have both req and sreqLow not NULL
+        assert( !(req && sreqLow));
+
+        DPRINTF(LSQUnit, "Validate/Expose request for inst [sn:%lli]"
+            " PC= %s. req=%#x, reqLow=%#x, reqHigh=%#x\n",
+            load_inst->seqNum, load_inst->pcState(),
+            (Addr)(load_inst->postReq),
+            (Addr)(load_inst->postSreqLow), (Addr)(load_inst->postSreqHigh));
+
+        PacketPtr data_pkt = NULL;
+        PacketPtr snd_data_pkt = NULL;
+
+        LSQSenderState *state = new LSQSenderState;
+        state->isLoad = false;
+        state->idx = loadVLDIdx;
+        state->inst = load_inst;
+        state->noWB = true;
+
+        bool split = false;
+        if (TheISA::HasUnalignedMemAcc && sreqLow) {
+            split = true;
+        } else {
+            assert(req);
+        }
+
+        bool onlyExpose = false;
+        if (!split) {
+            if (load_inst->needExposeOnly() || load_inst->isL1HitLow()){
+                data_pkt = Packet::createExpose(req);
+                onlyExpose = true;
+            }else {
+                data_pkt = Packet::createValidate(req);
+                if (!load_inst->vldData)
+                    load_inst->vldData = new uint8_t[1];
+                data_pkt->dataStatic(load_inst->vldData);
+            }
+            data_pkt->senderState = state;
+            data_pkt->setFirst();
+            data_pkt->reqIdx = loadVLDIdx;
+            DPRINTF(LSQUnit, "contextid = %d\n", req->contextId());
+        } else {
+            // allocate memory if we need at least one validation
+            if (!load_inst->needExposeOnly() &&
+                (!load_inst->isL1HitLow() || !load_inst->isL1HitHigh())){
+                if (!load_inst->vldData)
+                    load_inst->vldData = new uint8_t[2];
+            } else {
+                onlyExpose = true;
+            }
+
+            // Create the split packets. - first one
+            if (load_inst->needExposeOnly() || load_inst->isL1HitLow()){
+                data_pkt = Packet::createExpose(sreqLow);
+            }else{
+                data_pkt = Packet::createValidate(sreqLow);
+                assert(load_inst->vldData);
+                data_pkt->dataStatic(load_inst->vldData);
+            }
+
+            // Create the split packets. - second one
+            if (load_inst->needExposeOnly() || load_inst->isL1HitHigh()){
+                snd_data_pkt = Packet::createExpose(sreqHigh);
+            } else {
+                snd_data_pkt = Packet::createValidate(sreqHigh);
+                assert(load_inst->vldData);
+                snd_data_pkt->dataStatic(&(load_inst->vldData[1]));
+            }
+
+            data_pkt->senderState = state;
+            data_pkt->setFirst();
+            snd_data_pkt->senderState = state;
+            data_pkt->reqIdx = loadVLDIdx;
+            snd_data_pkt->reqIdx = loadVLDIdx;
+
+            data_pkt->isSplit = true;
+            snd_data_pkt->isSplit = true;
+            state->isSplit = true;
+            state->outstanding = 2;
+            state->mainPkt = data_pkt;
+
+            DPRINTF(LSQUnit, "contextid = %d, %d\n",
+                    sreqLow->contextId(), sreqHigh->contextId());
+            req = sreqLow;
+        }
+
+        assert(!req->isStrictlyOrdered());
+        assert(!req->isMmappedIpr());
+
+        DPRINTF(LSQUnit, "D-Cache: Validating/Exposing load idx:%i PC:%s "
+                "to Addr:%#x, data:%#x [sn:%lli]\n",
+                loadVLDIdx, load_inst->pcState(),
+                //FIXME: resultData not memData
+                req->getPaddr(), (int)*(load_inst->memData),
+                load_inst->seqNum);
+
+        bool successful_expose = true;
+        bool completedFirst = false;
+
+        if (!dcachePort->sendTimingReq(data_pkt)){
+            DPRINTF(IEW, "D-Cache became blocked when "
+                "validating [sn:%lli], will retry later\n",
+                load_inst->seqNum);
+            successful_expose = false;
+        } else {
+            if (split) {
+                // If split, try to send the second packet too
+                completedFirst = true;
+                assert(snd_data_pkt);
+
+                if (!dcachePort->sendTimingReq(snd_data_pkt)){
+                    state->complete();
+                    state->cacheBlocked = true;
+                    successful_expose = false;
+                    DPRINTF(IEW, "D-Cache became blocked when validating"
+                        " [sn:%lli] second packet, will retry later\n",
+                        load_inst->seqNum);
+                }
+            }
+        }
+
+        if (!successful_expose){
+            if (!split) {
+                delete state;
+                delete data_pkt;
+            }else{
+                if (!completedFirst){
+                    delete state;
+                    delete data_pkt;
+                    delete snd_data_pkt;
+                } else {
+                    delete snd_data_pkt;
+                }
+            }
+            //cpu->wakeCPU();  // This will cause issue(wrong activity count and affects the memory transactions
+            ++lsqCacheBlocked;
+            break;
+        } else {
+            // Here is to fix memory leakage
+            // it is ugly, but we have to do it now.
+            load_inst->needDeletePostReq(false);
+
+            // if all the packets we sent out is expose,
+            // we assume the expose is alreay completed
+            if (onlyExpose) {
+                load_inst->setExposeCompleted();
+                numExposes++;
+            } else {
+                numValidates++;
+            }
+            if (load_inst->needExposeOnly()){
+                numConvertedExposes++;
+            }
+            if (load_inst->isExecuted() && load_inst->isExposeCompleted()
+                    && !load_inst->isSquashed()){
+                DPRINTF(LSQUnit, "Expose finished. Execution done."
+                    "Send inst [sn:%lli] to commit stage.\n",
+                    load_inst->seqNum);
+                    iewStage->instToCommit(load_inst);
+                    iewStage->activityThisCycle();
+            } else{
+                DPRINTF(LSQUnit, "Need validation or execution not finishes."
+                    "Need to wait for readResp/validateResp "
+                    "for inst [sn:%lli].\n",
+                    load_inst->seqNum);
+            }
+
+            load_inst->setExposeSent();
+            --loadsToVLD;
+            incrLdIdx(loadVLDIdx);
+            if (!split){
+                setSpecBuffState(req);
+            } else {
+                setSpecBuffState(sreqLow);
+                setSpecBuffState(sreqHigh);
+            }
+        }
+    }
+
+    DPRINTF(LSQUnit, "Send validate/expose for %d insts. loadsToVLD=%d"
+            ". loadHead=%d. loadTail=%d.\n",
+            old_loadsToVLD-loadsToVLD, loadsToVLD, loadHead,
+            loadTail);
+
+    assert(loads>=0 && loadsToVLD >= 0);
+
+    return old_loadsToVLD-loadsToVLD;
+}
+
+
+
+
 template <class Impl>
 void
 LSQUnit<Impl>::writebackStores()
@@ -787,7 +1341,7 @@ LSQUnit<Impl>::writebackStores()
 
         if (isStoreBlocked) {
             DPRINTF(LSQUnit, "Unable to write back any more stores, cache"
-                    " is blocked!\n");
+                    " is blocked on stores!\n");
             break;
         }
 
@@ -997,6 +1551,12 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
             stallingLoadIdx = 0;
         }
 
+        if (loadQueue[load_idx]->needPostFetch() &&
+                loadQueue[load_idx]->readyToExpose() &&
+                !loadQueue[load_idx]->isExposeSent()){
+            loadsToVLD --;
+        }
+
         // Clear the smart pointer to make sure it is decremented.
         loadQueue[load_idx]->setSquashed();
         loadQueue[load_idx] = NULL;
@@ -1007,6 +1567,7 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
 
         decrLdIdx(load_idx);
         ++lsqSquashedLoads;
+
     }
 
     if (memDepViolator && squashed_num < memDepViolator->seqNum) {
@@ -1061,6 +1622,10 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
     }
 }
 
+
+// after sent, we assume the store is complete
+// thus, we can wekeup and forward data
+// In TSO, mark inFlightStore as true to block following stores [mengjia]
 template <class Impl>
 void
 LSQUnit<Impl>::storePostSend(PacketPtr pkt)
@@ -1090,9 +1655,58 @@ LSQUnit<Impl>::storePostSend(PacketPtr pkt)
         storeInFlight = true;
     }
 
+    DPRINTF(LSQUnit, "Post sending store for inst [sn:%lli]\n",
+            storeQueue[storeWBIdx].inst->seqNum);
     incrStIdx(storeWBIdx);
 }
 
+
+
+template <class Impl>
+void
+LSQUnit<Impl>::completeValidate(DynInstPtr &inst, PacketPtr pkt)
+{
+    iewStage->wakeCPU();
+    // if instruction fault, no need to check value,
+    // return directly
+    //assert(!inst->needExposeOnly());
+    if (inst->isExposeCompleted() || inst->isSquashed()){
+        //assert(inst->fault != NoFault);
+        //Already sent to commit, do nothing
+        return;
+    }
+    //Check validation result
+    bool validation_fail = false;
+    if (!inst->isL1HitLow() && inst->vldData[0]==0) {
+        validation_fail = true;
+    } else {
+        if (pkt->isSplit && !inst->isL1HitHigh()
+            && inst->vldData[1]==0){
+            validation_fail = true;
+        }
+    }
+    if (validation_fail){
+        // Mark the load for re-execution
+        inst->fault = std::make_shared<ReExec>();
+        inst->validationFail(true);
+        DPRINTF(LSQUnit, "Validation failed.\n",
+        inst->seqNum);
+    }
+
+    inst->setExposeCompleted();
+    if ( inst->isExecuted() && inst->isExposeCompleted() ){
+        DPRINTF(LSQUnit, "Validation finished. Execution done."
+            "Send inst [sn:%lli] to commit stage.\n",
+            inst->seqNum);
+            iewStage->instToCommit(inst);
+            iewStage->activityThisCycle();
+    } else{
+        DPRINTF(LSQUnit, "Validation done. Execution not finishes."
+            "Need to wait for readResp for inst [sn:%lli].\n",
+            inst->seqNum);
+    }
+}
+
 template <class Impl>
 void
 LSQUnit<Impl>::writeback(const DynInstPtr &inst, PacketPtr pkt)
@@ -1106,6 +1720,11 @@ LSQUnit<Impl>::writeback(const DynInstPtr &inst, PacketPtr pkt)
         return;
     }
 
+    //DPRINTF(LSQUnit, "write back for inst [sn:%lli]\n", inst->seqNum);
+    assert(!(inst->isExecuted() && inst->isExposeCompleted() &&
+                inst->fault==NoFault) &&
+            "in this case, we will put it into ROB twice.");
+
     if (!inst->isExecuted()) {
         inst->setExecuted();
 
@@ -1125,8 +1744,42 @@ LSQUnit<Impl>::writeback(const DynInstPtr &inst, PacketPtr pkt)
         }
     }
 
-    // Need to insert instruction into queue to commit
-    iewStage->instToCommit(inst);
+    // [mengjia]
+    // check schemes to decide whether to set load can be committed
+    // on receiving readResp or readSpecResp
+    if(!isInvisibleSpec){
+        // if not invisibleSpec mode, we only receive readResp
+        assert(!pkt->isSpec() && !pkt->isValidate() &&
+                "Receiving spec or validation response "
+                "in non invisibleSpec mode");
+        iewStage->instToCommit(inst);
+    } else if (inst->fault != NoFault){
+        inst->setExposeCompleted();
+        inst->setExposeSent();
+        iewStage->instToCommit(inst);
+    } else {
+        //isInvisibleSpec == true
+        if (pkt->isSpec()) {
+            inst->setSpecCompleted();
+        }
+
+        assert(!pkt->isValidate() && "receiving validation response"
+                "in invisibleSpec RC mode");
+        assert(!pkt->isExpose() && "receiving expose response"
+                "on write back path");
+
+        // check whether the instruction can be committed
+        if ( !inst->isExposeCompleted() && inst->needPostFetch() ){
+            DPRINTF(LSQUnit, "Expose not finished. "
+                "Wait until expose completion"
+                " to send inst [sn:%lli] to commit stage\n", inst->seqNum);
+        }else{
+            DPRINTF(LSQUnit, "Expose and execution both finished. "
+                "Send inst [sn:%lli] to commit stage\n", inst->seqNum);
+            iewStage->instToCommit(inst);
+        }
+
+    }
 
     iewStage->activityThisCycle();
 
@@ -1134,6 +1787,8 @@ LSQUnit<Impl>::writeback(const DynInstPtr &inst, PacketPtr pkt)
     iewStage->checkMisprediction(inst);
 }
 
+// set store to complete [mengjia]
+// complete the store after it commits
 template <class Impl>
 void
 LSQUnit<Impl>::completeStore(int store_idx)
@@ -1209,9 +1864,12 @@ LSQUnit<Impl>::sendStore(PacketPtr data_pkt)
         retryPkt = data_pkt;
         return false;
     }
+    setSpecBuffState(data_pkt->req);
     return true;
 }
 
+
+
 template <class Impl>
 void
 LSQUnit<Impl>::recvRetry()
@@ -1219,6 +1877,7 @@ LSQUnit<Impl>::recvRetry()
     if (isStoreBlocked) {
         DPRINTF(LSQUnit, "Receiving retry: store blocked\n");
         assert(retryPkt != NULL);
+        assert(retryPkt->isWrite());
 
         LSQSenderState *state =
             dynamic_cast<LSQSenderState *>(retryPkt->senderState);
@@ -1267,7 +1926,7 @@ template <class Impl>
 inline void
 LSQUnit<Impl>::incrLdIdx(int &load_idx) const
 {
-    if (++load_idx >= LQEntries)
+    if ((++load_idx) >= LQEntries)
         load_idx = 0;
 }
 
@@ -1275,7 +1934,7 @@ template <class Impl>
 inline void
 LSQUnit<Impl>::decrLdIdx(int &load_idx) const
 {
-    if (--load_idx < 0)
+    if ((--load_idx) < 0)
         load_idx += LQEntries;
 }
 
diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh
index 4331b6d08..0f7839462 100644
--- a/src/cpu/o3/rename_impl.hh
+++ b/src/cpu/o3/rename_impl.hh
@@ -1380,7 +1380,8 @@ DefaultRename<Impl>::serializeAfter(InstQueue &inst_list, ThreadID tid)
         // Mark a bit to say that I must serialize on the next instruction.
         serializeOnNextInst[tid] = true;
         return;
-    }
+    } 
+    
 
     // Set the next instruction as serializing.
     inst_list.front()->setSerializeBefore();
diff --git a/src/cpu/o3/rob.hh b/src/cpu/o3/rob.hh
index 1896e62a4..b5fc5c0fb 100644
--- a/src/cpu/o3/rob.hh
+++ b/src/cpu/o3/rob.hh
@@ -212,6 +212,10 @@ class ROB
     /** Updates the tail instruction with the new youngest instruction. */
     void updateTail();
 
+    /** [SafeSpce] Updates load instructions visible condition
+     *  set isPrevInstsCompleted and isPrevBrsResolved. */
+    void updateVisibleState();
+
     /** Reads the PC of the oldest head instruction. */
 //    uint64_t readHeadPC();
 
diff --git a/src/cpu/o3/rob_impl.hh b/src/cpu/o3/rob_impl.hh
index 3a0140b9f..691149e67 100644
--- a/src/cpu/o3/rob_impl.hh
+++ b/src/cpu/o3/rob_impl.hh
@@ -405,6 +405,85 @@ ROB<Impl>::doSquash(ThreadID tid)
 }
 
 
+/* **************************
+ * [InvisiSpec] update load insts state
+ * isPrevInstsCompleted; isPrevBrsResolved
+ * *************************/
+template <class Impl>
+void
+ROB<Impl>::updateVisibleState()
+{
+    list<ThreadID>::iterator threads = activeThreads->begin();
+    list<ThreadID>::iterator end = activeThreads->end();
+
+    while (threads != end) {
+        ThreadID tid = *threads++;
+
+        if (instList[tid].empty())
+            continue;
+
+        InstIt inst_it = instList[tid].begin();
+        InstIt tail_inst_it = instList[tid].end();
+
+        bool prevInstsComplete=true;
+        bool prevBrsResolved=true;
+        bool prevInstsCommitted=true;
+        bool prevBrsCommitted=true;
+
+        while (inst_it != tail_inst_it) {
+            DynInstPtr inst = *inst_it++;
+
+            assert(inst!=0);
+
+            if (!prevInstsComplete &&
+                    !prevBrsResolved) {
+                break;
+            }
+
+            if (inst->isLoad()) {
+                if (prevInstsComplete) {
+                    inst->setPrevInstsCompleted();
+                }
+                if (prevBrsResolved){
+                    inst->setPrevBrsResolved();
+                }
+                if (prevInstsCommitted) {
+                    inst->setPrevInstsCommitted();
+                }
+                if (prevBrsCommitted) {
+                    inst->setPrevBrsCommitted();
+                }
+            }
+            
+            // Update prev control insts state
+            if (inst->isControl()){
+                prevBrsCommitted = false;
+                if (!inst->readyToCommit() || inst->getFault()!=NoFault
+                        || inst->isSquashed()){
+                    prevBrsResolved = false;
+                }
+            } 
+            
+            prevInstsCommitted = false;
+
+            // Update prev insts state
+            if (inst->isNonSpeculative() || inst->isStoreConditional()
+               || inst->isMemBarrier() || inst->isWriteBarrier() ||
+               (inst->isLoad() && inst->strictlyOrdered())){
+                //Some special instructions, directly set canCommit
+                //when entering ROB
+                prevInstsComplete = false;
+            }
+            if (!inst->readyToCommit() || inst->getFault()!=NoFault
+                    || inst->isSquashed()){
+                prevInstsComplete = false;
+            }
+            
+        }
+    }
+}
+
+
 template <class Impl>
 void
 ROB<Impl>::updateHead()
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index 4369e168f..b81193082 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -224,7 +224,24 @@ MemCmd::commandInfo[] =
       InvalidateResp, "InvalidateReq" },
     /* Invalidation Response */
     { SET2(IsInvalidate, IsResponse),
-      InvalidCmd, "InvalidateResp" }
+      InvalidCmd, "InvalidateResp" },
+    /* [InvisiSpec] New command info */
+    { SET4(IsRead, IsRequest, NeedsResponse, IsSpec),
+      ReadSpecResp, "ReadSpecReq" },
+    { SET4(IsRead, IsResponse, HasData, IsSpec),
+      InvalidCmd, "ReadSpecResp" },
+    { SET4(IsRead, IsRequest, NeedsResponse, IsValidate),
+      ValidateResp, "ValidateReq" },
+    { SET4(IsRead, IsResponse, HasData, IsValidate),
+      InvalidCmd, "ValidateResp" },
+    { SET4(IsRead, IsRequest, NeedsResponse, IsExpose),
+      ExposeResp, "ExposeReq" },
+    { SET3(IsRead, IsResponse, IsExpose),
+      InvalidCmd, "ExposeResp" },
+    { SET3(IsRequest, NeedsResponse, IsSpecFlush),
+      SpecFlushResp, "SpecFlushReq" },
+    { SET2(IsResponse, IsSpecFlush),
+      InvalidCmd, "SpecFlushResp" }
 };
 
 bool
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index c59db362e..1e54bd9d9 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -135,6 +135,15 @@ class MemCmd
         FlushReq,      //request for a cache flush
         InvalidateReq,   // request for address to be invalidated
         InvalidateResp,
+        /* [InvisiSpec] New commands */
+        ReadSpecReq,
+        ReadSpecResp,
+        ValidateReq,
+        ValidateResp,
+        ExposeReq,
+        ExposeResp,
+        SpecFlushReq,
+        SpecFlushResp,
         NUM_MEM_CMDS
     };
 
@@ -162,6 +171,11 @@ class MemCmd
         IsPrint,        //!< Print state matching address (for debugging)
         IsFlush,        //!< Flush the address from caches
         FromCache,      //!< Request originated from a caching agent
+        /* [InvisiSpec] New attributes */
+        IsSpec,         //!< Speculatively issued
+        IsValidate,
+        IsExpose,
+        IsSpecFlush,
         NUM_COMMAND_ATTRIBUTES
     };
 
@@ -228,6 +242,12 @@ class MemCmd
     bool isPrint() const        { return testCmdAttrib(IsPrint); }
     bool isFlush() const        { return testCmdAttrib(IsFlush); }
 
+    /// [InvisiSpec] InvisiSpec attributes
+    bool isSpec() const         { return testCmdAttrib(IsSpec); }
+    bool isValidate() const     { return testCmdAttrib(IsValidate); }
+    bool isExpose() const       { return testCmdAttrib(IsExpose); }
+    bool isSpecFlush() const    { return testCmdAttrib(IsSpecFlush); }
+
     Command
     responseCommand() const
     {
@@ -308,7 +328,17 @@ class Packet : public Printable
 
         // Signal block present to squash prefetch and cache evict packets
         // through express snoop flag
-        BLOCK_CACHED          = 0x00010000
+        BLOCK_CACHED          = 0x00010000,
+
+        // [InvisiSpec] ReadSpecReq was L1 hit.
+        L1_HIT                = 0x00020000,
+
+        // [InvisiSpec] this packet is the first one of split packets
+        // maximum split is 2
+        FIRST_IN_SPLIT      = 0x00040000,
+        ONLY_ACCESS_SPEC_BUFF      = 0x00080000,
+
+        EXTERNAL_EVICTION = 0x00100000,
     };
 
     Flags flags;
@@ -381,6 +411,12 @@ class Packet : public Printable
      */
     uint32_t payloadDelay;
 
+    //[InvisiSpec] indicate the source buffer entry
+    //if the load should get data from specbuffer
+    int srcIdx;
+    int reqIdx;
+    bool isSplit;
+
     /**
      * A virtual base opaque structure used to hold state associated
      * with the packet (e.g., an MSHR), specific to a MemObject that
@@ -554,6 +590,45 @@ class Packet : public Printable
     bool isPrint() const             { return cmd.isPrint(); }
     bool isFlush() const             { return cmd.isFlush(); }
 
+    /// [InvisiSpec] InvisiSpec flags
+    bool isSpec() const              { return cmd.isSpec(); }
+    bool isValidate() const          { return cmd.isValidate(); }
+    bool isExpose() const            { return cmd.isExpose(); }
+    bool isSpecFlush() const         { return cmd.isSpecFlush(); }
+    bool isL1Hit() const             { return flags.isSet(L1_HIT); }
+    bool isExternalEviction() const  { return flags.isSet(EXTERNAL_EVICTION); }
+    // [InvisiSpec] Check whether it is the first in split packets
+    bool isFirst() const             { return flags.isSet(FIRST_IN_SPLIT); }
+    bool onlyAccessSpecBuff() const
+        { return flags.isSet(ONLY_ACCESS_SPEC_BUFF); }
+
+    void setL1Hit()
+    {
+        assert(isSpec());
+        assert(!flags.isSet(L1_HIT));
+        flags.set(L1_HIT);
+    }
+
+    void setExternalEviction()
+    {
+        assert(!flags.isSet(EXTERNAL_EVICTION));
+        flags.set(EXTERNAL_EVICTION);
+    }
+
+    void setOnlyAccessSpecBuff()
+    {
+        assert(isSpec());
+        assert(!flags.isSet(ONLY_ACCESS_SPEC_BUFF));
+        flags.set(ONLY_ACCESS_SPEC_BUFF);
+    }
+
+    void setFirst()
+    {
+        //assert(isSpec());
+        assert(!flags.isSet(FIRST_IN_SPLIT));
+        flags.set(FIRST_IN_SPLIT);
+    }
+
     bool isWholeLineWrite(unsigned blk_size)
     {
         return (cmd == MemCmd::WriteReq || cmd == MemCmd::WriteLineReq) &&
@@ -779,7 +854,8 @@ class Packet : public Printable
         :  cmd(_cmd), id((PacketId)_req.get()), req(_req),
            data(nullptr), addr(0), _isSecure(false), size(0),
            _qosValue(0), headerDelay(0), snoopDelay(0),
-           payloadDelay(0), senderState(NULL)
+           payloadDelay(0), srcIdx(-1), reqIdx(-1), isSplit(false),
+           senderState(NULL)
     {
         if (req->hasPaddr()) {
             addr = req->getPaddr();
@@ -801,7 +877,8 @@ class Packet : public Printable
         :  cmd(_cmd), id(_id ? _id : (PacketId)_req.get()), req(_req),
            data(nullptr), addr(0), _isSecure(false),
            _qosValue(0), headerDelay(0),
-           snoopDelay(0), payloadDelay(0), senderState(NULL)
+           snoopDelay(0), payloadDelay(0), srcIdx(-1), reqIdx(-1), isSplit(false),
+           senderState(NULL)
     {
         if (req->hasPaddr()) {
             addr = req->getPaddr() & ~(_blkSize - 1);
@@ -828,6 +905,9 @@ class Packet : public Printable
            headerDelay(pkt->headerDelay),
            snoopDelay(0),
            payloadDelay(pkt->payloadDelay),
+           srcIdx(pkt->srcIdx),
+           reqIdx(pkt->reqIdx),
+           isSplit(pkt->isSplit),
            senderState(pkt->senderState)
     {
         if (!clear_flags)
@@ -904,6 +984,33 @@ class Packet : public Printable
     }
 
     /**
+     * [InvisiSpec] Methods that return Packets for InvisiSpec.
+     */
+    static PacketPtr
+    createReadSpec(const RequestPtr req)
+    {
+        return new Packet(req, MemCmd::ReadSpecReq);
+    }
+
+    static PacketPtr
+    createValidate(const RequestPtr req)
+    {
+        return new Packet(req, MemCmd::ValidateReq);
+    }
+
+    static PacketPtr
+    createExpose(const RequestPtr req)
+    {
+        return new Packet(req, MemCmd::ExposeReq);
+    }
+
+    static PacketPtr
+    createSpecFlush(const RequestPtr req)
+    {
+        return new Packet(req, MemCmd::SpecFlushReq);
+    }
+
+    /**
      * clean up packet variables
      */
     ~Packet()
diff --git a/src/mem/port.cc b/src/mem/port.cc
index 47f56e633..318a65308 100644
--- a/src/mem/port.cc
+++ b/src/mem/port.cc
@@ -176,6 +176,7 @@ MasterPort::sendFunctional(PacketPtr pkt)
     return _slavePort->recvFunctional(pkt);
 }
 
+// [InvisiSpec] Request from CPU to Ruby
 bool
 MasterPort::sendTimingReq(PacketPtr pkt)
 {
diff --git a/src/mem/protocol/MESI_Two_Level-L1cache.sm b/src/mem/protocol/MESI_Two_Level-L1cache.sm
index 87684ce10..846af7da5 100644
--- a/src/mem/protocol/MESI_Two_Level-L1cache.sm
+++ b/src/mem/protocol/MESI_Two_Level-L1cache.sm
@@ -76,10 +76,12 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     S, AccessPermission:Read_Only, desc="a L1 cache entry Shared";
     E, AccessPermission:Read_Only, desc="a L1 cache entry Exclusive";
     M, AccessPermission:Read_Write, desc="a L1 cache entry Modified", format="!b";
+    X, AccessPermission:Read_Only, desc="a L1 cache entry Speculatively observed";
 
     // Transient States
     IS, AccessPermission:Busy, desc="L1 idle, issued GETS, have not seen response yet";
     IM, AccessPermission:Busy, desc="L1 idle, issued GETX, have not seen response yet";
+    IX, AccessPermission:Busy, desc="L1 idle, issued GETSPEC, have not seen response yet";
     SM, AccessPermission:Read_Only, desc="L1 idle, issued GETX, have not seen response yet";
     IS_I, AccessPermission:Busy, desc="L1 idle, issued GETS, saw Inv before data because directory doesn't block on GETS hit";
 
@@ -99,6 +101,8 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     Load,            desc="Load request from the home processor";
     Ifetch,          desc="I-fetch request from the home processor";
     Store,           desc="Store request from the home processor";
+    SpecLoad,        desc="SpecLoad request from the home processor";
+    Expose,          desc="Expose request from the home processor";
 
     Inv,           desc="Invalidate request from L2 bank";
 
@@ -110,6 +114,8 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     Fwd_GETX,   desc="GETX from other processor";
     Fwd_GETS,   desc="GETS from other processor";
     Fwd_GET_INSTR,   desc="GET_INSTR from other processor";
+    Fwd_GETSPEC,   desc="GETSPEC from other processor";
+    Fwd_EXPOSE,   desc="EXPOSE from other processor";
 
     Data,       desc="Data for processor";
     Data_Exclusive,       desc="Data for processor";
@@ -188,6 +194,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
   }
 
   State getState(TBE tbe, Entry cache_entry, Addr addr) {
+    // [InvisiSpec] The same cache line cannot be present in L1D and L1I at the same time.
     assert((L1Dcache.isTagPresent(addr) && L1Icache.isTagPresent(addr)) == false);
 
     if(is_valid(tbe)) {
@@ -265,6 +272,10 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
       return Event:Ifetch;
     } else if ((type == RubyRequestType:ST) || (type == RubyRequestType:ATOMIC)) {
       return Event:Store;
+    } else if (type == RubyRequestType:SPEC_LD) {
+      return Event:SpecLoad;
+    } else if (type == RubyRequestType:EXPOSE) {
+      return Event:Expose;
     } else {
       error("Invalid RubyRequestType");
     }
@@ -387,6 +398,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
           trigger(Event:Data_Exclusive, in_msg.addr, cache_entry, tbe);
         } else if(in_msg.Type == CoherenceResponseType:DATA) {
           if ((getState(tbe, cache_entry, in_msg.addr) == State:IS ||
+               getState(tbe, cache_entry, in_msg.addr) == State:IX ||
                getState(tbe, cache_entry, in_msg.addr) == State:IS_I ||
                getState(tbe, cache_entry, in_msg.addr) == State:PF_IS ||
                getState(tbe, cache_entry, in_msg.addr) == State:PF_IS_I) &&
@@ -433,6 +445,10 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
           trigger(Event:Fwd_GETS, in_msg.addr, cache_entry, tbe);
         } else if (in_msg.Type == CoherenceRequestType:GET_INSTR) {
           trigger(Event:Fwd_GET_INSTR, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:GETSPEC) {
+          trigger(Event:Fwd_GETSPEC, in_msg.addr, cache_entry, tbe);
+        } else if (in_msg.Type == CoherenceRequestType:EXPOSE) {
+          trigger(Event:Fwd_EXPOSE, in_msg.addr, cache_entry, tbe);
         } else {
           error("Invalid forwarded request type");
         }
@@ -534,6 +550,43 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
         out_msg.MessageSize := MessageSizeType:Control;
         out_msg.Prefetch := in_msg.Prefetch;
         out_msg.AccessMode := in_msg.AccessMode;
+        out_msg.idx := in_msg.idx;
+      }
+    }
+  }
+
+  action(as_issueGETSPEC, "as", desc="Issue GETSPEC") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      enqueue(requestL1Network_out, RequestMsg, l1_request_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceRequestType:GETSPEC;
+        out_msg.Requestor := machineID;
+        out_msg.Destination.add(mapAddressToRange(address, MachineType:L2Cache,
+                          l2_select_low_bit, l2_select_num_bits, intToID(0)));
+        DPRINTF(RubySlicc, "address: %#x, destination: %s\n",
+                address, out_msg.Destination);
+        out_msg.MessageSize := MessageSizeType:SPECLD_Control;
+        out_msg.Prefetch := in_msg.Prefetch;
+        out_msg.AccessMode := in_msg.AccessMode;
+        out_msg.idx := in_msg.idx;
+      }
+    }
+  }
+
+  action(ex_issueEXPOSE, "ex", desc="Issue EXPOSE") {
+    peek(mandatoryQueue_in, RubyRequest) {
+      enqueue(requestL1Network_out, RequestMsg, l1_request_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceRequestType:EXPOSE;
+        out_msg.Requestor := machineID;
+        out_msg.Destination.add(mapAddressToRange(address, MachineType:L2Cache,
+                          l2_select_low_bit, l2_select_num_bits, intToID(0)));
+        DPRINTF(RubySlicc, "address: %#x, destination: %s\n",
+                address, out_msg.Destination);
+        out_msg.MessageSize := MessageSizeType:EXPOSE_Control;
+        out_msg.Prefetch := in_msg.Prefetch;
+        out_msg.AccessMode := in_msg.AccessMode;
+        out_msg.idx := in_msg.idx;
       }
     }
   }
@@ -568,6 +621,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
         out_msg.MessageSize := MessageSizeType:Control;
         out_msg.Prefetch := in_msg.Prefetch;
         out_msg.AccessMode := in_msg.AccessMode;
+        out_msg.idx := in_msg.idx;
       }
     }
   }
@@ -606,6 +660,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
         out_msg.MessageSize := MessageSizeType:Control;
         out_msg.Prefetch := in_msg.Prefetch;
         out_msg.AccessMode := in_msg.AccessMode;
+        out_msg.idx := in_msg.idx;
       }
     }
   }
@@ -643,6 +698,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
         out_msg.MessageSize := MessageSizeType:Control;
         out_msg.Prefetch := in_msg.Prefetch;
         out_msg.AccessMode := in_msg.AccessMode;
+        out_msg.idx := in_msg.idx;
       }
     }
   }
@@ -662,6 +718,36 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     }
   }
 
+  action(dex_sendDataToExposeRequestor, "dex", desc="send data to requestor") {
+    peek(requestL1Network_in, RequestMsg) {
+      enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:DATA;
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.Dirty := cache_entry.Dirty;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:EXPOSE_Data;
+      }
+    }
+  }
+
+  action(ds_sendDataToSpecRequestor, "ds", desc="send data to requestor") {
+    peek(requestL1Network_in, RequestMsg) {
+      enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:DATA;
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.Dirty := cache_entry.Dirty;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:SPECLD_Data;
+      }
+    }
+  }
+
   action(d2_sendDataToL2, "d2", desc="send data to the L2 cache because of M downgrade") {
     enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) {
       assert(is_valid(cache_entry));
@@ -676,6 +762,20 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     }
   }
 
+  action(d2ex_sendExposeDataToL2, "d2ex", desc="send data to the L2 cache because of M downgrade") {
+    enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) {
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:DATA;
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.Dirty := cache_entry.Dirty;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address, MachineType:L2Cache,
+                          l2_select_low_bit, l2_select_num_bits, intToID(0)));
+      out_msg.MessageSize := MessageSizeType:EXPOSE_Data;
+    }
+  }
+
   action(dt_sendDataToRequestor_fromTBE, "dt", desc="send data to requestor") {
     peek(requestL1Network_in, RequestMsg) {
       enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) {
@@ -691,6 +791,36 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     }
   }
 
+  action(dtex_sendDataToExposeRequestor_fromTBE, "dtex", desc="send data to requestor") {
+    peek(requestL1Network_in, RequestMsg) {
+      enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:DATA;
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:EXPOSE_Data;
+      }
+    }
+  }
+
+  action(dts_sendDataToSpecRequestor_fromTBE, "dts", desc="send data to requestor") {
+    peek(requestL1Network_in, RequestMsg) {
+      enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) {
+        assert(is_valid(tbe));
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:DATA;
+        out_msg.DataBlk := tbe.DataBlk;
+        out_msg.Dirty := tbe.Dirty;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.MessageSize := MessageSizeType:SPECLD_Data;
+      }
+    }
+  }
+
   action(d2t_sendDataToL2_fromTBE, "d2t", desc="send data to the L2 cache") {
     enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) {
       assert(is_valid(tbe));
@@ -705,6 +835,20 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     }
   }
 
+  action(d2tex_sendExposeDataToL2_fromTBE, "d2tex", desc="send data to the L2 cache") {
+    enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) {
+      assert(is_valid(tbe));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:DATA;
+      out_msg.DataBlk := tbe.DataBlk;
+      out_msg.Dirty := tbe.Dirty;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(mapAddressToRange(address, MachineType:L2Cache,
+                          l2_select_low_bit, l2_select_num_bits, intToID(0)));
+      out_msg.MessageSize := MessageSizeType:EXPOSE_Data;
+    }
+  }
+
   action(e_sendAckToRequestor, "e", desc="send invalidate ack to requestor (could be L2 or L1)") {
     peek(requestL1Network_in, RequestMsg) {
       enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) {
@@ -761,7 +905,14 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
   action(forward_eviction_to_cpu, "\cc", desc="sends eviction information to the processor") {
     if (send_evictions) {
       DPRINTF(RubySlicc, "Sending invalidation for %#x to the CPU\n", address);
-      sequencer.evictionCallback(address);
+      sequencer.evictionCallback(address, false);
+    }
+  }
+
+  action(forward_external_eviction_to_cpu, "\ccc", desc="sends external eviction information to the processor") {
+    if (send_evictions) {
+      DPRINTF(RubySlicc, "Sending invalidation for %#x to the CPU\n", address);
+      sequencer.evictionCallback(address, true);
     }
   }
 
@@ -822,6 +973,14 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     sequencer.readCallback(address, cache_entry.DataBlk);
   }
 
+  action(h_spec_load_hit, "hs",
+         desc="Notify sequencer the spec load completed.")
+  {
+    assert(is_valid(cache_entry));
+    DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk);
+    sequencer.readCallback(address, cache_entry.DataBlk);
+  }
+
   action(h_ifetch_hit, "hi", desc="Notify sequencer the instruction fetch completed.")
   {
     assert(is_valid(cache_entry));
@@ -839,6 +998,15 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     sequencer.readCallback(address, cache_entry.DataBlk, true);
   }
 
+  action(hsx_spec_load_hit, "hsx", desc="Notify sequencer the external load completed.")
+  {
+    peek(responseL1Network_in, ResponseMsg) {
+      // [InvisiSpec] Hack for in_msg.DataBlk returning const DataBlk
+      tbe.DataBlk := in_msg.DataBlk;
+      sequencer.readCallback(address, tbe.DataBlk, true);
+    }
+  }
+
   action(hh_store_hit, "\h", desc="Notify sequencer that store completed.")
   {
     assert(is_valid(cache_entry));
@@ -868,6 +1036,14 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     tbe.DataBlk := cache_entry.DataBlk;
   }
 
+  action(iw_allocateTBEWithoutCacheEntry, "iw", desc="Allocate TBE without a cache entry") {
+    check_allocate(TBEs);
+    assert(!is_valid(cache_entry) || cache_entry.CacheState == State:I);
+    TBEs.allocate(address);
+    set_tbe(TBEs[address]);
+    tbe.isPrefetch := false;
+  }
+
   action(k_popMandatoryQueue, "k", desc="Pop mandatory queue.") {
     mandatoryQueue_in.dequeue(clockEdge());
   }
@@ -989,13 +1165,19 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
       cache_entry.isPrefetch := true;
   }
 
+  action(x_expose_done, "xd",
+         desc="Notify sequencer the expose completed.")
+  {
+    sequencer.readCallback(address, cache_entry.DataBlk);
+  }
+
 
   //*****************************************************
   // TRANSITIONS
   //*****************************************************
 
   // Transitions for Load/Store/Replacement/WriteBack from transient states
-  transition({IS, IM, IS_I, M_I, SM, SINK_WB_ACK}, {Load, Ifetch, Store, L1_Replacement}) {
+  transition({IS, IM, IX, IS_I, M_I, SM, SINK_WB_ACK}, {Load, Expose, SpecLoad, Ifetch, Store, L1_Replacement}) {
     z_stallAndWaitMandatoryQueue;
   }
 
@@ -1003,7 +1185,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     z_stallAndWaitMandatoryQueue;
   }
 
-  transition({PF_IM, PF_SM}, {Load, Ifetch, L1_Replacement}) {
+  transition({PF_IM, PF_SM}, {Load, Expose, SpecLoad, Ifetch, L1_Replacement}) {
     z_stallAndWaitMandatoryQueue;
   }
 
@@ -1016,7 +1198,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     ff_deallocateL1CacheBlock;
   }
 
-  transition({S,E,M,IS,IM,SM,IS_I,PF_IS_I,M_I,SINK_WB_ACK,PF_IS,PF_IM},
+  transition({S,E,M,IS,IM,IX,SM,IS_I,PF_IS_I,M_I,SINK_WB_ACK,PF_IS,PF_IM},
              {PF_Load, PF_Store, PF_Ifetch}) {
       pq_popPrefetchQueue;
   }
@@ -1030,6 +1212,21 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     k_popMandatoryQueue;
   }
 
+  transition({NP,I}, Expose, IS) {
+    oo_allocateL1DCacheBlock;
+    i_allocateTBE;
+    ex_issueEXPOSE;
+    uu_profileDataMiss;
+    //po_observeMiss;
+    k_popMandatoryQueue;
+  }
+
+  transition({NP,I}, SpecLoad, IX) {
+    iw_allocateTBEWithoutCacheEntry;
+    as_issueGETSPEC;
+    k_popMandatoryQueue;
+  }
+
   transition({NP,I}, PF_Load, PF_IS) {
     oo_allocateL1DCacheBlock;
     i_allocateTBE;
@@ -1037,13 +1234,13 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     pq_popPrefetchQueue;
   }
 
-  transition(PF_IS, Load, IS) {
+  transition(PF_IS, {Load, Expose}, IS) {
     uu_profileDataMiss;
     ppm_observePfMiss;
     k_popMandatoryQueue;
   }
 
-  transition(PF_IS_I, Load, IS_I) {
+  transition(PF_IS_I, {Load, Expose}, IS_I) {
     uu_profileDataMiss;
     ppm_observePfMiss;
     k_popMandatoryQueue;
@@ -1055,6 +1252,10 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     k_popMandatoryQueue;
   }
 
+  transition({PF_IS, PF_IS_I}, SpecLoad) {
+    k_popMandatoryQueue;
+  }
+
   transition({NP,I}, Ifetch, IS) {
     pp_allocateL1ICacheBlock;
     i_allocateTBE;
@@ -1107,19 +1308,24 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     k_popMandatoryQueue;
   }
 
-  transition({NP, I}, Inv) {
+  transition({NP, I, IX}, Inv) {
     fi_sendInvAck;
     l_popRequestQueue;
   }
 
   // Transitions from Shared
-  transition({S,E,M}, Load) {
+  transition({S,E,M}, {Load, Expose}) {
     h_load_hit;
     uu_profileDataHit;
     po_observeHit;
     k_popMandatoryQueue;
   }
 
+  transition({S,E,M}, SpecLoad) {
+    h_spec_load_hit;
+    k_popMandatoryQueue;
+  }
+
   transition({S,E,M}, Ifetch) {
     h_ifetch_hit;
     uu_profileInstHit;
@@ -1140,7 +1346,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
   }
 
   transition(S, Inv, I) {
-    forward_eviction_to_cpu;
+    forward_external_eviction_to_cpu;
     fi_sendInvAck;
     l_popRequestQueue;
   }
@@ -1164,13 +1370,13 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
 
   transition(E, Inv, I) {
     // don't send data
-    forward_eviction_to_cpu;
+    forward_external_eviction_to_cpu;
     fi_sendInvAck;
     l_popRequestQueue;
   }
 
   transition(E, Fwd_GETX, I) {
-    forward_eviction_to_cpu;
+    forward_external_eviction_to_cpu;
     d_sendDataToRequestor;
     l_popRequestQueue;
   }
@@ -1181,6 +1387,17 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     l_popRequestQueue;
   }
 
+  transition({E, M}, Fwd_GETSPEC) {
+    ds_sendDataToSpecRequestor;
+    l_popRequestQueue;
+  }
+
+  transition({E, M}, Fwd_EXPOSE, S) {
+    dex_sendDataToExposeRequestor;
+    d2ex_sendExposeDataToL2;
+    l_popRequestQueue;
+  }
+
   // Transitions from Modified
 
   transition(M, {L1_Replacement, PF_L1_Replacement}, M_I) {
@@ -1197,7 +1414,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
   }
 
   transition(M, Inv, I) {
-    forward_eviction_to_cpu;
+    forward_external_eviction_to_cpu;
     f_sendDataToL2;
     l_popRequestQueue;
   }
@@ -1208,7 +1425,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
   }
 
   transition(M, Fwd_GETX, I) {
-    forward_eviction_to_cpu;
+    forward_external_eviction_to_cpu;
     d_sendDataToRequestor;
     l_popRequestQueue;
   }
@@ -1230,6 +1447,17 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     l_popRequestQueue;
   }
 
+  transition(M_I, Fwd_EXPOSE, SINK_WB_ACK) {
+    dtex_sendDataToExposeRequestor_fromTBE;
+    d2tex_sendExposeDataToL2_fromTBE;
+    l_popRequestQueue;
+  }
+
+  transition(M_I, Fwd_GETSPEC) {
+    dts_sendDataToSpecRequestor_fromTBE;
+    l_popRequestQueue;
+  }
+
   // Transitions from IS
   transition({IS, IS_I}, Inv, IS_I) {
     fi_sendInvAck;
@@ -1341,6 +1569,14 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
     kd_wakeUpDependents;
   }
 
+  // [InvisiSpec] Data and Data_Exclusive are not possible at IX
+  transition(IX, {Data_all_Acks, DataS_fromL1}, I) {
+    hsx_spec_load_hit;
+    s_deallocateTBE;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
   // Transitions from IM
   transition(IM, Inv, IM) {
     fi_sendInvAck;
@@ -1384,7 +1620,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP")
 
   // transitions from SM
   transition(SM, Inv, IM) {
-    forward_eviction_to_cpu;
+    forward_external_eviction_to_cpu;
     fi_sendInvAck;
     dg_invalidate_sc;
     l_popRequestQueue;
diff --git a/src/mem/protocol/MESI_Two_Level-L2cache.sm b/src/mem/protocol/MESI_Two_Level-L2cache.sm
index 5a8cfae6d..ea884133e 100644
--- a/src/mem/protocol/MESI_Two_Level-L2cache.sm
+++ b/src/mem/protocol/MESI_Two_Level-L2cache.sm
@@ -72,6 +72,8 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     ISS, AccessPermission:Busy, desc="L2 idle, got single L1_GETS, issued memory fetch, have not seen response yet";
     IS, AccessPermission:Busy, desc="L2 idle, got L1_GET_INSTR or multiple L1_GETS, issued memory fetch, have not seen response yet";
     IM, AccessPermission:Busy, desc="L2 idle, got L1_GETX, issued memory fetch, have not seen response(s) yet";
+    II, AccessPermission:Busy, desc="L2 idle, got single L1_GETSPEC, issued memory fetch, have not seen response yet";
+    IEE, AccessPermission:Busy, desc="L2 idle, got single L1_EXPOSE, issued memory fetch, have not seen response yet";
 
     // Blocking states
     SS_MB, AccessPermission:Busy, desc="Blocked for L1_GETX from SS";
@@ -96,6 +98,9 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     L1_PUTX,                 desc="L1 replacing data";
     L1_PUTX_old,             desc="L1 replacing data, but no longer sharer";
 
+    L1_GETSPEC,              desc="L1 GETSPEC request for a block mapped to us";
+    L1_EXPOSE,               desc="L1 EXPOSE request for a block mapped to us";
+
     // events initiated by this L2
     L2_Replacement,     desc="L2 Replacement", format="!r";
     L2_Replacement_clean,     desc="L2 Replacement, but data is clean", format="!r";
@@ -135,6 +140,8 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     bool Dirty, default="false", desc="Data is Dirty";
 
     NetDest L1_GetS_IDs,            desc="Set of the internal processors that want the block in shared state";
+    NetDest L1_GetSPEC_IDs,            desc="Set of the internal processors that want the block speculatively";
+    NetDest L1_Expose_IDs,            desc="Set of the internal processors that want the block to be exposed";
     MachineID L1_GetX_ID,          desc="ID of the L1 cache to forward the block to once we get a response";
     int pendingAcks,            desc="number of pending acks for invalidates during writeback";
   }
@@ -267,6 +274,10 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
       } else {
         return Event:L1_PUTX_old;
       }
+    } else if (type == CoherenceRequestType:GETSPEC) {
+      return Event:L1_GETSPEC;
+    } else if (type == CoherenceRequestType:EXPOSE) {
+      return Event:L1_EXPOSE;
     } else {
       DPRINTF(RubySlicc, "address: %#x, Request Type: %s\n", addr, type);
       error("Invalid L1 forwarded request type");
@@ -399,10 +410,12 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     peek(L1RequestL2Network_in, RequestMsg) {
       enqueue(DirRequestL2Network_out, RequestMsg, l2_request_latency) {
         out_msg.addr := address;
-        out_msg.Type := CoherenceRequestType:GETS;
+        out_msg.Type := in_msg.Type;
         out_msg.Requestor := machineID;
         out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory));
-        out_msg.MessageSize := MessageSizeType:Control;
+        out_msg.MessageSize := in_msg.MessageSize;
+        out_msg.idx := in_msg.idx;
+        out_msg.origin := in_msg.Requestor;
       }
     }
   }
@@ -420,6 +433,32 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     }
   }
 
+  action(bs_forwardSpecRequestToExclusive, "bs", desc="Forward request to the exclusive L1") {
+    peek(L1RequestL2Network_in, RequestMsg) {
+      enqueue(L1RequestL2Network_out, RequestMsg, to_l1_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.addr := address;
+        out_msg.Type := in_msg.Type;
+        out_msg.Requestor := in_msg.Requestor;
+        out_msg.Destination.add(cache_entry.Exclusive);
+        out_msg.MessageSize := MessageSizeType:SPECLD_Request_Control;
+      }
+    }
+  }
+
+  action(bex_forwardExposeRequestToExclusive, "bex", desc="Forward request to the exclusive L1") {
+    peek(L1RequestL2Network_in, RequestMsg) {
+      enqueue(L1RequestL2Network_out, RequestMsg, to_l1_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.addr := address;
+        out_msg.Type := in_msg.Type;
+        out_msg.Requestor := in_msg.Requestor;
+        out_msg.Destination.add(cache_entry.Exclusive);
+        out_msg.MessageSize := MessageSizeType:EXPOSE_Request_Control;
+      }
+    }
+  }
+
   action(c_exclusiveReplacement, "c", desc="Send data to memory") {
     enqueue(responseL2Network_out, ResponseMsg, l2_response_latency) {
       assert(is_valid(cache_entry));
@@ -494,6 +533,25 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     }
   }
 
+  action(ddex_sendExclusiveDataToExposeRequestor, "ddex", desc="Send data from cache to reqeustor") {
+    peek(L1RequestL2Network_in, RequestMsg) {
+      enqueue(responseL2Network_out, ResponseMsg, l2_response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:EXPOSE_Data;
+
+        out_msg.AckCount := 0 - cache_entry.Sharers.count();
+        if (cache_entry.Sharers.isElement(in_msg.Requestor)) {
+          out_msg.AckCount := out_msg.AckCount + 1;
+        }
+      }
+    }
+  }
+
   action(ds_sendSharedDataToRequestor, "ds", desc="Send data from cache to reqeustor") {
     peek(L1RequestL2Network_in, RequestMsg) {
       enqueue(responseL2Network_out, ResponseMsg, l2_response_latency) {
@@ -509,9 +567,39 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     }
   }
 
+  action(dss_sendSharedDataToSpecRequestor, "dss", desc="Send data from cache to reqeustor") {
+    peek(L1RequestL2Network_in, RequestMsg) {
+      enqueue(responseL2Network_out, ResponseMsg, l2_response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:DATA;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:SPECLD_Data;
+        out_msg.AckCount := 0;
+      }
+    }
+  }
+
+  action(dsex_sendSharedDataToExposeRequestor, "dsex", desc="Send data from cache to reqeustor") {
+    peek(L1RequestL2Network_in, RequestMsg) {
+      enqueue(responseL2Network_out, ResponseMsg, l2_response_latency) {
+        assert(is_valid(cache_entry));
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:DATA;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.Requestor);
+        out_msg.DataBlk := cache_entry.DataBlk;
+        out_msg.MessageSize := MessageSizeType:EXPOSE_Data;
+        out_msg.AckCount := 0;
+      }
+    }
+  }
+
   action(e_sendDataToGetSRequestors, "e", desc="Send data from cache to all GetS IDs") {
     assert(is_valid(tbe));
-    assert(tbe.L1_GetS_IDs.count() > 0);
+    assert(tbe.L1_GetS_IDs.count() + tbe.L1_GetSPEC_IDs.count() + tbe.L1_Expose_IDs.count() > 0);
     enqueue(responseL2Network_out, ResponseMsg, to_l1_latency) {
       assert(is_valid(cache_entry));
       out_msg.addr := address;
@@ -523,9 +611,40 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     }
   }
 
+  action(es_sendDataToGetSpecRequestors, "es", desc="Send data from cache to all GetSpec IDs") {
+    assert(is_valid(tbe));
+    assert(tbe.L1_GetS_IDs.count() + tbe.L1_GetSPEC_IDs.count() + tbe.L1_Expose_IDs.count() > 0);
+    peek(responseL2Network_in, ResponseMsg) {
+      enqueue(responseL2Network_out, ResponseMsg, to_l1_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:DATA;
+        out_msg.Sender := machineID;
+        out_msg.Destination := tbe.L1_GetSPEC_IDs;  // internal nodes
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.MessageSize := MessageSizeType:SPECLD_Data;
+      }
+    }
+  }
+
+  action(eex_sendDataToExposeRequestors, "eex", desc="Send data from cache to all GetSpec IDs") {
+    assert(is_valid(tbe));
+    assert(tbe.L1_GetS_IDs.count() + tbe.L1_GetSPEC_IDs.count() + tbe.L1_Expose_IDs.count() > 0);
+    peek(responseL2Network_in, ResponseMsg) {
+      enqueue(responseL2Network_out, ResponseMsg, to_l1_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:DATA;
+        out_msg.Sender := machineID;
+        out_msg.Destination := tbe.L1_Expose_IDs;  // internal nodes
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.MessageSize := MessageSizeType:EXPOSE_Data;
+      }
+    }
+  }
+
   action(ex_sendExclusiveDataToGetSRequestors, "ex", desc="Send data from cache to all GetS IDs") {
     assert(is_valid(tbe));
     assert(tbe.L1_GetS_IDs.count() == 1);
+    assert(tbe.L1_GetSPEC_IDs.count() + tbe.L1_Expose_IDs.count() == 0);
     enqueue(responseL2Network_out, ResponseMsg, to_l1_latency) {
       assert(is_valid(cache_entry));
       out_msg.addr := address;
@@ -537,6 +656,21 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     }
   }
 
+  action(exex_sendExclusiveDataToExposeRequestors, "exex", desc="Send data from cache to all GetS IDs") {
+    assert(is_valid(tbe));
+    assert(tbe.L1_Expose_IDs.count() == 1);
+    assert(tbe.L1_GetS_IDs.count() + tbe.L1_GetSPEC_IDs.count() == 0);
+    enqueue(responseL2Network_out, ResponseMsg, to_l1_latency) {
+      assert(is_valid(cache_entry));
+      out_msg.addr := address;
+      out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE;
+      out_msg.Sender := machineID;
+      out_msg.Destination := tbe.L1_Expose_IDs;  // internal nodes
+      out_msg.DataBlk := cache_entry.DataBlk;
+      out_msg.MessageSize := MessageSizeType:EXPOSE_Data;
+    }
+  }
+
   action(ee_sendDataToGetXRequestor, "ee", desc="Send data from cache to GetX ID") {
     enqueue(responseL2Network_out, ResponseMsg, to_l1_latency) {
       assert(is_valid(tbe));
@@ -598,11 +732,23 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     TBEs.allocate(address);
     set_tbe(TBEs[address]);
     tbe.L1_GetS_IDs.clear();
+    tbe.L1_GetSPEC_IDs.clear();
+    tbe.L1_Expose_IDs.clear();
     tbe.DataBlk := cache_entry.DataBlk;
     tbe.Dirty := cache_entry.Dirty;
     tbe.pendingAcks := cache_entry.Sharers.count();
   }
 
+  action(iw_allocateTBEWithoutCacheEntry, "iw", desc="Allocate TBE for request without a cache entry") {
+    check_allocate(TBEs);
+    assert(!is_valid(cache_entry));
+    TBEs.allocate(address);
+    set_tbe(TBEs[address]);
+    tbe.L1_GetS_IDs.clear();
+    tbe.L1_GetSPEC_IDs.clear();
+    tbe.L1_Expose_IDs.clear();
+  }
+
   action(s_deallocateTBE, "s", desc="Deallocate external TBE") {
     TBEs.deallocate(address);
     unset_tbe();
@@ -668,6 +814,20 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     }
   }
 
+  action(sss_recordGetSPECL1ID, "\sss", desc="Record L1 GetSpec for load response") {
+    peek(L1RequestL2Network_in, RequestMsg) {
+      assert(is_valid(tbe));
+      tbe.L1_GetSPEC_IDs.add(in_msg.Requestor);
+    }
+  }
+
+  action(ssss_recordExposeL1ID, "\ssss", desc="Record L1 Expose for load response") {
+    peek(L1RequestL2Network_in, RequestMsg) {
+      assert(is_valid(tbe));
+      tbe.L1_Expose_IDs.add(in_msg.Requestor);
+    }
+  }
+
   action(xx_recordGetXL1ID, "\x", desc="Record L1 GetX for store response") {
     peek(L1RequestL2Network_in, RequestMsg) {
       assert(is_valid(tbe));
@@ -793,21 +953,22 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
   // BASE STATE - I
 
   // Transitions from I (Idle)
-  transition({NP, IS, ISS, IM, SS, M, M_I, I_I, S_I, MT_IB, MT_SB}, L1_PUTX) {
+  transition({NP, IS, ISS, IEE, IM, II, SS, M, M_I, I_I, S_I, MT_IB, MT_SB}, L1_PUTX) {
     t_sendWBAck;
     jj_popL1RequestQueue;
   }
 
-  transition({NP, SS, M, MT, M_I, I_I, S_I, IS, ISS, IM, MT_IB, MT_SB}, L1_PUTX_old) {
+  transition({NP, SS, M, MT, M_I, I_I, S_I, IS, ISS, IEE, IM, II, MT_IB, MT_SB}, L1_PUTX_old) {
     t_sendWBAck;
     jj_popL1RequestQueue;
   }
 
-  transition({IM, IS, ISS, SS_MB, MT_MB, MT_IIB, MT_IB, MT_SB}, {L2_Replacement, L2_Replacement_clean}) {
+  transition({IM, IS, ISS, IEE, II, SS_MB, MT_MB, MT_IIB, MT_IB, MT_SB}, {L2_Replacement, L2_Replacement_clean}) {
     zz_stallAndWaitL1RequestQueue;
   }
 
-  transition({IM, IS, ISS, SS_MB, MT_MB, MT_IIB, MT_IB, MT_SB}, MEM_Inv) {
+  // [InvisiSpec] TODO: How to handle Mem_Inv at II? Stall or ignore?
+  transition({IM, IS, ISS, IEE, II, SS_MB, MT_MB, MT_IIB, MT_IB, MT_SB}, MEM_Inv) {
     zn_recycleResponseNetwork;
   }
 
@@ -816,7 +977,7 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
   }
 
 
-  transition({SS_MB, MT_MB, MT_IIB, MT_IB, MT_SB}, {L1_GETS, L1_GET_INSTR, L1_GETX, L1_UPGRADE}) {
+  transition({SS_MB, MT_MB, MT_IIB, MT_IB, MT_SB}, {L1_GETS, L1_EXPOSE, L1_GET_INSTR, L1_GETX, L1_UPGRADE, L1_GETSPEC}) {
     zz_stallAndWaitL1RequestQueue;
   }
 
@@ -832,6 +993,17 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     jj_popL1RequestQueue;
   }
 
+  transition(NP, L1_EXPOSE,  IEE) {
+    qq_allocateL2CacheBlock;
+    ll_clearSharers;
+    nn_addSharer;
+    i_allocateTBE;
+    ssss_recordExposeL1ID;
+    a_issueFetchToMemory;
+    uu_profileMiss;
+    jj_popL1RequestQueue;
+  }
+
   transition(NP, L1_GET_INSTR, IS) {
     qq_allocateL2CacheBlock;
     ll_clearSharers;
@@ -854,12 +1026,28 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     jj_popL1RequestQueue;
   }
 
+  transition(NP, L1_GETSPEC, II) {
+    iw_allocateTBEWithoutCacheEntry;
+    sss_recordGetSPECL1ID;
+    a_issueFetchToMemory;
+    jj_popL1RequestQueue;
+  }
+
 
   // transitions from IS/IM
 
   transition(ISS, Mem_Data, MT_MB) {
     m_writeDataToCache;
     ex_sendExclusiveDataToGetSRequestors;
+    es_sendDataToGetSpecRequestors;
+    s_deallocateTBE;
+    o_popIncomingResponseQueue;
+  }
+
+  transition(IEE, Mem_Data, MT_MB) {
+    m_writeDataToCache;
+    exex_sendExclusiveDataToExposeRequestors;
+    es_sendDataToGetSpecRequestors;
     s_deallocateTBE;
     o_popIncomingResponseQueue;
   }
@@ -867,6 +1055,8 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
   transition(IS, Mem_Data, SS) {
     m_writeDataToCache;
     e_sendDataToGetSRequestors;
+    es_sendDataToGetSpecRequestors;
+    eex_sendDataToExposeRequestors;
     s_deallocateTBE;
     o_popIncomingResponseQueue;
     kd_wakeUpDependents;
@@ -879,18 +1069,48 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     o_popIncomingResponseQueue;
   }
 
-  transition({IS, ISS}, {L1_GETS, L1_GET_INSTR}, IS) {
+  transition(II, Mem_Data, NP) {
+    es_sendDataToGetSpecRequestors;
+    s_deallocateTBE;
+    o_popIncomingResponseQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition({IS, ISS, IEE}, {L1_GETS, L1_GET_INSTR}, IS) {
     nn_addSharer;
     ss_recordGetSL1ID;
     uu_profileMiss;
     jj_popL1RequestQueue;
   }
 
-  transition({IS, ISS}, L1_GETX) {
+  transition({IS, ISS, IEE}, L1_EXPOSE, IS) {
+    nn_addSharer;
+    ssss_recordExposeL1ID;
+    uu_profileMiss;
+    jj_popL1RequestQueue;
+  }
+
+  transition({IS, ISS, IEE}, L1_GETSPEC, IS) {
+    sss_recordGetSPECL1ID;
+    jj_popL1RequestQueue;
+  }
+
+  transition(II, L1_GETSPEC) {
+    sss_recordGetSPECL1ID;
+    jj_popL1RequestQueue;
+  }
+
+  // [InvisiSpec] L1_GET_INSTR should not be received at II
+  transition(II, {L1_GETS, L1_EXPOSE}) {
     zz_stallAndWaitL1RequestQueue;
   }
 
-  transition(IM, {L1_GETX, L1_GETS, L1_GET_INSTR}) {
+  // [InvisiSpec] TODO: Maybe we can optimize this?
+  transition({IS, ISS, IEE, II}, L1_GETX) {
+    zz_stallAndWaitL1RequestQueue;
+  }
+
+  transition(IM, {L1_GETX, L1_GETS, L1_EXPOSE, L1_GET_INSTR, L1_GETSPEC}) {
     zz_stallAndWaitL1RequestQueue;
   }
 
@@ -903,6 +1123,19 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     jj_popL1RequestQueue;
   }
 
+  transition(SS, L1_EXPOSE) {
+    dsex_sendSharedDataToExposeRequestor;
+    nn_addSharer;
+    set_setMRU;
+    uu_profileHit;
+    jj_popL1RequestQueue;
+  }
+
+  transition({SS, M}, L1_GETSPEC) {
+    dss_sendSharedDataToSpecRequestor;
+    jj_popL1RequestQueue;
+  }
+
 
   transition(SS, L1_GETX, SS_MB) {
     d_sendDataToRequestor;
@@ -956,6 +1189,14 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     jj_popL1RequestQueue;
   }
 
+  // [InvisiSpec] TODO
+  transition(M, L1_EXPOSE, MT_MB) {
+    ddex_sendExclusiveDataToExposeRequestor;
+    set_setMRU;
+    uu_profileHit;
+    jj_popL1RequestQueue;
+  }
+
   transition(M, {L2_Replacement, MEM_Inv}, M_I) {
     i_allocateTBE;
     c_exclusiveReplacement;
@@ -986,6 +1227,20 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
     jj_popL1RequestQueue;
   }
 
+  // [InvisiSpec] TODO: Ack packets are currently not recorded as EXPOSE traffic.
+  transition(MT, L1_EXPOSE, MT_IIB) {
+    bex_forwardExposeRequestToExclusive;
+    uu_profileMiss;
+    set_setMRU;
+    jj_popL1RequestQueue;
+  }
+
+  // [InvisiSpec] Do we need to block?
+  transition(MT, L1_GETSPEC) {
+    bs_forwardSpecRequestToExclusive;
+    jj_popL1RequestQueue;
+  }
+
   transition(MT, {L2_Replacement, MEM_Inv}, MT_I) {
     i_allocateTBE;
     f_sendInvToSharers;
@@ -1039,7 +1294,7 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP")
   }
 
   // writeback states
-  transition({I_I, S_I, MT_I, MCT_I, M_I}, {L1_GETX, L1_UPGRADE, L1_GETS, L1_GET_INSTR}) {
+  transition({I_I, S_I, MT_I, MCT_I, M_I}, {L1_GETX, L1_UPGRADE, L1_GETS, L1_EXPOSE, L1_GET_INSTR, L1_GETSPEC}) {
     zz_stallAndWaitL1RequestQueue;
   }
 
diff --git a/src/mem/protocol/MESI_Two_Level-dir.sm b/src/mem/protocol/MESI_Two_Level-dir.sm
index 991de5a2c..9934f57a8 100644
--- a/src/mem/protocol/MESI_Two_Level-dir.sm
+++ b/src/mem/protocol/MESI_Two_Level-dir.sm
@@ -49,6 +49,8 @@ machine(MachineType:Directory, "MESI Two Level directory protocol")
 
     M, AccessPermission:Maybe_Stale, desc="memory copy may be stale, i.e. other modified copies may exist";
     IM, AccessPermission:Busy, desc="Intermediate State I>M";
+    IE, AccessPermission:Busy, desc="Intermediate State I>M";
+    II, AccessPermission:Busy, desc="Intermediate State I>I for SpecFetch";
     MI, AccessPermission:Busy, desc="Intermediate State M>I";
     M_DRD, AccessPermission:Busy, desc="Intermediate State when there is a dma read";
     M_DRDI, AccessPermission:Busy, desc="Intermediate State when there is a dma read";
@@ -59,6 +61,8 @@ machine(MachineType:Directory, "MESI Two Level directory protocol")
   // Events
   enumeration(Event, desc="Directory events") {
     Fetch, desc="A memory fetch arrives";
+    Expose, desc="A memory expose arrives";
+    SpecFetch, desc="A memory fetch for speculative execution arrives";
     Data, desc="writeback data arrives";
     Memory_Data, desc="Fetched data from memory arrives";
     Memory_Ack, desc="Writeback Ack from memory arrives";
@@ -198,6 +202,10 @@ machine(MachineType:Directory, "MESI Two Level directory protocol")
         assert(in_msg.Destination.isElement(machineID));
         if (isGETRequest(in_msg.Type)) {
           trigger(Event:Fetch, in_msg.addr, TBEs[in_msg.addr]);
+        } else if (in_msg.Type == CoherenceRequestType:EXPOSE) {
+          trigger(Event:Expose, in_msg.addr, TBEs[in_msg.addr]);
+        } else if (in_msg.Type == CoherenceRequestType:GETSPEC) {
+          trigger(Event:SpecFetch, in_msg.addr, TBEs[in_msg.addr]);
         } else if (in_msg.Type == CoherenceRequestType:DMA_READ) {
           trigger(Event:DMA_READ, makeLineAddress(in_msg.addr),
                   TBEs[makeLineAddress(in_msg.addr)]);
@@ -275,6 +283,40 @@ machine(MachineType:Directory, "MESI Two Level directory protocol")
     }
   }
 
+  action(dex_sendExposeData, "dex", desc="Send data to requestor") {
+    peek(memQueue_in, MemoryMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, to_mem_ctrl_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:MEMORY_DATA;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.OriginalRequestorMachId);
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Dirty := false;
+        out_msg.MessageSize := MessageSizeType:EXPOSE_Data;
+
+        Entry e := getDirectoryEntry(in_msg.addr);
+        e.Owner := in_msg.OriginalRequestorMachId;
+      }
+    }
+  }
+
+  action(ds_sendSpecData, "ds", desc="Send data to requestor") {
+    peek(memQueue_in, MemoryMsg) {
+      enqueue(responseNetwork_out, ResponseMsg, to_mem_ctrl_latency) {
+        out_msg.addr := address;
+        out_msg.Type := CoherenceResponseType:MEMORY_DATA;
+        out_msg.Sender := machineID;
+        out_msg.Destination.add(in_msg.OriginalRequestorMachId);
+        out_msg.DataBlk := in_msg.DataBlk;
+        out_msg.Dirty := false;
+        out_msg.MessageSize := MessageSizeType:SPECLD_Data;
+
+        Entry e := getDirectoryEntry(in_msg.addr);
+        e.Owner := in_msg.OriginalRequestorMachId;
+      }
+    }
+  }
+
   // Actions
   action(aa_sendAck, "aa", desc="Send ack to L2") {
     peek(memQueue_in, MemoryMsg) {
@@ -306,7 +348,19 @@ machine(MachineType:Directory, "MESI Two Level directory protocol")
 
   action(qf_queueMemoryFetchRequest, "qf", desc="Queue off-chip fetch request") {
     peek(requestNetwork_in, RequestMsg) {
-      queueMemoryRead(in_msg.Requestor, address, to_mem_ctrl_latency);
+      queueMemoryRead(in_msg.Requestor, address, to_mem_ctrl_latency, in_msg.origin, in_msg.idx, 0);
+    }
+  }
+
+  action(qfs_queueMemorySpecFetchRequest, "qfs", desc="Queue off-chip fetch request") {
+    peek(requestNetwork_in, RequestMsg) {
+      queueMemoryRead(in_msg.Requestor, address, to_mem_ctrl_latency, in_msg.origin, in_msg.idx, 1);
+    }
+  }
+
+  action(qfe_queueMemoryExposeRequest, "qfe", desc="Queue off-chip fetch request") {
+    peek(requestNetwork_in, RequestMsg) {
+      queueMemoryRead(in_msg.Requestor, address, to_mem_ctrl_latency, in_msg.origin, in_msg.idx, 2);
     }
   }
 
@@ -320,7 +374,8 @@ machine(MachineType:Directory, "MESI Two Level directory protocol")
 //added by SS for dma
   action(qf_queueMemoryFetchRequestDMA, "qfd", desc="Queue off-chip fetch request") {
     peek(requestNetwork_in, RequestMsg) {
-      queueMemoryRead(in_msg.Requestor, address, to_mem_ctrl_latency);
+      assert(false);
+      queueMemoryRead(in_msg.Requestor, address, to_mem_ctrl_latency, in_msg.Requestor, -1, -1);
     }
   }
 
@@ -425,7 +480,18 @@ machine(MachineType:Directory, "MESI Two Level directory protocol")
     j_popIncomingRequestQueue;
   }
 
-  transition(M, Fetch) {
+  transition(I, Expose, IE) {
+    qfe_queueMemoryExposeRequest;
+    j_popIncomingRequestQueue;
+  }
+
+  transition(I, SpecFetch, II) {
+    qfs_queueMemorySpecFetchRequest;
+    j_popIncomingRequestQueue;
+  }
+
+  // [InvisiSpec] Is it secure?
+  transition(M, {Fetch, Expose, SpecFetch}) {
     inv_sendCacheInvalidate;
     z_stallAndWaitRequest;
   }
@@ -435,6 +501,19 @@ machine(MachineType:Directory, "MESI Two Level directory protocol")
     l_popMemQueue;
     kd_wakeUpDependents;
   }
+
+  transition(IE, Memory_Data, M) {
+    dex_sendExposeData;
+    l_popMemQueue;
+    kd_wakeUpDependents;
+  }
+
+  transition(II, Memory_Data, I) {
+    ds_sendSpecData;
+    l_popMemQueue;
+    kd_wakeUpDependents;
+  }
+
 //added by SS
   transition(M, CleanReplacement, I) {
     a_sendAck;
@@ -481,11 +560,11 @@ machine(MachineType:Directory, "MESI Two Level directory protocol")
     kd_wakeUpDependents;
   }
 
-  transition({ID, ID_W, M_DRDI, M_DWRI, IM, MI}, {Fetch, Data} ) {
+  transition({ID, ID_W, M_DRDI, M_DWRI, IM, IE, MI, II}, {Fetch, Expose, SpecFetch, Data} ) {
     z_stallAndWaitRequest;
   }
 
-  transition({ID, ID_W, M_DRD, M_DRDI, M_DWR, M_DWRI, IM, MI}, {DMA_WRITE, DMA_READ} ) {
+  transition({ID, ID_W, M_DRD, M_DRDI, M_DWR, M_DWRI, IM, IE, MI, II}, {DMA_WRITE, DMA_READ} ) {
     zz_recycleDMAQueue;
   }
 
diff --git a/src/mem/protocol/MESI_Two_Level-msg.sm b/src/mem/protocol/MESI_Two_Level-msg.sm
index 738019e7b..d4269193d 100644
--- a/src/mem/protocol/MESI_Two_Level-msg.sm
+++ b/src/mem/protocol/MESI_Two_Level-msg.sm
@@ -36,6 +36,8 @@ enumeration(CoherenceRequestType, desc="...") {
   GET_INSTR, desc="Get Instruction";
   INV,       desc="INValidate";
   PUTX,      desc="Replacement message";
+  GETSPEC,   desc="Get Speculatively";
+  EXPOSE,    desc="Expose";
 
   WB_ACK,    desc="Writeback ack";
 
@@ -68,7 +70,9 @@ structure(RequestMsg, desc="...", interface="Message") {
   int Len;
   bool Dirty, default="false",  desc="Dirty bit";
   PrefetchBit Prefetch,         desc="Is this a prefetch request";
-
+  MachineID origin;
+  int idx, default="-1",        desc="LQ index";
+  
   bool functionalRead(Packet *pkt) {
     // Only PUTX messages contains the data block
     if (Type == CoherenceRequestType:PUTX) {
diff --git a/src/mem/protocol/RubySlicc_Defines.sm b/src/mem/protocol/RubySlicc_Defines.sm
index eb235f8f3..7df82847e 100644
--- a/src/mem/protocol/RubySlicc_Defines.sm
+++ b/src/mem/protocol/RubySlicc_Defines.sm
@@ -35,7 +35,7 @@ Cycles recycle_latency;
 // Functions implemented in the AbstractController class for
 // making timing access to the memory maintained by the
 // memory controllers.
-void queueMemoryRead(MachineID id, Addr addr, Cycles latency);
+void queueMemoryRead(MachineID id, Addr addr, Cycles latency, MachineID origin, int idx, int type);
 void queueMemoryWrite(MachineID id, Addr addr, Cycles latency,
                       DataBlock block);
 void queueMemoryWritePartial(MachineID id, Addr addr, Cycles latency,
diff --git a/src/mem/protocol/RubySlicc_Exports.sm b/src/mem/protocol/RubySlicc_Exports.sm
index 8e17f9849..be64706b9 100644
--- a/src/mem/protocol/RubySlicc_Exports.sm
+++ b/src/mem/protocol/RubySlicc_Exports.sm
@@ -175,6 +175,11 @@ enumeration(RubyRequestType, desc="...", default="RubyRequestType_NULL") {
   Release,           desc="Release operation";
   Acquire,           desc="Acquire opertion";
   AcquireRelease,    desc="Acquire and Release opertion";
+  // [InvisiSpec] New request types
+  SPEC_LD,           desc="Speculative load";
+  EXPOSE,            desc="Expose";
+  VALIDATE,          desc="Validate";
+  SPEC_FLUSH,        desc="Flush SpecBuffer";
 }
 
 enumeration(SequencerRequestType, desc="...", default="SequencerRequestType_NULL") {
@@ -256,6 +261,12 @@ enumeration(MessageSizeType, desc="...") {
   Unblock_Control, desc="Unblock control";
   Persistent_Control, desc="Persistent request activation messages";
   Completion_Control, desc="Completion messages";
+  SPECLD_Control, desc="SPECLD control message";
+  SPECLD_Request_Control, desc="SPECLD forward message";
+  SPECLD_Data, desc="SPECLD data response";
+  EXPOSE_Control, desc="EXPOSE control message";
+  EXPOSE_Request_Control, desc="EXPOSE forward request";
+  EXPOSE_Data, desc="EXPOSE data response";
 }
 
 // AccessType
@@ -345,6 +356,7 @@ enumeration(RequestStatus, desc="...", default="RequestStatus_NULL")  {
   Issued, desc="The sequencer successfully issued the request";
   BufferFull, desc="Can not issue because the sequencer is full";
   Aliased, desc="This request aliased with a currently outstanding request";
+  Merged, desc="This request merged with a currently outstanding request";
   NULL, desc="";
 }
 
diff --git a/src/mem/protocol/RubySlicc_Types.sm b/src/mem/protocol/RubySlicc_Types.sm
index 27a045d29..5c73b4320 100644
--- a/src/mem/protocol/RubySlicc_Types.sm
+++ b/src/mem/protocol/RubySlicc_Types.sm
@@ -113,7 +113,7 @@ structure (Sequencer, external = "yes") {
                      Cycles, Cycles, Cycles);
 
   void checkCoherence(Addr);
-  void evictionCallback(Addr);
+  void evictionCallback(Addr, bool);
   void recordRequestType(SequencerRequestType);
   bool checkResourceAvailable(CacheResourceType, Addr);
   void invalidateSC(Addr);
@@ -172,6 +172,7 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") {
   HSAScope scope,            desc="HSA scope";
   HSASegment segment,        desc="HSA segment";
   PacketPtr pkt,             desc="Packet associated with this request";
+  int idx,                   desc="LQ index";
 }
 
 structure(AbstractEntry, primitive="yes", external = "yes") {
diff --git a/src/mem/request.hh b/src/mem/request.hh
index 2a53c21a4..01982e40f 100644
--- a/src/mem/request.hh
+++ b/src/mem/request.hh
@@ -149,6 +149,9 @@ class Request
         MEM_SWAP                    = 0x00400000,
         MEM_SWAP_COND               = 0x00800000,
 
+        /** [InvisiSpec] it is a spec request */
+        SPEC                        = 0x00004000,
+
         /** The request is a prefetch. */
         PREFETCH                    = 0x01000000,
         /** The request should be prefetched into the exclusive state. */
@@ -863,6 +866,7 @@ class Request
     bool isPrefetch() const { return (_flags.isSet(PREFETCH) ||
                                       _flags.isSet(PF_EXCLUSIVE)); }
     bool isPrefetchEx() const { return _flags.isSet(PF_EXCLUSIVE); }
+    bool isSpec() const { return _flags.isSet(SPEC); }
     bool isLLSC() const { return _flags.isSet(LLSC); }
     bool isPriv() const { return _flags.isSet(PRIVILEGED); }
     bool isLockedRMW() const { return _flags.isSet(LOCKED_RMW); }
diff --git a/src/mem/ruby/SConscript b/src/mem/ruby/SConscript
index be52c02d0..e6d45a419 100644
--- a/src/mem/ruby/SConscript
+++ b/src/mem/ruby/SConscript
@@ -59,6 +59,9 @@ DebugFlag('RubySystem')
 DebugFlag('RubyTester')
 DebugFlag('RubyStats')
 DebugFlag('RubyResourceStalls')
+DebugFlag('SpecBuffer')
+DebugFlag('SpecBufferValidate')
+DebugFlag('MemSpecBuffer')
 
 CompoundFlag('Ruby', [ 'RubyQueue', 'RubyNetwork', 'RubyTester',
     'RubyGenerated', 'RubySlicc', 'RubySystem', 'RubyCache',
diff --git a/src/mem/ruby/network/Network.cc b/src/mem/ruby/network/Network.cc
index 57834f2e2..7d4d71eb3 100644
--- a/src/mem/ruby/network/Network.cc
+++ b/src/mem/ruby/network/Network.cc
@@ -144,12 +144,18 @@ Network::MessageSizeType_to_int(MessageSizeType size_type)
       case MessageSizeType_Unblock_Control:
       case MessageSizeType_Persistent_Control:
       case MessageSizeType_Completion_Control:
+      case MessageSizeType_SPECLD_Control:
+      case MessageSizeType_SPECLD_Request_Control:
+      case MessageSizeType_EXPOSE_Control:
+      case MessageSizeType_EXPOSE_Request_Control:
         return m_control_msg_size;
       case MessageSizeType_Data:
       case MessageSizeType_Response_Data:
       case MessageSizeType_ResponseLocal_Data:
       case MessageSizeType_ResponseL2hit_Data:
       case MessageSizeType_Writeback_Data:
+      case MessageSizeType_SPECLD_Data:
+      case MessageSizeType_EXPOSE_Data:
         return m_data_msg_size;
       default:
         panic("Invalid range for type MessageSizeType");
diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc
index 101a4ce7f..25903e550 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.cc
+++ b/src/mem/ruby/slicc_interface/AbstractController.cc
@@ -41,6 +41,7 @@
 #include "mem/ruby/slicc_interface/AbstractController.hh"
 
 #include "debug/RubyQueue.hh"
+#include "debug/MemSpecBuffer.hh"
 #include "mem/protocol/MemoryMsg.hh"
 #include "mem/ruby/network/Network.hh"
 #include "mem/ruby/system/GPUCoalescer.hh"
@@ -96,6 +97,14 @@ AbstractController::regStats()
         .name(name() + ".fully_busy_cycles")
         .desc("cycles for which number of transistions == max transitions")
         .flags(Stats::nozero);
+    m_expose_hits
+        .name(name() + ".expose_hits")
+        .desc("number of expose hits at LLC spec buffer")
+        .flags(Stats::nozero);
+    m_expose_misses
+        .name(name() + ".expose_misses")
+        .desc("number of expose misses at LLC spec buffer")
+        .flags(Stats::nozero);
 }
 
 void
@@ -238,8 +247,67 @@ AbstractController::getMasterPort(const std::string &if_name,
 
 void
 AbstractController::queueMemoryRead(const MachineID &id, Addr addr,
-                                    Cycles latency)
+                                    Cycles latency, MachineID origin, int idx, int type)
 {
+    int coreId = origin.num;
+    int sbeId = idx;
+    // type 0: non-spec 1: spec 2: expose
+    // DPRINTFR(MemSpecBuffer, "%10s MemRead (core=%d, type=%d, idx=%d, addr=%#x)\n", curTick(), coreId, type, sbeId, printAddress(addr));
+    // if idx == -1, it is a write request which cannot be spec or expose.
+    assert(!(type != 0 && sbeId == -1));
+    assert(sbeId >= -1 && sbeId <= 65);
+    assert(coreId < 8);
+    assert(type >=0 && type <= 2);
+    if (type == 0) {
+        for (int c = 0; c < 8; ++c) {
+            for (int i = 0; i < 66; ++i) {
+                if (m_specBuf[c][i].address == addr) {
+                    DPRINTFR(MemSpecBuffer, "%10s Cleared by Read (core=%d, type=%d, idx=%d, addr=%#x)\n", curTick(), c, type, i, printAddress(addr));
+                    m_specBuf[c][i].address = 0;
+                    m_specBuf[c][i].data.clear();
+                }
+            }
+        }
+    } else if (type == 1) {
+
+    } else if (type == 2) {
+        if (m_specBuf[coreId][sbeId].address == addr) {
+            DPRINTFR(MemSpecBuffer, "%10s Expose Hit (core=%d, type=%d, idx=%d, addr=%#x)\n", curTick(), coreId, type, sbeId, printAddress(addr));
+            ++m_expose_hits;
+            assert(getMemoryQueue());
+            std::shared_ptr<MemoryMsg> msg = std::make_shared<MemoryMsg>(clockEdge());
+            (*msg).m_addr = addr;
+            (*msg).m_Sender = m_machineID;
+            (*msg).m_OriginalRequestorMachId = id;
+            (*msg).m_Type = MemoryRequestType_MEMORY_READ;
+            (*msg).m_MessageSize = MessageSizeType_Response_Data;
+            (*msg).m_DataBlk = m_specBuf[coreId][sbeId].data;
+            getMemoryQueue()->enqueue(msg, clockEdge(), cyclesToTicks(Cycles(1)));
+            for (int c = 0; c < 8; ++c) {
+                for (int i = 0; i < 66; ++i) {
+                    if (m_specBuf[c][i].address == addr) {
+                        DPRINTFR(MemSpecBuffer, "%10s Cleared by Expose Hit (core=%d, type=%d, idx=%d, addr=%#x)\n", curTick(), c, type, i, printAddress(addr));
+                        m_specBuf[c][i].address = 0;
+                        m_specBuf[c][i].data.clear();
+                    }
+                }
+            }
+            return;
+        } else {
+            DPRINTFR(MemSpecBuffer, "%10s Expose Miss (core=%d, type=%d, idx=%d, addr=%#x)\n", curTick(), coreId, type, sbeId, printAddress(addr));
+            ++m_expose_misses;
+            for (int c = 0; c < 8; ++c) {
+                for (int i = 0; i < 66; ++i) {
+                    if (m_specBuf[c][i].address == addr) {
+                        DPRINTFR(MemSpecBuffer, "%10s Cleared by Expose Miss (core=%d, type=%d, idx=%d, addr=%#x)\n", curTick(), c, type, i, printAddress(addr));
+                        m_specBuf[c][i].address = 0;
+                        m_specBuf[c][i].data.clear();
+                    }
+                }
+            }
+        }
+    }
+    
     RequestPtr req = std::make_shared<Request>(
         addr, RubySystem::getBlockSizeBytes(), 0, m_masterId);
 
@@ -248,6 +316,9 @@ AbstractController::queueMemoryRead(const MachineID &id, Addr addr,
     pkt->dataDynamic(newData);
 
     SenderState *s = new SenderState(id);
+    s->type = type;
+    s->coreId = coreId;
+    s->sbeId = sbeId;
     pkt->pushSenderState(s);
 
     // Use functional rather than timing accesses during warmup
@@ -336,6 +407,9 @@ AbstractController::recvTimingResp(PacketPtr pkt)
 
     SenderState *s = dynamic_cast<SenderState *>(pkt->senderState);
     (*msg).m_OriginalRequestorMachId = s->id;
+    int type = s->type;
+    int coreId = s->coreId;
+    int sbeId = s->sbeId;
     delete s;
 
     if (pkt->isRead()) {
@@ -345,6 +419,12 @@ AbstractController::recvTimingResp(PacketPtr pkt)
         // Copy data from the packet
         (*msg).m_DataBlk.setData(pkt->getPtr<uint8_t>(), 0,
                                  RubySystem::getBlockSizeBytes());
+        if (type == 1) {
+            DPRINTFR(MemSpecBuffer, "%10s Updated by ReadSpec (core=%d, type=%d, idx=%d, addr=%#x)\n", curTick(), coreId, type, sbeId, printAddress(pkt->getAddr()));
+            m_specBuf[coreId][sbeId].address = pkt->getAddr();
+            m_specBuf[coreId][sbeId].data.setData(pkt->getPtr<uint8_t>(), 0,
+                                                  RubySystem::getBlockSizeBytes());
+        }
     } else if (pkt->isWrite()) {
         (*msg).m_Type = MemoryRequestType_MEMORY_WB;
         (*msg).m_MessageSize = MessageSizeType_Writeback_Control;
diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh
index 35cd3d2a5..b65a511d0 100644
--- a/src/mem/ruby/slicc_interface/AbstractController.hh
+++ b/src/mem/ruby/slicc_interface/AbstractController.hh
@@ -129,7 +129,7 @@ class AbstractController : public MemObject, public Consumer
     BaseMasterPort& getMasterPort(const std::string& if_name,
                                   PortID idx = InvalidPortID);
 
-    void queueMemoryRead(const MachineID &id, Addr addr, Cycles latency);
+    void queueMemoryRead(const MachineID &id, Addr addr, Cycles latency, MachineID origin, int idx, int type);
     void queueMemoryWrite(const MachineID &id, Addr addr, Cycles latency,
                           const DataBlock &block);
     void queueMemoryWritePartial(const MachineID &id, Addr addr, Cycles latency,
@@ -199,6 +199,8 @@ class AbstractController : public MemObject, public Consumer
     //! Counter for the number of cycles when the transitions carried out
     //! were equal to the maximum allowed
     Stats::Scalar m_fully_busy_cycles;
+    Stats::Scalar m_expose_hits;
+    Stats::Scalar m_expose_misses;
 
     //! Histogram for profiling delay for the messages this controller
     //! cares for
@@ -250,6 +252,9 @@ class AbstractController : public MemObject, public Consumer
     {
         // Id of the machine from which the request originated.
         MachineID id;
+        int type;
+        int coreId;
+        int sbeId;
 
         SenderState(MachineID _id) : id(_id)
         {}
@@ -258,6 +263,14 @@ class AbstractController : public MemObject, public Consumer
   private:
     /** The address range to which the controller responds on the CPU side. */
     const AddrRangeList addrRanges;
+
+    struct SBE
+    {
+      Addr address;
+      DataBlock data;
+    };
+
+    SBE m_specBuf[8][66];
 };
 
 #endif // __MEM_RUBY_SLICC_INTERFACE_ABSTRACTCONTROLLER_HH__
diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh
index 6c84f3823..2fc4c9f98 100644
--- a/src/mem/ruby/slicc_interface/RubyRequest.hh
+++ b/src/mem/ruby/slicc_interface/RubyRequest.hh
@@ -60,6 +60,7 @@ class RubyRequest : public Message
     int m_wfid;
     HSAScope m_scope;
     HSASegment m_segment;
+    int m_idx;
 
 
     RubyRequest(Tick curTime, uint64_t _paddr, uint8_t* _data, int _len,
@@ -82,6 +83,11 @@ class RubyRequest : public Message
           m_segment(_segment)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt->reqIdx == -1) {
+            m_idx = _pkt->reqIdx;
+        } else {
+            m_idx = (_pkt->reqIdx) * 2 + (_pkt->isFirst()? 0 : 1);
+        }
     }
 
     RubyRequest(Tick curTime, uint64_t _paddr, uint8_t* _data, int _len,
@@ -109,6 +115,11 @@ class RubyRequest : public Message
           m_segment(_segment)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt->reqIdx == -1) {
+            m_idx = _pkt->reqIdx;
+        } else {
+            m_idx = (_pkt->reqIdx) * 2 + (_pkt->isFirst()? 0 : 1);
+        }
     }
 
     RubyRequest(Tick curTime, uint64_t _paddr, uint8_t* _data, int _len,
@@ -137,6 +148,11 @@ class RubyRequest : public Message
           m_segment(_segment)
     {
         m_LineAddress = makeLineAddress(m_PhysicalAddress);
+        if (_pkt->reqIdx == -1) {
+            m_idx = _pkt->reqIdx;
+        } else {
+            m_idx = (_pkt->reqIdx) * 2 + (_pkt->isFirst()? 0 : 1);
+        }
     }
 
 
diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc
index 6c93c3260..943ccdf26 100644
--- a/src/mem/ruby/structures/CacheMemory.cc
+++ b/src/mem/ruby/structures/CacheMemory.cc
@@ -177,7 +177,9 @@ CacheMemory::tryCacheAccess(Addr address, RubyRequestType type,
             return true;
         }
         if ((entry->m_Permission == AccessPermission_Read_Only) &&
-            (type == RubyRequestType_LD || type == RubyRequestType_IFETCH)) {
+            (type == RubyRequestType_LD ||
+             type == RubyRequestType_IFETCH ||
+             type == RubyRequestType_SPEC_LD)) {
             return true;
         }
         // The line must not be accessible
diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc
index 84a70c0f1..15013e056 100644
--- a/src/mem/ruby/system/RubyPort.cc
+++ b/src/mem/ruby/system/RubyPort.cc
@@ -246,6 +246,7 @@ RubyPort::PioSlavePort::recvAtomic(PacketPtr pkt)
     panic("Could not find address in Ruby PIO address ranges!\n");
 }
 
+// [InvisiSpec] Request on the way from CPU to Ruby
 bool
 RubyPort::MemSlavePort::recvTimingReq(PacketPtr pkt)
 {
@@ -429,6 +430,7 @@ RubyPort::MemSlavePort::recvFunctional(PacketPtr pkt)
     }
 }
 
+// [InvisiSpec] On the way from Ruby to CPU
 void
 RubyPort::ruby_hit_callback(PacketPtr pkt)
 {
@@ -512,6 +514,7 @@ RubyPort::drain()
     }
 }
 
+// [InvisiSpec] Still on the way from Ruby to CPU
 void
 RubyPort::MemSlavePort::hitCallback(PacketPtr pkt)
 {
@@ -545,7 +548,7 @@ RubyPort::MemSlavePort::hitCallback(PacketPtr pkt)
     }
 
     // Flush, acquire, release requests don't access physical memory
-    if (pkt->isFlush() || pkt->cmd == MemCmd::MemFenceReq) {
+    if (pkt->isFlush() || pkt->isExpose() || pkt->cmd == MemCmd::MemFenceReq) {
         accessPhysMem = false;
     }
 
@@ -572,6 +575,7 @@ RubyPort::MemSlavePort::hitCallback(PacketPtr pkt)
         // Ruby protocol.
         schedTimingResp(pkt, curTick());
     } else {
+        // [InvisiSpec] Delete the packet if a reponse is not required
         delete pkt;
     }
 
@@ -602,7 +606,7 @@ RubyPort::MemSlavePort::isPhysMemAddress(Addr addr) const
 }
 
 void
-RubyPort::ruby_eviction_callback(Addr address)
+RubyPort::ruby_eviction_callback(Addr address, bool external)
 {
     DPRINTF(RubyPort, "Sending invalidations.\n");
     // Allocate the invalidate request and packet on the stack, as it is
@@ -615,6 +619,9 @@ RubyPort::ruby_eviction_callback(Addr address)
     // Use a single packet to signal all snooping ports of the invalidation.
     // This assumes that snooping ports do NOT modify the packet/request
     Packet pkt(request, MemCmd::InvalidateReq);
+    if (external) {
+        pkt.setExternalEviction();
+    }
     for (CpuPortIter p = slave_ports.begin(); p != slave_ports.end(); ++p) {
         // check if the connected master port is snooping
         if ((*p)->isSnooping()) {
diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh
index 146443282..9c0200829 100644
--- a/src/mem/ruby/system/RubyPort.hh
+++ b/src/mem/ruby/system/RubyPort.hh
@@ -172,7 +172,7 @@ class RubyPort : public MemObject
     void trySendRetries();
     void ruby_hit_callback(PacketPtr pkt);
     void testDrainComplete();
-    void ruby_eviction_callback(Addr address);
+    void ruby_eviction_callback(Addr address, bool external);
 
     /**
      * Called by the PIO port when receiving a timing response.
diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc
index 41ec6ea6c..4a8e5ae02 100644
--- a/src/mem/ruby/system/Sequencer.cc
+++ b/src/mem/ruby/system/Sequencer.cc
@@ -36,6 +36,8 @@
 #include "debug/ProtocolTrace.hh"
 #include "debug/RubySequencer.hh"
 #include "debug/RubyStats.hh"
+#include "debug/SpecBuffer.hh"
+#include "debug/SpecBufferValidate.hh"
 #include "mem/packet.hh"
 #include "mem/protocol/PrefetchBit.hh"
 #include "mem/protocol/RubyAccessMode.hh"
@@ -54,7 +56,9 @@ RubySequencerParams::create()
 
 Sequencer::Sequencer(const Params *p)
     : RubyPort(p), m_IncompleteTimes(MachineType_NUM),
-      deadlockCheckEvent([this]{ wakeup(); }, "Sequencer deadlock check")
+      deadlockCheckEvent([this]{ wakeup(); }, "Sequencer deadlock check"),
+      m_specBuf(33),
+      specBufferHitEvent([this]{ specBufferHitCallback(); }, "Sequencer spec buffer hit")
 {
     m_outstanding_count = 0;
 
@@ -160,6 +164,7 @@ void Sequencer::resetStats()
     }
 }
 
+// [InvisiSpec] Request on the way from CPU to Ruby
 // Insert the request on the correct request table.  Return true if
 // the entry was already present.
 RequestStatus
@@ -190,6 +195,7 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
     RequestTable::value_type default_entry(line_addr,
                                            (SequencerRequest*) NULL);
 
+    // [InvisiSpec] If store
     if ((request_type == RubyRequestType_ST) ||
         (request_type == RubyRequestType_RMW_Read) ||
         (request_type == RubyRequestType_RMW_Write) ||
@@ -217,6 +223,7 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
           m_store_waiting_on_store++;
           return RequestStatus_Aliased;
         }
+    // [InvisiSpec] If load
     } else {
         // Check if there is any outstanding write request for the same
         // cache line.
@@ -232,6 +239,16 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType request_type)
             RequestTable::iterator i = r.first;
             i->second = new SequencerRequest(pkt, request_type, curCycle());
             m_outstanding_count++;
+        } else if (request_type == RubyRequestType_SPEC_LD) {
+            auto i = m_readRequestTable.find(line_addr);
+            if (i->second->m_type == RubyRequestType_SPEC_LD) {
+                DPRINTFR(SpecBuffer, "%10s Merging (idx=%d-%d, addr=%#x) with %d\n", curTick(), pkt->reqIdx, pkt->isFirst()? 0 : 1, printAddress(pkt->getAddr()), i->second->pkt->reqIdx);
+                i->second->dependentSpecRequests.push_back(pkt);
+                return RequestStatus_Merged;
+            } else {
+                m_load_waiting_on_load++;
+                return RequestStatus_Aliased;
+            }
         } else {
             // There is an outstanding read request for the cache line
             m_load_waiting_on_load++;
@@ -412,6 +429,19 @@ Sequencer::writeCallback(Addr address, DataBlock& data,
                 initialRequestTime, forwardRequestTime, firstResponseTime);
 }
 
+bool Sequencer::updateSBB(PacketPtr pkt, DataBlock& data, Addr dataAddress) {
+    uint8_t idx = pkt->reqIdx;
+    SBE& sbe = m_specBuf[idx];
+    int blkIdx = pkt->isFirst() ? 0 : 1;
+    SBB& sbb = sbe.blocks[blkIdx];
+    if (makeLineAddress(sbb.reqAddress) == dataAddress) {
+        sbb.data = data;
+        return true;
+    }
+    return false;
+}
+
+// [InvisiSpec] Called by Ruby to send a response to CPU.
 void
 Sequencer::readCallback(Addr address, DataBlock& data,
                         bool externalHit, const MachineType mach,
@@ -430,13 +460,79 @@ Sequencer::readCallback(Addr address, DataBlock& data,
     markRemoved();
 
     assert((request->m_type == RubyRequestType_LD) ||
+           (request->m_type == RubyRequestType_SPEC_LD) ||
+           (request->m_type == RubyRequestType_EXPOSE) ||
            (request->m_type == RubyRequestType_IFETCH));
+    
+    PacketPtr pkt = request->pkt;
+    if (pkt->isSpec()) {
+        assert(!pkt->onlyAccessSpecBuff());
+        DPRINTFR(SpecBuffer, "%10s SPEC_LD callback (idx=%d-%d, addr=%#x)\n", curTick(), pkt->reqIdx, pkt->isFirst()? 0 : 1, printAddress(pkt->getAddr()));
+        updateSBB(pkt, data, address);
+        if (!externalHit) {
+            pkt->setL1Hit();
+        }
+    } else if (pkt->isExpose()) {
+        DPRINTFR(SpecBuffer, "%10s EXPOSE callback (idx=%d-%d, addr=%#x)\n", curTick(), pkt->reqIdx, pkt->isFirst()? 0 : 1, printAddress(pkt->getAddr()));
+    } else if (pkt->isValidate()) {
+        DPRINTFR(SpecBuffer, "%10s VALIDATE callback (idx=%d-%d, addr=%#x)\n", curTick(), pkt->reqIdx, pkt->isFirst()? 0 : 1, printAddress(pkt->getAddr()));
+        uint8_t idx = pkt->reqIdx;
+        SBE& sbe = m_specBuf[idx];
+        int blkIdx = pkt->isFirst() ? 0 : 1;
+        SBB& sbb = sbe.blocks[blkIdx];
+        assert(makeLineAddress(sbb.reqAddress) == address);
+        if (!memcmp(sbb.data.getData(getOffset(pkt->getAddr()), pkt->getSize()), data.getData(getOffset(pkt->getAddr()), pkt->getSize()), pkt->getSize())) {
+            *(pkt->getPtr<uint8_t>()) = 1;
+        } else {
+            // std::ostringstream os;
+            // sbb.data.print(os);
+            // DPRINTFR(SpecBufferValidate, "%s\n", os.str());
+            // os.str("");
+            // data.print(os);
+            // DPRINTFR(SpecBufferValidate, "%s\n", os.str());
+            *(pkt->getPtr<uint8_t>()) = 0;
+        }
+    }
+
+    for (auto& dependentPkt : request->dependentSpecRequests) {
+        assert(!dependentPkt->onlyAccessSpecBuff());
+        DPRINTFR(SpecBuffer, "%10s Merged SPEC_LD callback (idx=%d-%d, addr=%#x)\n", curTick(), dependentPkt->reqIdx, dependentPkt->isFirst()? 0 : 1, printAddress(dependentPkt->getAddr()));
+        assert(dependentPkt->isSpec());
+        updateSBB(dependentPkt, data, address);
+        if (!externalHit) {
+            dependentPkt->setL1Hit();
+        }
+        memcpy(dependentPkt->getPtr<uint8_t>(),
+               data.getData(getOffset(dependentPkt->getAddr()), dependentPkt->getSize()),
+               dependentPkt->getSize());
+        ruby_hit_callback(dependentPkt);
+    }
 
     hitCallback(request, data, true, mach, externalHit,
                 initialRequestTime, forwardRequestTime, firstResponseTime);
 }
 
 void
+Sequencer::specBufferHitCallback()
+{
+    assert(m_specRequestQueue.size());
+    while (m_specRequestQueue.size()) {
+        auto specReq = m_specRequestQueue.front();
+        if (specReq.second <= curTick()) {
+            PacketPtr pkt = specReq.first;
+            assert(pkt->onlyAccessSpecBuff());
+            DPRINTFR(SpecBuffer, "%10s SB Hit Callback (idx=%d, addr=%#x)\n", curTick(), pkt->reqIdx, printAddress(pkt->getAddr()));
+            ruby_hit_callback(pkt);
+            m_specRequestQueue.pop();
+        } else {
+            schedule(specBufferHitEvent, specReq.second);
+            break;
+        }
+    }
+}
+
+// [InvisiSpec] Response on the way from Ruby to CPU
+void
 Sequencer::hitCallback(SequencerRequest* srequest, DataBlock& data,
                        bool llscSuccess,
                        const MachineType mach, const bool externalHit,
@@ -470,8 +566,9 @@ Sequencer::hitCallback(SequencerRequest* srequest, DataBlock& data,
     if (RubySystem::getWarmupEnabled()) {
         data.setData(pkt->getConstPtr<uint8_t>(),
                      getOffset(request_address), pkt->getSize());
-    } else if (!pkt->isFlush()) {
+    } else if (!pkt->isFlush() && !pkt->isExpose() && !pkt->isValidate()) {
         if ((type == RubyRequestType_LD) ||
+            (type == RubyRequestType_SPEC_LD) ||
             (type == RubyRequestType_IFETCH) ||
             (type == RubyRequestType_RMW_Read) ||
             (type == RubyRequestType_Locked_RMW_Read) ||
@@ -530,6 +627,7 @@ Sequencer::empty() const
     return m_writeRequestTable.empty() && m_readRequestTable.empty();
 }
 
+// [InvisiSpec] Request on the way from CPU to Ruby
 RequestStatus
 Sequencer::makeRequest(PacketPtr pkt)
 {
@@ -540,7 +638,56 @@ Sequencer::makeRequest(PacketPtr pkt)
     RubyRequestType primary_type = RubyRequestType_NULL;
     RubyRequestType secondary_type = RubyRequestType_NULL;
 
-    if (pkt->isLLSC()) {
+    // [InvisiSpec] Handle new requests
+    if (pkt->isSpec()) {
+        assert(pkt->cmd == MemCmd::ReadSpecReq);
+        assert(pkt->isSplit || pkt->isFirst());
+        uint8_t idx = pkt->reqIdx;
+        SBE& sbe = m_specBuf[idx];
+        sbe.isSplit = pkt->isSplit;
+        int blkIdx = pkt->isFirst() ? 0 : 1;
+        SBB& sbb = sbe.blocks[blkIdx];
+        sbb.reqAddress = pkt->getAddr();
+        sbb.reqSize = pkt->getSize();
+        if (pkt->onlyAccessSpecBuff()) {
+            int srcIdx = pkt->srcIdx;
+            SBE& srcEntry = m_specBuf[srcIdx];
+            if (makeLineAddress(sbb.reqAddress) == makeLineAddress(srcEntry.blocks[0].reqAddress)) {
+                sbb.data = srcEntry.blocks[0].data;
+            } else if (makeLineAddress(sbb.reqAddress) == makeLineAddress(srcEntry.blocks[1].reqAddress)) {
+                sbb.data = srcEntry.blocks[1].data;
+            } else {
+                fatal("Requested address %#x is not present in the spec buffer\n", printAddress(sbb.reqAddress));
+            }
+            memcpy(pkt->getPtr<uint8_t>(),
+                   sbb.data.getData(getOffset(sbb.reqAddress), sbb.reqSize),
+                   sbb.reqSize);
+            m_specRequestQueue.push({pkt, curTick()});
+            DPRINTFR(SpecBuffer, "%10s SB Hit (idx=%d, addr=%#x) on (srcIdx=%d)\n", curTick(), idx, printAddress(sbb.reqAddress), srcIdx);
+            if (!specBufferHitEvent.scheduled()) {
+                schedule(specBufferHitEvent, clockEdge(Cycles(1)));
+            }
+            return RequestStatus_Issued;
+        } else {
+            // assert it is not in the buffer
+            primary_type = secondary_type = RubyRequestType_SPEC_LD;
+        }
+    } else if (pkt->isExpose() || pkt->isValidate()) {
+        assert(pkt->cmd == MemCmd::ExposeReq || pkt->cmd == MemCmd::ValidateReq);
+        assert(pkt->isSplit || pkt->isFirst());
+        uint8_t idx = pkt->reqIdx;
+        SBE& sbe = m_specBuf[idx];
+        sbe.isSplit = pkt->isSplit;
+        int blkIdx = pkt->isFirst() ? 0 : 1;
+        SBB& sbb = sbe.blocks[blkIdx];
+        if (sbb.reqAddress != pkt->getAddr()) {
+            fatal("sbb.reqAddress != pkt->getAddr: %#x != %#x\n", printAddress(sbb.reqAddress), printAddress(pkt->getAddr()));
+        }
+        if (sbb.reqSize != pkt->getSize()) {
+            fatal("sbb.reqSize != pkt->getSize(): %d != %d\n", sbb.reqSize, pkt->getSize());
+        }
+        primary_type = secondary_type = RubyRequestType_EXPOSE;
+    } else if (pkt->isLLSC()) {
         //
         // Alpha LL/SC instructions need to be handled carefully by the cache
         // coherence protocol to ensure they follow the proper semantics. In
@@ -611,8 +758,22 @@ Sequencer::makeRequest(PacketPtr pkt)
     }
 
     RequestStatus status = insertRequest(pkt, primary_type);
-    if (status != RequestStatus_Ready)
+    if (status == RequestStatus_Merged) {
+        return RequestStatus_Issued;
+    } else if (status != RequestStatus_Ready) {
         return status;
+    }
+
+    if (pkt->isSpec()) {
+        DPRINTFR(SpecBuffer, "%10s Issuing SPEC_LD (idx=%d-%d, addr=%#x)\n",
+                 curTick(), pkt->reqIdx, pkt->isFirst()? 0 : 1, printAddress(pkt->getAddr()));
+    } else if (pkt->isExpose()) {
+        DPRINTFR(SpecBuffer, "%10s Issuing EXPOSE (idx=%d-%d, addr=%#x)\n",
+                 curTick(), pkt->reqIdx, pkt->isFirst()? 0 : 1, printAddress(pkt->getAddr()));
+    } else if (pkt->isValidate()) {
+        DPRINTFR(SpecBuffer, "%10s Issuing VALIDATE (idx=%d-%d, addr=%#x)\n",
+                 curTick(), pkt->reqIdx, pkt->isFirst()? 0 : 1, printAddress(pkt->getAddr()));
+    }
 
     issueRequest(pkt, secondary_type);
 
@@ -639,7 +800,7 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type)
     // requests do not
     std::shared_ptr<RubyRequest> msg =
         std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
-                                      pkt->isFlush() ?
+                                      pkt->isFlush() || pkt->isExpose() ?
                                       nullptr : pkt->getPtr<uint8_t>(),
                                       pkt->getSize(), pc, secondary_type,
                                       RubyAccessMode_Supervisor, pkt,
@@ -713,9 +874,9 @@ Sequencer::recordRequestType(SequencerRequestType requestType) {
 
 
 void
-Sequencer::evictionCallback(Addr address)
+Sequencer::evictionCallback(Addr address, bool external)
 {
-    ruby_eviction_callback(address);
+    ruby_eviction_callback(address, external);
 }
 
 void
diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh
index fcfa8ad86..66ff92777 100644
--- a/src/mem/ruby/system/Sequencer.hh
+++ b/src/mem/ruby/system/Sequencer.hh
@@ -31,6 +31,7 @@
 
 #include <iostream>
 #include <unordered_map>
+#include <queue>
 
 #include "mem/protocol/MachineType.hh"
 #include "mem/protocol/RubyRequestType.hh"
@@ -45,6 +46,7 @@ struct SequencerRequest
     PacketPtr pkt;
     RubyRequestType m_type;
     Cycles issue_time;
+    std::vector<PacketPtr> dependentSpecRequests;
 
     SequencerRequest(PacketPtr _pkt, RubyRequestType _m_type,
                      Cycles _issue_time)
@@ -54,6 +56,19 @@ struct SequencerRequest
 
 std::ostream& operator<<(std::ostream& out, const SequencerRequest& obj);
 
+struct SBB // SpecBufferBlock
+{
+  Addr reqAddress;
+  unsigned reqSize;
+  DataBlock data;
+};
+
+struct SBE // SpecBufferEntry
+{
+  bool isSplit;
+  SBB blocks[2];
+};
+
 class Sequencer : public RubyPort
 {
   public:
@@ -83,6 +98,9 @@ class Sequencer : public RubyPort
                       const Cycles forwardRequestTime = Cycles(0),
                       const Cycles firstResponseTime = Cycles(0));
 
+    void specBufferHitCallback();
+    bool updateSBB(PacketPtr pkt, DataBlock& data, Addr dataAddress);
+
     RequestStatus makeRequest(PacketPtr pkt);
     bool empty() const;
     int outstandingCount() const { return m_outstanding_count; }
@@ -97,7 +115,7 @@ class Sequencer : public RubyPort
     void checkCoherence(Addr address);
 
     void markRemoved();
-    void evictionCallback(Addr address);
+    void evictionCallback(Addr address, bool external);
     void invalidateSC(Addr address);
     int coreId() const { return m_coreId; }
 
@@ -238,6 +256,10 @@ class Sequencer : public RubyPort
     std::vector<Stats::Counter> m_IncompleteTimes;
 
     EventFunctionWrapper deadlockCheckEvent;
+
+    std::vector<SBE> m_specBuf;
+    std::queue<std::pair<PacketPtr, Tick>> m_specRequestQueue;
+    EventFunctionWrapper specBufferHitEvent;
 };
 
 inline std::ostream&