diff options
Diffstat (limited to 'src')
46 files changed, 2556 insertions, 95 deletions
diff --git a/src/arch/arm/tlb.cc b/src/arch/arm/tlb.cc index 5f104e96d..fab26d8cb 100644 --- a/src/arch/arm/tlb.cc +++ b/src/arch/arm/tlb.cc @@ -535,6 +535,11 @@ TLB::regStats() .name(name() + ".prefetch_faults") .desc("Number of TLB faults due to prefetch") ; + + specTLBMisses + .name(name() + ".spec_tlb_misses") + .desc("Number of TLB misses from a speculative mem instructions") + ; domainFaults .name(name() + ".domain_faults") @@ -1423,6 +1428,17 @@ TLB::getTE(TlbEntry **te, RequestPtr req, ThreadContext *tc, Mode mode, vaddr_tainted, ArmFault::PrefetchTLBMiss, isStage2); } + if (req->isSpec()) { + // if the request is a prefetch don't attempt to fill the TLB or go + // any further with the memory access (here we can safely use the + // fault status for the short desc. format in all cases) + specTLBMisses++; + //FIXME: currently resue the prefetch tlbmiss fault + //do not want to introduce new fault declaration + return std::make_shared<PrefetchAbort>( + vaddr_tainted, ArmFault::PrefetchTLBMiss, isStage2); + } + if (is_fetch) instMisses++; else if (is_write) diff --git a/src/arch/arm/tlb.hh b/src/arch/arm/tlb.hh index 212a79f79..5f92a3e8a 100644 --- a/src/arch/arm/tlb.hh +++ b/src/arch/arm/tlb.hh @@ -169,6 +169,7 @@ class TLB : public BaseTLB mutable Stats::Scalar flushedEntries; mutable Stats::Scalar alignFaults; mutable Stats::Scalar prefetchFaults; + mutable Stats::Scalar specTLBMisses; mutable Stats::Scalar domainFaults; mutable Stats::Scalar permsFaults; diff --git a/src/arch/generic/memhelpers.hh b/src/arch/generic/memhelpers.hh index 35e666b92..0a38b780c 100644 --- a/src/arch/generic/memhelpers.hh +++ b/src/arch/generic/memhelpers.hh @@ -53,6 +53,7 @@ /// Initiate a read from memory in timing mode. Note that the 'mem' /// parameter is unused; only the type of that parameter is used /// to determine the size of the access. +// XC: executeContextPtr [mengjia] template <class XC, class MemT> Fault initiateMemRead(XC *xc, Trace::InstRecord *traceData, Addr addr, diff --git a/src/arch/x86/isa/decoder/two_byte_opcodes.isa b/src/arch/x86/isa/decoder/two_byte_opcodes.isa index aa60e4c48..bc8edf416 100644 --- a/src/arch/x86/isa/decoder/two_byte_opcodes.isa +++ b/src/arch/x86/isa/decoder/two_byte_opcodes.isa @@ -133,7 +133,8 @@ 0x7: decode MODRM_MOD { 0x3: decode MODRM_RM { 0x0: Inst::SWAPGS(); - 0x1: rdtscp(); + 0x1: Inst::RDTSCP(); + //rdtscp(); default: Inst::UD2(); } default: Inst::INVLPG(M); diff --git a/src/arch/x86/isa/insts/system/msrs.py b/src/arch/x86/isa/insts/system/msrs.py index d0e2675de..f269742dd 100644 --- a/src/arch/x86/isa/insts/system/msrs.py +++ b/src/arch/x86/isa/insts/system/msrs.py @@ -65,4 +65,14 @@ def macroop RDTSC srli t1, t1, 32, dataSize=8 mov rdx, rdx, t1, dataSize=4 }; + + +def macroop RDTSCP +{ + .block + rdtsc t1 + mov rax, rax, t1, dataSize=4 + srli t1, t1, 32, dataSize=8 + mov rdx, rdx, t1, dataSize=4 +}; ''' diff --git a/src/arch/x86/isa/macroop.isa b/src/arch/x86/isa/macroop.isa index 3a1a84a7d..aff0b942c 100644 --- a/src/arch/x86/isa/macroop.isa +++ b/src/arch/x86/isa/macroop.isa @@ -146,6 +146,9 @@ let {{ self.adjust_disp += val def serializing(self): self.serializing = True + # define directive [mengjia] + def block(self): + self.block = True def function_call(self): self.function_call = True @@ -159,6 +162,8 @@ let {{ "adjust_imm" : self.adjustImm, "adjust_disp" : self.adjustDisp, "serializing" : self.serializing, + # add directives block [mengjia] + "block" : self.block, "function_call" : self.function_call, "function_return" : self.function_return } @@ -176,6 +181,8 @@ let {{ adjustedDisp = adjustedDisp; ''' self.serializing = False + # initialize as false [mengjia] + self.block = False self.function_call = False self.function_return = False @@ -212,6 +219,10 @@ let {{ if self.serializing: flags.append("IsSerializing") flags.append("IsSerializeAfter") + # add new attribute for block [mengjia] + if self.block: + flags.append("IsBlock") + flags.append("IsSerializeBefore") if self.function_call: flags.append("IsCall") diff --git a/src/arch/x86/tlb.cc b/src/arch/x86/tlb.cc index a3aec1676..248f929f9 100644 --- a/src/arch/x86/tlb.cc +++ b/src/arch/x86/tlb.cc @@ -338,6 +338,17 @@ TLB::translate(RequestPtr req, ThreadContext *tc, Translation *translation, wrAccesses++; } if (!entry) { + if(req->isSpec()){ + // [InvisiSpec] do not perform TLB fill for + // speculative load + specMisses++; + DPRINTF(TLB, "Get a TLB miss for a speculative load " + "address %#x at pc %#x.\n", + vaddr, tc->instAddr()); + //FIXME: currently reuse the GeneralProtection fault + //instead of creating new faults + return std::make_shared<GeneralProtection>(0); + } DPRINTF(TLB, "Handling a TLB miss for " "address %#x at pc %#x.\n", vaddr, tc->instAddr()); @@ -470,6 +481,9 @@ TLB::regStats() .name(name() + ".wrMisses") .desc("TLB misses on write requests"); + specMisses + .name(name() + ".spec_tlb_misses") + .desc("TLB misses on speculative memory requests"); } void diff --git a/src/arch/x86/tlb.hh b/src/arch/x86/tlb.hh index 08804a455..9e9c8fa05 100644 --- a/src/arch/x86/tlb.hh +++ b/src/arch/x86/tlb.hh @@ -105,6 +105,7 @@ namespace X86ISA Stats::Scalar wrAccesses; Stats::Scalar rdMisses; Stats::Scalar wrMisses; + Stats::Scalar specMisses; Fault translateInt(RequestPtr req, ThreadContext *tc); diff --git a/src/cpu/StaticInstFlags.py b/src/cpu/StaticInstFlags.py index 55ef456ce..091606327 100644 --- a/src/cpu/StaticInstFlags.py +++ b/src/cpu/StaticInstFlags.py @@ -84,6 +84,7 @@ class StaticInstFlags(Enum): 'IsSerializing', # Serializes pipeline: won't execute until all # older instructions have committed. + 'IsBlock', # Block issuing. [mengjia] 'IsSerializeBefore', 'IsSerializeAfter', 'IsMemBarrier', # Is a memory barrier diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh index ae408e3fb..e29dbb5e6 100644 --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -120,6 +120,28 @@ class BaseDynInst : public ExecContext, public RefCounted /// instructions ahead of it SerializeAfter, /// Needs to serialize instructions behind it SerializeHandled, /// Serialization has been handled + + SpecCompleted, + // [mengjia] indicates whether received specReadResp + ValidationCompleted, + // indicates whether validation finishes + ExposeCompleted, + ExposeSent, + // indicates whether expose finishes + // (should set when expose is sent out) + PrevInstsCompleted, + // indicate whether previous instructions completed + PrevBrsResolved, + // [mengjia] indicate whether previous branches are resolved + PrevInstsCommitted, + // indicate whether previous instructions committed + PrevBrsCommitted, + // [mengjia] indicate whether previous branches are committed + L1HitHigh, + L1HitLow, + SpecBuffObsoleteHigh, + SpecBuffObsoleteLow, + // [InvisiSpec] it hits in L1 and is open to invalidations NumStatus }; @@ -136,6 +158,22 @@ class BaseDynInst : public ExecContext, public RefCounted IsStrictlyOrdered, ReqMade, MemOpDone, + // [mengjia] indicates need validation or expose + NeedPostFetch, + NeedDeletePostReq, + // [mengjia] indicates only need to expose, do not need to validate + NeedExposeOnly, + // [InvisiSpec] indicate the instruction needs to be delayed + // due to virtual fences before to defend against speculative attacks + FenceDelay, + // [InvisiSpec] indicate the load is legal to be visible + ReadyToExpose, + HitInvalidation, + HitExternalEviction, + ValidationFail, + OnlyWaitForFence, + OnlyWaitForExpose, + SpecTLBMiss, MaxFlags }; @@ -222,6 +260,9 @@ class BaseDynInst : public ExecContext, public RefCounted /** Pointer to the data for the memory access. */ uint8_t *memData; + /** Pointer to the data for the validation result. */ + uint8_t *vldData; + /** Load queue index. */ int16_t lqIdx; @@ -238,6 +279,12 @@ class BaseDynInst : public ExecContext, public RefCounted RequestPtr savedSreqLow; RequestPtr savedSreqHigh; + /** [InvisiSpec] + * Saved memory requests (needed for post-fetch validation/expose). + */ + RequestPtr postReq; + RequestPtr postSreqLow; + RequestPtr postSreqHigh; /////////////////////// Checker ////////////////////// // Need a copy of main request pointer to verify on writes. RequestPtr reqToVerify; @@ -275,6 +322,43 @@ class BaseDynInst : public ExecContext, public RefCounted bool memOpDone() const { return instFlags[MemOpDone]; } void memOpDone(bool f) { instFlags[MemOpDone] = f; } + /** [mengjia] Whether or not need pseudo-validation. + * whether speculative laod finishes, + * whether validation completes or not (success) */ + bool needExposeOnly() const { return instFlags[NeedExposeOnly]; } + void needExposeOnly(bool f) { instFlags[NeedExposeOnly] = f; } + + bool needPostFetch() const { return instFlags[NeedPostFetch]; } + void needPostFetch(bool f) { instFlags[NeedPostFetch] = f; } + + bool needDeletePostReq() const { return instFlags[NeedDeletePostReq]; } + void needDeletePostReq(bool f) { instFlags[NeedDeletePostReq] = f; } + + bool fenceDelay() const { return instFlags[ReadyToExpose]; } + void fenceDelay(bool f) { instFlags[ReadyToExpose] = f; } + + bool readyToExpose() const { return instFlags[FenceDelay]; } + void readyToExpose(bool f) { instFlags[FenceDelay] = f; } + + bool hitInvalidation() const { return instFlags[HitInvalidation]; } + void hitInvalidation(bool f) { instFlags[HitInvalidation] = f; } + + bool hitExternalEviction() const { return instFlags[HitExternalEviction]; } + void hitExternalEviction(bool f) { instFlags[HitExternalEviction] = f; } + + bool validationFail() const { return instFlags[ValidationFail]; } + void validationFail(bool f) { instFlags[ValidationFail] = f; } + + bool onlyWaitForFence() const { return instFlags[OnlyWaitForFence]; } + void onlyWaitForFence(bool f) { instFlags[OnlyWaitForFence] = f; } + + bool onlyWaitForExpose() const { return instFlags[OnlyWaitForExpose]; } + void onlyWaitForExpose(bool f) { instFlags[OnlyWaitForExpose] = f; } + + bool specTLBMiss() const { return instFlags[SpecTLBMiss]; } + void specTLBMiss(bool f) { instFlags[SpecTLBMiss] = f; } + /*[mengjia] added 2 new flags and corresponding functions*/ + bool notAnInst() const { return instFlags[NotAnInst]; } void setNotAnInst() { instFlags[NotAnInst] = true; } @@ -522,6 +606,10 @@ class BaseDynInst : public ExecContext, public RefCounted bool isCondDelaySlot() const { return staticInst->isCondDelaySlot(); } bool isThreadSync() const { return staticInst->isThreadSync(); } bool isSerializing() const { return staticInst->isSerializing(); } + + // add block attribute for synamic isntruction type [mengjia] + bool isBlock() const { return staticInst->isBlock(); } + bool isSerializeBefore() const { return staticInst->isSerializeBefore() || status[SerializeBefore]; } bool isSerializeAfter() const @@ -703,6 +791,49 @@ class BaseDynInst : public ExecContext, public RefCounted /** Returns whether or not this instruction is completed. */ bool isCompleted() const { return status[Completed]; } + /* [mengjia] new status for load operations */ + //void setSpecSent() { status.set(SpecSent); } + //bool isSpecSent() const { return status[SpecSent]; } + + void setSpecCompleted() { status.set(SpecCompleted); } + bool isSpecCompleted() const { return status[SpecCompleted]; } + + void setValidationCompleted() { status.set(ValidationCompleted); } + bool isValidationCompleted() const { return status[ValidationCompleted]; } + + void setExposeCompleted() { status.set(ExposeCompleted); } + bool isExposeCompleted() const { return status[ExposeCompleted]; } + + void setExposeSent() { status.set(ExposeSent); } + bool isExposeSent() const { return status[ExposeSent]; } + + void setL1HitHigh() { status.set(L1HitHigh); } + void clearL1HitHigh() { status.reset(L1HitHigh); } + bool isL1HitHigh() const { return status[L1HitHigh]; } + + void setL1HitLow() { status.set(L1HitLow); } + void clearL1HitLow() { status.reset(L1HitLow); } + bool isL1HitLow() const { return status[L1HitLow]; } + + void setPrevInstsCompleted() { status.set(PrevInstsCompleted); } + bool isPrevInstsCompleted() const { return status[PrevInstsCompleted]; } + + void setSpecBuffObsoleteHigh() { status.set(SpecBuffObsoleteHigh); } + bool isSpecBuffObsoleteHigh() const { return status[SpecBuffObsoleteHigh]; } + + void setSpecBuffObsoleteLow() { status.set(SpecBuffObsoleteLow); } + bool isSpecBuffObsoleteLow() const { return status[SpecBuffObsoleteLow]; } + + void setPrevBrsResolved() { status.set(PrevBrsResolved); } + bool isPrevBrsResolved() const { return status[PrevBrsResolved]; } + + void setPrevInstsCommitted() { status.set(PrevInstsCommitted); } + bool isPrevInstsCommitted() const { return status[PrevInstsCommitted]; } + + void setPrevBrsCommitted() { status.set(PrevBrsCommitted); } + bool isPrevBrsCommitted() const { return status[PrevBrsCommitted]; } + /* Configure load related status */ + /** Marks the result as ready. */ void setResultReady() { status.set(ResultReady); } @@ -892,7 +1023,25 @@ Fault BaseDynInst<Impl>::initiateMemRead(Addr addr, unsigned size, Request::Flags flags) { + // [InvisiSpec] do not start translation if + // there is a virtual fence ahead + assert(!fenceDelay()); + + if ( (flags.isSet(Request::ATOMIC_RETURN_OP) + || flags.isSet(Request::ATOMIC_NO_RETURN_OP) + || flags.isSet(Request::UNCACHEABLE) + || flags.isSet(Request::LLSC) + || flags.isSet(Request::STRICT_ORDER)) + && !readyToExpose()){ + onlyWaitForExpose(true); + // FIXME: reschedule due to LLSC + // reuse TLBMiss for now + specTLBMiss(true); + return NoFault; + } + instFlags[ReqMade] = true; + instFlags[SpecTLBMiss] = false; Request *req = NULL; Request *sreqLow = NULL; Request *sreqHigh = NULL; @@ -906,16 +1055,36 @@ BaseDynInst<Impl>::initiateMemRead(Addr addr, unsigned size, thread->contextId()); req->taskId(cpu->taskId()); - + if(!readyToExpose()){ + req->setFlags(Request::SPEC); + } // Only split the request if the ISA supports unaligned accesses. if (TheISA::HasUnalignedMemAcc) { splitRequest(req, sreqLow, sreqHigh); } + initiateTranslation(req, sreqLow, sreqHigh, NULL, BaseTLB::Read); + } if (translationCompleted()) { + // [InvisiSpec] to fix the memory leakage problem + // in the case the read is squashed and the request + // is never sent out due to a virtual fence ahead if (fault == NoFault) { + /* + if (fenceDelay()){ + translationStarted(false); + translationCompleted(false); + onlyWaitForFence(true); + delete req; + if (sreqLow){ + delete sreqLow; + delete sreqHigh; + } + return NoFault; + } + */ effAddr = req->getVaddr(); effSize = size; instFlags[EffAddrValid] = true; @@ -926,10 +1095,30 @@ BaseDynInst<Impl>::initiateMemRead(Addr addr, unsigned size, } reqToVerify = new Request(*req); } + + // issue load request [mengjia] fault = cpu->read(req, sreqLow, sreqHigh, lqIdx); } else { // Commit will have to clean up whatever happened. Set this // instruction as executed. + + // [InvisiSpec] If it is a fault on translating a spec load + // Deffer it and retry when it is ready to expose + if (!readyToExpose()){ + translationStarted(false); + translationCompleted(false); + onlyWaitForExpose(true); + specTLBMiss(true); + //delete req; + //if (sreqLow){ + // delete sreqLow; + // delete sreqHigh; + //} + return NoFault; + } + // set it as executed and fault flag. + // when it enters ROB and try to commit, + // the commit stage will squash this inst [mengjia] this->setExecuted(); } } @@ -940,6 +1129,7 @@ BaseDynInst<Impl>::initiateMemRead(Addr addr, unsigned size, return fault; } + template<class Impl> Fault BaseDynInst<Impl>::writeMem(uint8_t *data, unsigned size, Addr addr, diff --git a/src/cpu/base_dyn_inst_impl.hh b/src/cpu/base_dyn_inst_impl.hh index f55bd8ed5..f5bcff2df 100644 --- a/src/cpu/base_dyn_inst_impl.hh +++ b/src/cpu/base_dyn_inst_impl.hh @@ -87,6 +87,7 @@ void BaseDynInst<Impl>::initVars() { memData = NULL; + vldData = NULL; effAddr = 0; physEffAddrLow = 0; physEffAddrHigh = 0; @@ -132,6 +133,9 @@ BaseDynInst<Impl>::initVars() #endif reqToVerify = NULL; + postReq = NULL; + postSreqLow = NULL; + postSreqHigh = NULL; } template <class Impl> @@ -141,6 +145,10 @@ BaseDynInst<Impl>::~BaseDynInst() delete [] memData; } + if (vldData) { + delete [] vldData; + } + if (traceData) { delete traceData; } @@ -160,6 +168,19 @@ BaseDynInst<Impl>::~BaseDynInst() if (reqToVerify) delete reqToVerify; + + if (needDeletePostReq()){ + if (postReq){ + delete postReq; + postReq = NULL; + } + if (postSreqLow) { + delete postSreqLow; + delete postSreqHigh; + postSreqLow = NULL; + postSreqHigh = NULL; + } + } } #ifdef DEBUG diff --git a/src/cpu/o3/O3CPU.py b/src/cpu/o3/O3CPU.py index b8152f663..371433eef 100644 --- a/src/cpu/o3/O3CPU.py +++ b/src/cpu/o3/O3CPU.py @@ -68,6 +68,10 @@ class DerivO3CPU(BaseCPU): cacheStorePorts = Param.Unsigned(200, "Cache Ports. " "Constrains stores only. Loads are constrained by load FUs.") + # we deal with validation very similar as store writes back + # FIXME: not sure whether it is the correct parameter or not + cacheValidationPorts = Param.Unsigned(200, "Validation Ports. " + "Constrains validations only. Loads are constrained by load FUs.") decodeToFetchDelay = Param.Cycles(1, "Decode to fetch delay") renameToFetchDelay = Param.Cycles(1 ,"Rename to fetch delay") @@ -124,7 +128,7 @@ class DerivO3CPU(BaseCPU): LFSTSize = Param.Unsigned(1024, "Last fetched store table size") SSITSize = Param.Unsigned(1024, "Store set ID table size") - numRobs = Param.Unsigned(1, "Number of Reorder Buffers"); + numRobs = Param.Unsigned(1, "Number of Reorder Buffers") numPhysIntRegs = Param.Unsigned(256, "Number of physical integer registers") numPhysFloatRegs = Param.Unsigned(256, "Number of physical floating point " @@ -157,10 +161,14 @@ class DerivO3CPU(BaseCPU): smtCommitPolicy = Param.String('RoundRobin', "SMT Commit Policy") branchPred = Param.BranchPredictor(TournamentBP(numThreads = - Parent.numThreads), + Parent.numThreads), "Branch Predictor") - needsTSO = Param.Bool(buildEnv['TARGET_ISA'] == 'x86', - "Enable TSO Memory model") + + # [mengjia] add configuration variables + simulateScheme = Param.String('UnsafeBaseline', + "The scheme specificed for simulation") + needsTSO = Param.Bool(False, "Enable TSO Memory model") + allowSpecBuffHit = Param.Bool(True, "Enable hit/reuse spec buffer entries") def addCheckerCpu(self): if buildEnv['TARGET_ISA'] in ['arm']: diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index f508a372e..7fe4ad731 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -450,6 +450,7 @@ class DefaultCommit /** The sequence number of the last commited instruction. */ InstSeqNum lastCommitedSeqNum[Impl::MaxThreads]; + Tick lastCommitTick; /** Records if there is a trap currently in flight. */ bool trapInFlight[Impl::MaxThreads]; @@ -479,6 +480,9 @@ class DefaultCommit /** Updates commit stats based on this instruction. */ void updateComInstStats(DynInstPtr &inst); + /** [InvisiSpec] Updates squash stats based on this instruction. */ + void updateSquashStats(DynInstPtr &inst); + /** Stat for the total number of squashed instructions discarded by commit. */ Stats::Scalar commitSquashedInsts; @@ -488,6 +492,19 @@ class DefaultCommit Stats::Scalar commitNonSpecStalls; /** Stat for the total number of branch mispredicts that caused a squash. */ Stats::Scalar branchMispredicts; + + // [InvisiSpec] count #squash + /** Stat for the total number of invalidation packets + * that caused a squash. */ + Stats::Scalar loadHitInvalidations; + Stats::Scalar loadHitExternalEvictions; + /** Stat for the total number of failed validations + * that caused a squash. */ + Stats::Scalar loadValidationFails; + // [InvisiSpec] count cycles stall due to waiting for + // validation responses + Stats::Scalar validationStalls; + /** Distribution of the number of committed instructions each cycle. */ Stats::Distribution numCommittedDist; diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh index d32493cbc..d83011a66 100644 --- a/src/cpu/o3/commit_impl.hh +++ b/src/cpu/o3/commit_impl.hh @@ -143,6 +143,7 @@ DefaultCommit<Impl>::DefaultCommit(O3CPU *_cpu, DerivO3CPUParams *params) lastCommitedSeqNum[tid] = 0; squashAfterInst[tid] = NULL; } + lastCommitTick = curTick(); interrupt = NoFault; } @@ -183,6 +184,28 @@ DefaultCommit<Impl>::regStats() .desc("The number of times a branch was mispredicted") .prereq(branchMispredicts); + // [InvisiSpec] stat for squash due to invalidation, failed validation + loadHitInvalidations + .name(name() + ".loadHitInvalidations") + .desc("The number of times a load hits a invalidation"); + //.prereq(loadHitInvalidations); + + loadHitExternalEvictions + .name(name() + ".loadHitExternalEvictions") + .desc("The number of times a load hits an external invalidation"); + //.prereq(loadHitInvalidations); + + loadValidationFails + .name(name() + ".loadValidationFails") + .desc("The number of times a load fails validation"); + //.prereq(loadValidationFails); + + validationStalls + .name(name() + ".validationStalls") + .desc("The number of ticks the commit is stalled due to waiting " + "for validation responses"); + //.prereq(loadValidationFails); + numCommittedDist .init(0,commitWidth,1) .name(name() + ".committed_per_cycle") @@ -579,6 +602,9 @@ DefaultCommit<Impl>::squashAll(ThreadID tid) toIEW->commitInfo[tid].squashInst = NULL; toIEW->commitInfo[tid].pc = pc[tid]; + + //TODO: send a packet to SpecBuffer to indicate flush + // } template <class Impl> @@ -705,13 +731,21 @@ DefaultCommit<Impl>::tick() } else if (!rob->isEmpty(tid)) { DynInstPtr inst = rob->readHeadInst(tid); + if (inst->isExecuted() && inst->needPostFetch() + && !inst->isExposeCompleted()){ + //stall due to waiting for validation response + if (curTick()-lastCommitTick > 0){ + validationStalls+= curTick()-lastCommitTick; + } + + } ppCommitStall->notify(inst); DPRINTF(Commit,"[tid:%i]: Can't commit, Instruction [sn:%lli] PC " "%s is head of ROB and not ready\n", tid, inst->seqNum, inst->pcState()); } - + lastCommitTick = curTick(); DPRINTF(Commit, "[tid:%i]: ROB has %d insts & %d free entries.\n", tid, rob->countInsts(tid), rob->numFreeEntries(tid)); } @@ -832,6 +866,7 @@ DefaultCommit<Impl>::commit() squashFromTrap(tid); } else if (tcSquash[tid]) { assert(commitStatus[tid] != TrapPending); + //TC: thread context. [mengjia] squashFromTC(tid); } else if (commitStatus[tid] == SquashAfterPending) { // A squash from the previous cycle of the commit stage (i.e., @@ -1039,6 +1074,7 @@ DefaultCommit<Impl>::commitInsts() toIEW->commitInfo[tid].doneSeqNum = head_inst->seqNum; if (tid == 0) { + //maybe we can use this to mask interrupts [mengjia] canHandleInterrupts = (!head_inst->isDelayedCommit()) && ((THE_ISA != ALPHA_ISA) || (!(pc[0].instAddr() & 0x3))); @@ -1219,6 +1255,8 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num) // execution doesn't generate extra squashes. thread[tid]->noSquashFromTC = true; + // [InvisiSpec] update squash stat for invalidation or validation fails + updateSquashStats(head_inst); // Execute the trap. Although it's slightly unrealistic in // terms of timing (as it doesn't wait for the full timing of // the trap event to complete before updating state), it's @@ -1350,6 +1388,7 @@ DefaultCommit<Impl>::markCompletedInsts() // Grab completed insts out of the IEW instruction queue, and mark // instructions completed within the ROB. for (int inst_num = 0; inst_num < fromIEW->size; ++inst_num) { + DPRINTF(Commit, "get the inst [num:%d]\n", inst_num); assert(fromIEW->insts[inst_num]); if (!fromIEW->insts[inst_num]->isSquashed()) { DPRINTF(Commit, "[tid:%i]: Marking PC %s, [sn:%lli] ready " @@ -1362,6 +1401,27 @@ DefaultCommit<Impl>::markCompletedInsts() fromIEW->insts[inst_num]->setCanCommit(); } } + + // [InvisiSpec] + // update load status + // isPrevInstsCompleted; isPrevBrsResolved + rob->updateVisibleState(); +} + +// [InvisiSpec] update squash stat for loads +template <class Impl> +void +DefaultCommit<Impl>::updateSquashStats(DynInstPtr &inst) +{ + if (inst->hitInvalidation()){ + loadHitInvalidations++; + } + if (inst->validationFail()){ + loadValidationFails++; + } + if (inst->hitExternalEviction()){ + loadHitExternalEvictions++; + } } template <class Impl> diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index c4bc13fb4..27ad78e2e 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -593,6 +593,7 @@ FullO3CPU<Impl>::tick() activityRec.advance(); + DPRINTF(O3CPU, "activityRec.advance() complete\n"); if (removeInstsThisCycle) { cleanUpRemovedInsts(); } @@ -610,6 +611,8 @@ FullO3CPU<Impl>::tick() schedule(tickEvent, clockEdge(Cycles(1))); DPRINTF(O3CPU, "Scheduling next tick!\n"); } + } else { + DPRINTF(O3CPU, "tickEvent.scheduled == false, %lu", curTick()); } if (!FullSystem) diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh index 8270a71b5..063394fdd 100644 --- a/src/cpu/o3/iew_impl.hh +++ b/src/cpu/o3/iew_impl.hh @@ -1186,9 +1186,16 @@ DefaultIEW<Impl>::executeInsts() fetchRedirect[tid] = false; } + // [mengjia] Validate/Expose any loads which are ready last cycle + // very tricky, need make the state consistent + // if we successfully commit sth, then we need to activate the stage or somehow + // problems happen when interacting with squash + // NOTE: we always send validations before execute load requests + ldstQueue.exposeLoads(); + // Uncomment this if you want to see all available instructions. // @todo This doesn't actually work anymore, we should fix it. -// printAvailableInsts(); + // printAvailableInsts(); // Execute/writeback any instructions that are available. int insts_to_execute = fromIssue->size; @@ -1235,18 +1242,40 @@ DefaultIEW<Impl>::executeInsts() DPRINTF(IEW, "Execute: Calculating address for memory " "reference.\n"); + DPRINTF(IEW, "Execute: %s\n", inst->staticInst->getName()); // Tell the LDSTQ to execute this instruction (if it is a load). if (inst->isLoad()) { // Loads will mark themselves as executed, and their writeback // event adds the instruction to the queue to commit + + // [InvisiSpec] a lifetime of a load + // always let it translate --> translation not complete, defer + // if !loadInExec, need to check whether there + // is a virtual fence ahead + // --> if existing virtual fence, defer + if (inst->fenceDelay()){ + DPRINTF(IEW, "Deferring load due to virtual fence.\n"); + inst->onlyWaitForFence(true); + instQueue.deferMemInst(inst); + continue; + } + fault = ldstQueue.executeLoad(inst); - if (inst->isTranslationDelayed() && + // [InvisiSpec] delay the load if there is a virtual fence ahead + if ((inst->isTranslationDelayed() ) && fault == NoFault) { // A hw page table walk is currently going on; the // instruction must be deferred. - DPRINTF(IEW, "Execute: Delayed translation, deferring " - "load.\n"); + DPRINTF(IEW, "Execute: Delayed translation, deferring load.\n"); + instQueue.deferMemInst(inst); + continue; + } + + if ((inst->specTLBMiss() ) && + fault == NoFault) { + DPRINTF(IEW, "Execute: Speculative load gets a TLB miss," + " deferring load.\n"); instQueue.deferMemInst(inst); continue; } @@ -1381,10 +1410,11 @@ DefaultIEW<Impl>::executeInsts() ++memOrderViolationEvents; } } - } + } // Update and record activity if we processed any instructions. if (inst_num) { + if (exeStatus == Idle) { exeStatus = Running; } @@ -1476,16 +1506,18 @@ DefaultIEW<Impl>::tick() dispatch(tid); } + ldstQueue.updateVisibleState(); + if (exeStatus != Squashing) { executeInsts(); - + writebackInsts(); // Have the instruction queue try to schedule any ready instructions. // (In actuality, this scheduling is for instructions that will // be executed next cycle.) instQueue.scheduleReadyInsts(); - + // Also should advance its own time buffers if the stage ran. // Not the best place for it, but this works (hopefully). issueToExecQueue.advance(); @@ -1502,6 +1534,7 @@ DefaultIEW<Impl>::tick() // Writeback any stores using any leftover bandwidth. ldstQueue.writebackStores(); + // Check the committed load/store signals to see if there's a load // or store to commit. Also check if it's being told to execute a diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh index f70f66274..86dfd9214 100644 --- a/src/cpu/o3/inst_queue_impl.hh +++ b/src/cpu/o3/inst_queue_impl.hh @@ -1172,8 +1172,19 @@ InstructionQueue<Impl>::getDeferredMemInstToExecute() { for (ListIt it = deferredMemInsts.begin(); it != deferredMemInsts.end(); ++it) { - if ((*it)->translationCompleted() || (*it)->isSquashed()) { + // [InvisiSpec] we need to check the FenceDelay + // a load can be delayed due to + // 1. translation delay + // 2. virtual fence ahead + // 3. not ready to expose and gets a TLB miss + // for both (2, 3) we need to restart the translation + if ( (*it)->translationCompleted() + || ((*it)->onlyWaitForFence() && !(*it)->fenceDelay()) + || ((*it)->onlyWaitForExpose() && (*it)->readyToExpose()) + || (*it)->isSquashed()) { DynInstPtr mem_inst = *it; + mem_inst->onlyWaitForFence(false); + mem_inst->onlyWaitForExpose(false); deferredMemInsts.erase(it); return mem_inst; } diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 6bc9b3d73..5a028e0c2 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -136,6 +136,23 @@ class LSQ { /** Same as above, but only for one thread. */ void writebackStores(ThreadID tid); + + /** [mengjia] + * Attempts to validate loads until all cache ports are used or the + * interface becomes blocked. + */ + int exposeLoads(); + /** Same as above, but only for one thread. */ + int exposeLoads(ThreadID tid); + + /** [mengjia] + * attempt to update FenceDelay state for load insts + */ + void updateVisibleState(); + /** Same as above, but only for one thread. */ + void updateVisibleState(ThreadID tid); + + /** * Squash instructions from a thread until the specified sequence number. */ @@ -257,6 +274,10 @@ class LSQ { int numStoresToWB(ThreadID tid) { return thread[tid].numStoresToWB(); } + /** Returns the number of stores a specific thread has to write back. */ + int numLoadsToVLD(ThreadID tid) + { return thread[tid].numLoadsToVLD(); } + /** Returns if the LSQ will write back to memory this cycle. */ bool willWB(); /** Returns if the LSQ of a specific thread will write back to memory this diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh index 9080907fe..d227f36ad 100644 --- a/src/cpu/o3/lsq_impl.hh +++ b/src/cpu/o3/lsq_impl.hh @@ -310,6 +310,44 @@ LSQ<Impl>::writebackStores() } } +// [mengjia] +template<class Impl> +int +LSQ<Impl>::exposeLoads() +{ + list<ThreadID>::iterator threads = activeThreads->begin(); + list<ThreadID>::iterator end = activeThreads->end(); + + int exposedLoads = 0; + while (threads != end) { + ThreadID tid = *threads++; + + if (numLoadsToVLD(tid) > 0) { + DPRINTF(Writeback,"[tid:%i] Validate loads. %i loads " + "available for Validate.\n", tid, numLoadsToVLD(tid)); + } + + exposedLoads += thread[tid].exposeLoads(); + } + return exposedLoads; +} + + +// [mengjia] +template<class Impl> +void +LSQ<Impl>::updateVisibleState() +{ + list<ThreadID>::iterator threads = activeThreads->begin(); + list<ThreadID>::iterator end = activeThreads->end(); + + while (threads != end) { + ThreadID tid = *threads++; + + thread[tid].updateVisibleState(); + } +} + template<class Impl> bool LSQ<Impl>::violation() @@ -339,6 +377,7 @@ LSQ<Impl>::recvReqRetry() } } +// [InvisiSpec] Callback function for receiving a response template <class Impl> bool LSQ<Impl>::recvTimingResp(PacketPtr pkt) @@ -347,6 +386,17 @@ LSQ<Impl>::recvTimingResp(PacketPtr pkt) DPRINTF(LSQ, "Got error packet back for address: %#X\n", pkt->getAddr()); + // for expose or validate request, + // if the instruction is squashed, maybe the req has been deleted + if (pkt->isValidate() || pkt->isExpose()){ + if (!pkt->req){ + delete pkt; + return true; + } + DPRINTF(LSQ, "Receive an expose/validate response, idx=%d\n", + pkt->reqIdx); + } + thread[cpu->contextToThread(pkt->req->contextId())] .completeDataAccess(pkt); @@ -370,7 +420,10 @@ LSQ<Impl>::recvTimingResp(PacketPtr pkt) } } - delete pkt->req; + //TODO: also not validation + if (!pkt->isExpose() && !pkt->isValidate()){ + delete pkt->req; + } delete pkt; return true; } diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index a2813b3dc..115cd035d 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -136,6 +136,12 @@ class LSQUnit { */ void checkSnoop(PacketPtr pkt); + // [InvisiSpec] check whether current request will hit in the + // spec buffer or not + int checkSpecBuffHit(const RequestPtr req, const int req_idx); + void setSpecBuffState(const RequestPtr req); + + bool checkPrevLoadsExecuted(const int req_idx); /** Executes a load instruction. */ Fault executeLoad(DynInstPtr &inst); @@ -154,6 +160,15 @@ class LSQUnit { /** Writes back stores. */ void writebackStores(); + /** [mengjia] Validate loads. */ + int exposeLoads(); + + /** [mengjia] Update Visbible State. + * In the mode defence relying on fence: setup fenceDelay state. + * In the mode defence relying on invisibleSpec: + * setup readyToExpose*/ + void updateVisibleState(); + /** Completes the data access that has been returned from the * memory system. */ void completeDataAccess(PacketPtr pkt); @@ -219,6 +234,8 @@ class LSQUnit { /** Returns the number of stores to writeback. */ int numStoresToWB() { return storesToWB; } + /** [InvisiSpec] Returns the number of loads to validate. */ + int numLoadsToVLD() { return loadsToVLD; } /** Returns if the LSQ unit will writeback on this cycle. */ bool willWB() { return storeQueue[storeWBIdx].canWB && @@ -235,18 +252,30 @@ class LSQUnit { /** Writes back the instruction, sending it to IEW. */ void writeback(DynInstPtr &inst, PacketPtr pkt); + // [InvisiSpec] complete Validates + void completeValidate(DynInstPtr &inst, PacketPtr pkt); + /** Writes back a store that couldn't be completed the previous cycle. */ void writebackPendingStore(); + /** Validates a load that couldn't be completed the previous cycle. */ + void validatePendingLoad(); + /** Handles completing the send of a store to memory. */ void storePostSend(PacketPtr pkt); + /** Handles completing the send of a validation to memory. */ + //void validationPostSend(PacketPtr pkt, int loadVLDIdx); + /** Completes the store at the specified index. */ void completeStore(int store_idx); /** Attempts to send a store to the cache. */ bool sendStore(PacketPtr data_pkt); + /** Attempts to send a validation to the cache. */ + //bool sendValidation(PacketPtr data_pkt, int loadVLDIdx); + /** Increments the given store index (circular queue). */ inline void incrStIdx(int &store_idx) const; /** Decrements the given store index (circular queue). */ @@ -409,6 +438,8 @@ class LSQUnit { /** The number of load instructions in the LQ. */ int loads; + /** [mengjia] The number of store instructions in the SQ waiting to writeback. */ + int loadsToVLD; /** The number of store instructions in the SQ. */ int stores; /** The number of store instructions in the SQ waiting to writeback. */ @@ -416,6 +447,10 @@ class LSQUnit { /** The index of the head instruction in the LQ. */ int loadHead; + /** [mengjia] The index of the first instruction that may be ready to be + * validated, and has not yet been validated. + */ + //int pendingLoadVLDIdx; /** The index of the tail instruction in the LQ. */ int loadTail; @@ -432,7 +467,7 @@ class LSQUnit { /** The number of cache ports available each cycle (stores only). */ int cacheStorePorts; - /** The number of used cache ports in this cycle by stores. */ + /** [InvisiSpec] The number of used cache ports in this cycle by stores. */ int usedStorePorts; //list<InstSeqNum> mshrSeqNums; @@ -458,6 +493,9 @@ class LSQUnit { /** Whehter or not a store is blocked due to the memory system. */ bool isStoreBlocked; + /** Whehter or not a validation is blocked due to the memory system. */ + bool isValidationBlocked; + /** Whether or not a store is in flight. */ bool storeInFlight; @@ -471,9 +509,21 @@ class LSQUnit { /** The packet that is pending free cache ports. */ PacketPtr pendingPkt; + /* [mengjia] define scheme variables */ + // Flag for whether issue packets in execution stage + bool loadInExec; + + // Flag for whether to use invisible speculative load + bool isInvisibleSpec; + /** Flag for memory model. */ bool needsTSO; + // Flag for whether defending against spectre attack or future attacks + bool isFuturistic; + bool allowSpecBuffHit; + /* [mengjia] different schemes determine values of 4 variables. */ + // Will also need how many read/write ports the Dcache has. Or keep track // of that in stage that is one level up, and only call executeLoad/Store // the appropriate number of times. @@ -508,6 +558,12 @@ class LSQUnit { /** Number of times the LSQ is blocked due to the cache. */ Stats::Scalar lsqCacheBlocked; + Stats::Scalar specBuffHits; + Stats::Scalar specBuffMisses; + Stats::Scalar numValidates; + Stats::Scalar numExposes; + Stats::Scalar numConvertedExposes; + public: /** Executes the load at the given index. */ Fault read(Request *req, Request *sreqLow, Request *sreqHigh, @@ -547,6 +603,8 @@ class LSQUnit { bool isStalled() { return stalled; } }; + +// IMPORTANT: the function to issue packets, interact with memory [mengjia] template <class Impl> Fault LSQUnit<Impl>::read(Request *req, Request *sreqLow, Request *sreqHigh, @@ -583,6 +641,7 @@ LSQUnit<Impl>::read(Request *req, Request *sreqLow, Request *sreqHigh, } // Check the SQ for any previous stores that might lead to forwarding + // why we have store queue index for a load operation? [mengjia] int store_idx = load_inst->sqIdx; int store_size = 0; @@ -592,6 +651,7 @@ LSQUnit<Impl>::read(Request *req, Request *sreqLow, Request *sreqHigh, load_idx, store_idx, storeHead, req->getPaddr(), sreqLow ? " split" : ""); + // LLSC: load-link/store-conditional [mengjia] if (req->isLLSC()) { assert(!sreqLow); // Disable recording the result temporarily. Writing to misc @@ -602,12 +662,14 @@ LSQUnit<Impl>::read(Request *req, Request *sreqLow, Request *sreqHigh, load_inst->recordResult(true); } + // request to memory mapped register [mengjia] if (req->isMmappedIpr()) { assert(!load_inst->memData); load_inst->memData = new uint8_t[64]; ThreadContext *thread = cpu->tcBase(lsqID); Cycles delay(0); + PacketPtr data_pkt = new Packet(req, MemCmd::ReadReq); data_pkt->dataStatic(load_inst->memData); @@ -772,6 +834,7 @@ LSQUnit<Impl>::read(Request *req, Request *sreqLow, Request *sreqHigh, DPRINTF(LSQUnit, "Doing memory access for inst [sn:%lli] PC %s\n", load_inst->seqNum, load_inst->pcState()); + // Allocate memory if this is the first time a load is issued. if (!load_inst->memData) { load_inst->memData = new uint8_t[req->getSize()]; @@ -779,10 +842,35 @@ LSQUnit<Impl>::read(Request *req, Request *sreqLow, Request *sreqHigh, // if we the cache is not blocked, do cache access bool completedFirst = false; - PacketPtr data_pkt = Packet::createRead(req); + + PacketPtr data_pkt = NULL; PacketPtr fst_data_pkt = NULL; PacketPtr snd_data_pkt = NULL; + // According to the isInsivisibleSpec variable to create + // corresponding type of packets [mengjia] + bool sendSpecRead = false; + if(isInvisibleSpec){ + if(!load_inst->readyToExpose()){ + assert(!req->isLLSC()); + assert(!req->isStrictlyOrdered()); + assert(!req->isMmappedIpr()); + sendSpecRead = true; + DPRINTF(LSQUnit, "send a spec read for inst [sn:%lli]\n", + load_inst->seqNum); + } + + } + + assert( !(sendSpecRead && load_inst->isSpecCompleted()) && + "Sending specRead twice for the same load insts"); + + if(sendSpecRead){ + data_pkt = Packet::createReadSpec(req); + }else{ + data_pkt = Packet::createRead(req); + } + data_pkt->dataStatic(load_inst->memData); LSQSenderState *state = new LSQSenderState; @@ -794,17 +882,64 @@ LSQUnit<Impl>::read(Request *req, Request *sreqLow, Request *sreqHigh, if (!TheISA::HasUnalignedMemAcc || !sreqLow) { // Point the first packet at the main data packet. fst_data_pkt = data_pkt; + + fst_data_pkt->setFirst(); + if (sendSpecRead){ + int src_idx = checkSpecBuffHit(req, load_idx); + if (src_idx != -1) { + if (allowSpecBuffHit){ + data_pkt->setOnlyAccessSpecBuff(); + } + data_pkt->srcIdx = src_idx; + specBuffHits++; + }else{ + specBuffMisses++; + } + } + fst_data_pkt->reqIdx = load_idx; } else { // Create the split packets. - fst_data_pkt = Packet::createRead(sreqLow); - snd_data_pkt = Packet::createRead(sreqHigh); + if(sendSpecRead){ + + fst_data_pkt = Packet::createReadSpec(sreqLow); + int fst_src_idx = checkSpecBuffHit(sreqLow, load_idx); + if ( fst_src_idx != -1 ) { + if (allowSpecBuffHit){ + fst_data_pkt->setOnlyAccessSpecBuff(); + } + fst_data_pkt->srcIdx = fst_src_idx; + specBuffHits++; + } else { + specBuffMisses++; + } + + snd_data_pkt = Packet::createReadSpec(sreqHigh); + int snd_src_idx = checkSpecBuffHit(sreqHigh, load_idx); + if ( snd_src_idx != -1 ) { + if (allowSpecBuffHit){ + snd_data_pkt->setOnlyAccessSpecBuff(); + } + snd_data_pkt->srcIdx = snd_src_idx; + specBuffHits++; + } else { + specBuffMisses++; + } + }else{ + fst_data_pkt = Packet::createRead(sreqLow); + snd_data_pkt = Packet::createRead(sreqHigh); + } + fst_data_pkt->setFirst(); fst_data_pkt->dataStatic(load_inst->memData); snd_data_pkt->dataStatic(load_inst->memData + sreqLow->getSize()); fst_data_pkt->senderState = state; snd_data_pkt->senderState = state; + fst_data_pkt->reqIdx = load_idx; + snd_data_pkt->reqIdx = load_idx; + fst_data_pkt->isSplit = true; + snd_data_pkt->isSplit = true; state->isSplit = true; state->outstanding = 2; state->mainPkt = data_pkt; @@ -816,6 +951,8 @@ LSQUnit<Impl>::read(Request *req, Request *sreqLow, Request *sreqHigh, // @todo We should account for cache port contention // and arbitrate between loads and stores. bool successful_load = true; + // MARK: here is the place memory request of read is sent [mengjia] + // [InvisiSpec] Sending out a memory request if (!dcachePort->sendTimingReq(fst_data_pkt)) { successful_load = false; } else if (TheISA::HasUnalignedMemAcc && sreqLow) { @@ -878,6 +1015,62 @@ LSQUnit<Impl>::read(Request *req, Request *sreqLow, Request *sreqHigh, return NoFault; } + DPRINTF(LSQUnit, "successfully sent out packet(s) for inst [sn:%lli]\n", + load_inst->seqNum); + // Set everything ready for expose/validation after the read is + // successfully sent out + if(sendSpecRead){ // sending actual request + + // [mengjia] Here we set the needExposeOnly flag + if (needsTSO && !load_inst->isDataPrefetch()){ + // need to check whether previous load_instructions specComplete or not + if ( checkPrevLoadsExecuted(load_idx) ){ + load_inst->needExposeOnly(true); + DPRINTF(LSQUnit, "Set load PC %s, [sn:%lli] as " + "needExposeOnly\n", + load_inst->pcState(), load_inst->seqNum); + } else { + DPRINTF(LSQUnit, "Set load PC %s, [sn:%lli] as " + "needValidation\n", + load_inst->pcState(), load_inst->seqNum); + } + }else{ + //if RC, always only need expose + load_inst->needExposeOnly(true); + DPRINTF(LSQUnit, "Set load PC %s, [sn:%lli] as needExposeOnly\n", + load_inst->pcState(), load_inst->seqNum); + } + + load_inst->needPostFetch(true); + assert(!req->isMmappedIpr()); + //save expose requestPtr + if (TheISA::HasUnalignedMemAcc && sreqLow) { + load_inst->postSreqLow = new Request(*sreqLow); + load_inst->postSreqHigh = new Request(*sreqHigh); + load_inst->postReq = NULL; + }else{ + load_inst->postReq = new Request(*req); + load_inst->postSreqLow = NULL; + load_inst->postSreqHigh = NULL; + } + load_inst->needDeletePostReq(true); + DPRINTF(LSQUnit, "created validation/expose" + " request for inst [sn:%lli]" + "req=%#x, reqLow=%#x, reqHigh=%#x\n", + load_inst->seqNum, (Addr)(load_inst->postReq), + (Addr)(load_inst->postSreqLow), + (Addr)(load_inst->postSreqHigh)); + } else { + load_inst->setExposeCompleted(); + load_inst->needPostFetch(false); + if (TheISA::HasUnalignedMemAcc && sreqLow) { + setSpecBuffState(sreqLow); + setSpecBuffState(sreqHigh); + } else { + setSpecBuffState(req); + } + } + return NoFault; } diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh index ca6a7f399..46382cb0b 100644 --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -79,7 +79,9 @@ LSQUnit<Impl>::WritebackEvent::process() if (pkt->senderState) delete pkt->senderState; - delete pkt->req; + if (!pkt->isValidate() && !pkt->isExpose()){ + delete pkt->req; + } delete pkt; } @@ -90,6 +92,9 @@ LSQUnit<Impl>::WritebackEvent::description() const return "Store writeback"; } + +// [InvisiSpec] This function deals with +// acknowledge response to memory read/write template<class Impl> void LSQUnit<Impl>::completeDataAccess(PacketPtr pkt) @@ -108,8 +113,25 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt) return; } + // need to update hit info for corresponding instruction + if (pkt->isL1Hit() && pkt->isSpec() && pkt->isRead()){ + if (state->isSplit && ! pkt->isFirst()){ + inst->setL1HitHigh(); + } else { + inst->setL1HitLow(); + } + } else if (!pkt->isSpec()) { + setSpecBuffState(pkt->req); + } + // If this is a split access, wait until all packets are received. if (TheISA::HasUnalignedMemAcc && !state->complete()) { + // Not the good place, but we need to fix the memory leakage + if (pkt->isExpose() || pkt->isValidate()){ + assert(!inst->needDeletePostReq()); + assert(!pkt->isInvalidate()); + delete pkt->req; + } return; } @@ -118,7 +140,9 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt) if (!state->noWB) { // Only loads and store conditionals perform the writeback // after receving the response from the memory + // [mengjia] validation also needs writeback, expose do not need assert(inst->isLoad() || inst->isStoreConditional()); + if (!TheISA::HasUnalignedMemAcc || !state->isSplit || !state->isLoad) { writeback(inst, pkt); @@ -130,6 +154,10 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt) if (inst->isStore()) { completeStore(state->idx); } + + if (pkt->isValidate() || pkt->isExpose()) { + completeValidate(inst, pkt); + } } if (TheISA::HasUnalignedMemAcc && state->isSplit && state->isLoad) { @@ -138,15 +166,22 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt) } pkt->req->setAccessLatency(); + // probe point, not sure about the mechanism [mengjia] cpu->ppDataAccessComplete->notify(std::make_pair(inst, pkt)); + // Not the good place, but we need to fix the memory leakage + if (pkt->isExpose() || pkt->isValidate()){ + assert(!inst->needDeletePostReq()); + assert(!pkt->isInvalidate()); + delete pkt->req; + } delete state; } template <class Impl> LSQUnit<Impl>::LSQUnit() - : loads(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false), - isStoreBlocked(false), storeInFlight(false), hasPendingPkt(false), + : loads(0), loadsToVLD(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false), + isStoreBlocked(false), isValidationBlocked(false), storeInFlight(false), hasPendingPkt(false), pendingPkt(nullptr) { } @@ -180,7 +215,52 @@ LSQUnit<Impl>::init(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params, depCheckShift = params->LSQDepCheckShift; checkLoads = params->LSQCheckLoads; cacheStorePorts = params->cacheStorePorts; + + // According to the scheme, we need to define actions as follows. + // loadInExec: if False, no packets are sent in execution stage; + // if True, send either readReq or readSpecReq + // isInvisibleSpec: if True, send readSpecReq in execution statge; + // if False, send readReq + // needsTSO: if True, squash read on receiving invalidations, and only allow one outstanding write at a time; + // if False, no squash on receiving invalidaiton, and allow multiple outstanding writes. + // isConservative: if True, react after all preceding instructions complete/no exception; + // if False, react only after all preceding stores/brancehs complete + const std::string scheme = params->simulateScheme; + if (scheme.compare("UnsafeBaseline")==0){ + loadInExec = true; + isInvisibleSpec = false; // send real request + isFuturistic = false; // not relevant in unsafe mode. + }else if (scheme.compare("FuturisticSafeFence")==0){ + // "LFENCE" before every load + loadInExec = false; + isInvisibleSpec = false; // not used since loadInExec is false + isFuturistic = true; // send readReq at head of ROB + }else if (scheme.compare("FuturisticSafeInvisibleSpec")==0){ + // only make load visible when all preceding instructions + // complete and no exception + loadInExec = true; + isInvisibleSpec = true; // send request but not change cache state + isFuturistic = true; // conservative condition to send validations + }else if (scheme.compare("SpectreSafeFence")==0){ + // "LFENCE" after every branch + loadInExec = false; + isInvisibleSpec = false; // not used since loadInExec is false + isFuturistic = false; // commit when preceding branches are resolved + }else if (scheme.compare("SpectreSafeInvisibleSpec")==0){ + // make load visible when all preceiding branches are resolved + loadInExec = true; + isInvisibleSpec = true; // send request but not change cache state + isFuturistic = false; // only deal with spectre attacks + }else { + cprintf("ERROR: unsupported simulation scheme: %s!\n", scheme); + exit(1); + } needsTSO = params->needsTSO; + allowSpecBuffHit = params->allowSpecBuffHit; + cprintf("Info: simulation uses scheme: %s; " + "needsTSO=%d; allowSpecBuffHit=%d\n", + scheme, needsTSO, allowSpecBuffHit); + // [mengjia] end of setting configuration variables resetState(); } @@ -190,7 +270,7 @@ template<class Impl> void LSQUnit<Impl>::resetState() { - loads = stores = storesToWB = 0; + loads = stores = loadsToVLD = storesToWB = 0; loadHead = loadTail = 0; @@ -260,6 +340,26 @@ LSQUnit<Impl>::regStats() lsqCacheBlocked .name(name() + ".cacheBlocked") .desc("Number of times an access to memory failed due to the cache being blocked"); + + specBuffHits + .name(name() + ".specBuffHits") + .desc("Number of times an access hits in speculative buffer"); + + specBuffMisses + .name(name() + ".specBuffMisses") + .desc("Number of times an access misses in speculative buffer"); + + numValidates + .name(name() + ".numValidates") + .desc("Number of validates sent to cache"); + + numExposes + .name(name() + ".numExposes") + .desc("Number of exposes sent to cache"); + + numConvertedExposes + .name(name() + ".numConvertedExposes") + .desc("Number of exposes converted from validation"); } template<class Impl> @@ -291,6 +391,7 @@ LSQUnit<Impl>::drainSanityCheck() const assert(!loadQueue[i]); assert(storesToWB == 0); + assert(loadsToVLD == 0); assert(!retryPkt); } @@ -379,6 +480,7 @@ LSQUnit<Impl>::insertLoad(DynInstPtr &load_inst) incrLdIdx(loadTail); ++loads; + } template <class Impl> @@ -402,6 +504,7 @@ LSQUnit<Impl>::insertStore(DynInstPtr &store_inst) ++stores; } +// It is an empty function? why? [mengjia] template <class Impl> typename Impl::DynInstPtr LSQUnit<Impl>::getMemDepViolator() @@ -462,13 +565,16 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt) Addr load_addr_high = ld_inst->physEffAddrHigh & cacheBlockMask; // Check that this snoop didn't just invalidate our lock flag - if (ld_inst->effAddrValid() && (load_addr_low == invalidate_addr - || load_addr_high == invalidate_addr) + // [InvisiSpec] also make sure the instruction has been sent out + // otherwise, we cause unneccessary squash + if (ld_inst->effAddrValid() && !ld_inst->fenceDelay() + && (load_addr_low == invalidate_addr + || load_addr_high == invalidate_addr) && ld_inst->memReqFlags & Request::LLSC) TheISA::handleLockedSnoopHit(ld_inst.get()); } - // If this is the only load in the LSQ we don't care + // If not match any load entry, then do nothing [mengjia] if (load_idx == loadTail) return; @@ -479,7 +585,10 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt) while (load_idx != loadTail) { DynInstPtr ld_inst = loadQueue[load_idx]; - if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) { + // [SafeSpce] check snoop violation when the load has + // been sent out; otherwise, unneccessary squash + if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered() + || ld_inst->fenceDelay()) { incrLdIdx(load_idx); continue; } @@ -497,11 +606,29 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt) // all other loads, this load as well as *all* subsequent loads // need to be squashed to prevent possible load reordering. force_squash = true; + + // [InvisiSpec] in InvisiSpec, we do not need to squash + // the load at the head of LQ, + // as well as the one do not need validation + if (isInvisibleSpec && + (load_idx==loadHead || ld_inst->needExposeOnly())){ + force_squash = false; + } + if (!pkt->isExternalEviction() && isInvisibleSpec){ + force_squash = false; + ld_inst->clearL1HitHigh(); + ld_inst->clearL1HitLow(); + } } if (ld_inst->possibleLoadViolation() || force_squash) { DPRINTF(LSQUnit, "Conflicting load at addr %#x [sn:%lli]\n", pkt->getAddr(), ld_inst->seqNum); + //[InvisiSpec] mark the load hit invalidation + ld_inst->hitInvalidation(true); + if (pkt->isExternalEviction()){ + ld_inst->hitExternalEviction(true); + } // Mark the load for re-execution ld_inst->fault = std::make_shared<ReExec>(); } else { @@ -526,6 +653,103 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt) } template <class Impl> +bool +LSQUnit<Impl>::checkPrevLoadsExecuted(int req_idx) +{ + int load_idx = loadHead; + while (load_idx != req_idx){ + if (!loadQueue[load_idx]->isExecuted()){ + // if at least on load ahead of current load + // does not finish spec access, + // then return false + return false; + } + incrLdIdx(load_idx); + } + + //if all executed, return true + return true; +} + +template <class Impl> +void +LSQUnit<Impl>::setSpecBuffState(RequestPtr expose_req) +{ + Addr req_eff_addr1 = expose_req->getPaddr() & cacheBlockMask; + + int load_idx = loadHead; + while (load_idx != loadTail){ + DynInstPtr ld_inst = loadQueue[load_idx]; + if (ld_inst->effAddrValid()){ + + Addr ld_eff_addr1 = ld_inst->physEffAddrLow & cacheBlockMask; + Addr ld_eff_addr2 = ld_inst->physEffAddrHigh & cacheBlockMask; + if (ld_eff_addr1 == req_eff_addr1){ + ld_inst->setSpecBuffObsoleteLow(); + } else if (ld_eff_addr2 == req_eff_addr1){ + ld_inst->setSpecBuffObsoleteHigh(); + } + } + incrLdIdx(load_idx); + } +} + + +template <class Impl> +int +LSQUnit<Impl>::checkSpecBuffHit(RequestPtr req, int req_idx) +{ + + Addr req_eff_addr1 = req->getPaddr() & cacheBlockMask; + //Addr req_eff_addr2 = (req->getPaddr() + req->getSize()-1) & cacheBlockMask; + // the req should be within the same cache line + //assert (req_eff_addr1 == req_eff_addr2); + assert (!loadQueue[req_idx]->isExecuted()); + + int load_idx = loadHead; + + while (load_idx != loadTail){ + DynInstPtr ld_inst = loadQueue[load_idx]; + if (ld_inst->effAddrValid()){ + Addr ld_eff_addr1 = ld_inst->physEffAddrLow & cacheBlockMask; + Addr ld_eff_addr2 = ld_inst->physEffAddrHigh & cacheBlockMask; + + if ((req_eff_addr1 == ld_eff_addr1 && ld_inst->isL1HitLow()) + || (req_eff_addr1 == ld_eff_addr2 && ld_inst->isL1HitHigh())){ + return -1; + //already in L1, do not copy from buffer + } else { + + if (ld_inst->isExecuted() && ld_inst->needPostFetch() + && !ld_inst->isSquashed() && ld_inst->fault==NoFault){ + if (req_eff_addr1 == ld_eff_addr1 && !ld_inst->isL1HitLow() + && !ld_inst->isSpecBuffObsoleteLow()){ + DPRINTF(LSQUnit, "Detected Spec Hit with inst [sn:%lli] " + "and [sn:%lli] (low) at address %#x\n", + loadQueue[req_idx]->seqNum, ld_inst->seqNum, + req_eff_addr1); + return load_idx; + } else if ( ld_eff_addr2 !=0 && + req_eff_addr1 == ld_eff_addr2 && !ld_inst->isL1HitHigh() + && !ld_inst->isSpecBuffObsoleteHigh()){ + DPRINTF(LSQUnit, "Detected Spec Hit with inst [sn:%lli] " + "and [sn:%lli] (high) at address %#x\n", + loadQueue[req_idx]->seqNum, ld_inst->seqNum, + req_eff_addr1); + return load_idx; + } + } + } + } + incrLdIdx(load_idx); + } + + return -1; +} + + + +template <class Impl> Fault LSQUnit<Impl>::checkViolations(int load_idx, DynInstPtr &inst) { @@ -539,7 +763,10 @@ LSQUnit<Impl>::checkViolations(int load_idx, DynInstPtr &inst) */ while (load_idx != loadTail) { DynInstPtr ld_inst = loadQueue[load_idx]; - if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) { + // [InvisiSpec] no need to check violation for unsent load + // otherwise, unneccessary squash + if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered() + || ld_inst->fenceDelay()) { incrLdIdx(load_idx); continue; } @@ -618,14 +845,25 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst) assert(!inst->isSquashed()); + // use ISA interface to generate correct access request + // initiateAcc is implemented in dyn_inst_impl.hh + // The interface calls corresponding ISA defined function + // check buld/ARM/arch/generic/memhelper.hh for more info [mengjia] load_fault = inst->initiateAcc(); - if (inst->isTranslationDelayed() && + // if translation delay, deferMem [mengjia] + // in the case it is not the correct time to send the load + // also defer it + if ( (inst->isTranslationDelayed() || inst->fenceDelay() + || inst->specTLBMiss()) && load_fault == NoFault) return load_fault; // If the instruction faulted or predicated false, then we need to send it // along to commit without the instruction completing. + // + // if it is faulty, not execute it, send it to commit, and commit statge will deal with it + // here is signling the ROB, the inst can commit [mengjia] if (load_fault != NoFault || !inst->readPredicate()) { // Send this instruction to commit, also make sure iew stage // realizes there is activity. Mark it as executed unless it @@ -673,6 +911,8 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst) // address. If so, then we have a memory ordering violation. int load_idx = store_inst->lqIdx; + // TODO: Check whether this store tries to get an exclusive copy + // of target line [mengjia] Fault store_fault = store_inst->initiateAcc(); if (store_inst->isTranslationDelayed() && @@ -771,15 +1011,343 @@ LSQUnit<Impl>::writebackPendingStore() if (hasPendingPkt) { assert(pendingPkt != NULL); - // If the cache is blocked, this will store the packet for retry. - if (sendStore(pendingPkt)) { - storePostSend(pendingPkt); + if(pendingPkt->isWrite()){ + // If the cache is blocked, this will store the packet for retry. + if (sendStore(pendingPkt)) { + storePostSend(pendingPkt); + } + pendingPkt = NULL; + hasPendingPkt = false; + } + } +} + + + + +// [InvisiSpec] update FenceDelay State +template <class Impl> +void +LSQUnit<Impl>::updateVisibleState() +{ + int load_idx = loadHead; + + //iterate all the loads and update its fencedelay state accordingly + while (load_idx != loadTail && loadQueue[load_idx]){ + DynInstPtr inst = loadQueue[load_idx]; + + if (!loadInExec){ + + if ( (isFuturistic && inst->isPrevInstsCommitted()) || + (!isFuturistic && inst->isPrevBrsCommitted())){ + if (inst->fenceDelay()){ + DPRINTF(LSQUnit, "Clear virtual fence for " + "inst [sn:%lli] PC %s\n", + inst->seqNum, inst->pcState()); + } + inst->fenceDelay(false); + }else { + if (!inst->fenceDelay()){ + DPRINTF(LSQUnit, "Deffering an inst [sn:%lli] PC %s" + " due to virtual fence\n", + inst->seqNum, inst->pcState()); + } + inst->fenceDelay(true); + } + inst->readyToExpose(true); + } else if (loadInExec && isInvisibleSpec){ + + if ( (isFuturistic && inst->isPrevInstsCompleted()) || + (!isFuturistic && inst->isPrevBrsResolved())){ + if (!inst->readyToExpose()){ + DPRINTF(LSQUnit, "Set readyToExpose for " + "inst [sn:%lli] PC %s\n", + inst->seqNum, inst->pcState()); + if (inst->needPostFetch()){ + ++loadsToVLD; + } + } + inst->readyToExpose(true); + }else { + if (inst->readyToExpose()){ + DPRINTF(LSQUnit, "The load can not be validated " + "[sn:%lli] PC %s\n", + inst->seqNum, inst->pcState()); + assert(0); + //--loadsToVLD; + } + inst->readyToExpose(false); + } + inst->fenceDelay(false); + } else { + inst->readyToExpose(true); + inst->fenceDelay(false); + } + incrLdIdx(load_idx); + } +} + +// [InvisiSpec] validate loads +template <class Impl> +int +LSQUnit<Impl>::exposeLoads() +{ + if(!isInvisibleSpec){ + assert(loadsToVLD==0 + && "request validation on Non invisible Spec mode"); + } + + int old_loadsToVLD = loadsToVLD; + + // [InvisiSpec] Note: + // need to iterate from the head every time + // since the load can be exposed out-of-order + int loadVLDIdx = loadHead; + + while (loadsToVLD > 0 && + loadVLDIdx != loadTail && + loadQueue[loadVLDIdx]) { + + if (loadQueue[loadVLDIdx]->isSquashed()){ + incrLdIdx(loadVLDIdx); + continue; + } + // skip the loads that either do not need to expose + // or exposed already + if(!loadQueue[loadVLDIdx]->needPostFetch() + || loadQueue[loadVLDIdx]->isExposeSent() ){ + incrLdIdx(loadVLDIdx); + continue; + } + + DynInstPtr load_inst = loadQueue[loadVLDIdx]; + if (loadQueue[loadVLDIdx]->fault!=NoFault){ + //load is executed, so it wait for expose complete + //to send it to commit, regardless of whether it is ready + //to expose + load_inst->setExposeCompleted(); + load_inst->setExposeSent(); + loadsToVLD--; + if (load_inst->isExecuted()){ + DPRINTF(LSQUnit, "Execute finished and gets violation fault." + "Send inst [sn:%lli] to commit stage.\n", + load_inst->seqNum); + iewStage->instToCommit(load_inst); + iewStage->activityThisCycle(); + } + incrLdIdx(loadVLDIdx); + continue; + } + + // skip the loads that need expose but + // are not ready + if (loadQueue[loadVLDIdx]->needPostFetch() + && !loadQueue[loadVLDIdx]->readyToExpose()){ + incrLdIdx(loadVLDIdx); + continue; + } + + assert(loadQueue[loadVLDIdx]->needPostFetch() + && loadQueue[loadVLDIdx]->readyToExpose() ); + + assert(!load_inst->isCommitted()); + + + Request *req = load_inst->postReq; + Request *sreqLow = load_inst->postSreqLow; + Request *sreqHigh = load_inst->postSreqHigh; + + // we should not have both req and sreqLow not NULL + assert( !(req && sreqLow)); + + DPRINTF(LSQUnit, "Validate/Expose request for inst [sn:%lli]" + " PC= %s. req=%#x, reqLow=%#x, reqHigh=%#x\n", + load_inst->seqNum, load_inst->pcState(), + (Addr)(load_inst->postReq), + (Addr)(load_inst->postSreqLow), (Addr)(load_inst->postSreqHigh)); + + PacketPtr data_pkt = NULL; + PacketPtr snd_data_pkt = NULL; + + LSQSenderState *state = new LSQSenderState; + state->isLoad = false; + state->idx = loadVLDIdx; + state->inst = load_inst; + state->noWB = true; + + bool split = false; + if (TheISA::HasUnalignedMemAcc && sreqLow) { + split = true; + } else { + assert(req); + } + + bool onlyExpose = false; + if (!split) { + if (load_inst->needExposeOnly() || load_inst->isL1HitLow()){ + data_pkt = Packet::createExpose(req); + onlyExpose = true; + }else { + data_pkt = Packet::createValidate(req); + if (!load_inst->vldData) + load_inst->vldData = new uint8_t[1]; + data_pkt->dataStatic(load_inst->vldData); + } + data_pkt->senderState = state; + data_pkt->setFirst(); + data_pkt->reqIdx = loadVLDIdx; + DPRINTF(LSQUnit, "contextid = %d\n", req->contextId()); + } else { + // allocate memory if we need at least one validation + if (!load_inst->needExposeOnly() && + (!load_inst->isL1HitLow() || !load_inst->isL1HitHigh())){ + if (!load_inst->vldData) + load_inst->vldData = new uint8_t[2]; + } else { + onlyExpose = true; + } + + // Create the split packets. - first one + if (load_inst->needExposeOnly() || load_inst->isL1HitLow()){ + data_pkt = Packet::createExpose(sreqLow); + }else{ + data_pkt = Packet::createValidate(sreqLow); + assert(load_inst->vldData); + data_pkt->dataStatic(load_inst->vldData); + } + + // Create the split packets. - second one + if (load_inst->needExposeOnly() || load_inst->isL1HitHigh()){ + snd_data_pkt = Packet::createExpose(sreqHigh); + } else { + snd_data_pkt = Packet::createValidate(sreqHigh); + assert(load_inst->vldData); + snd_data_pkt->dataStatic(&(load_inst->vldData[1])); + } + + data_pkt->senderState = state; + data_pkt->setFirst(); + snd_data_pkt->senderState = state; + data_pkt->reqIdx = loadVLDIdx; + snd_data_pkt->reqIdx = loadVLDIdx; + + data_pkt->isSplit = true; + snd_data_pkt->isSplit = true; + state->isSplit = true; + state->outstanding = 2; + state->mainPkt = data_pkt; + + DPRINTF(LSQUnit, "contextid = %d, %d\n", + sreqLow->contextId(), sreqHigh->contextId()); + req = sreqLow; + } + + assert(!req->isStrictlyOrdered()); + assert(!req->isMmappedIpr()); + + DPRINTF(LSQUnit, "D-Cache: Validating/Exposing load idx:%i PC:%s " + "to Addr:%#x, data:%#x [sn:%lli]\n", + loadVLDIdx, load_inst->pcState(), + //FIXME: resultData not memData + req->getPaddr(), (int)*(load_inst->memData), + load_inst->seqNum); + + bool successful_expose = true; + bool completedFirst = false; + + if (!dcachePort->sendTimingReq(data_pkt)){ + DPRINTF(IEW, "D-Cache became blocked when " + "validating [sn:%lli], will retry later\n", + load_inst->seqNum); + successful_expose = false; + } else { + if (split) { + // If split, try to send the second packet too + completedFirst = true; + assert(snd_data_pkt); + + if (!dcachePort->sendTimingReq(snd_data_pkt)){ + state->complete(); + state->cacheBlocked = true; + successful_expose = false; + DPRINTF(IEW, "D-Cache became blocked when validating" + " [sn:%lli] second packet, will retry later\n", + load_inst->seqNum); + } + } + } + + if (!successful_expose){ + if (!split) { + delete state; + delete data_pkt; + }else{ + if (!completedFirst){ + delete state; + delete data_pkt; + delete snd_data_pkt; + } else { + delete snd_data_pkt; + } + } + //cpu->wakeCPU(); // This will cause issue(wrong activity count and affects the memory transactions + ++lsqCacheBlocked; + break; + } else { + // Here is to fix memory leakage + // it is ugly, but we have to do it now. + load_inst->needDeletePostReq(false); + + // if all the packets we sent out is expose, + // we assume the expose is alreay completed + if (onlyExpose) { + load_inst->setExposeCompleted(); + numExposes++; + } else { + numValidates++; + } + if (load_inst->needExposeOnly()){ + numConvertedExposes++; + } + if (load_inst->isExecuted() && load_inst->isExposeCompleted() + && !load_inst->isSquashed()){ + DPRINTF(LSQUnit, "Expose finished. Execution done." + "Send inst [sn:%lli] to commit stage.\n", + load_inst->seqNum); + iewStage->instToCommit(load_inst); + iewStage->activityThisCycle(); + } else{ + DPRINTF(LSQUnit, "Need validation or execution not finishes." + "Need to wait for readResp/validateResp " + "for inst [sn:%lli].\n", + load_inst->seqNum); + } + + load_inst->setExposeSent(); + --loadsToVLD; + incrLdIdx(loadVLDIdx); + if (!split){ + setSpecBuffState(req); + } else { + setSpecBuffState(sreqLow); + setSpecBuffState(sreqHigh); + } } - pendingPkt = NULL; - hasPendingPkt = false; } + + DPRINTF(LSQUnit, "Send validate/expose for %d insts. loadsToVLD=%d" + ". loadHead=%d. loadTail=%d.\n", + old_loadsToVLD-loadsToVLD, loadsToVLD, loadHead, + loadTail); + + assert(loads>=0 && loadsToVLD >= 0); + + return old_loadsToVLD-loadsToVLD; } + + + template <class Impl> void LSQUnit<Impl>::writebackStores() @@ -799,7 +1367,7 @@ LSQUnit<Impl>::writebackStores() if (isStoreBlocked) { DPRINTF(LSQUnit, "Unable to write back any more stores, cache" - " is blocked!\n"); + " is blocked on stores!\n"); break; } @@ -1013,6 +1581,12 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num) stallingLoadIdx = 0; } + if (loadQueue[load_idx]->needPostFetch() && + loadQueue[load_idx]->readyToExpose() && + !loadQueue[load_idx]->isExposeSent()){ + loadsToVLD --; + } + // Clear the smart pointer to make sure it is decremented. loadQueue[load_idx]->setSquashed(); loadQueue[load_idx] = NULL; @@ -1023,6 +1597,7 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num) decrLdIdx(load_idx); ++lsqSquashedLoads; + } if (memDepViolator && squashed_num < memDepViolator->seqNum) { @@ -1081,6 +1656,10 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num) } } + +// after sent, we assume the store is complete +// thus, we can wekeup and forward data +// In TSO, mark inFlightStore as true to block following stores [mengjia] template <class Impl> void LSQUnit<Impl>::storePostSend(PacketPtr pkt) @@ -1110,9 +1689,58 @@ LSQUnit<Impl>::storePostSend(PacketPtr pkt) storeInFlight = true; } + DPRINTF(LSQUnit, "Post sending store for inst [sn:%lli]\n", + storeQueue[storeWBIdx].inst->seqNum); incrStIdx(storeWBIdx); } + + +template <class Impl> +void +LSQUnit<Impl>::completeValidate(DynInstPtr &inst, PacketPtr pkt) +{ + iewStage->wakeCPU(); + // if instruction fault, no need to check value, + // return directly + //assert(!inst->needExposeOnly()); + if (inst->isExposeCompleted() || inst->isSquashed()){ + //assert(inst->fault != NoFault); + //Already sent to commit, do nothing + return; + } + //Check validation result + bool validation_fail = false; + if (!inst->isL1HitLow() && inst->vldData[0]==0) { + validation_fail = true; + } else { + if (pkt->isSplit && !inst->isL1HitHigh() + && inst->vldData[1]==0){ + validation_fail = true; + } + } + if (validation_fail){ + // Mark the load for re-execution + inst->fault = std::make_shared<ReExec>(); + inst->validationFail(true); + DPRINTF(LSQUnit, "Validation failed.\n", + inst->seqNum); + } + + inst->setExposeCompleted(); + if ( inst->isExecuted() && inst->isExposeCompleted() ){ + DPRINTF(LSQUnit, "Validation finished. Execution done." + "Send inst [sn:%lli] to commit stage.\n", + inst->seqNum); + iewStage->instToCommit(inst); + iewStage->activityThisCycle(); + } else{ + DPRINTF(LSQUnit, "Validation done. Execution not finishes." + "Need to wait for readResp for inst [sn:%lli].\n", + inst->seqNum); + } +} + template <class Impl> void LSQUnit<Impl>::writeback(DynInstPtr &inst, PacketPtr pkt) @@ -1126,6 +1754,11 @@ LSQUnit<Impl>::writeback(DynInstPtr &inst, PacketPtr pkt) return; } + //DPRINTF(LSQUnit, "write back for inst [sn:%lli]\n", inst->seqNum); + assert(!(inst->isExecuted() && inst->isExposeCompleted() && + inst->fault==NoFault) && + "in this case, we will put it into ROB twice."); + if (!inst->isExecuted()) { inst->setExecuted(); @@ -1145,8 +1778,42 @@ LSQUnit<Impl>::writeback(DynInstPtr &inst, PacketPtr pkt) } } - // Need to insert instruction into queue to commit - iewStage->instToCommit(inst); + // [mengjia] + // check schemes to decide whether to set load can be committed + // on receiving readResp or readSpecResp + if(!isInvisibleSpec){ + // if not invisibleSpec mode, we only receive readResp + assert(!pkt->isSpec() && !pkt->isValidate() && + "Receiving spec or validation response " + "in non invisibleSpec mode"); + iewStage->instToCommit(inst); + } else if (inst->fault != NoFault){ + inst->setExposeCompleted(); + inst->setExposeSent(); + iewStage->instToCommit(inst); + } else { + //isInvisibleSpec == true + if (pkt->isSpec()) { + inst->setSpecCompleted(); + } + + assert(!pkt->isValidate() && "receiving validation response" + "in invisibleSpec RC mode"); + assert(!pkt->isExpose() && "receiving expose response" + "on write back path"); + + // check whether the instruction can be committed + if ( !inst->isExposeCompleted() && inst->needPostFetch() ){ + DPRINTF(LSQUnit, "Expose not finished. " + "Wait until expose completion" + " to send inst [sn:%lli] to commit stage\n", inst->seqNum); + }else{ + DPRINTF(LSQUnit, "Expose and execution both finished. " + "Send inst [sn:%lli] to commit stage\n", inst->seqNum); + iewStage->instToCommit(inst); + } + + } iewStage->activityThisCycle(); @@ -1154,6 +1821,8 @@ LSQUnit<Impl>::writeback(DynInstPtr &inst, PacketPtr pkt) iewStage->checkMisprediction(inst); } +// set store to complete [mengjia] +// complete the store after it commits template <class Impl> void LSQUnit<Impl>::completeStore(int store_idx) @@ -1229,9 +1898,12 @@ LSQUnit<Impl>::sendStore(PacketPtr data_pkt) retryPkt = data_pkt; return false; } + setSpecBuffState(data_pkt->req); return true; } + + template <class Impl> void LSQUnit<Impl>::recvRetry() @@ -1239,6 +1911,7 @@ LSQUnit<Impl>::recvRetry() if (isStoreBlocked) { DPRINTF(LSQUnit, "Receiving retry: store blocked\n"); assert(retryPkt != NULL); + assert(retryPkt->isWrite()); LSQSenderState *state = dynamic_cast<LSQSenderState *>(retryPkt->senderState); @@ -1287,7 +1960,7 @@ template <class Impl> inline void LSQUnit<Impl>::incrLdIdx(int &load_idx) const { - if (++load_idx >= LQEntries) + if ((++load_idx) >= LQEntries) load_idx = 0; } @@ -1295,7 +1968,7 @@ template <class Impl> inline void LSQUnit<Impl>::decrLdIdx(int &load_idx) const { - if (--load_idx < 0) + if ((--load_idx) < 0) load_idx += LQEntries; } diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh index bc024f603..66700563a 100644 --- a/src/cpu/o3/rename_impl.hh +++ b/src/cpu/o3/rename_impl.hh @@ -580,7 +580,12 @@ DefaultRename<Impl>::renameInsts(ThreadID tid) tid, instsInProgress[tid], fromIEW->iewInfo[tid].dispatched); // Handle serializing the next instruction if necessary. - if (serializeOnNextInst[tid]) { + // Add one more serializing condition [mengjia] + if (serializeOnNextInst[tid] || insts_to_rename.front()->isBlock()) { + if(insts_to_rename.front()->isBlock()){ + DPRINTF(Rename, "Rename got rdtscp instructions for thread %d and ROB empty: %d, instProgress: %d, handled: %d.\n", + tid, emptyROB[tid], instsInProgress[tid], insts_to_rename.front()->isSerializeHandled()); + } if (emptyROB[tid] && instsInProgress[tid] == 0) { // ROB already empty; no need to serialize. serializeOnNextInst[tid] = false; @@ -669,6 +674,12 @@ DefaultRename<Impl>::renameInsts(ThreadID tid) // instructions. This is mainly due to lack of support for // out-of-order operations of either of those classes of // instructions. + + // added debug infor for block state / rdtscp [mengjia] + if(inst->isBlock()){ + DPRINTF(Rename, "Rename got rdtscp instructions for thread %d and ROB empty: %d, instProgress: %d, handled: %d.\n", + tid, emptyROB[tid], instsInProgress[tid], inst->isSerializeHandled()); + } if ((inst->isIprAccess() || inst->isSerializeBefore()) && !inst->isSerializeHandled()) { DPRINTF(Rename, "Serialize before instruction encountered.\n"); @@ -685,7 +696,7 @@ DefaultRename<Impl>::renameInsts(ThreadID tid) renameStatus[tid] = SerializeStall; serializeInst[tid] = inst; - + blockThisCycle = true; break; @@ -1370,7 +1381,8 @@ DefaultRename<Impl>::serializeAfter(InstQueue &inst_list, ThreadID tid) // Mark a bit to say that I must serialize on the next instruction. serializeOnNextInst[tid] = true; return; - } + } + // Set the next instruction as serializing. inst_list.front()->setSerializeBefore(); diff --git a/src/cpu/o3/rob.hh b/src/cpu/o3/rob.hh index 1c3cc2815..7024d9920 100644 --- a/src/cpu/o3/rob.hh +++ b/src/cpu/o3/rob.hh @@ -212,6 +212,10 @@ class ROB /** Updates the tail instruction with the new youngest instruction. */ void updateTail(); + /** [SafeSpce] Updates load instructions visible condition + * set isPrevInstsCompleted and isPrevBrsResolved. */ + void updateVisibleState(); + /** Reads the PC of the oldest head instruction. */ // uint64_t readHeadPC(); diff --git a/src/cpu/o3/rob_impl.hh b/src/cpu/o3/rob_impl.hh index 5a9dc90f9..ebfbb9754 100644 --- a/src/cpu/o3/rob_impl.hh +++ b/src/cpu/o3/rob_impl.hh @@ -402,6 +402,85 @@ ROB<Impl>::doSquash(ThreadID tid) } +/* ************************** + * [InvisiSpec] update load insts state + * isPrevInstsCompleted; isPrevBrsResolved + * *************************/ +template <class Impl> +void +ROB<Impl>::updateVisibleState() +{ + list<ThreadID>::iterator threads = activeThreads->begin(); + list<ThreadID>::iterator end = activeThreads->end(); + + while (threads != end) { + ThreadID tid = *threads++; + + if (instList[tid].empty()) + continue; + + InstIt inst_it = instList[tid].begin(); + InstIt tail_inst_it = instList[tid].end(); + + bool prevInstsComplete=true; + bool prevBrsResolved=true; + bool prevInstsCommitted=true; + bool prevBrsCommitted=true; + + while (inst_it != tail_inst_it) { + DynInstPtr inst = *inst_it++; + + assert(inst!=0); + + if (!prevInstsComplete && + !prevBrsResolved) { + break; + } + + if (inst->isLoad()) { + if (prevInstsComplete) { + inst->setPrevInstsCompleted(); + } + if (prevBrsResolved){ + inst->setPrevBrsResolved(); + } + if (prevInstsCommitted) { + inst->setPrevInstsCommitted(); + } + if (prevBrsCommitted) { + inst->setPrevBrsCommitted(); + } + } + + // Update prev control insts state + if (inst->isControl()){ + prevBrsCommitted = false; + if (!inst->readyToCommit() || inst->getFault()!=NoFault + || inst->isSquashed()){ + prevBrsResolved = false; + } + } + + prevInstsCommitted = false; + + // Update prev insts state + if (inst->isNonSpeculative() || inst->isStoreConditional() + || inst->isMemBarrier() || inst->isWriteBarrier() || + (inst->isLoad() && inst->strictlyOrdered())){ + //Some special instructions, directly set canCommit + //when entering ROB + prevInstsComplete = false; + } + if (!inst->readyToCommit() || inst->getFault()!=NoFault + || inst->isSquashed()){ + prevInstsComplete = false; + } + + } + } +} + + template <class Impl> void ROB<Impl>::updateHead() diff --git a/src/cpu/static_inst.hh b/src/cpu/static_inst.hh index 79f45d828..8e98eb940 100644 --- a/src/cpu/static_inst.hh +++ b/src/cpu/static_inst.hh @@ -166,6 +166,7 @@ class StaticInst : public RefCounted, public StaticInstFlags bool isSerializing() const { return flags[IsSerializing] || flags[IsSerializeBefore] || flags[IsSerializeAfter]; } + bool isBlock() const { return flags[IsBlock]; } // add attribute for blocking issue stage [mengjia] bool isSerializeBefore() const { return flags[IsSerializeBefore]; } bool isSerializeAfter() const { return flags[IsSerializeAfter]; } bool isSquashAfter() const { return flags[IsSquashAfter]; } diff --git a/src/mem/packet.cc b/src/mem/packet.cc index ffda3d5af..f664878e2 100644 --- a/src/mem/packet.cc +++ b/src/mem/packet.cc @@ -219,7 +219,24 @@ MemCmd::commandInfo[] = InvalidateResp, "InvalidateReq" }, /* Invalidation Response */ { SET2(IsInvalidate, IsResponse), - InvalidCmd, "InvalidateResp" } + InvalidCmd, "InvalidateResp" }, + /* [InvisiSpec] New command info */ + { SET4(IsRead, IsRequest, NeedsResponse, IsSpec), + ReadSpecResp, "ReadSpecReq" }, + { SET4(IsRead, IsResponse, HasData, IsSpec), + InvalidCmd, "ReadSpecResp" }, + { SET4(IsRead, IsRequest, NeedsResponse, IsValidate), + ValidateResp, "ValidateReq" }, + { SET4(IsRead, IsResponse, HasData, IsValidate), + InvalidCmd, "ValidateResp" }, + { SET4(IsRead, IsRequest, NeedsResponse, IsExpose), + ExposeResp, "ExposeReq" }, + { SET3(IsRead, IsResponse, IsExpose), + InvalidCmd, "ExposeResp" }, + { SET3(IsRequest, NeedsResponse, IsSpecFlush), + SpecFlushResp, "SpecFlushReq" }, + { SET2(IsResponse, IsSpecFlush), + InvalidCmd, "SpecFlushResp" } }; bool diff --git a/src/mem/packet.hh b/src/mem/packet.hh index 66625b382..026cdabfc 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -133,6 +133,15 @@ class MemCmd FlushReq, //request for a cache flush InvalidateReq, // request for address to be invalidated InvalidateResp, + /* [InvisiSpec] New commands */ + ReadSpecReq, + ReadSpecResp, + ValidateReq, + ValidateResp, + ExposeReq, + ExposeResp, + SpecFlushReq, + SpecFlushResp, NUM_MEM_CMDS }; @@ -160,6 +169,11 @@ class MemCmd IsPrint, //!< Print state matching address (for debugging) IsFlush, //!< Flush the address from caches FromCache, //!< Request originated from a caching agent + /* [InvisiSpec] New attributes */ + IsSpec, //!< Speculatively issued + IsValidate, + IsExpose, + IsSpecFlush, NUM_COMMAND_ATTRIBUTES }; @@ -226,6 +240,12 @@ class MemCmd bool isPrint() const { return testCmdAttrib(IsPrint); } bool isFlush() const { return testCmdAttrib(IsFlush); } + /// [InvisiSpec] InvisiSpec attributes + bool isSpec() const { return testCmdAttrib(IsSpec); } + bool isValidate() const { return testCmdAttrib(IsValidate); } + bool isExpose() const { return testCmdAttrib(IsExpose); } + bool isSpecFlush() const { return testCmdAttrib(IsSpecFlush); } + Command responseCommand() const { @@ -306,7 +326,17 @@ class Packet : public Printable // Signal block present to squash prefetch and cache evict packets // through express snoop flag - BLOCK_CACHED = 0x00010000 + BLOCK_CACHED = 0x00010000, + + // [InvisiSpec] ReadSpecReq was L1 hit. + L1_HIT = 0x00020000, + + // [InvisiSpec] this packet is the first one of split packets + // maximum split is 2 + FIRST_IN_SPLIT = 0x00040000, + ONLY_ACCESS_SPEC_BUFF = 0x00080000, + + EXTERNAL_EVICTION = 0x00100000, }; Flags flags; @@ -376,6 +406,12 @@ class Packet : public Printable */ uint32_t payloadDelay; + //[InvisiSpec] indicate the source buffer entry + //if the load should get data from specbuffer + int srcIdx; + int reqIdx; + bool isSplit; + /** * A virtual base opaque structure used to hold state associated * with the packet (e.g., an MSHR), specific to a MemObject that @@ -549,6 +585,45 @@ class Packet : public Printable bool isPrint() const { return cmd.isPrint(); } bool isFlush() const { return cmd.isFlush(); } + /// [InvisiSpec] InvisiSpec flags + bool isSpec() const { return cmd.isSpec(); } + bool isValidate() const { return cmd.isValidate(); } + bool isExpose() const { return cmd.isExpose(); } + bool isSpecFlush() const { return cmd.isSpecFlush(); } + bool isL1Hit() const { return flags.isSet(L1_HIT); } + bool isExternalEviction() const { return flags.isSet(EXTERNAL_EVICTION); } + // [InvisiSpec] Check whether it is the first in split packets + bool isFirst() const { return flags.isSet(FIRST_IN_SPLIT); } + bool onlyAccessSpecBuff() const + { return flags.isSet(ONLY_ACCESS_SPEC_BUFF); } + + void setL1Hit() + { + assert(isSpec()); + assert(!flags.isSet(L1_HIT)); + flags.set(L1_HIT); + } + + void setExternalEviction() + { + assert(!flags.isSet(EXTERNAL_EVICTION)); + flags.set(EXTERNAL_EVICTION); + } + + void setOnlyAccessSpecBuff() + { + assert(isSpec()); + assert(!flags.isSet(ONLY_ACCESS_SPEC_BUFF)); + flags.set(ONLY_ACCESS_SPEC_BUFF); + } + + void setFirst() + { + //assert(isSpec()); + assert(!flags.isSet(FIRST_IN_SPLIT)); + flags.set(FIRST_IN_SPLIT); + } + //@{ /// Snoop flags /** @@ -748,7 +823,8 @@ class Packet : public Printable Packet(const RequestPtr _req, MemCmd _cmd) : cmd(_cmd), id((PacketId)_req), req(_req), data(nullptr), addr(0), _isSecure(false), size(0), headerDelay(0), snoopDelay(0), - payloadDelay(0), senderState(NULL) + payloadDelay(0), srcIdx(-1), reqIdx(-1), isSplit(false), + senderState(NULL) { if (req->hasPaddr()) { addr = req->getPaddr(); @@ -769,7 +845,8 @@ class Packet : public Printable Packet(const RequestPtr _req, MemCmd _cmd, int _blkSize, PacketId _id = 0) : cmd(_cmd), id(_id ? _id : (PacketId)_req), req(_req), data(nullptr), addr(0), _isSecure(false), headerDelay(0), snoopDelay(0), - payloadDelay(0), senderState(NULL) + payloadDelay(0), srcIdx(-1), reqIdx(-1), isSplit(false), + senderState(NULL) { if (req->hasPaddr()) { addr = req->getPaddr() & ~(_blkSize - 1); @@ -795,6 +872,9 @@ class Packet : public Printable headerDelay(pkt->headerDelay), snoopDelay(0), payloadDelay(pkt->payloadDelay), + srcIdx(pkt->srcIdx), + reqIdx(pkt->reqIdx), + isSplit(pkt->isSplit), senderState(pkt->senderState) { if (!clear_flags) @@ -869,6 +949,33 @@ class Packet : public Printable } /** + * [InvisiSpec] Methods that return Packets for InvisiSpec. + */ + static PacketPtr + createReadSpec(const RequestPtr req) + { + return new Packet(req, MemCmd::ReadSpecReq); + } + + static PacketPtr + createValidate(const RequestPtr req) + { + return new Packet(req, MemCmd::ValidateReq); + } + + static PacketPtr + createExpose(const RequestPtr req) + { + return new Packet(req, MemCmd::ExposeReq); + } + + static PacketPtr + createSpecFlush(const RequestPtr req) + { + return new Packet(req, MemCmd::SpecFlushReq); + } + + /** * clean up packet variables */ ~Packet() diff --git a/src/mem/port.cc b/src/mem/port.cc index 756eb8bdd..924a2188c 100644 --- a/src/mem/port.cc +++ b/src/mem/port.cc @@ -176,6 +176,7 @@ MasterPort::sendFunctional(PacketPtr pkt) return _slavePort->recvFunctional(pkt); } +// [InvisiSpec] Request from CPU to Ruby bool MasterPort::sendTimingReq(PacketPtr pkt) { diff --git a/src/mem/protocol/MESI_Two_Level-L1cache.sm b/src/mem/protocol/MESI_Two_Level-L1cache.sm index 87684ce10..846af7da5 100644 --- a/src/mem/protocol/MESI_Two_Level-L1cache.sm +++ b/src/mem/protocol/MESI_Two_Level-L1cache.sm @@ -76,10 +76,12 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") S, AccessPermission:Read_Only, desc="a L1 cache entry Shared"; E, AccessPermission:Read_Only, desc="a L1 cache entry Exclusive"; M, AccessPermission:Read_Write, desc="a L1 cache entry Modified", format="!b"; + X, AccessPermission:Read_Only, desc="a L1 cache entry Speculatively observed"; // Transient States IS, AccessPermission:Busy, desc="L1 idle, issued GETS, have not seen response yet"; IM, AccessPermission:Busy, desc="L1 idle, issued GETX, have not seen response yet"; + IX, AccessPermission:Busy, desc="L1 idle, issued GETSPEC, have not seen response yet"; SM, AccessPermission:Read_Only, desc="L1 idle, issued GETX, have not seen response yet"; IS_I, AccessPermission:Busy, desc="L1 idle, issued GETS, saw Inv before data because directory doesn't block on GETS hit"; @@ -99,6 +101,8 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") Load, desc="Load request from the home processor"; Ifetch, desc="I-fetch request from the home processor"; Store, desc="Store request from the home processor"; + SpecLoad, desc="SpecLoad request from the home processor"; + Expose, desc="Expose request from the home processor"; Inv, desc="Invalidate request from L2 bank"; @@ -110,6 +114,8 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") Fwd_GETX, desc="GETX from other processor"; Fwd_GETS, desc="GETS from other processor"; Fwd_GET_INSTR, desc="GET_INSTR from other processor"; + Fwd_GETSPEC, desc="GETSPEC from other processor"; + Fwd_EXPOSE, desc="EXPOSE from other processor"; Data, desc="Data for processor"; Data_Exclusive, desc="Data for processor"; @@ -188,6 +194,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") } State getState(TBE tbe, Entry cache_entry, Addr addr) { + // [InvisiSpec] The same cache line cannot be present in L1D and L1I at the same time. assert((L1Dcache.isTagPresent(addr) && L1Icache.isTagPresent(addr)) == false); if(is_valid(tbe)) { @@ -265,6 +272,10 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") return Event:Ifetch; } else if ((type == RubyRequestType:ST) || (type == RubyRequestType:ATOMIC)) { return Event:Store; + } else if (type == RubyRequestType:SPEC_LD) { + return Event:SpecLoad; + } else if (type == RubyRequestType:EXPOSE) { + return Event:Expose; } else { error("Invalid RubyRequestType"); } @@ -387,6 +398,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") trigger(Event:Data_Exclusive, in_msg.addr, cache_entry, tbe); } else if(in_msg.Type == CoherenceResponseType:DATA) { if ((getState(tbe, cache_entry, in_msg.addr) == State:IS || + getState(tbe, cache_entry, in_msg.addr) == State:IX || getState(tbe, cache_entry, in_msg.addr) == State:IS_I || getState(tbe, cache_entry, in_msg.addr) == State:PF_IS || getState(tbe, cache_entry, in_msg.addr) == State:PF_IS_I) && @@ -433,6 +445,10 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") trigger(Event:Fwd_GETS, in_msg.addr, cache_entry, tbe); } else if (in_msg.Type == CoherenceRequestType:GET_INSTR) { trigger(Event:Fwd_GET_INSTR, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:GETSPEC) { + trigger(Event:Fwd_GETSPEC, in_msg.addr, cache_entry, tbe); + } else if (in_msg.Type == CoherenceRequestType:EXPOSE) { + trigger(Event:Fwd_EXPOSE, in_msg.addr, cache_entry, tbe); } else { error("Invalid forwarded request type"); } @@ -534,6 +550,43 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") out_msg.MessageSize := MessageSizeType:Control; out_msg.Prefetch := in_msg.Prefetch; out_msg.AccessMode := in_msg.AccessMode; + out_msg.idx := in_msg.idx; + } + } + } + + action(as_issueGETSPEC, "as", desc="Issue GETSPEC") { + peek(mandatoryQueue_in, RubyRequest) { + enqueue(requestL1Network_out, RequestMsg, l1_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:GETSPEC; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address, MachineType:L2Cache, + l2_select_low_bit, l2_select_num_bits, intToID(0))); + DPRINTF(RubySlicc, "address: %#x, destination: %s\n", + address, out_msg.Destination); + out_msg.MessageSize := MessageSizeType:SPECLD_Control; + out_msg.Prefetch := in_msg.Prefetch; + out_msg.AccessMode := in_msg.AccessMode; + out_msg.idx := in_msg.idx; + } + } + } + + action(ex_issueEXPOSE, "ex", desc="Issue EXPOSE") { + peek(mandatoryQueue_in, RubyRequest) { + enqueue(requestL1Network_out, RequestMsg, l1_request_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceRequestType:EXPOSE; + out_msg.Requestor := machineID; + out_msg.Destination.add(mapAddressToRange(address, MachineType:L2Cache, + l2_select_low_bit, l2_select_num_bits, intToID(0))); + DPRINTF(RubySlicc, "address: %#x, destination: %s\n", + address, out_msg.Destination); + out_msg.MessageSize := MessageSizeType:EXPOSE_Control; + out_msg.Prefetch := in_msg.Prefetch; + out_msg.AccessMode := in_msg.AccessMode; + out_msg.idx := in_msg.idx; } } } @@ -568,6 +621,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") out_msg.MessageSize := MessageSizeType:Control; out_msg.Prefetch := in_msg.Prefetch; out_msg.AccessMode := in_msg.AccessMode; + out_msg.idx := in_msg.idx; } } } @@ -606,6 +660,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") out_msg.MessageSize := MessageSizeType:Control; out_msg.Prefetch := in_msg.Prefetch; out_msg.AccessMode := in_msg.AccessMode; + out_msg.idx := in_msg.idx; } } } @@ -643,6 +698,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") out_msg.MessageSize := MessageSizeType:Control; out_msg.Prefetch := in_msg.Prefetch; out_msg.AccessMode := in_msg.AccessMode; + out_msg.idx := in_msg.idx; } } } @@ -662,6 +718,36 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") } } + action(dex_sendDataToExposeRequestor, "dex", desc="send data to requestor") { + peek(requestL1Network_in, RequestMsg) { + enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Dirty := cache_entry.Dirty; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := MessageSizeType:EXPOSE_Data; + } + } + } + + action(ds_sendDataToSpecRequestor, "ds", desc="send data to requestor") { + peek(requestL1Network_in, RequestMsg) { + enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Dirty := cache_entry.Dirty; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := MessageSizeType:SPECLD_Data; + } + } + } + action(d2_sendDataToL2, "d2", desc="send data to the L2 cache because of M downgrade") { enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) { assert(is_valid(cache_entry)); @@ -676,6 +762,20 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") } } + action(d2ex_sendExposeDataToL2, "d2ex", desc="send data to the L2 cache because of M downgrade") { + enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.Dirty := cache_entry.Dirty; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address, MachineType:L2Cache, + l2_select_low_bit, l2_select_num_bits, intToID(0))); + out_msg.MessageSize := MessageSizeType:EXPOSE_Data; + } + } + action(dt_sendDataToRequestor_fromTBE, "dt", desc="send data to requestor") { peek(requestL1Network_in, RequestMsg) { enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) { @@ -691,6 +791,36 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") } } + action(dtex_sendDataToExposeRequestor_fromTBE, "dtex", desc="send data to requestor") { + peek(requestL1Network_in, RequestMsg) { + enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) { + assert(is_valid(tbe)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := MessageSizeType:EXPOSE_Data; + } + } + } + + action(dts_sendDataToSpecRequestor_fromTBE, "dts", desc="send data to requestor") { + peek(requestL1Network_in, RequestMsg) { + enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) { + assert(is_valid(tbe)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.MessageSize := MessageSizeType:SPECLD_Data; + } + } + } + action(d2t_sendDataToL2_fromTBE, "d2t", desc="send data to the L2 cache") { enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) { assert(is_valid(tbe)); @@ -705,6 +835,20 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") } } + action(d2tex_sendExposeDataToL2_fromTBE, "d2tex", desc="send data to the L2 cache") { + enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) { + assert(is_valid(tbe)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.DataBlk := tbe.DataBlk; + out_msg.Dirty := tbe.Dirty; + out_msg.Sender := machineID; + out_msg.Destination.add(mapAddressToRange(address, MachineType:L2Cache, + l2_select_low_bit, l2_select_num_bits, intToID(0))); + out_msg.MessageSize := MessageSizeType:EXPOSE_Data; + } + } + action(e_sendAckToRequestor, "e", desc="send invalidate ack to requestor (could be L2 or L1)") { peek(requestL1Network_in, RequestMsg) { enqueue(responseL1Network_out, ResponseMsg, l1_response_latency) { @@ -761,7 +905,14 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") action(forward_eviction_to_cpu, "\cc", desc="sends eviction information to the processor") { if (send_evictions) { DPRINTF(RubySlicc, "Sending invalidation for %#x to the CPU\n", address); - sequencer.evictionCallback(address); + sequencer.evictionCallback(address, false); + } + } + + action(forward_external_eviction_to_cpu, "\ccc", desc="sends external eviction information to the processor") { + if (send_evictions) { + DPRINTF(RubySlicc, "Sending invalidation for %#x to the CPU\n", address); + sequencer.evictionCallback(address, true); } } @@ -822,6 +973,14 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") sequencer.readCallback(address, cache_entry.DataBlk); } + action(h_spec_load_hit, "hs", + desc="Notify sequencer the spec load completed.") + { + assert(is_valid(cache_entry)); + DPRINTF(RubySlicc, "%s\n", cache_entry.DataBlk); + sequencer.readCallback(address, cache_entry.DataBlk); + } + action(h_ifetch_hit, "hi", desc="Notify sequencer the instruction fetch completed.") { assert(is_valid(cache_entry)); @@ -839,6 +998,15 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") sequencer.readCallback(address, cache_entry.DataBlk, true); } + action(hsx_spec_load_hit, "hsx", desc="Notify sequencer the external load completed.") + { + peek(responseL1Network_in, ResponseMsg) { + // [InvisiSpec] Hack for in_msg.DataBlk returning const DataBlk + tbe.DataBlk := in_msg.DataBlk; + sequencer.readCallback(address, tbe.DataBlk, true); + } + } + action(hh_store_hit, "\h", desc="Notify sequencer that store completed.") { assert(is_valid(cache_entry)); @@ -868,6 +1036,14 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") tbe.DataBlk := cache_entry.DataBlk; } + action(iw_allocateTBEWithoutCacheEntry, "iw", desc="Allocate TBE without a cache entry") { + check_allocate(TBEs); + assert(!is_valid(cache_entry) || cache_entry.CacheState == State:I); + TBEs.allocate(address); + set_tbe(TBEs[address]); + tbe.isPrefetch := false; + } + action(k_popMandatoryQueue, "k", desc="Pop mandatory queue.") { mandatoryQueue_in.dequeue(clockEdge()); } @@ -989,13 +1165,19 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") cache_entry.isPrefetch := true; } + action(x_expose_done, "xd", + desc="Notify sequencer the expose completed.") + { + sequencer.readCallback(address, cache_entry.DataBlk); + } + //***************************************************** // TRANSITIONS //***************************************************** // Transitions for Load/Store/Replacement/WriteBack from transient states - transition({IS, IM, IS_I, M_I, SM, SINK_WB_ACK}, {Load, Ifetch, Store, L1_Replacement}) { + transition({IS, IM, IX, IS_I, M_I, SM, SINK_WB_ACK}, {Load, Expose, SpecLoad, Ifetch, Store, L1_Replacement}) { z_stallAndWaitMandatoryQueue; } @@ -1003,7 +1185,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") z_stallAndWaitMandatoryQueue; } - transition({PF_IM, PF_SM}, {Load, Ifetch, L1_Replacement}) { + transition({PF_IM, PF_SM}, {Load, Expose, SpecLoad, Ifetch, L1_Replacement}) { z_stallAndWaitMandatoryQueue; } @@ -1016,7 +1198,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") ff_deallocateL1CacheBlock; } - transition({S,E,M,IS,IM,SM,IS_I,PF_IS_I,M_I,SINK_WB_ACK,PF_IS,PF_IM}, + transition({S,E,M,IS,IM,IX,SM,IS_I,PF_IS_I,M_I,SINK_WB_ACK,PF_IS,PF_IM}, {PF_Load, PF_Store, PF_Ifetch}) { pq_popPrefetchQueue; } @@ -1030,6 +1212,21 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") k_popMandatoryQueue; } + transition({NP,I}, Expose, IS) { + oo_allocateL1DCacheBlock; + i_allocateTBE; + ex_issueEXPOSE; + uu_profileDataMiss; + //po_observeMiss; + k_popMandatoryQueue; + } + + transition({NP,I}, SpecLoad, IX) { + iw_allocateTBEWithoutCacheEntry; + as_issueGETSPEC; + k_popMandatoryQueue; + } + transition({NP,I}, PF_Load, PF_IS) { oo_allocateL1DCacheBlock; i_allocateTBE; @@ -1037,13 +1234,13 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") pq_popPrefetchQueue; } - transition(PF_IS, Load, IS) { + transition(PF_IS, {Load, Expose}, IS) { uu_profileDataMiss; ppm_observePfMiss; k_popMandatoryQueue; } - transition(PF_IS_I, Load, IS_I) { + transition(PF_IS_I, {Load, Expose}, IS_I) { uu_profileDataMiss; ppm_observePfMiss; k_popMandatoryQueue; @@ -1055,6 +1252,10 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") k_popMandatoryQueue; } + transition({PF_IS, PF_IS_I}, SpecLoad) { + k_popMandatoryQueue; + } + transition({NP,I}, Ifetch, IS) { pp_allocateL1ICacheBlock; i_allocateTBE; @@ -1107,19 +1308,24 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") k_popMandatoryQueue; } - transition({NP, I}, Inv) { + transition({NP, I, IX}, Inv) { fi_sendInvAck; l_popRequestQueue; } // Transitions from Shared - transition({S,E,M}, Load) { + transition({S,E,M}, {Load, Expose}) { h_load_hit; uu_profileDataHit; po_observeHit; k_popMandatoryQueue; } + transition({S,E,M}, SpecLoad) { + h_spec_load_hit; + k_popMandatoryQueue; + } + transition({S,E,M}, Ifetch) { h_ifetch_hit; uu_profileInstHit; @@ -1140,7 +1346,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") } transition(S, Inv, I) { - forward_eviction_to_cpu; + forward_external_eviction_to_cpu; fi_sendInvAck; l_popRequestQueue; } @@ -1164,13 +1370,13 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") transition(E, Inv, I) { // don't send data - forward_eviction_to_cpu; + forward_external_eviction_to_cpu; fi_sendInvAck; l_popRequestQueue; } transition(E, Fwd_GETX, I) { - forward_eviction_to_cpu; + forward_external_eviction_to_cpu; d_sendDataToRequestor; l_popRequestQueue; } @@ -1181,6 +1387,17 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") l_popRequestQueue; } + transition({E, M}, Fwd_GETSPEC) { + ds_sendDataToSpecRequestor; + l_popRequestQueue; + } + + transition({E, M}, Fwd_EXPOSE, S) { + dex_sendDataToExposeRequestor; + d2ex_sendExposeDataToL2; + l_popRequestQueue; + } + // Transitions from Modified transition(M, {L1_Replacement, PF_L1_Replacement}, M_I) { @@ -1197,7 +1414,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") } transition(M, Inv, I) { - forward_eviction_to_cpu; + forward_external_eviction_to_cpu; f_sendDataToL2; l_popRequestQueue; } @@ -1208,7 +1425,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") } transition(M, Fwd_GETX, I) { - forward_eviction_to_cpu; + forward_external_eviction_to_cpu; d_sendDataToRequestor; l_popRequestQueue; } @@ -1230,6 +1447,17 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") l_popRequestQueue; } + transition(M_I, Fwd_EXPOSE, SINK_WB_ACK) { + dtex_sendDataToExposeRequestor_fromTBE; + d2tex_sendExposeDataToL2_fromTBE; + l_popRequestQueue; + } + + transition(M_I, Fwd_GETSPEC) { + dts_sendDataToSpecRequestor_fromTBE; + l_popRequestQueue; + } + // Transitions from IS transition({IS, IS_I}, Inv, IS_I) { fi_sendInvAck; @@ -1341,6 +1569,14 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") kd_wakeUpDependents; } + // [InvisiSpec] Data and Data_Exclusive are not possible at IX + transition(IX, {Data_all_Acks, DataS_fromL1}, I) { + hsx_spec_load_hit; + s_deallocateTBE; + o_popIncomingResponseQueue; + kd_wakeUpDependents; + } + // Transitions from IM transition(IM, Inv, IM) { fi_sendInvAck; @@ -1384,7 +1620,7 @@ machine(MachineType:L1Cache, "MESI Directory L1 Cache CMP") // transitions from SM transition(SM, Inv, IM) { - forward_eviction_to_cpu; + forward_external_eviction_to_cpu; fi_sendInvAck; dg_invalidate_sc; l_popRequestQueue; diff --git a/src/mem/protocol/MESI_Two_Level-L2cache.sm b/src/mem/protocol/MESI_Two_Level-L2cache.sm index 5a8cfae6d..ea884133e 100644 --- a/src/mem/protocol/MESI_Two_Level-L2cache.sm +++ b/src/mem/protocol/MESI_Two_Level-L2cache.sm @@ -72,6 +72,8 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") ISS, AccessPermission:Busy, desc="L2 idle, got single L1_GETS, issued memory fetch, have not seen response yet"; IS, AccessPermission:Busy, desc="L2 idle, got L1_GET_INSTR or multiple L1_GETS, issued memory fetch, have not seen response yet"; IM, AccessPermission:Busy, desc="L2 idle, got L1_GETX, issued memory fetch, have not seen response(s) yet"; + II, AccessPermission:Busy, desc="L2 idle, got single L1_GETSPEC, issued memory fetch, have not seen response yet"; + IEE, AccessPermission:Busy, desc="L2 idle, got single L1_EXPOSE, issued memory fetch, have not seen response yet"; // Blocking states SS_MB, AccessPermission:Busy, desc="Blocked for L1_GETX from SS"; @@ -96,6 +98,9 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") L1_PUTX, desc="L1 replacing data"; L1_PUTX_old, desc="L1 replacing data, but no longer sharer"; + L1_GETSPEC, desc="L1 GETSPEC request for a block mapped to us"; + L1_EXPOSE, desc="L1 EXPOSE request for a block mapped to us"; + // events initiated by this L2 L2_Replacement, desc="L2 Replacement", format="!r"; L2_Replacement_clean, desc="L2 Replacement, but data is clean", format="!r"; @@ -135,6 +140,8 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") bool Dirty, default="false", desc="Data is Dirty"; NetDest L1_GetS_IDs, desc="Set of the internal processors that want the block in shared state"; + NetDest L1_GetSPEC_IDs, desc="Set of the internal processors that want the block speculatively"; + NetDest L1_Expose_IDs, desc="Set of the internal processors that want the block to be exposed"; MachineID L1_GetX_ID, desc="ID of the L1 cache to forward the block to once we get a response"; int pendingAcks, desc="number of pending acks for invalidates during writeback"; } @@ -267,6 +274,10 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") } else { return Event:L1_PUTX_old; } + } else if (type == CoherenceRequestType:GETSPEC) { + return Event:L1_GETSPEC; + } else if (type == CoherenceRequestType:EXPOSE) { + return Event:L1_EXPOSE; } else { DPRINTF(RubySlicc, "address: %#x, Request Type: %s\n", addr, type); error("Invalid L1 forwarded request type"); @@ -399,10 +410,12 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") peek(L1RequestL2Network_in, RequestMsg) { enqueue(DirRequestL2Network_out, RequestMsg, l2_request_latency) { out_msg.addr := address; - out_msg.Type := CoherenceRequestType:GETS; + out_msg.Type := in_msg.Type; out_msg.Requestor := machineID; out_msg.Destination.add(mapAddressToMachine(address, MachineType:Directory)); - out_msg.MessageSize := MessageSizeType:Control; + out_msg.MessageSize := in_msg.MessageSize; + out_msg.idx := in_msg.idx; + out_msg.origin := in_msg.Requestor; } } } @@ -420,6 +433,32 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") } } + action(bs_forwardSpecRequestToExclusive, "bs", desc="Forward request to the exclusive L1") { + peek(L1RequestL2Network_in, RequestMsg) { + enqueue(L1RequestL2Network_out, RequestMsg, to_l1_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := in_msg.Requestor; + out_msg.Destination.add(cache_entry.Exclusive); + out_msg.MessageSize := MessageSizeType:SPECLD_Request_Control; + } + } + } + + action(bex_forwardExposeRequestToExclusive, "bex", desc="Forward request to the exclusive L1") { + peek(L1RequestL2Network_in, RequestMsg) { + enqueue(L1RequestL2Network_out, RequestMsg, to_l1_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := in_msg.Type; + out_msg.Requestor := in_msg.Requestor; + out_msg.Destination.add(cache_entry.Exclusive); + out_msg.MessageSize := MessageSizeType:EXPOSE_Request_Control; + } + } + } + action(c_exclusiveReplacement, "c", desc="Send data to memory") { enqueue(responseL2Network_out, ResponseMsg, l2_response_latency) { assert(is_valid(cache_entry)); @@ -494,6 +533,25 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") } } + action(ddex_sendExclusiveDataToExposeRequestor, "ddex", desc="Send data from cache to reqeustor") { + peek(L1RequestL2Network_in, RequestMsg) { + enqueue(responseL2Network_out, ResponseMsg, l2_response_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:EXPOSE_Data; + + out_msg.AckCount := 0 - cache_entry.Sharers.count(); + if (cache_entry.Sharers.isElement(in_msg.Requestor)) { + out_msg.AckCount := out_msg.AckCount + 1; + } + } + } + } + action(ds_sendSharedDataToRequestor, "ds", desc="Send data from cache to reqeustor") { peek(L1RequestL2Network_in, RequestMsg) { enqueue(responseL2Network_out, ResponseMsg, l2_response_latency) { @@ -509,9 +567,39 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") } } + action(dss_sendSharedDataToSpecRequestor, "dss", desc="Send data from cache to reqeustor") { + peek(L1RequestL2Network_in, RequestMsg) { + enqueue(responseL2Network_out, ResponseMsg, l2_response_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:SPECLD_Data; + out_msg.AckCount := 0; + } + } + } + + action(dsex_sendSharedDataToExposeRequestor, "dsex", desc="Send data from cache to reqeustor") { + peek(L1RequestL2Network_in, RequestMsg) { + enqueue(responseL2Network_out, ResponseMsg, l2_response_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.Requestor); + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:EXPOSE_Data; + out_msg.AckCount := 0; + } + } + } + action(e_sendDataToGetSRequestors, "e", desc="Send data from cache to all GetS IDs") { assert(is_valid(tbe)); - assert(tbe.L1_GetS_IDs.count() > 0); + assert(tbe.L1_GetS_IDs.count() + tbe.L1_GetSPEC_IDs.count() + tbe.L1_Expose_IDs.count() > 0); enqueue(responseL2Network_out, ResponseMsg, to_l1_latency) { assert(is_valid(cache_entry)); out_msg.addr := address; @@ -523,9 +611,40 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") } } + action(es_sendDataToGetSpecRequestors, "es", desc="Send data from cache to all GetSpec IDs") { + assert(is_valid(tbe)); + assert(tbe.L1_GetS_IDs.count() + tbe.L1_GetSPEC_IDs.count() + tbe.L1_Expose_IDs.count() > 0); + peek(responseL2Network_in, ResponseMsg) { + enqueue(responseL2Network_out, ResponseMsg, to_l1_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.Sender := machineID; + out_msg.Destination := tbe.L1_GetSPEC_IDs; // internal nodes + out_msg.DataBlk := in_msg.DataBlk; + out_msg.MessageSize := MessageSizeType:SPECLD_Data; + } + } + } + + action(eex_sendDataToExposeRequestors, "eex", desc="Send data from cache to all GetSpec IDs") { + assert(is_valid(tbe)); + assert(tbe.L1_GetS_IDs.count() + tbe.L1_GetSPEC_IDs.count() + tbe.L1_Expose_IDs.count() > 0); + peek(responseL2Network_in, ResponseMsg) { + enqueue(responseL2Network_out, ResponseMsg, to_l1_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:DATA; + out_msg.Sender := machineID; + out_msg.Destination := tbe.L1_Expose_IDs; // internal nodes + out_msg.DataBlk := in_msg.DataBlk; + out_msg.MessageSize := MessageSizeType:EXPOSE_Data; + } + } + } + action(ex_sendExclusiveDataToGetSRequestors, "ex", desc="Send data from cache to all GetS IDs") { assert(is_valid(tbe)); assert(tbe.L1_GetS_IDs.count() == 1); + assert(tbe.L1_GetSPEC_IDs.count() + tbe.L1_Expose_IDs.count() == 0); enqueue(responseL2Network_out, ResponseMsg, to_l1_latency) { assert(is_valid(cache_entry)); out_msg.addr := address; @@ -537,6 +656,21 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") } } + action(exex_sendExclusiveDataToExposeRequestors, "exex", desc="Send data from cache to all GetS IDs") { + assert(is_valid(tbe)); + assert(tbe.L1_Expose_IDs.count() == 1); + assert(tbe.L1_GetS_IDs.count() + tbe.L1_GetSPEC_IDs.count() == 0); + enqueue(responseL2Network_out, ResponseMsg, to_l1_latency) { + assert(is_valid(cache_entry)); + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:DATA_EXCLUSIVE; + out_msg.Sender := machineID; + out_msg.Destination := tbe.L1_Expose_IDs; // internal nodes + out_msg.DataBlk := cache_entry.DataBlk; + out_msg.MessageSize := MessageSizeType:EXPOSE_Data; + } + } + action(ee_sendDataToGetXRequestor, "ee", desc="Send data from cache to GetX ID") { enqueue(responseL2Network_out, ResponseMsg, to_l1_latency) { assert(is_valid(tbe)); @@ -598,11 +732,23 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") TBEs.allocate(address); set_tbe(TBEs[address]); tbe.L1_GetS_IDs.clear(); + tbe.L1_GetSPEC_IDs.clear(); + tbe.L1_Expose_IDs.clear(); tbe.DataBlk := cache_entry.DataBlk; tbe.Dirty := cache_entry.Dirty; tbe.pendingAcks := cache_entry.Sharers.count(); } + action(iw_allocateTBEWithoutCacheEntry, "iw", desc="Allocate TBE for request without a cache entry") { + check_allocate(TBEs); + assert(!is_valid(cache_entry)); + TBEs.allocate(address); + set_tbe(TBEs[address]); + tbe.L1_GetS_IDs.clear(); + tbe.L1_GetSPEC_IDs.clear(); + tbe.L1_Expose_IDs.clear(); + } + action(s_deallocateTBE, "s", desc="Deallocate external TBE") { TBEs.deallocate(address); unset_tbe(); @@ -668,6 +814,20 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") } } + action(sss_recordGetSPECL1ID, "\sss", desc="Record L1 GetSpec for load response") { + peek(L1RequestL2Network_in, RequestMsg) { + assert(is_valid(tbe)); + tbe.L1_GetSPEC_IDs.add(in_msg.Requestor); + } + } + + action(ssss_recordExposeL1ID, "\ssss", desc="Record L1 Expose for load response") { + peek(L1RequestL2Network_in, RequestMsg) { + assert(is_valid(tbe)); + tbe.L1_Expose_IDs.add(in_msg.Requestor); + } + } + action(xx_recordGetXL1ID, "\x", desc="Record L1 GetX for store response") { peek(L1RequestL2Network_in, RequestMsg) { assert(is_valid(tbe)); @@ -793,21 +953,22 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") // BASE STATE - I // Transitions from I (Idle) - transition({NP, IS, ISS, IM, SS, M, M_I, I_I, S_I, MT_IB, MT_SB}, L1_PUTX) { + transition({NP, IS, ISS, IEE, IM, II, SS, M, M_I, I_I, S_I, MT_IB, MT_SB}, L1_PUTX) { t_sendWBAck; jj_popL1RequestQueue; } - transition({NP, SS, M, MT, M_I, I_I, S_I, IS, ISS, IM, MT_IB, MT_SB}, L1_PUTX_old) { + transition({NP, SS, M, MT, M_I, I_I, S_I, IS, ISS, IEE, IM, II, MT_IB, MT_SB}, L1_PUTX_old) { t_sendWBAck; jj_popL1RequestQueue; } - transition({IM, IS, ISS, SS_MB, MT_MB, MT_IIB, MT_IB, MT_SB}, {L2_Replacement, L2_Replacement_clean}) { + transition({IM, IS, ISS, IEE, II, SS_MB, MT_MB, MT_IIB, MT_IB, MT_SB}, {L2_Replacement, L2_Replacement_clean}) { zz_stallAndWaitL1RequestQueue; } - transition({IM, IS, ISS, SS_MB, MT_MB, MT_IIB, MT_IB, MT_SB}, MEM_Inv) { + // [InvisiSpec] TODO: How to handle Mem_Inv at II? Stall or ignore? + transition({IM, IS, ISS, IEE, II, SS_MB, MT_MB, MT_IIB, MT_IB, MT_SB}, MEM_Inv) { zn_recycleResponseNetwork; } @@ -816,7 +977,7 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") } - transition({SS_MB, MT_MB, MT_IIB, MT_IB, MT_SB}, {L1_GETS, L1_GET_INSTR, L1_GETX, L1_UPGRADE}) { + transition({SS_MB, MT_MB, MT_IIB, MT_IB, MT_SB}, {L1_GETS, L1_EXPOSE, L1_GET_INSTR, L1_GETX, L1_UPGRADE, L1_GETSPEC}) { zz_stallAndWaitL1RequestQueue; } @@ -832,6 +993,17 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") jj_popL1RequestQueue; } + transition(NP, L1_EXPOSE, IEE) { + qq_allocateL2CacheBlock; + ll_clearSharers; + nn_addSharer; + i_allocateTBE; + ssss_recordExposeL1ID; + a_issueFetchToMemory; + uu_profileMiss; + jj_popL1RequestQueue; + } + transition(NP, L1_GET_INSTR, IS) { qq_allocateL2CacheBlock; ll_clearSharers; @@ -854,12 +1026,28 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") jj_popL1RequestQueue; } + transition(NP, L1_GETSPEC, II) { + iw_allocateTBEWithoutCacheEntry; + sss_recordGetSPECL1ID; + a_issueFetchToMemory; + jj_popL1RequestQueue; + } + // transitions from IS/IM transition(ISS, Mem_Data, MT_MB) { m_writeDataToCache; ex_sendExclusiveDataToGetSRequestors; + es_sendDataToGetSpecRequestors; + s_deallocateTBE; + o_popIncomingResponseQueue; + } + + transition(IEE, Mem_Data, MT_MB) { + m_writeDataToCache; + exex_sendExclusiveDataToExposeRequestors; + es_sendDataToGetSpecRequestors; s_deallocateTBE; o_popIncomingResponseQueue; } @@ -867,6 +1055,8 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") transition(IS, Mem_Data, SS) { m_writeDataToCache; e_sendDataToGetSRequestors; + es_sendDataToGetSpecRequestors; + eex_sendDataToExposeRequestors; s_deallocateTBE; o_popIncomingResponseQueue; kd_wakeUpDependents; @@ -879,18 +1069,48 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") o_popIncomingResponseQueue; } - transition({IS, ISS}, {L1_GETS, L1_GET_INSTR}, IS) { + transition(II, Mem_Data, NP) { + es_sendDataToGetSpecRequestors; + s_deallocateTBE; + o_popIncomingResponseQueue; + kd_wakeUpDependents; + } + + transition({IS, ISS, IEE}, {L1_GETS, L1_GET_INSTR}, IS) { nn_addSharer; ss_recordGetSL1ID; uu_profileMiss; jj_popL1RequestQueue; } - transition({IS, ISS}, L1_GETX) { + transition({IS, ISS, IEE}, L1_EXPOSE, IS) { + nn_addSharer; + ssss_recordExposeL1ID; + uu_profileMiss; + jj_popL1RequestQueue; + } + + transition({IS, ISS, IEE}, L1_GETSPEC, IS) { + sss_recordGetSPECL1ID; + jj_popL1RequestQueue; + } + + transition(II, L1_GETSPEC) { + sss_recordGetSPECL1ID; + jj_popL1RequestQueue; + } + + // [InvisiSpec] L1_GET_INSTR should not be received at II + transition(II, {L1_GETS, L1_EXPOSE}) { zz_stallAndWaitL1RequestQueue; } - transition(IM, {L1_GETX, L1_GETS, L1_GET_INSTR}) { + // [InvisiSpec] TODO: Maybe we can optimize this? + transition({IS, ISS, IEE, II}, L1_GETX) { + zz_stallAndWaitL1RequestQueue; + } + + transition(IM, {L1_GETX, L1_GETS, L1_EXPOSE, L1_GET_INSTR, L1_GETSPEC}) { zz_stallAndWaitL1RequestQueue; } @@ -903,6 +1123,19 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") jj_popL1RequestQueue; } + transition(SS, L1_EXPOSE) { + dsex_sendSharedDataToExposeRequestor; + nn_addSharer; + set_setMRU; + uu_profileHit; + jj_popL1RequestQueue; + } + + transition({SS, M}, L1_GETSPEC) { + dss_sendSharedDataToSpecRequestor; + jj_popL1RequestQueue; + } + transition(SS, L1_GETX, SS_MB) { d_sendDataToRequestor; @@ -956,6 +1189,14 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") jj_popL1RequestQueue; } + // [InvisiSpec] TODO + transition(M, L1_EXPOSE, MT_MB) { + ddex_sendExclusiveDataToExposeRequestor; + set_setMRU; + uu_profileHit; + jj_popL1RequestQueue; + } + transition(M, {L2_Replacement, MEM_Inv}, M_I) { i_allocateTBE; c_exclusiveReplacement; @@ -986,6 +1227,20 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") jj_popL1RequestQueue; } + // [InvisiSpec] TODO: Ack packets are currently not recorded as EXPOSE traffic. + transition(MT, L1_EXPOSE, MT_IIB) { + bex_forwardExposeRequestToExclusive; + uu_profileMiss; + set_setMRU; + jj_popL1RequestQueue; + } + + // [InvisiSpec] Do we need to block? + transition(MT, L1_GETSPEC) { + bs_forwardSpecRequestToExclusive; + jj_popL1RequestQueue; + } + transition(MT, {L2_Replacement, MEM_Inv}, MT_I) { i_allocateTBE; f_sendInvToSharers; @@ -1039,7 +1294,7 @@ machine(MachineType:L2Cache, "MESI Directory L2 Cache CMP") } // writeback states - transition({I_I, S_I, MT_I, MCT_I, M_I}, {L1_GETX, L1_UPGRADE, L1_GETS, L1_GET_INSTR}) { + transition({I_I, S_I, MT_I, MCT_I, M_I}, {L1_GETX, L1_UPGRADE, L1_GETS, L1_EXPOSE, L1_GET_INSTR, L1_GETSPEC}) { zz_stallAndWaitL1RequestQueue; } diff --git a/src/mem/protocol/MESI_Two_Level-dir.sm b/src/mem/protocol/MESI_Two_Level-dir.sm index 991de5a2c..9934f57a8 100644 --- a/src/mem/protocol/MESI_Two_Level-dir.sm +++ b/src/mem/protocol/MESI_Two_Level-dir.sm @@ -49,6 +49,8 @@ machine(MachineType:Directory, "MESI Two Level directory protocol") M, AccessPermission:Maybe_Stale, desc="memory copy may be stale, i.e. other modified copies may exist"; IM, AccessPermission:Busy, desc="Intermediate State I>M"; + IE, AccessPermission:Busy, desc="Intermediate State I>M"; + II, AccessPermission:Busy, desc="Intermediate State I>I for SpecFetch"; MI, AccessPermission:Busy, desc="Intermediate State M>I"; M_DRD, AccessPermission:Busy, desc="Intermediate State when there is a dma read"; M_DRDI, AccessPermission:Busy, desc="Intermediate State when there is a dma read"; @@ -59,6 +61,8 @@ machine(MachineType:Directory, "MESI Two Level directory protocol") // Events enumeration(Event, desc="Directory events") { Fetch, desc="A memory fetch arrives"; + Expose, desc="A memory expose arrives"; + SpecFetch, desc="A memory fetch for speculative execution arrives"; Data, desc="writeback data arrives"; Memory_Data, desc="Fetched data from memory arrives"; Memory_Ack, desc="Writeback Ack from memory arrives"; @@ -198,6 +202,10 @@ machine(MachineType:Directory, "MESI Two Level directory protocol") assert(in_msg.Destination.isElement(machineID)); if (isGETRequest(in_msg.Type)) { trigger(Event:Fetch, in_msg.addr, TBEs[in_msg.addr]); + } else if (in_msg.Type == CoherenceRequestType:EXPOSE) { + trigger(Event:Expose, in_msg.addr, TBEs[in_msg.addr]); + } else if (in_msg.Type == CoherenceRequestType:GETSPEC) { + trigger(Event:SpecFetch, in_msg.addr, TBEs[in_msg.addr]); } else if (in_msg.Type == CoherenceRequestType:DMA_READ) { trigger(Event:DMA_READ, makeLineAddress(in_msg.addr), TBEs[makeLineAddress(in_msg.addr)]); @@ -275,6 +283,40 @@ machine(MachineType:Directory, "MESI Two Level directory protocol") } } + action(dex_sendExposeData, "dex", desc="Send data to requestor") { + peek(memQueue_in, MemoryMsg) { + enqueue(responseNetwork_out, ResponseMsg, to_mem_ctrl_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:MEMORY_DATA; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.OriginalRequestorMachId); + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Dirty := false; + out_msg.MessageSize := MessageSizeType:EXPOSE_Data; + + Entry e := getDirectoryEntry(in_msg.addr); + e.Owner := in_msg.OriginalRequestorMachId; + } + } + } + + action(ds_sendSpecData, "ds", desc="Send data to requestor") { + peek(memQueue_in, MemoryMsg) { + enqueue(responseNetwork_out, ResponseMsg, to_mem_ctrl_latency) { + out_msg.addr := address; + out_msg.Type := CoherenceResponseType:MEMORY_DATA; + out_msg.Sender := machineID; + out_msg.Destination.add(in_msg.OriginalRequestorMachId); + out_msg.DataBlk := in_msg.DataBlk; + out_msg.Dirty := false; + out_msg.MessageSize := MessageSizeType:SPECLD_Data; + + Entry e := getDirectoryEntry(in_msg.addr); + e.Owner := in_msg.OriginalRequestorMachId; + } + } + } + // Actions action(aa_sendAck, "aa", desc="Send ack to L2") { peek(memQueue_in, MemoryMsg) { @@ -306,7 +348,19 @@ machine(MachineType:Directory, "MESI Two Level directory protocol") action(qf_queueMemoryFetchRequest, "qf", desc="Queue off-chip fetch request") { peek(requestNetwork_in, RequestMsg) { - queueMemoryRead(in_msg.Requestor, address, to_mem_ctrl_latency); + queueMemoryRead(in_msg.Requestor, address, to_mem_ctrl_latency, in_msg.origin, in_msg.idx, 0); + } + } + + action(qfs_queueMemorySpecFetchRequest, "qfs", desc="Queue off-chip fetch request") { + peek(requestNetwork_in, RequestMsg) { + queueMemoryRead(in_msg.Requestor, address, to_mem_ctrl_latency, in_msg.origin, in_msg.idx, 1); + } + } + + action(qfe_queueMemoryExposeRequest, "qfe", desc="Queue off-chip fetch request") { + peek(requestNetwork_in, RequestMsg) { + queueMemoryRead(in_msg.Requestor, address, to_mem_ctrl_latency, in_msg.origin, in_msg.idx, 2); } } @@ -320,7 +374,8 @@ machine(MachineType:Directory, "MESI Two Level directory protocol") //added by SS for dma action(qf_queueMemoryFetchRequestDMA, "qfd", desc="Queue off-chip fetch request") { peek(requestNetwork_in, RequestMsg) { - queueMemoryRead(in_msg.Requestor, address, to_mem_ctrl_latency); + assert(false); + queueMemoryRead(in_msg.Requestor, address, to_mem_ctrl_latency, in_msg.Requestor, -1, -1); } } @@ -425,7 +480,18 @@ machine(MachineType:Directory, "MESI Two Level directory protocol") j_popIncomingRequestQueue; } - transition(M, Fetch) { + transition(I, Expose, IE) { + qfe_queueMemoryExposeRequest; + j_popIncomingRequestQueue; + } + + transition(I, SpecFetch, II) { + qfs_queueMemorySpecFetchRequest; + j_popIncomingRequestQueue; + } + + // [InvisiSpec] Is it secure? + transition(M, {Fetch, Expose, SpecFetch}) { inv_sendCacheInvalidate; z_stallAndWaitRequest; } @@ -435,6 +501,19 @@ machine(MachineType:Directory, "MESI Two Level directory protocol") l_popMemQueue; kd_wakeUpDependents; } + + transition(IE, Memory_Data, M) { + dex_sendExposeData; + l_popMemQueue; + kd_wakeUpDependents; + } + + transition(II, Memory_Data, I) { + ds_sendSpecData; + l_popMemQueue; + kd_wakeUpDependents; + } + //added by SS transition(M, CleanReplacement, I) { a_sendAck; @@ -481,11 +560,11 @@ machine(MachineType:Directory, "MESI Two Level directory protocol") kd_wakeUpDependents; } - transition({ID, ID_W, M_DRDI, M_DWRI, IM, MI}, {Fetch, Data} ) { + transition({ID, ID_W, M_DRDI, M_DWRI, IM, IE, MI, II}, {Fetch, Expose, SpecFetch, Data} ) { z_stallAndWaitRequest; } - transition({ID, ID_W, M_DRD, M_DRDI, M_DWR, M_DWRI, IM, MI}, {DMA_WRITE, DMA_READ} ) { + transition({ID, ID_W, M_DRD, M_DRDI, M_DWR, M_DWRI, IM, IE, MI, II}, {DMA_WRITE, DMA_READ} ) { zz_recycleDMAQueue; } diff --git a/src/mem/protocol/MESI_Two_Level-msg.sm b/src/mem/protocol/MESI_Two_Level-msg.sm index 738019e7b..d4269193d 100644 --- a/src/mem/protocol/MESI_Two_Level-msg.sm +++ b/src/mem/protocol/MESI_Two_Level-msg.sm @@ -36,6 +36,8 @@ enumeration(CoherenceRequestType, desc="...") { GET_INSTR, desc="Get Instruction"; INV, desc="INValidate"; PUTX, desc="Replacement message"; + GETSPEC, desc="Get Speculatively"; + EXPOSE, desc="Expose"; WB_ACK, desc="Writeback ack"; @@ -68,7 +70,9 @@ structure(RequestMsg, desc="...", interface="Message") { int Len; bool Dirty, default="false", desc="Dirty bit"; PrefetchBit Prefetch, desc="Is this a prefetch request"; - + MachineID origin; + int idx, default="-1", desc="LQ index"; + bool functionalRead(Packet *pkt) { // Only PUTX messages contains the data block if (Type == CoherenceRequestType:PUTX) { diff --git a/src/mem/protocol/RubySlicc_Defines.sm b/src/mem/protocol/RubySlicc_Defines.sm index eb235f8f3..7df82847e 100644 --- a/src/mem/protocol/RubySlicc_Defines.sm +++ b/src/mem/protocol/RubySlicc_Defines.sm @@ -35,7 +35,7 @@ Cycles recycle_latency; // Functions implemented in the AbstractController class for // making timing access to the memory maintained by the // memory controllers. -void queueMemoryRead(MachineID id, Addr addr, Cycles latency); +void queueMemoryRead(MachineID id, Addr addr, Cycles latency, MachineID origin, int idx, int type); void queueMemoryWrite(MachineID id, Addr addr, Cycles latency, DataBlock block); void queueMemoryWritePartial(MachineID id, Addr addr, Cycles latency, diff --git a/src/mem/protocol/RubySlicc_Exports.sm b/src/mem/protocol/RubySlicc_Exports.sm index 1beb3f2e0..a330e731e 100644 --- a/src/mem/protocol/RubySlicc_Exports.sm +++ b/src/mem/protocol/RubySlicc_Exports.sm @@ -175,6 +175,11 @@ enumeration(RubyRequestType, desc="...", default="RubyRequestType_NULL") { Release, desc="Release operation"; Acquire, desc="Acquire opertion"; AcquireRelease, desc="Acquire and Release opertion"; + // [InvisiSpec] New request types + SPEC_LD, desc="Speculative load"; + EXPOSE, desc="Expose"; + VALIDATE, desc="Validate"; + SPEC_FLUSH, desc="Flush SpecBuffer"; } enumeration(SequencerRequestType, desc="...", default="SequencerRequestType_NULL") { @@ -255,6 +260,12 @@ enumeration(MessageSizeType, desc="...") { Unblock_Control, desc="Unblock control"; Persistent_Control, desc="Persistent request activation messages"; Completion_Control, desc="Completion messages"; + SPECLD_Control, desc="SPECLD control message"; + SPECLD_Request_Control, desc="SPECLD forward message"; + SPECLD_Data, desc="SPECLD data response"; + EXPOSE_Control, desc="EXPOSE control message"; + EXPOSE_Request_Control, desc="EXPOSE forward request"; + EXPOSE_Data, desc="EXPOSE data response"; } // AccessType @@ -344,6 +355,7 @@ enumeration(RequestStatus, desc="...", default="RequestStatus_NULL") { Issued, desc="The sequencer successfully issued the request"; BufferFull, desc="Can not issue because the sequencer is full"; Aliased, desc="This request aliased with a currently outstanding request"; + Merged, desc="This request merged with a currently outstanding request"; NULL, desc=""; } diff --git a/src/mem/protocol/RubySlicc_Types.sm b/src/mem/protocol/RubySlicc_Types.sm index 27a045d29..5c73b4320 100644 --- a/src/mem/protocol/RubySlicc_Types.sm +++ b/src/mem/protocol/RubySlicc_Types.sm @@ -113,7 +113,7 @@ structure (Sequencer, external = "yes") { Cycles, Cycles, Cycles); void checkCoherence(Addr); - void evictionCallback(Addr); + void evictionCallback(Addr, bool); void recordRequestType(SequencerRequestType); bool checkResourceAvailable(CacheResourceType, Addr); void invalidateSC(Addr); @@ -172,6 +172,7 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") { HSAScope scope, desc="HSA scope"; HSASegment segment, desc="HSA segment"; PacketPtr pkt, desc="Packet associated with this request"; + int idx, desc="LQ index"; } structure(AbstractEntry, primitive="yes", external = "yes") { diff --git a/src/mem/request.hh b/src/mem/request.hh index 5cb08ca39..f912a0a6c 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -149,6 +149,9 @@ class Request MEM_SWAP = 0x00400000, MEM_SWAP_COND = 0x00800000, + /** [InvisiSpec] it is a spec request */ + SPEC = 0x00004000, + /** The request is a prefetch. */ PREFETCH = 0x01000000, /** The request should be prefetched into the exclusive state. */ @@ -784,6 +787,7 @@ class Request bool isStrictlyOrdered() const { return _flags.isSet(STRICT_ORDER); } bool isInstFetch() const { return _flags.isSet(INST_FETCH); } bool isPrefetch() const { return _flags.isSet(PREFETCH); } + bool isSpec() const { return _flags.isSet(SPEC); } bool isLLSC() const { return _flags.isSet(LLSC); } bool isPriv() const { return _flags.isSet(PRIVILEGED); } bool isLockedRMW() const { return _flags.isSet(LOCKED_RMW); } diff --git a/src/mem/ruby/SConscript b/src/mem/ruby/SConscript index 64e798fd5..b79306c04 100644 --- a/src/mem/ruby/SConscript +++ b/src/mem/ruby/SConscript @@ -56,6 +56,9 @@ DebugFlag('RubySystem') DebugFlag('RubyTester') DebugFlag('RubyStats') DebugFlag('RubyResourceStalls') +DebugFlag('SpecBuffer') +DebugFlag('SpecBufferValidate') +DebugFlag('MemSpecBuffer') CompoundFlag('Ruby', [ 'RubyQueue', 'RubyNetwork', 'RubyTester', 'RubyGenerated', 'RubySlicc', 'RubySystem', 'RubyCache', diff --git a/src/mem/ruby/network/Network.cc b/src/mem/ruby/network/Network.cc index 57834f2e2..7d4d71eb3 100644 --- a/src/mem/ruby/network/Network.cc +++ b/src/mem/ruby/network/Network.cc @@ -144,12 +144,18 @@ Network::MessageSizeType_to_int(MessageSizeType size_type) case MessageSizeType_Unblock_Control: case MessageSizeType_Persistent_Control: case MessageSizeType_Completion_Control: + case MessageSizeType_SPECLD_Control: + case MessageSizeType_SPECLD_Request_Control: + case MessageSizeType_EXPOSE_Control: + case MessageSizeType_EXPOSE_Request_Control: return m_control_msg_size; case MessageSizeType_Data: case MessageSizeType_Response_Data: case MessageSizeType_ResponseLocal_Data: case MessageSizeType_ResponseL2hit_Data: case MessageSizeType_Writeback_Data: + case MessageSizeType_SPECLD_Data: + case MessageSizeType_EXPOSE_Data: return m_data_msg_size; default: panic("Invalid range for type MessageSizeType"); diff --git a/src/mem/ruby/slicc_interface/AbstractController.cc b/src/mem/ruby/slicc_interface/AbstractController.cc index b920ff7b0..742c07705 100644 --- a/src/mem/ruby/slicc_interface/AbstractController.cc +++ b/src/mem/ruby/slicc_interface/AbstractController.cc @@ -41,6 +41,7 @@ #include "mem/ruby/slicc_interface/AbstractController.hh" #include "debug/RubyQueue.hh" +#include "debug/MemSpecBuffer.hh" #include "mem/protocol/MemoryMsg.hh" #include "mem/ruby/network/Network.hh" #include "mem/ruby/system/GPUCoalescer.hh" @@ -96,6 +97,14 @@ AbstractController::regStats() .name(name() + ".fully_busy_cycles") .desc("cycles for which number of transistions == max transitions") .flags(Stats::nozero); + m_expose_hits + .name(name() + ".expose_hits") + .desc("number of expose hits at LLC spec buffer") + .flags(Stats::nozero); + m_expose_misses + .name(name() + ".expose_misses") + .desc("number of expose misses at LLC spec buffer") + .flags(Stats::nozero); } void @@ -238,8 +247,67 @@ AbstractController::getMasterPort(const std::string &if_name, void AbstractController::queueMemoryRead(const MachineID &id, Addr addr, - Cycles latency) + Cycles latency, MachineID origin, int idx, int type) { + int coreId = origin.num; + int sbeId = idx; + // type 0: non-spec 1: spec 2: expose + // DPRINTFR(MemSpecBuffer, "%10s MemRead (core=%d, type=%d, idx=%d, addr=%#x)\n", curTick(), coreId, type, sbeId, printAddress(addr)); + // if idx == -1, it is a write request which cannot be spec or expose. + assert(!(type != 0 && sbeId == -1)); + assert(sbeId >= -1 && sbeId <= 65); + assert(coreId < 8); + assert(type >=0 && type <= 2); + if (type == 0) { + for (int c = 0; c < 8; ++c) { + for (int i = 0; i < 66; ++i) { + if (m_specBuf[c][i].address == addr) { + DPRINTFR(MemSpecBuffer, "%10s Cleared by Read (core=%d, type=%d, idx=%d, addr=%#x)\n", curTick(), c, type, i, printAddress(addr)); + m_specBuf[c][i].address = 0; + m_specBuf[c][i].data.clear(); + } + } + } + } else if (type == 1) { + + } else if (type == 2) { + if (m_specBuf[coreId][sbeId].address == addr) { + DPRINTFR(MemSpecBuffer, "%10s Expose Hit (core=%d, type=%d, idx=%d, addr=%#x)\n", curTick(), coreId, type, sbeId, printAddress(addr)); + ++m_expose_hits; + assert(getMemoryQueue()); + std::shared_ptr<MemoryMsg> msg = std::make_shared<MemoryMsg>(clockEdge()); + (*msg).m_addr = addr; + (*msg).m_Sender = m_machineID; + (*msg).m_OriginalRequestorMachId = id; + (*msg).m_Type = MemoryRequestType_MEMORY_READ; + (*msg).m_MessageSize = MessageSizeType_Response_Data; + (*msg).m_DataBlk = m_specBuf[coreId][sbeId].data; + getMemoryQueue()->enqueue(msg, clockEdge(), cyclesToTicks(Cycles(1))); + for (int c = 0; c < 8; ++c) { + for (int i = 0; i < 66; ++i) { + if (m_specBuf[c][i].address == addr) { + DPRINTFR(MemSpecBuffer, "%10s Cleared by Expose Hit (core=%d, type=%d, idx=%d, addr=%#x)\n", curTick(), c, type, i, printAddress(addr)); + m_specBuf[c][i].address = 0; + m_specBuf[c][i].data.clear(); + } + } + } + return; + } else { + DPRINTFR(MemSpecBuffer, "%10s Expose Miss (core=%d, type=%d, idx=%d, addr=%#x)\n", curTick(), coreId, type, sbeId, printAddress(addr)); + ++m_expose_misses; + for (int c = 0; c < 8; ++c) { + for (int i = 0; i < 66; ++i) { + if (m_specBuf[c][i].address == addr) { + DPRINTFR(MemSpecBuffer, "%10s Cleared by Expose Miss (core=%d, type=%d, idx=%d, addr=%#x)\n", curTick(), c, type, i, printAddress(addr)); + m_specBuf[c][i].address = 0; + m_specBuf[c][i].data.clear(); + } + } + } + } + } + RequestPtr req = new Request(addr, RubySystem::getBlockSizeBytes(), 0, m_masterId); @@ -248,6 +316,9 @@ AbstractController::queueMemoryRead(const MachineID &id, Addr addr, pkt->dataDynamic(newData); SenderState *s = new SenderState(id); + s->type = type; + s->coreId = coreId; + s->sbeId = sbeId; pkt->pushSenderState(s); // Use functional rather than timing accesses during warmup @@ -339,6 +410,9 @@ AbstractController::recvTimingResp(PacketPtr pkt) SenderState *s = dynamic_cast<SenderState *>(pkt->senderState); (*msg).m_OriginalRequestorMachId = s->id; + int type = s->type; + int coreId = s->coreId; + int sbeId = s->sbeId; delete s; if (pkt->isRead()) { @@ -348,6 +422,12 @@ AbstractController::recvTimingResp(PacketPtr pkt) // Copy data from the packet (*msg).m_DataBlk.setData(pkt->getPtr<uint8_t>(), 0, RubySystem::getBlockSizeBytes()); + if (type == 1) { + DPRINTFR(MemSpecBuffer, "%10s Updated by ReadSpec (core=%d, type=%d, idx=%d, addr=%#x)\n", curTick(), coreId, type, sbeId, printAddress(pkt->getAddr())); + m_specBuf[coreId][sbeId].address = pkt->getAddr(); + m_specBuf[coreId][sbeId].data.setData(pkt->getPtr<uint8_t>(), 0, + RubySystem::getBlockSizeBytes()); + } } else if (pkt->isWrite()) { (*msg).m_Type = MemoryRequestType_MEMORY_WB; (*msg).m_MessageSize = MessageSizeType_Writeback_Control; diff --git a/src/mem/ruby/slicc_interface/AbstractController.hh b/src/mem/ruby/slicc_interface/AbstractController.hh index 35cd3d2a5..b65a511d0 100644 --- a/src/mem/ruby/slicc_interface/AbstractController.hh +++ b/src/mem/ruby/slicc_interface/AbstractController.hh @@ -129,7 +129,7 @@ class AbstractController : public MemObject, public Consumer BaseMasterPort& getMasterPort(const std::string& if_name, PortID idx = InvalidPortID); - void queueMemoryRead(const MachineID &id, Addr addr, Cycles latency); + void queueMemoryRead(const MachineID &id, Addr addr, Cycles latency, MachineID origin, int idx, int type); void queueMemoryWrite(const MachineID &id, Addr addr, Cycles latency, const DataBlock &block); void queueMemoryWritePartial(const MachineID &id, Addr addr, Cycles latency, @@ -199,6 +199,8 @@ class AbstractController : public MemObject, public Consumer //! Counter for the number of cycles when the transitions carried out //! were equal to the maximum allowed Stats::Scalar m_fully_busy_cycles; + Stats::Scalar m_expose_hits; + Stats::Scalar m_expose_misses; //! Histogram for profiling delay for the messages this controller //! cares for @@ -250,6 +252,9 @@ class AbstractController : public MemObject, public Consumer { // Id of the machine from which the request originated. MachineID id; + int type; + int coreId; + int sbeId; SenderState(MachineID _id) : id(_id) {} @@ -258,6 +263,14 @@ class AbstractController : public MemObject, public Consumer private: /** The address range to which the controller responds on the CPU side. */ const AddrRangeList addrRanges; + + struct SBE + { + Addr address; + DataBlock data; + }; + + SBE m_specBuf[8][66]; }; #endif // __MEM_RUBY_SLICC_INTERFACE_ABSTRACTCONTROLLER_HH__ diff --git a/src/mem/ruby/slicc_interface/RubyRequest.hh b/src/mem/ruby/slicc_interface/RubyRequest.hh index 6c84f3823..2fc4c9f98 100644 --- a/src/mem/ruby/slicc_interface/RubyRequest.hh +++ b/src/mem/ruby/slicc_interface/RubyRequest.hh @@ -60,6 +60,7 @@ class RubyRequest : public Message int m_wfid; HSAScope m_scope; HSASegment m_segment; + int m_idx; RubyRequest(Tick curTime, uint64_t _paddr, uint8_t* _data, int _len, @@ -82,6 +83,11 @@ class RubyRequest : public Message m_segment(_segment) { m_LineAddress = makeLineAddress(m_PhysicalAddress); + if (_pkt->reqIdx == -1) { + m_idx = _pkt->reqIdx; + } else { + m_idx = (_pkt->reqIdx) * 2 + (_pkt->isFirst()? 0 : 1); + } } RubyRequest(Tick curTime, uint64_t _paddr, uint8_t* _data, int _len, @@ -109,6 +115,11 @@ class RubyRequest : public Message m_segment(_segment) { m_LineAddress = makeLineAddress(m_PhysicalAddress); + if (_pkt->reqIdx == -1) { + m_idx = _pkt->reqIdx; + } else { + m_idx = (_pkt->reqIdx) * 2 + (_pkt->isFirst()? 0 : 1); + } } RubyRequest(Tick curTime, uint64_t _paddr, uint8_t* _data, int _len, @@ -137,6 +148,11 @@ class RubyRequest : public Message m_segment(_segment) { m_LineAddress = makeLineAddress(m_PhysicalAddress); + if (_pkt->reqIdx == -1) { + m_idx = _pkt->reqIdx; + } else { + m_idx = (_pkt->reqIdx) * 2 + (_pkt->isFirst()? 0 : 1); + } } diff --git a/src/mem/ruby/structures/CacheMemory.cc b/src/mem/ruby/structures/CacheMemory.cc index 8d99c90aa..dc5898bea 100644 --- a/src/mem/ruby/structures/CacheMemory.cc +++ b/src/mem/ruby/structures/CacheMemory.cc @@ -176,7 +176,9 @@ CacheMemory::tryCacheAccess(Addr address, RubyRequestType type, return true; } if ((entry->m_Permission == AccessPermission_Read_Only) && - (type == RubyRequestType_LD || type == RubyRequestType_IFETCH)) { + (type == RubyRequestType_LD || + type == RubyRequestType_IFETCH || + type == RubyRequestType_SPEC_LD)) { return true; } // The line must not be accessible diff --git a/src/mem/ruby/system/RubyPort.cc b/src/mem/ruby/system/RubyPort.cc index 5977ce9ef..bf030034b 100644 --- a/src/mem/ruby/system/RubyPort.cc +++ b/src/mem/ruby/system/RubyPort.cc @@ -246,6 +246,7 @@ RubyPort::PioSlavePort::recvAtomic(PacketPtr pkt) panic("Could not find address in Ruby PIO address ranges!\n"); } +// [InvisiSpec] Request on the way from CPU to Ruby bool RubyPort::MemSlavePort::recvTimingReq(PacketPtr pkt) { @@ -428,6 +429,7 @@ RubyPort::MemSlavePort::recvFunctional(PacketPtr pkt) } } +// [InvisiSpec] On the way from Ruby to CPU void RubyPort::ruby_hit_callback(PacketPtr pkt) { @@ -511,6 +513,7 @@ RubyPort::drain() } } +// [InvisiSpec] Still on the way from Ruby to CPU void RubyPort::MemSlavePort::hitCallback(PacketPtr pkt) { @@ -544,7 +547,7 @@ RubyPort::MemSlavePort::hitCallback(PacketPtr pkt) } // Flush, acquire, release requests don't access physical memory - if (pkt->isFlush() || pkt->cmd == MemCmd::MemFenceReq) { + if (pkt->isFlush() || pkt->isExpose() || pkt->cmd == MemCmd::MemFenceReq) { accessPhysMem = false; } @@ -571,6 +574,7 @@ RubyPort::MemSlavePort::hitCallback(PacketPtr pkt) // Ruby protocol. schedTimingResp(pkt, curTick()); } else { + // [InvisiSpec] Delete the packet if a reponse is not required delete pkt; } @@ -601,7 +605,7 @@ RubyPort::MemSlavePort::isPhysMemAddress(Addr addr) const } void -RubyPort::ruby_eviction_callback(Addr address) +RubyPort::ruby_eviction_callback(Addr address, bool external) { DPRINTF(RubyPort, "Sending invalidations.\n"); // Allocate the invalidate request and packet on the stack, as it is @@ -612,6 +616,9 @@ RubyPort::ruby_eviction_callback(Addr address) // Use a single packet to signal all snooping ports of the invalidation. // This assumes that snooping ports do NOT modify the packet/request Packet pkt(&request, MemCmd::InvalidateReq); + if (external) { + pkt.setExternalEviction(); + } for (CpuPortIter p = slave_ports.begin(); p != slave_ports.end(); ++p) { // check if the connected master port is snooping if ((*p)->isSnooping()) { diff --git a/src/mem/ruby/system/RubyPort.hh b/src/mem/ruby/system/RubyPort.hh index 146443282..9c0200829 100644 --- a/src/mem/ruby/system/RubyPort.hh +++ b/src/mem/ruby/system/RubyPort.hh @@ -172,7 +172,7 @@ class RubyPort : public MemObject void trySendRetries(); void ruby_hit_callback(PacketPtr pkt); void testDrainComplete(); - void ruby_eviction_callback(Addr address); + void ruby_eviction_callback(Addr address, bool external); /** * Called by the PIO port when receiving a timing response. diff --git a/src/mem/ruby/system/Sequencer.cc b/src/mem/ruby/system/Sequencer.cc index 4037fb8f1..ed663f9c6 100644 --- a/src/mem/ruby/system/Sequencer.cc +++ b/src/mem/ruby/system/Sequencer.cc @@ -36,6 +36,8 @@ #include "debug/ProtocolTrace.hh" #include "debug/RubySequencer.hh" #include "debug/RubyStats.hh" +#include "debug/SpecBuffer.hh" +#include "debug/SpecBufferValidate.hh" #include "mem/packet.hh" #include "mem/protocol/PrefetchBit.hh" #include "mem/protocol/RubyAccessMode.hh" @@ -54,7 +56,9 @@ RubySequencerParams::create() Sequencer::Sequencer(const Params *p) : RubyPort(p), m_IncompleteTimes(MachineType_NUM), - deadlockCheckEvent([this]{ wakeup(); }, "Sequencer deadlock check") + deadlockCheckEvent([this]{ wakeup(); }, "Sequencer deadlock check"), + m_specBuf(33), + specBufferHitEvent([this]{ specBufferHitCallback(); }, "Sequencer spec buffer hit") { m_outstanding_count = 0; @@ -160,6 +164,7 @@ void Sequencer::resetStats() } } +// [InvisiSpec] Request on the way from CPU to Ruby // Insert the request on the correct request table. Return true if // the entry was already present. RequestStatus @@ -190,6 +195,7 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType request_type) RequestTable::value_type default_entry(line_addr, (SequencerRequest*) NULL); + // [InvisiSpec] If store if ((request_type == RubyRequestType_ST) || (request_type == RubyRequestType_RMW_Read) || (request_type == RubyRequestType_RMW_Write) || @@ -217,6 +223,7 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType request_type) m_store_waiting_on_store++; return RequestStatus_Aliased; } + // [InvisiSpec] If load } else { // Check if there is any outstanding write request for the same // cache line. @@ -232,6 +239,16 @@ Sequencer::insertRequest(PacketPtr pkt, RubyRequestType request_type) RequestTable::iterator i = r.first; i->second = new SequencerRequest(pkt, request_type, curCycle()); m_outstanding_count++; + } else if (request_type == RubyRequestType_SPEC_LD) { + auto i = m_readRequestTable.find(line_addr); + if (i->second->m_type == RubyRequestType_SPEC_LD) { + DPRINTFR(SpecBuffer, "%10s Merging (idx=%d-%d, addr=%#x) with %d\n", curTick(), pkt->reqIdx, pkt->isFirst()? 0 : 1, printAddress(pkt->getAddr()), i->second->pkt->reqIdx); + i->second->dependentSpecRequests.push_back(pkt); + return RequestStatus_Merged; + } else { + m_load_waiting_on_load++; + return RequestStatus_Aliased; + } } else { // There is an outstanding read request for the cache line m_load_waiting_on_load++; @@ -412,6 +429,19 @@ Sequencer::writeCallback(Addr address, DataBlock& data, initialRequestTime, forwardRequestTime, firstResponseTime); } +bool Sequencer::updateSBB(PacketPtr pkt, DataBlock& data, Addr dataAddress) { + uint8_t idx = pkt->reqIdx; + SBE& sbe = m_specBuf[idx]; + int blkIdx = pkt->isFirst() ? 0 : 1; + SBB& sbb = sbe.blocks[blkIdx]; + if (makeLineAddress(sbb.reqAddress) == dataAddress) { + sbb.data = data; + return true; + } + return false; +} + +// [InvisiSpec] Called by Ruby to send a response to CPU. void Sequencer::readCallback(Addr address, DataBlock& data, bool externalHit, const MachineType mach, @@ -430,13 +460,79 @@ Sequencer::readCallback(Addr address, DataBlock& data, markRemoved(); assert((request->m_type == RubyRequestType_LD) || + (request->m_type == RubyRequestType_SPEC_LD) || + (request->m_type == RubyRequestType_EXPOSE) || (request->m_type == RubyRequestType_IFETCH)); + + PacketPtr pkt = request->pkt; + if (pkt->isSpec()) { + assert(!pkt->onlyAccessSpecBuff()); + DPRINTFR(SpecBuffer, "%10s SPEC_LD callback (idx=%d-%d, addr=%#x)\n", curTick(), pkt->reqIdx, pkt->isFirst()? 0 : 1, printAddress(pkt->getAddr())); + updateSBB(pkt, data, address); + if (!externalHit) { + pkt->setL1Hit(); + } + } else if (pkt->isExpose()) { + DPRINTFR(SpecBuffer, "%10s EXPOSE callback (idx=%d-%d, addr=%#x)\n", curTick(), pkt->reqIdx, pkt->isFirst()? 0 : 1, printAddress(pkt->getAddr())); + } else if (pkt->isValidate()) { + DPRINTFR(SpecBuffer, "%10s VALIDATE callback (idx=%d-%d, addr=%#x)\n", curTick(), pkt->reqIdx, pkt->isFirst()? 0 : 1, printAddress(pkt->getAddr())); + uint8_t idx = pkt->reqIdx; + SBE& sbe = m_specBuf[idx]; + int blkIdx = pkt->isFirst() ? 0 : 1; + SBB& sbb = sbe.blocks[blkIdx]; + assert(makeLineAddress(sbb.reqAddress) == address); + if (!memcmp(sbb.data.getData(getOffset(pkt->getAddr()), pkt->getSize()), data.getData(getOffset(pkt->getAddr()), pkt->getSize()), pkt->getSize())) { + *(pkt->getPtr<uint8_t>()) = 1; + } else { + // std::ostringstream os; + // sbb.data.print(os); + // DPRINTFR(SpecBufferValidate, "%s\n", os.str()); + // os.str(""); + // data.print(os); + // DPRINTFR(SpecBufferValidate, "%s\n", os.str()); + *(pkt->getPtr<uint8_t>()) = 0; + } + } + + for (auto& dependentPkt : request->dependentSpecRequests) { + assert(!dependentPkt->onlyAccessSpecBuff()); + DPRINTFR(SpecBuffer, "%10s Merged SPEC_LD callback (idx=%d-%d, addr=%#x)\n", curTick(), dependentPkt->reqIdx, dependentPkt->isFirst()? 0 : 1, printAddress(dependentPkt->getAddr())); + assert(dependentPkt->isSpec()); + updateSBB(dependentPkt, data, address); + if (!externalHit) { + dependentPkt->setL1Hit(); + } + memcpy(dependentPkt->getPtr<uint8_t>(), + data.getData(getOffset(dependentPkt->getAddr()), dependentPkt->getSize()), + dependentPkt->getSize()); + ruby_hit_callback(dependentPkt); + } hitCallback(request, data, true, mach, externalHit, initialRequestTime, forwardRequestTime, firstResponseTime); } void +Sequencer::specBufferHitCallback() +{ + assert(m_specRequestQueue.size()); + while (m_specRequestQueue.size()) { + auto specReq = m_specRequestQueue.front(); + if (specReq.second <= curTick()) { + PacketPtr pkt = specReq.first; + assert(pkt->onlyAccessSpecBuff()); + DPRINTFR(SpecBuffer, "%10s SB Hit Callback (idx=%d, addr=%#x)\n", curTick(), pkt->reqIdx, printAddress(pkt->getAddr())); + ruby_hit_callback(pkt); + m_specRequestQueue.pop(); + } else { + schedule(specBufferHitEvent, specReq.second); + break; + } + } +} + +// [InvisiSpec] Response on the way from Ruby to CPU +void Sequencer::hitCallback(SequencerRequest* srequest, DataBlock& data, bool llscSuccess, const MachineType mach, const bool externalHit, @@ -470,8 +566,9 @@ Sequencer::hitCallback(SequencerRequest* srequest, DataBlock& data, if (RubySystem::getWarmupEnabled()) { data.setData(pkt->getConstPtr<uint8_t>(), getOffset(request_address), pkt->getSize()); - } else if (!pkt->isFlush()) { + } else if (!pkt->isFlush() && !pkt->isExpose() && !pkt->isValidate()) { if ((type == RubyRequestType_LD) || + (type == RubyRequestType_SPEC_LD) || (type == RubyRequestType_IFETCH) || (type == RubyRequestType_RMW_Read) || (type == RubyRequestType_Locked_RMW_Read) || @@ -534,6 +631,7 @@ Sequencer::empty() const return m_writeRequestTable.empty() && m_readRequestTable.empty(); } +// [InvisiSpec] Request on the way from CPU to Ruby RequestStatus Sequencer::makeRequest(PacketPtr pkt) { @@ -544,7 +642,56 @@ Sequencer::makeRequest(PacketPtr pkt) RubyRequestType primary_type = RubyRequestType_NULL; RubyRequestType secondary_type = RubyRequestType_NULL; - if (pkt->isLLSC()) { + // [InvisiSpec] Handle new requests + if (pkt->isSpec()) { + assert(pkt->cmd == MemCmd::ReadSpecReq); + assert(pkt->isSplit || pkt->isFirst()); + uint8_t idx = pkt->reqIdx; + SBE& sbe = m_specBuf[idx]; + sbe.isSplit = pkt->isSplit; + int blkIdx = pkt->isFirst() ? 0 : 1; + SBB& sbb = sbe.blocks[blkIdx]; + sbb.reqAddress = pkt->getAddr(); + sbb.reqSize = pkt->getSize(); + if (pkt->onlyAccessSpecBuff()) { + int srcIdx = pkt->srcIdx; + SBE& srcEntry = m_specBuf[srcIdx]; + if (makeLineAddress(sbb.reqAddress) == makeLineAddress(srcEntry.blocks[0].reqAddress)) { + sbb.data = srcEntry.blocks[0].data; + } else if (makeLineAddress(sbb.reqAddress) == makeLineAddress(srcEntry.blocks[1].reqAddress)) { + sbb.data = srcEntry.blocks[1].data; + } else { + fatal("Requested address %#x is not present in the spec buffer\n", printAddress(sbb.reqAddress)); + } + memcpy(pkt->getPtr<uint8_t>(), + sbb.data.getData(getOffset(sbb.reqAddress), sbb.reqSize), + sbb.reqSize); + m_specRequestQueue.push({pkt, curTick()}); + DPRINTFR(SpecBuffer, "%10s SB Hit (idx=%d, addr=%#x) on (srcIdx=%d)\n", curTick(), idx, printAddress(sbb.reqAddress), srcIdx); + if (!specBufferHitEvent.scheduled()) { + schedule(specBufferHitEvent, clockEdge(Cycles(1))); + } + return RequestStatus_Issued; + } else { + // assert it is not in the buffer + primary_type = secondary_type = RubyRequestType_SPEC_LD; + } + } else if (pkt->isExpose() || pkt->isValidate()) { + assert(pkt->cmd == MemCmd::ExposeReq || pkt->cmd == MemCmd::ValidateReq); + assert(pkt->isSplit || pkt->isFirst()); + uint8_t idx = pkt->reqIdx; + SBE& sbe = m_specBuf[idx]; + sbe.isSplit = pkt->isSplit; + int blkIdx = pkt->isFirst() ? 0 : 1; + SBB& sbb = sbe.blocks[blkIdx]; + if (sbb.reqAddress != pkt->getAddr()) { + fatal("sbb.reqAddress != pkt->getAddr: %#x != %#x\n", printAddress(sbb.reqAddress), printAddress(pkt->getAddr())); + } + if (sbb.reqSize != pkt->getSize()) { + fatal("sbb.reqSize != pkt->getSize(): %d != %d\n", sbb.reqSize, pkt->getSize()); + } + primary_type = secondary_type = RubyRequestType_EXPOSE; + } else if (pkt->isLLSC()) { // // Alpha LL/SC instructions need to be handled carefully by the cache // coherence protocol to ensure they follow the proper semantics. In @@ -615,8 +762,22 @@ Sequencer::makeRequest(PacketPtr pkt) } RequestStatus status = insertRequest(pkt, primary_type); - if (status != RequestStatus_Ready) + if (status == RequestStatus_Merged) { + return RequestStatus_Issued; + } else if (status != RequestStatus_Ready) { return status; + } + + if (pkt->isSpec()) { + DPRINTFR(SpecBuffer, "%10s Issuing SPEC_LD (idx=%d-%d, addr=%#x)\n", + curTick(), pkt->reqIdx, pkt->isFirst()? 0 : 1, printAddress(pkt->getAddr())); + } else if (pkt->isExpose()) { + DPRINTFR(SpecBuffer, "%10s Issuing EXPOSE (idx=%d-%d, addr=%#x)\n", + curTick(), pkt->reqIdx, pkt->isFirst()? 0 : 1, printAddress(pkt->getAddr())); + } else if (pkt->isValidate()) { + DPRINTFR(SpecBuffer, "%10s Issuing VALIDATE (idx=%d-%d, addr=%#x)\n", + curTick(), pkt->reqIdx, pkt->isFirst()? 0 : 1, printAddress(pkt->getAddr())); + } issueRequest(pkt, secondary_type); @@ -643,7 +804,7 @@ Sequencer::issueRequest(PacketPtr pkt, RubyRequestType secondary_type) // requests do not std::shared_ptr<RubyRequest> msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(), - pkt->isFlush() ? + pkt->isFlush() || pkt->isExpose() ? nullptr : pkt->getPtr<uint8_t>(), pkt->getSize(), pc, secondary_type, RubyAccessMode_Supervisor, pkt, @@ -717,9 +878,9 @@ Sequencer::recordRequestType(SequencerRequestType requestType) { void -Sequencer::evictionCallback(Addr address) +Sequencer::evictionCallback(Addr address, bool external) { - ruby_eviction_callback(address); + ruby_eviction_callback(address, external); } void diff --git a/src/mem/ruby/system/Sequencer.hh b/src/mem/ruby/system/Sequencer.hh index fcfa8ad86..66ff92777 100644 --- a/src/mem/ruby/system/Sequencer.hh +++ b/src/mem/ruby/system/Sequencer.hh @@ -31,6 +31,7 @@ #include <iostream> #include <unordered_map> +#include <queue> #include "mem/protocol/MachineType.hh" #include "mem/protocol/RubyRequestType.hh" @@ -45,6 +46,7 @@ struct SequencerRequest PacketPtr pkt; RubyRequestType m_type; Cycles issue_time; + std::vector<PacketPtr> dependentSpecRequests; SequencerRequest(PacketPtr _pkt, RubyRequestType _m_type, Cycles _issue_time) @@ -54,6 +56,19 @@ struct SequencerRequest std::ostream& operator<<(std::ostream& out, const SequencerRequest& obj); +struct SBB // SpecBufferBlock +{ + Addr reqAddress; + unsigned reqSize; + DataBlock data; +}; + +struct SBE // SpecBufferEntry +{ + bool isSplit; + SBB blocks[2]; +}; + class Sequencer : public RubyPort { public: @@ -83,6 +98,9 @@ class Sequencer : public RubyPort const Cycles forwardRequestTime = Cycles(0), const Cycles firstResponseTime = Cycles(0)); + void specBufferHitCallback(); + bool updateSBB(PacketPtr pkt, DataBlock& data, Addr dataAddress); + RequestStatus makeRequest(PacketPtr pkt); bool empty() const; int outstandingCount() const { return m_outstanding_count; } @@ -97,7 +115,7 @@ class Sequencer : public RubyPort void checkCoherence(Addr address); void markRemoved(); - void evictionCallback(Addr address); + void evictionCallback(Addr address, bool external); void invalidateSC(Addr address); int coreId() const { return m_coreId; } @@ -238,6 +256,10 @@ class Sequencer : public RubyPort std::vector<Stats::Counter> m_IncompleteTimes; EventFunctionWrapper deadlockCheckEvent; + + std::vector<SBE> m_specBuf; + std::queue<std::pair<PacketPtr, Tick>> m_specRequestQueue; + EventFunctionWrapper specBufferHitEvent; }; inline std::ostream& |