diff options
Diffstat (limited to 'src')
28 files changed, 413 insertions, 101 deletions
diff --git a/src/cpu/base.cc b/src/cpu/base.cc index 878e65551..30f6baf20 100644 --- a/src/cpu/base.cc +++ b/src/cpu/base.cc @@ -409,7 +409,7 @@ BaseCPU::probeInstCommit(const StaticInstPtr &inst) if (inst->isLoad()) ppRetiredLoads->notify(1); - if (inst->isStore()) + if (inst->isStore() || inst->isAtomic()) ppRetiredStores->notify(1); if (inst->isControl()) diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh index c24517937..9a1ab062c 100644 --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -303,6 +303,9 @@ class BaseDynInst : public ExecContext, public RefCounted Fault writeMem(uint8_t *data, unsigned size, Addr addr, Request::Flags flags, uint64_t *res); + Fault initiateMemAMO(Addr addr, unsigned size, Request::Flags flags, + AtomicOpFunctor *amo_op); + /** True if the DTB address translation has started. */ bool translationStarted() const { return instFlags[TranslationStarted]; } void translationStarted(bool f) { instFlags[TranslationStarted] = f; } @@ -920,4 +923,20 @@ BaseDynInst<Impl>::writeMem(uint8_t *data, unsigned size, Addr addr, /* st */ false, data, size, addr, flags, res); } +template<class Impl> +Fault +BaseDynInst<Impl>::initiateMemAMO(Addr addr, unsigned size, + Request::Flags flags, + AtomicOpFunctor *amo_op) +{ + // atomic memory instructions do not have data to be written to memory yet + // since the atomic operations will be executed directly in cache/memory. + // Therefore, its `data` field is nullptr. + // Atomic memory requests need to carry their `amo_op` fields to cache/ + // memory + return cpu->pushRequest( + dynamic_cast<typename DynInstPtr::PtrType>(this), + /* atomic */ false, nullptr, size, addr, flags, nullptr, amo_op); +} + #endif // __CPU_BASE_DYN_INST_HH__ diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh index e32c015bf..5f830d7a9 100644 --- a/src/cpu/checker/cpu.hh +++ b/src/cpu/checker/cpu.hh @@ -536,9 +536,16 @@ class CheckerCPU : public BaseCPU, public ExecContext Fault readMem(Addr addr, uint8_t *data, unsigned size, Request::Flags flags) override; + Fault writeMem(uint8_t *data, unsigned size, Addr addr, Request::Flags flags, uint64_t *res) override; + Fault amoMem(Addr addr, uint8_t* data, unsigned size, + Request::Flags flags, AtomicOpFunctor *amo_op) override + { + panic("AMO is not supported yet in CPU checker\n"); + } + unsigned int readStCondFailures() const override { return thread->readStCondFailures(); diff --git a/src/cpu/exec_context.hh b/src/cpu/exec_context.hh index 1c1c8956a..d46cc1315 100644 --- a/src/cpu/exec_context.hh +++ b/src/cpu/exec_context.hh @@ -261,6 +261,28 @@ class ExecContext { Request::Flags flags, uint64_t *res) = 0; /** + * For atomic-mode contexts, perform an atomic AMO (a.k.a., Atomic + * Read-Modify-Write Memory Operation) + */ + virtual Fault amoMem(Addr addr, uint8_t *data, unsigned int size, + Request::Flags flags, + AtomicOpFunctor *amo_op) + { + panic("ExecContext::amoMem() should be overridden\n"); + } + + /** + * For timing-mode contexts, initiate an atomic AMO (atomic + * read-modify-write memory operation) + */ + virtual Fault initiateMemAMO(Addr addr, unsigned int size, + Request::Flags flags, + AtomicOpFunctor *amo_op) + { + panic("ExecContext::initiateMemAMO() should be overridden\n"); + } + + /** * Sets the number of consecutive store conditional failures. */ virtual void setStCondFailures(unsigned int sc_failures) = 0; diff --git a/src/cpu/minor/exec_context.hh b/src/cpu/minor/exec_context.hh index 179883ecc..02b3dae1c 100644 --- a/src/cpu/minor/exec_context.hh +++ b/src/cpu/minor/exec_context.hh @@ -108,7 +108,7 @@ class ExecContext : public ::ExecContext Request::Flags flags) override { execute.getLSQ().pushRequest(inst, true /* load */, nullptr, - size, addr, flags, NULL); + size, addr, flags, NULL, nullptr); return NoFault; } @@ -117,7 +117,17 @@ class ExecContext : public ::ExecContext Request::Flags flags, uint64_t *res) override { execute.getLSQ().pushRequest(inst, false /* store */, data, - size, addr, flags, res); + size, addr, flags, res, nullptr); + return NoFault; + } + + Fault + initiateMemAMO(Addr addr, unsigned int size, Request::Flags flags, + AtomicOpFunctor *amo_op) override + { + // AMO requests are pushed through the store path + execute.getLSQ().pushRequest(inst, false /* amo */, nullptr, + size, addr, flags, nullptr, amo_op); return NoFault; } diff --git a/src/cpu/minor/execute.cc b/src/cpu/minor/execute.cc index 234a233c2..6a418202f 100644 --- a/src/cpu/minor/execute.cc +++ b/src/cpu/minor/execute.cc @@ -337,6 +337,7 @@ Execute::handleMemResponse(MinorDynInstPtr inst, bool is_load = inst->staticInst->isLoad(); bool is_store = inst->staticInst->isStore(); + bool is_atomic = inst->staticInst->isAtomic(); bool is_prefetch = inst->staticInst->isDataPrefetch(); /* If true, the trace's predicate value will be taken from the exec @@ -368,7 +369,7 @@ Execute::handleMemResponse(MinorDynInstPtr inst, *inst); fatal("Received error response packet for inst: %s\n", *inst); - } else if (is_store || is_load || is_prefetch) { + } else if (is_store || is_load || is_prefetch || is_atomic) { assert(packet); DPRINTF(MinorMem, "Memory response inst: %s addr: 0x%x size: %d\n", diff --git a/src/cpu/minor/fetch2.cc b/src/cpu/minor/fetch2.cc index 180890147..9347e4ccb 100644 --- a/src/cpu/minor/fetch2.cc +++ b/src/cpu/minor/fetch2.cc @@ -421,6 +421,8 @@ Fetch2::evaluate() loadInstructions++; else if (decoded_inst->isStore()) storeInstructions++; + else if (decoded_inst->isAtomic()) + amoInstructions++; else if (decoded_inst->isVector()) vecInstructions++; else if (decoded_inst->isFloating()) @@ -636,6 +638,11 @@ Fetch2::regStats() .name(name() + ".store_instructions") .desc("Number of memory store instructions successfully decoded") .flags(total); + + amoInstructions + .name(name() + ".amo_instructions") + .desc("Number of memory atomic instructions successfully decoded") + .flags(total); } void diff --git a/src/cpu/minor/fetch2.hh b/src/cpu/minor/fetch2.hh index 2230560f1..114dec0f5 100644 --- a/src/cpu/minor/fetch2.hh +++ b/src/cpu/minor/fetch2.hh @@ -171,6 +171,7 @@ class Fetch2 : public Named Stats::Scalar vecInstructions; Stats::Scalar loadInstructions; Stats::Scalar storeInstructions; + Stats::Scalar amoInstructions; public: /** Dump the whole contents of the input buffer. Useful after a diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc index b836ed22d..6fe6c3738 100644 --- a/src/cpu/minor/lsq.cc +++ b/src/cpu/minor/lsq.cc @@ -676,9 +676,9 @@ LSQ::StoreBuffer::canForwardDataToLoad(LSQRequestPtr request, while (ret == NoAddrRangeCoverage && i != slots.rend()) { LSQRequestPtr slot = *i; - /* Cache maintenance instructions go down via the store path * - * but they carry no data and they shouldn't be considered for - * forwarding */ + /* Cache maintenance instructions go down via the store path but + * they carry no data and they shouldn't be considered + * for forwarding */ if (slot->packet && slot->inst->id.threadId == request->inst->id.threadId && !slot->packet->req->isCacheMaintenance()) { @@ -931,8 +931,9 @@ LSQ::tryToSendToTransfers(LSQRequestPtr request) bool is_load = request->isLoad; bool is_llsc = request->request->isLLSC(); bool is_swap = request->request->isSwap(); + bool is_atomic = request->request->isAtomic(); bool bufferable = !(request->request->isStrictlyOrdered() || - is_llsc || is_swap); + is_llsc || is_swap || is_atomic); if (is_load) { if (numStoresInTransfers != 0) { @@ -965,9 +966,16 @@ LSQ::tryToSendToTransfers(LSQRequestPtr request) if (storeBuffer.canForwardDataToLoad(request, forwarding_slot) != NoAddrRangeCoverage) { + // There's at least another request that targets the same + // address and is staying in the storeBuffer. Since our + // request is non-bufferable (e.g., strictly ordered or atomic), + // we must wait for the other request in the storeBuffer to + // complete before we can issue this non-bufferable request. + // This is to make sure that the order they access the cache is + // correct. DPRINTF(MinorMem, "Memory access can receive forwarded data" - " from the store buffer, need to wait for store buffer to" - " drain\n"); + " from the store buffer, but need to wait for store buffer" + " to drain\n"); return; } } @@ -1469,9 +1477,21 @@ LSQ::needsToTick() void LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, unsigned int size, Addr addr, Request::Flags flags, - uint64_t *res) + uint64_t *res, AtomicOpFunctor *amo_op) { bool needs_burst = transferNeedsBurst(addr, size, lineWidth); + + if (needs_burst && inst->staticInst->isAtomic()) { + // AMO requests that access across a cache line boundary are not + // allowed since the cache does not guarantee AMO ops to be executed + // atomically in two cache lines + // For ISAs such as x86 that requires AMO operations to work on + // accesses that cross cache-line boundaries, the cache needs to be + // modified to support locking both cache lines to guarantee the + // atomicity. + panic("Do not expect cross-cache-line atomic memory request\n"); + } + LSQRequestPtr request; /* Copy given data into the request. The request will pass this to the @@ -1480,15 +1500,16 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, DPRINTF(MinorMem, "Pushing request (%s) addr: 0x%x size: %d flags:" " 0x%x%s lineWidth : 0x%x\n", - (isLoad ? "load" : "store"), addr, size, flags, + (isLoad ? "load" : "store/atomic"), addr, size, flags, (needs_burst ? " (needs burst)" : ""), lineWidth); if (!isLoad) { - /* request_data becomes the property of a ...DataRequest (see below) + /* Request_data becomes the property of a ...DataRequest (see below) * and destroyed by its destructor */ request_data = new uint8_t[size]; - if (flags & Request::STORE_NO_DATA) { - /* For cache zeroing, just use zeroed data */ + if (inst->staticInst->isAtomic() || + (flags & Request::STORE_NO_DATA)) { + /* For atomic or store-no-data, just use zeroed data */ std::memset(request_data, 0, size); } else { std::memcpy(request_data, data, size); @@ -1511,7 +1532,7 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, request->request->setVirt(0 /* asid */, addr, size, flags, cpu.dataMasterId(), /* I've no idea why we need the PC, but give it */ - inst->pc.instAddr()); + inst->pc.instAddr(), amo_op); requests.push(request); request->startAddrTranslation(); diff --git a/src/cpu/minor/lsq.hh b/src/cpu/minor/lsq.hh index da873b4ac..11fa8774f 100644 --- a/src/cpu/minor/lsq.hh +++ b/src/cpu/minor/lsq.hh @@ -696,11 +696,11 @@ class LSQ : public Named void completeMemBarrierInst(MinorDynInstPtr inst, bool committed); - /** Single interface for readMem/writeMem to issue requests into + /** Single interface for readMem/writeMem/amoMem to issue requests into * the LSQ */ void pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, unsigned int size, Addr addr, Request::Flags flags, - uint64_t *res); + uint64_t *res, AtomicOpFunctor *amo_op); /** Push a predicate failed-representing request into the queues just * to maintain commit order */ diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index 4e32f865d..e624557c8 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -498,6 +498,8 @@ class DefaultCommit Stats::Vector statComRefs; /** Stat for the total number of committed loads. */ Stats::Vector statComLoads; + /** Stat for the total number of committed atomics. */ + Stats::Vector statComAmos; /** Total number of committed memory barriers. */ Stats::Vector statComMembars; /** Total number of committed branches. */ diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh index 2891ce331..ec3d61050 100644 --- a/src/cpu/o3/commit_impl.hh +++ b/src/cpu/o3/commit_impl.hh @@ -208,6 +208,13 @@ DefaultCommit<Impl>::regStats() .flags(total) ; + statComAmos + .init(cpu->numThreads) + .name(name() + ".amos") + .desc("Number of atomic instructions committed") + .flags(total) + ; + statComMembars .init(cpu->numThreads) .name(name() + ".membars") @@ -1158,8 +1165,9 @@ DefaultCommit<Impl>::commitHead(const DynInstPtr &head_inst, unsigned inst_num) // Make sure we are only trying to commit un-executed instructions we // think are possible. assert(head_inst->isNonSpeculative() || head_inst->isStoreConditional() - || head_inst->isMemBarrier() || head_inst->isWriteBarrier() || - (head_inst->isLoad() && head_inst->strictlyOrdered())); + || head_inst->isMemBarrier() || head_inst->isWriteBarrier() + || head_inst->isAtomic() + || (head_inst->isLoad() && head_inst->strictlyOrdered())); DPRINTF(Commit, "Encountered a barrier or non-speculative " "instruction [sn:%lli] at the head of the ROB, PC %s.\n", @@ -1306,7 +1314,7 @@ DefaultCommit<Impl>::commitHead(const DynInstPtr &head_inst, unsigned inst_num) #endif // If this was a store, record it for this cycle. - if (head_inst->isStore()) + if (head_inst->isStore() || head_inst->isAtomic()) committedStores[tid] = true; // Return true to indicate that we have committed an instruction. @@ -1399,6 +1407,10 @@ DefaultCommit<Impl>::updateComInstStats(const DynInstPtr &inst) if (inst->isLoad()) { statComLoads[tid]++; } + + if (inst->isAtomic()) { + statComAmos[tid]++; + } } if (inst->isMemBarrier()) { diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index ec6be657a..21cae444b 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -793,10 +793,10 @@ class FullO3CPU : public BaseO3CPU /** CPU pushRequest function, forwards request to LSQ. */ Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, unsigned int size, Addr addr, Request::Flags flags, - uint64_t *res) + uint64_t *res, AtomicOpFunctor *amo_op = nullptr) { return iew.ldstQueue.pushRequest(inst, isLoad, data, size, addr, - flags, res); + flags, res, amo_op); } /** CPU read function, forwards read to LSQ. */ diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh index 251389631..6434ec8c3 100644 --- a/src/cpu/o3/iew_impl.hh +++ b/src/cpu/o3/iew_impl.hh @@ -479,7 +479,8 @@ DefaultIEW<Impl>::squash(ThreadID tid) if (skidBuffer[tid].front()->isLoad()) { toRename->iewInfo[tid].dispatchedToLQ++; } - if (skidBuffer[tid].front()->isStore()) { + if (skidBuffer[tid].front()->isStore() || + skidBuffer[tid].front()->isAtomic()) { toRename->iewInfo[tid].dispatchedToSQ++; } @@ -862,7 +863,8 @@ DefaultIEW<Impl>::emptyRenameInsts(ThreadID tid) if (insts[tid].front()->isLoad()) { toRename->iewInfo[tid].dispatchedToLQ++; } - if (insts[tid].front()->isStore()) { + if (insts[tid].front()->isStore() || + insts[tid].front()->isAtomic()) { toRename->iewInfo[tid].dispatchedToSQ++; } @@ -1004,7 +1006,7 @@ DefaultIEW<Impl>::dispatchInsts(ThreadID tid) if (inst->isLoad()) { toRename->iewInfo[tid].dispatchedToLQ++; } - if (inst->isStore()) { + if (inst->isStore() || inst->isAtomic()) { toRename->iewInfo[tid].dispatchedToSQ++; } @@ -1030,7 +1032,8 @@ DefaultIEW<Impl>::dispatchInsts(ThreadID tid) } // Check LSQ if inst is LD/ST - if ((inst->isLoad() && ldstQueue.lqFull(tid)) || + if ((inst->isAtomic() && ldstQueue.sqFull(tid)) || + (inst->isLoad() && ldstQueue.lqFull(tid)) || (inst->isStore() && ldstQueue.sqFull(tid))) { DPRINTF(IEW, "[tid:%i]: Issue: %s has become full.\n",tid, inst->isLoad() ? "LQ" : "SQ"); @@ -1048,7 +1051,25 @@ DefaultIEW<Impl>::dispatchInsts(ThreadID tid) } // Otherwise issue the instruction just fine. - if (inst->isLoad()) { + if (inst->isAtomic()) { + DPRINTF(IEW, "[tid:%i]: Issue: Memory instruction " + "encountered, adding to LSQ.\n", tid); + + ldstQueue.insertStore(inst); + + ++iewDispStoreInsts; + + // AMOs need to be set as "canCommit()" + // so that commit can process them when they reach the + // head of commit. + inst->setCanCommit(); + instQueue.insertNonSpec(inst); + add_to_iq = false; + + ++iewDispNonSpecInsts; + + toRename->iewInfo[tid].dispatchedToSQ++; + } else if (inst->isLoad()) { DPRINTF(IEW, "[tid:%i]: Issue: Memory instruction " "encountered, adding to LSQ.\n", tid); @@ -1243,7 +1264,20 @@ DefaultIEW<Impl>::executeInsts() "reference.\n"); // Tell the LDSTQ to execute this instruction (if it is a load). - if (inst->isLoad()) { + if (inst->isAtomic()) { + // AMOs are treated like store requests + fault = ldstQueue.executeStore(inst); + + if (inst->isTranslationDelayed() && + fault == NoFault) { + // A hw page table walk is currently going on; the + // instruction must be deferred. + DPRINTF(IEW, "Execute: Delayed translation, deferring " + "store.\n"); + instQueue.deferMemInst(inst); + continue; + } + } else if (inst->isLoad()) { // Loads will mark themselves as executed, and their writeback // event adds the instruction to the queue to commit fault = ldstQueue.executeLoad(inst); diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh index ddd7b6d5f..aa12297d6 100644 --- a/src/cpu/o3/inst_queue_impl.hh +++ b/src/cpu/o3/inst_queue_impl.hh @@ -1251,13 +1251,15 @@ InstructionQueue<Impl>::doSquash(ThreadID tid) bool is_acq_rel = squashed_inst->isMemBarrier() && (squashed_inst->isLoad() || - (squashed_inst->isStore() && + squashed_inst->isAtomic() || + (squashed_inst->isStore() && !squashed_inst->isStoreConditional())); // Remove the instruction from the dependency list. if (is_acq_rel || (!squashed_inst->isNonSpeculative() && !squashed_inst->isStoreConditional() && + !squashed_inst->isAtomic() && !squashed_inst->isMemBarrier() && !squashed_inst->isWriteBarrier())) { diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 81b7c04a5..f576dd3f4 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -191,7 +191,7 @@ class LSQ enum Flag : FlagsStorage { IsLoad = 0x00000001, - /** True if this is a store that writes registers (SC). */ + /** True if this is a store/atomic that writes registers (SC). */ WbStore = 0x00000002, Delayed = 0x00000004, IsSplit = 0x00000008, @@ -211,7 +211,9 @@ class LSQ LSQEntryFreed = 0x00000800, /** Store written back. */ WritebackScheduled = 0x00001000, - WritebackDone = 0x00002000 + WritebackDone = 0x00002000, + /** True if this is an atomic request */ + IsAtomic = 0x00004000 }; FlagsType flags; @@ -250,32 +252,39 @@ class LSQ const uint32_t _size; const Request::Flags _flags; uint32_t _numOutstandingPackets; + AtomicOpFunctor *_amo_op; protected: LSQUnit* lsqUnit() { return &_port; } LSQRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad) : _state(State::NotIssued), _senderState(nullptr), _port(*port), _inst(inst), _data(nullptr), _res(nullptr), _addr(0), _size(0), _flags(0), - _numOutstandingPackets(0) + _numOutstandingPackets(0), _amo_op(nullptr) { flags.set(Flag::IsLoad, isLoad); - flags.set(Flag::WbStore, _inst->isStoreConditional()); + flags.set(Flag::WbStore, + _inst->isStoreConditional() || _inst->isAtomic()); + flags.set(Flag::IsAtomic, _inst->isAtomic()); install(); } LSQRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad, const Addr& addr, const uint32_t& size, const Request::Flags& flags_, - PacketDataPtr data = nullptr, uint64_t* res = nullptr) + PacketDataPtr data = nullptr, uint64_t* res = nullptr, + AtomicOpFunctor* amo_op = nullptr) : _state(State::NotIssued), _senderState(nullptr), numTranslatedFragments(0), numInTranslationFragments(0), _port(*port), _inst(inst), _data(data), _res(res), _addr(addr), _size(size), _flags(flags_), - _numOutstandingPackets(0) + _numOutstandingPackets(0), + _amo_op(amo_op) { flags.set(Flag::IsLoad, isLoad); - flags.set(Flag::WbStore, _inst->isStoreConditional()); + flags.set(Flag::WbStore, + _inst->isStoreConditional() || _inst->isAtomic()); + flags.set(Flag::IsAtomic, _inst->isAtomic()); install(); } @@ -285,12 +294,20 @@ class LSQ return flags.isSet(Flag::IsLoad); } + bool + isAtomic() const + { + return flags.isSet(Flag::IsAtomic); + } + /** Install the request in the LQ/SQ. */ void install() { if (isLoad()) { _port.loadQueue[_inst->lqIdx].setRequest(this); } else { + // Store, StoreConditional, and Atomic requests are pushed + // to this storeQueue _port.storeQueue[_inst->sqIdx].setRequest(this); } } @@ -609,17 +626,21 @@ class LSQ using LSQRequest::numInTranslationFragments; using LSQRequest::numTranslatedFragments; using LSQRequest::_numOutstandingPackets; + using LSQRequest::_amo_op; public: SingleDataRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad, const Addr& addr, const uint32_t& size, const Request::Flags& flags_, PacketDataPtr data = nullptr, - uint64_t* res = nullptr) : - LSQRequest(port, inst, isLoad, addr, size, flags_, data, res) + uint64_t* res = nullptr, + AtomicOpFunctor* amo_op = nullptr) : + LSQRequest(port, inst, isLoad, addr, size, flags_, data, res, + amo_op) { LSQRequest::_requests.push_back( - std::make_shared<Request>(inst->getASID(), addr, size, flags_, - inst->masterId(), inst->instAddr(), inst->contextId())); + std::make_shared<Request>(inst->getASID(), addr, size, + flags_, inst->masterId(), inst->instAddr(), + inst->contextId(), amo_op)); LSQRequest::_requests.back()->setReqInstSeqNum(inst->seqNum); } inline virtual ~SingleDataRequest() {} @@ -928,7 +949,7 @@ class LSQ Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, unsigned int size, Addr addr, Request::Flags flags, - uint64_t *res); + uint64_t *res, AtomicOpFunctor *amo_op); /** The CPU pointer. */ O3CPU *cpu; diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh index 8a221a8d5..abe751c88 100644 --- a/src/cpu/o3/lsq_impl.hh +++ b/src/cpu/o3/lsq_impl.hh @@ -680,13 +680,26 @@ template<class Impl> Fault LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, unsigned int size, Addr addr, Request::Flags flags, - uint64_t *res) + uint64_t *res, AtomicOpFunctor *amo_op) { + // This comming request can be either load, store or atomic. + // Atomic request has a corresponding pointer to its atomic memory + // operation + bool isAtomic = !isLoad && amo_op; + ThreadID tid = cpu->contextToThread(inst->contextId()); auto cacheLineSize = cpu->cacheLineSize(); bool needs_burst = transferNeedsBurst(addr, size, cacheLineSize); LSQRequest* req = nullptr; + // Atomic requests that access data across cache line boundary are + // currently not allowed since the cache does not guarantee corresponding + // atomic memory operations to be executed atomically across a cache line. + // For ISAs such as x86 that supports cross-cache-line atomic instructions, + // the cache needs to be modified to perform atomic update to both cache + // lines. For now, such cross-line update is not supported. + assert(!isAtomic || (isAtomic && !needs_burst)); + if (inst->translationStarted()) { req = inst->savedReq; assert(req); @@ -696,7 +709,7 @@ LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, size, flags, data, res); } else { req = new SingleDataRequest(&thread[tid], inst, isLoad, addr, - size, flags, data, res); + size, flags, data, res, amo_op); } assert(req); inst->setRequest(); diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 5b90da4f5..3be67bec4 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -702,10 +702,12 @@ LSQUnit<Impl>::read(LSQRequest *req, int load_idx) bool lower_load_has_store_part = req_s < st_e; bool upper_load_has_store_part = req_e > st_s; - // If the store's data has all of the data needed and the load - // isn't LLSC then - // we can forward. - if (store_has_lower_limit && store_has_upper_limit && + // If the store entry is not atomic (atomic does not have valid + // data), the store has all of the data needed, and + // the load is not LLSC, then + // we can forward data from the store to the load + if (!store_it->instruction()->isAtomic() && + store_has_lower_limit && store_has_upper_limit && !req->mainRequest()->isLLSC()) { // Get shift amount for offset into the store's data. @@ -755,17 +757,22 @@ LSQUnit<Impl>::read(LSQRequest *req, int load_idx) return NoFault; } else if ( + // This is the partial store-load forwarding case where a store + // has only part of the load's data and the load isn't LLSC (!req->mainRequest()->isLLSC() && ((store_has_lower_limit && lower_load_has_store_part) || (store_has_upper_limit && upper_load_has_store_part) || (lower_load_has_store_part && upper_load_has_store_part))) || + // The load is LLSC, and the store has all or part of the + // load's data (req->mainRequest()->isLLSC() && ((store_has_lower_limit || upper_load_has_store_part) && - (store_has_upper_limit || lower_load_has_store_part)))) { - // This is the partial store-load forwarding case where a store - // has only part of the load's data and the load isn't LLSC or - // the load is LLSC and the store has all or part of the load's + (store_has_upper_limit || lower_load_has_store_part))) || + // The store entry is atomic and has all or part of the load's // data + (store_it->instruction()->isAtomic() && + ((store_has_lower_limit || upper_load_has_store_part) && + (store_has_upper_limit || lower_load_has_store_part)))) { // If it's already been written back, then don't worry about // stalling on it. @@ -857,8 +864,10 @@ LSQUnit<Impl>::write(LSQRequest *req, uint8_t *data, int store_idx) storeQueue[store_idx].isAllZeros() = store_no_data; assert(size <= SQEntry::DataSize || store_no_data); + // copy data into the storeQueue only if the store request has valid data if (!(req->request()->getFlags() & Request::CACHE_BLOCK_ZERO) && - !req->request()->isCacheMaintenance()) + !req->request()->isCacheMaintenance() && + !req->request()->isAtomic()) memcpy(storeQueue[store_idx].data(), data, size); // This function only writes the data to the store queue, so no fault diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh index 9756a9ef1..48179ceb8 100644 --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -124,16 +124,19 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt) assert(!cpu->switchedOut()); if (!inst->isSquashed()) { if (state->needWB) { - // Only loads and store conditionals perform the writeback + // Only loads, store conditionals and atomics perform the writeback // after receving the response from the memory - assert(inst->isLoad() || inst->isStoreConditional()); + assert(inst->isLoad() || inst->isStoreConditional() || + inst->isAtomic()); writeback(inst, state->request()->mainPacket()); - if (inst->isStore()) { + if (inst->isStore() || inst->isAtomic()) { auto ss = dynamic_cast<SQSenderState*>(state); ss->writebackDone(); completeStore(ss->idx); } } else if (inst->isStore()) { + // This is a regular store (i.e., not store conditionals and + // atomics), so it can complete without writing back completeStore(dynamic_cast<SQSenderState*>(state)->idx); } } @@ -274,7 +277,7 @@ LSQUnit<Impl>::insert(const DynInstPtr &inst) { assert(inst->isMemRef()); - assert(inst->isLoad() || inst->isStore()); + assert(inst->isLoad() || inst->isStore() || inst->isAtomic()); if (inst->isLoad()) { insertLoad(inst); @@ -614,8 +617,8 @@ LSQUnit<Impl>::executeStore(const DynInstPtr &store_inst) assert(store_fault == NoFault); - if (store_inst->isStoreConditional()) { - // Store conditionals need to set themselves as able to + if (store_inst->isStoreConditional() || store_inst->isAtomic()) { + // Store conditionals and Atomics need to set themselves as able to // writeback if we haven't had a fault by here. storeQueue[store_idx].canWB() = true; @@ -751,8 +754,8 @@ LSQUnit<Impl>::writebackStores() state->inst = inst; req->senderState(state); - if (inst->isStoreConditional()) { - /* Only store conditionals need a writeback. */ + if (inst->isStoreConditional() || inst->isAtomic()) { + /* Only store conditionals and atomics need a writeback. */ state->needWB = true; } } diff --git a/src/cpu/o3/mem_dep_unit_impl.hh b/src/cpu/o3/mem_dep_unit_impl.hh index 26c4b4d6e..f1d0e2313 100644 --- a/src/cpu/o3/mem_dep_unit_impl.hh +++ b/src/cpu/o3/mem_dep_unit_impl.hh @@ -191,11 +191,11 @@ MemDepUnit<MemDepPred, Impl>::insert(const DynInstPtr &inst) // Check any barriers and the dependence predictor for any // producing memrefs/stores. InstSeqNum producing_store; - if (inst->isLoad() && loadBarrier) { + if ((inst->isLoad() || inst->isAtomic()) && loadBarrier) { DPRINTF(MemDepUnit, "Load barrier [sn:%lli] in flight\n", loadBarrierSN); producing_store = loadBarrierSN; - } else if (inst->isStore() && storeBarrier) { + } else if ((inst->isStore() || inst->isAtomic()) && storeBarrier) { DPRINTF(MemDepUnit, "Store barrier [sn:%lli] in flight\n", storeBarrierSN); producing_store = storeBarrierSN; @@ -252,8 +252,8 @@ MemDepUnit<MemDepPred, Impl>::insert(const DynInstPtr &inst) } } - if (inst->isStore()) { - DPRINTF(MemDepUnit, "Inserting store PC %s [sn:%lli].\n", + if (inst->isStore() || inst->isAtomic()) { + DPRINTF(MemDepUnit, "Inserting store/atomic PC %s [sn:%lli].\n", inst->pcState(), inst->seqNum); depPred.insertStore(inst->instAddr(), inst->seqNum, inst->threadNumber); @@ -288,8 +288,8 @@ MemDepUnit<MemDepPred, Impl>::insertNonSpec(const DynInstPtr &inst) // Might want to turn this part into an inline function or something. // It's shared between both insert functions. - if (inst->isStore()) { - DPRINTF(MemDepUnit, "Inserting store PC %s [sn:%lli].\n", + if (inst->isStore() || inst->isAtomic()) { + DPRINTF(MemDepUnit, "Inserting store/atomic PC %s [sn:%lli].\n", inst->pcState(), inst->seqNum); depPred.insertStore(inst->instAddr(), inst->seqNum, inst->threadNumber); @@ -451,8 +451,9 @@ template <class MemDepPred, class Impl> void MemDepUnit<MemDepPred, Impl>::wakeDependents(const DynInstPtr &inst) { - // Only stores and barriers have dependents. - if (!inst->isStore() && !inst->isMemBarrier() && !inst->isWriteBarrier()) { + // Only stores, atomics and barriers have dependents. + if (!inst->isStore() && !inst->isAtomic() && !inst->isMemBarrier() && + !inst->isWriteBarrier()) { return; } diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh index fd9b09e20..c24a09711 100644 --- a/src/cpu/o3/rename_impl.hh +++ b/src/cpu/o3/rename_impl.hh @@ -647,7 +647,7 @@ DefaultRename<Impl>::renameInsts(ThreadID tid) } } - if (inst->isStore()) { + if (inst->isStore() || inst->isAtomic()) { if (calcFreeSQEntries(tid) <= 0) { DPRINTF(Rename, "[tid:%u]: Cannot rename due to no free SQ\n"); source = SQ; @@ -741,12 +741,12 @@ DefaultRename<Impl>::renameInsts(ThreadID tid) renameDestRegs(inst, inst->threadNumber); - if (inst->isLoad()) { - loadsInProgress[tid]++; - } - if (inst->isStore()) { - storesInProgress[tid]++; + if (inst->isAtomic() || inst->isStore()) { + storesInProgress[tid]++; + } else if (inst->isLoad()) { + loadsInProgress[tid]++; } + ++renamed_insts; // Notify potential listeners that source and destination registers for // this instruction have been renamed. diff --git a/src/cpu/simple/atomic.cc b/src/cpu/simple/atomic.cc index e91fafbcc..caf2427ef 100644 --- a/src/cpu/simple/atomic.cc +++ b/src/cpu/simple/atomic.cc @@ -72,6 +72,7 @@ AtomicSimpleCPU::init() ifetch_req->setContext(cid); data_read_req->setContext(cid); data_write_req->setContext(cid); + data_amo_req->setContext(cid); } AtomicSimpleCPU::AtomicSimpleCPU(AtomicSimpleCPUParams *p) @@ -90,6 +91,7 @@ AtomicSimpleCPU::AtomicSimpleCPU(AtomicSimpleCPUParams *p) ifetch_req = std::make_shared<Request>(); data_read_req = std::make_shared<Request>(); data_write_req = std::make_shared<Request>(); + data_amo_req = std::make_shared<Request>(); } @@ -417,14 +419,6 @@ AtomicSimpleCPU::readMem(Addr addr, uint8_t * data, unsigned size, } Fault -AtomicSimpleCPU::initiateMemRead(Addr addr, unsigned size, - Request::Flags flags) -{ - panic("initiateMemRead() is for timing accesses, and should " - "never be called on AtomicSimpleCPU.\n"); -} - -Fault AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr, Request::Flags flags, uint64_t *res) { @@ -534,6 +528,70 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr, } } +Fault +AtomicSimpleCPU::amoMem(Addr addr, uint8_t* data, unsigned size, + Request::Flags flags, AtomicOpFunctor *amo_op) +{ + SimpleExecContext& t_info = *threadInfo[curThread]; + SimpleThread* thread = t_info.thread; + + // use the CPU's statically allocated amo request and packet objects + const RequestPtr &req = data_amo_req; + + if (traceData) + traceData->setMem(addr, size, flags); + + //The address of the second part of this access if it needs to be split + //across a cache line boundary. + Addr secondAddr = roundDown(addr + size - 1, cacheLineSize()); + + // AMO requests that access across a cache line boundary are not + // allowed since the cache does not guarantee AMO ops to be executed + // atomically in two cache lines + // For ISAs such as x86 that requires AMO operations to work on + // accesses that cross cache-line boundaries, the cache needs to be + // modified to support locking both cache lines to guarantee the + // atomicity. + if (secondAddr > addr) { + panic("AMO request should not access across a cache line boundary\n"); + } + + dcache_latency = 0; + + req->taskId(taskId()); + req->setVirt(0, addr, size, flags, dataMasterId(), + thread->pcState().instAddr(), amo_op); + + // translate to physical address + Fault fault = thread->dtb->translateAtomic(req, thread->getTC(), + BaseTLB::Write); + + // Now do the access. + if (fault == NoFault && !req->getFlags().isSet(Request::NO_ACCESS)) { + // We treat AMO accesses as Write accesses with SwapReq command + // data will hold the return data of the AMO access + Packet pkt(req, Packet::makeWriteCmd(req)); + pkt.dataStatic(data); + + if (req->isMmappedIpr()) + dcache_latency += TheISA::handleIprRead(thread->getTC(), &pkt); + else { + dcache_latency += sendPacket(dcachePort, &pkt); + } + + dcache_access = true; + + assert(!pkt.isError()); + assert(!req->isLLSC()); + } + + if (fault != NoFault && req->isPrefetch()) { + return NoFault; + } + + //If there's a fault and we're not doing prefetch, return it + return fault; +} void AtomicSimpleCPU::tick() @@ -550,6 +608,7 @@ AtomicSimpleCPU::tick() ifetch_req->setContext(cid); data_read_req->setContext(cid); data_write_req->setContext(cid); + data_amo_req->setContext(cid); } SimpleExecContext& t_info = *threadInfo[curThread]; diff --git a/src/cpu/simple/atomic.hh b/src/cpu/simple/atomic.hh index a5151aa18..84f379121 100644 --- a/src/cpu/simple/atomic.hh +++ b/src/cpu/simple/atomic.hh @@ -163,6 +163,7 @@ class AtomicSimpleCPU : public BaseSimpleCPU RequestPtr ifetch_req; RequestPtr data_read_req; RequestPtr data_write_req; + RequestPtr data_amo_req; bool dcache_access; Tick dcache_latency; @@ -197,12 +198,12 @@ class AtomicSimpleCPU : public BaseSimpleCPU Fault readMem(Addr addr, uint8_t *data, unsigned size, Request::Flags flags) override; - Fault initiateMemRead(Addr addr, unsigned size, - Request::Flags flags) override; - Fault writeMem(uint8_t *data, unsigned size, Addr addr, Request::Flags flags, uint64_t *res) override; + Fault amoMem(Addr addr, uint8_t* data, unsigned size, + Request::Flags flags, AtomicOpFunctor *amo_op) override; + void regProbePoints() override; /** diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc index f71277d1c..422c73298 100644 --- a/src/cpu/simple/base.cc +++ b/src/cpu/simple/base.cc @@ -644,7 +644,7 @@ BaseSimpleCPU::postExecute() t_info.numLoadInsts++; } - if (curStaticInst->isStore()){ + if (curStaticInst->isStore() || curStaticInst->isAtomic()){ t_info.numStoreInsts++; } /* End power model statistics */ diff --git a/src/cpu/simple/base.hh b/src/cpu/simple/base.hh index e62fcf4d1..8060b07ad 100644 --- a/src/cpu/simple/base.hh +++ b/src/cpu/simple/base.hh @@ -143,13 +143,26 @@ class BaseSimpleCPU : public BaseCPU void startup() override; virtual Fault readMem(Addr addr, uint8_t* data, unsigned size, - Request::Flags flags) = 0; + Request::Flags flags) + { panic("readMem() is not implemented\n"); } virtual Fault initiateMemRead(Addr addr, unsigned size, - Request::Flags flags) = 0; + Request::Flags flags) + { panic("initiateMemRead() is not implemented\n"); } virtual Fault writeMem(uint8_t* data, unsigned size, Addr addr, - Request::Flags flags, uint64_t* res) = 0; + Request::Flags flags, uint64_t* res) + { panic("writeMem() is not implemented\n"); } + + virtual Fault amoMem(Addr addr, uint8_t* data, unsigned size, + Request::Flags flags, + AtomicOpFunctor *amo_op) + { panic("amoMem() is not implemented\n"); } + + virtual Fault initiateMemAMO(Addr addr, unsigned size, + Request::Flags flags, + AtomicOpFunctor *amo_op) + { panic("initiateMemAMO() is not implemented\n"); } void countInst(); Counter totalInsts() const override; diff --git a/src/cpu/simple/exec_context.hh b/src/cpu/simple/exec_context.hh index 0552dc0c6..de5cc7fd7 100644 --- a/src/cpu/simple/exec_context.hh +++ b/src/cpu/simple/exec_context.hh @@ -456,6 +456,19 @@ class SimpleExecContext : public ExecContext { return cpu->writeMem(data, size, addr, flags, res); } + Fault amoMem(Addr addr, uint8_t *data, unsigned int size, + Request::Flags flags, AtomicOpFunctor *amo_op) override + { + return cpu->amoMem(addr, data, size, flags, amo_op); + } + + Fault initiateMemAMO(Addr addr, unsigned int size, + Request::Flags flags, + AtomicOpFunctor *amo_op) override + { + return cpu->initiateMemAMO(addr, size, flags, amo_op); + } + /** * Sets the number of consecutive store conditional failures. */ diff --git a/src/cpu/simple/timing.cc b/src/cpu/simple/timing.cc index b5450cf5f..637308a96 100644 --- a/src/cpu/simple/timing.cc +++ b/src/cpu/simple/timing.cc @@ -293,6 +293,7 @@ TimingSimpleCPU::sendData(const RequestPtr &req, uint8_t *data, uint64_t *res, PacketPtr pkt = buildPacket(req, read); pkt->dataDynamic<uint8_t>(data); + if (req->getFlags().isSet(Request::NO_ACCESS)) { assert(!dcache_pkt); pkt->makeResponse(); @@ -415,14 +416,6 @@ TimingSimpleCPU::buildSplitPacket(PacketPtr &pkt1, PacketPtr &pkt2, } Fault -TimingSimpleCPU::readMem(Addr addr, uint8_t *data, - unsigned size, Request::Flags flags) -{ - panic("readMem() is for atomic accesses, and should " - "never be called on TimingSimpleCPU.\n"); -} - -Fault TimingSimpleCPU::initiateMemRead(Addr addr, unsigned size, Request::Flags flags) { @@ -556,6 +549,54 @@ TimingSimpleCPU::writeMem(uint8_t *data, unsigned size, return NoFault; } +Fault +TimingSimpleCPU::initiateMemAMO(Addr addr, unsigned size, + Request::Flags flags, + AtomicOpFunctor *amo_op) +{ + SimpleExecContext &t_info = *threadInfo[curThread]; + SimpleThread* thread = t_info.thread; + + Fault fault; + const int asid = 0; + const Addr pc = thread->instAddr(); + unsigned block_size = cacheLineSize(); + BaseTLB::Mode mode = BaseTLB::Write; + + if (traceData) + traceData->setMem(addr, size, flags); + + RequestPtr req = make_shared<Request>(asid, addr, size, flags, + dataMasterId(), pc, thread->contextId(), amo_op); + + assert(req->hasAtomicOpFunctor()); + + req->taskId(taskId()); + + Addr split_addr = roundDown(addr + size - 1, block_size); + + // AMO requests that access across a cache line boundary are not + // allowed since the cache does not guarantee AMO ops to be executed + // atomically in two cache lines + // For ISAs such as x86 that requires AMO operations to work on + // accesses that cross cache-line boundaries, the cache needs to be + // modified to support locking both cache lines to guarantee the + // atomicity. + if (split_addr > addr) { + panic("AMO requests should not access across a cache line boundary\n"); + } + + _status = DTBWaitResponse; + + WholeTranslationState *state = + new WholeTranslationState(req, new uint8_t[size], NULL, mode); + DataTranslation<TimingSimpleCPU *> *translation + = new DataTranslation<TimingSimpleCPU *>(this, state); + thread->dtb->translateTiming(req, thread->getTC(), translation, mode); + + return NoFault; +} + void TimingSimpleCPU::threadSnoop(PacketPtr pkt, ThreadID sender) { diff --git a/src/cpu/simple/timing.hh b/src/cpu/simple/timing.hh index 0300d38eb..ce0a4dbfc 100644 --- a/src/cpu/simple/timing.hh +++ b/src/cpu/simple/timing.hh @@ -282,15 +282,15 @@ class TimingSimpleCPU : public BaseSimpleCPU void activateContext(ThreadID thread_num) override; void suspendContext(ThreadID thread_num) override; - Fault readMem(Addr addr, uint8_t *data, unsigned size, - Request::Flags flags) override; - Fault initiateMemRead(Addr addr, unsigned size, Request::Flags flags) override; Fault writeMem(uint8_t *data, unsigned size, Addr addr, Request::Flags flags, uint64_t *res) override; + Fault initiateMemAMO(Addr addr, unsigned size, Request::Flags flags, + AtomicOpFunctor *amo_op) override; + void fetch(); void sendFetch(const Fault &fault, const RequestPtr &req, ThreadContext *tc); |