diff options
Diffstat (limited to 'src/cpu/o3')
-rw-r--r-- | src/cpu/o3/cpu.cc | 1 | ||||
-rw-r--r-- | src/cpu/o3/cpu.hh | 25 | ||||
-rw-r--r-- | src/cpu/o3/iew_impl.hh | 16 | ||||
-rw-r--r-- | src/cpu/o3/inst_queue_impl.hh | 7 | ||||
-rw-r--r-- | src/cpu/o3/lsq.hh | 773 | ||||
-rw-r--r-- | src/cpu/o3/lsq_impl.hh | 537 | ||||
-rw-r--r-- | src/cpu/o3/lsq_unit.hh | 852 | ||||
-rw-r--r-- | src/cpu/o3/lsq_unit_impl.hh | 762 | ||||
-rw-r--r-- | src/cpu/o3/probe/elastic_trace.cc | 2 |
9 files changed, 1903 insertions, 1072 deletions
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 600c89aa5..7261f0c9e 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -850,7 +850,6 @@ FullO3CPU<Impl>::insertThread(ThreadID tid) //Reset ROB/IQ/LSQ Entries commit.rob->resetEntries(); - iew.resetEntries(); } template <class Impl> diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index 90024bc84..1159850f8 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2013, 2016 ARM Limited + * Copyright (c) 2011-2013, 2016-2018 ARM Limited * Copyright (c) 2013 Advanced Micro Devices, Inc. * All rights reserved * @@ -125,6 +125,7 @@ class FullO3CPU : public BaseO3CPU BaseTLB *itb; BaseTLB *dtb; + using LSQRequest = typename LSQ<Impl>::LSQRequest; /** Overall CPU status. */ Status _status; @@ -733,21 +734,25 @@ class FullO3CPU : public BaseO3CPU /** Available thread ids in the cpu*/ std::vector<ThreadID> tids; + /** CPU pushRequest function, forwards request to LSQ. */ + Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, + unsigned int size, Addr addr, Request::Flags flags, + uint64_t *res) + { + return iew.ldstQueue.pushRequest(inst, isLoad, data, size, addr, + flags, res); + } + /** CPU read function, forwards read to LSQ. */ - Fault read(const RequestPtr &req, - RequestPtr &sreqLow, RequestPtr &sreqHigh, - int load_idx) + Fault read(LSQRequest* req, int load_idx) { - return this->iew.ldstQueue.read(req, sreqLow, sreqHigh, load_idx); + return this->iew.ldstQueue.read(req, load_idx); } /** CPU write function, forwards write to LSQ. */ - Fault write(const RequestPtr &req, - const RequestPtr &sreqLow, const RequestPtr &sreqHigh, - uint8_t *data, int store_idx) + Fault write(LSQRequest* req, uint8_t *data, int store_idx) { - return this->iew.ldstQueue.write(req, sreqLow, sreqHigh, - data, store_idx); + return this->iew.ldstQueue.write(req, data, store_idx); } /** Used by the fetch unit to get a hold of the instruction port. */ diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh index e706b09a1..3d5d84886 100644 --- a/src/cpu/o3/iew_impl.hh +++ b/src/cpu/o3/iew_impl.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2013 ARM Limited + * Copyright (c) 2010-2013, 2018 ARM Limited * Copyright (c) 2013 Advanced Micro Devices, Inc. * All rights reserved. * @@ -744,14 +744,6 @@ DefaultIEW<Impl>::updateStatus() } template <class Impl> -void -DefaultIEW<Impl>::resetEntries() -{ - instQueue.resetEntries(); - ldstQueue.resetEntries(); -} - -template <class Impl> bool DefaultIEW<Impl>::checkStall(ThreadID tid) { @@ -1353,7 +1345,7 @@ DefaultIEW<Impl>::executeInsts() DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: %s " "[sn:%lli], inst PC: %s [sn:%lli]. Addr is: %#x.\n", violator->pcState(), violator->seqNum, - inst->pcState(), inst->seqNum, inst->physEffAddrLow); + inst->pcState(), inst->seqNum, inst->physEffAddr); fetchRedirect[tid] = true; @@ -1376,7 +1368,7 @@ DefaultIEW<Impl>::executeInsts() DPRINTF(IEW, "LDSTQ detected a violation. Violator PC: " "%s, inst PC: %s. Addr is: %#x.\n", violator->pcState(), inst->pcState(), - inst->physEffAddrLow); + inst->physEffAddr); DPRINTF(IEW, "Violation will not be handled because " "already squashing\n"); @@ -1460,6 +1452,8 @@ DefaultIEW<Impl>::tick() wroteToTimeBuffer = false; updatedQueues = false; + ldstQueue.tick(); + sortInsts(); // Free function units marked as being freed this cycle. diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh index a8895f8ff..4a55a91ea 100644 --- a/src/cpu/o3/inst_queue_impl.hh +++ b/src/cpu/o3/inst_queue_impl.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2014 ARM Limited + * Copyright (c) 2011-2014, 2017-2018 ARM Limited * Copyright (c) 2013 Advanced Micro Devices, Inc. * All rights reserved. * @@ -1140,9 +1140,6 @@ template <class Impl> void InstructionQueue<Impl>::blockMemInst(const DynInstPtr &blocked_inst) { - blocked_inst->translationStarted(false); - blocked_inst->translationCompleted(false); - blocked_inst->clearIssued(); blocked_inst->clearCanIssue(); blockedMemInsts.push_back(blocked_inst); @@ -1285,9 +1282,9 @@ InstructionQueue<Impl>::doSquash(ThreadID tid) squashed_inst); } - ++iqSquashedOperandsExamined; } + } else if (!squashed_inst->isStoreConditional() || !squashed_inst->isCompleted()) { NonSpecMapIt ns_inst_it = diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 2b2d39bf7..003726c7c 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2012, 2014 ARM Limited + * Copyright (c) 2011-2012, 2014, 2018 ARM Limited * Copyright (c) 2013 Advanced Micro Devices, Inc. * All rights reserved * @@ -47,8 +47,9 @@ #include <map> #include <queue> -#include "cpu/o3/lsq_unit.hh" +#include "arch/generic/tlb.hh" #include "cpu/inst_seq.hh" +#include "cpu/o3/lsq_unit.hh" #include "enums/SMTQueuePolicy.hh" #include "mem/port.hh" #include "sim/sim_object.hh" @@ -56,13 +57,659 @@ struct DerivO3CPUParams; template <class Impl> -class LSQ { +class LSQ + +{ public: typedef typename Impl::O3CPU O3CPU; typedef typename Impl::DynInstPtr DynInstPtr; typedef typename Impl::CPUPol::IEW IEW; typedef typename Impl::CPUPol::LSQUnit LSQUnit; + class LSQRequest; + /** Derived class to hold any sender state the LSQ needs. */ + class LSQSenderState : public Packet::SenderState + { + protected: + /** The senderState needs to know the LSQRequest who owns it. */ + LSQRequest* _request; + + /** Default constructor. */ + LSQSenderState(LSQRequest* request, bool isLoad_) + : _request(request), mainPkt(nullptr), pendingPacket(nullptr), + outstanding(0), isLoad(isLoad_), needWB(isLoad_), isSplit(false), + pktToSend(false), deleted(false) + { } + public: + + /** Instruction which initiated the access to memory. */ + DynInstPtr inst; + /** The main packet from a split load, used during writeback. */ + PacketPtr mainPkt; + /** A second packet from a split store that needs sending. */ + PacketPtr pendingPacket; + /** Number of outstanding packets to complete. */ + uint8_t outstanding; + /** Whether or not it is a load. */ + bool isLoad; + /** Whether or not the instruction will need to writeback. */ + bool needWB; + /** Whether or not this access is split in two. */ + bool isSplit; + /** Whether or not there is a packet that needs sending. */ + bool pktToSend; + /** Has the request been deleted? + * LSQ entries can be squashed before the response comes back. in that + * case the SenderState knows. + */ + bool deleted; + ContextID contextId() { return inst->contextId(); } + + /** Completes a packet and returns whether the access is finished. */ + inline bool isComplete() { return outstanding == 0; } + inline void deleteRequest() { deleted = true; } + inline bool alive() { return !deleted; } + LSQRequest* request() { return _request; } + virtual void complete() = 0; + void writebackDone() { _request->writebackDone(); } + }; + + /** Memory operation metadata. + * This class holds the information about a memory operation. It lives + * from initiateAcc to resource deallocation at commit or squash. + * LSQRequest objects are owned by the LQ/SQ Entry in the LSQUnit that + * holds the operation. It is also used by the LSQSenderState. In addition, + * the LSQRequest is a TranslationState, therefore, upon squash, there must + * be a defined ownership transferal in case the LSQ resources are + * deallocated before the TLB is done using the TranslationState. If that + * happens, the LSQRequest will be self-owned, and responsible to detect + * that its services are no longer required and self-destruct. + * + * Lifetime of a LSQRequest: + * +--------------------+ + * |LSQ creates and owns| + * +--------------------+ + * | + * +--------------------+ + * | Initate translation| + * +--------------------+ + * | + * ___^___ + * ___/ \___ + * ______/ Squashed? \ + * | \___ ___/ + * | \___ ___/ + * | v + * | | + * | +--------------------+ + * | | Translation done | + * | +--------------------+ + * | | + * | +--------------------+ + * | | Send packet |<------+ + * | +--------------------+ | + * | | | + * | ___^___ | + * | ___/ \___ | + * | ____/ Squashed? \ | + * | | \___ ___/ | + * | | \___ ___/ | + * | | v | + * | | | | + * | | ___^___ | + * | | ___/ \___ | + * | | / Done? \__________| + * | | \___ ___/ + * | | \___ ___/ + * | | v + * | | | + * | | +--------------------+ + * | | | Manage stuff | + * | | | Free resources | + * | | +--------------------+ + * | | + * | | +--------------------+ + * | | | senderState owns | + * | +->| onRecvTimingResp | + * | | free resources | + * | +--------------------+ + * | + * | +----------------------+ + * | | self owned (Trans) | + * +-->| on TranslationFinish | + * | free resources | + * +----------------------+ + * + * + */ + class LSQRequest : public BaseTLB::Translation + { + protected: + typedef uint32_t FlagsStorage; + typedef ::Flags<FlagsStorage> FlagsType; + + enum Flag : FlagsStorage + { + IsLoad = 0x00000001, + /** True if this is a store that writes registers (SC). */ + WbStore = 0x00000002, + Delayed = 0x00000004, + IsSplit = 0x00000008, + /** True if any translation has been sent to TLB. */ + TranslationStarted = 0x00000010, + /** True if there are un-replied outbound translations.. */ + TranslationFinished = 0x00000020, + Sent = 0x00000040, + Retry = 0x00000080, + Complete = 0x00000100, + /** Ownership tracking flags. */ + /** Translation squashed. */ + TranslationSquashed = 0x00000200, + /** Request discarded */ + Discarded = 0x00000400, + /** LSQ resources freed. */ + LSQEntryFreed = 0x00000800, + /** Store written back. */ + WritebackScheduled = 0x00001000, + WritebackDone = 0x00002000 + }; + FlagsType flags; + + enum class State + { + NotIssued, + Translation, + Request, + Complete, + Squashed, + Fault, + }; + State _state; + LSQSenderState* _senderState; + void setState(const State& newState) { _state = newState; } + + uint32_t numTranslatedFragments; + uint32_t numInTranslationFragments; + + /** LQ/SQ entry idx. */ + uint32_t _entryIdx; + + void markDelayed() { flags.set(Flag::Delayed); } + bool isDelayed() { return flags.isSet(Flag::Delayed); } + + public: + LSQUnit& _port; + const DynInstPtr _inst; + uint32_t _taskId; + PacketDataPtr _data; + std::vector<PacketPtr> _packets; + std::vector<RequestPtr> _requests; + std::vector<Fault> _fault; + uint64_t* _res; + const Addr _addr; + const uint32_t _size; + const Request::Flags _flags; + uint32_t _numOutstandingPackets; + protected: + LSQUnit* lsqUnit() { return &_port; } + LSQRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad) : + _state(State::NotIssued), _senderState(nullptr), + _port(*port), _inst(inst), _data(nullptr), + _res(nullptr), _addr(0), _size(0), _flags(0), + _numOutstandingPackets(0) + { + flags.set(Flag::IsLoad, isLoad); + flags.set(Flag::WbStore, _inst->isStoreConditional()); + install(); + } + LSQRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad, + const Addr& addr, const uint32_t& size, + const Request::Flags& flags_, + PacketDataPtr data = nullptr, uint64_t* res = nullptr) + : _state(State::NotIssued), _senderState(nullptr), + numTranslatedFragments(0), + numInTranslationFragments(0), + _port(*port), _inst(inst), _data(data), + _res(res), _addr(addr), _size(size), + _flags(flags_), + _numOutstandingPackets(0) + { + flags.set(Flag::IsLoad, isLoad); + flags.set(Flag::WbStore, _inst->isStoreConditional()); + install(); + } + + bool + isLoad() const + { + return flags.isSet(Flag::IsLoad); + } + + /** Install the request in the LQ/SQ. */ + void install() + { + if (isLoad()) { + _port.loadQueue[_inst->lqIdx].setRequest(this); + } else { + _port.storeQueue[_inst->sqIdx].setRequest(this); + } + } + virtual bool + squashed() const override + { + return _inst->isSquashed(); + } + + /** + * Test if the LSQRequest has been released, i.e. self-owned. + * An LSQRequest manages itself when the resources on the LSQ are freed + * but the translation is still going on and the LSQEntry was freed. + */ + bool + isReleased() + { + return flags.isSet(Flag::LSQEntryFreed) || + flags.isSet(Flag::Discarded); + } + + /** Release the LSQRequest. + * Notify the sender state that the request it points to is not valid + * anymore. Understand if the request is orphan (self-managed) and if + * so, mark it as freed, else destroy it, as this means + * the end of its life cycle. + * An LSQRequest is orphan when its resources are released + * but there is any in-flight translation request to the TLB or access + * request to the memory. + */ + void release(Flag reason) + { + assert(reason == Flag::LSQEntryFreed || reason == Flag::Discarded); + if (!isAnyOutstandingRequest()) { + delete this; + } else { + if (_senderState) { + _senderState->deleteRequest(); + } + flags.set(reason); + } + } + + /** Destructor. + * The LSQRequest owns the request. If the packet has already been + * sent, the sender state will be deleted upon receiving the reply. + */ + virtual ~LSQRequest() + { + assert(!isAnyOutstandingRequest()); + _inst->savedReq = nullptr; + if (_senderState) + delete _senderState; + + for (auto r: _packets) + delete r; + }; + + + public: + /** Convenience getters/setters. */ + /** @{ */ + /** Set up Context numbers. */ + void + setContext(const ContextID& context_id) + { + request()->setContext(context_id); + } + + const DynInstPtr& + instruction() + { + return _inst; + } + + /** Set up virtual request. + * For a previously allocated Request objects. + */ + void + setVirt(int asid, Addr vaddr, unsigned size, Request::Flags flags_, + MasterID mid, Addr pc) + { + request()->setVirt(asid, vaddr, size, flags_, mid, pc); + } + + void + taskId(const uint32_t& v) + { + _taskId = v; + for (auto& r: _requests) + r->taskId(v); + } + + uint32_t taskId() const { return _taskId; } + RequestPtr request(int idx = 0) { return _requests.at(idx); } + + const RequestPtr + request(int idx = 0) const + { + return _requests.at(idx); + } + + Addr getVaddr(int idx = 0) const { return request(idx)->getVaddr(); } + virtual void initiateTranslation() = 0; + + PacketPtr packet(int idx = 0) { return _packets.at(idx); } + + virtual PacketPtr + mainPacket() + { + assert (_packets.size() == 1); + return packet(); + } + + virtual RequestPtr + mainRequest() + { + assert (_requests.size() == 1); + return request(); + } + + void + senderState(LSQSenderState* st) + { + _senderState = st; + for (auto& pkt: _packets) { + if (pkt) + pkt->senderState = st; + } + } + + const LSQSenderState* + senderState() const + { + return _senderState; + } + + /** + * Mark senderState as discarded. This will cause to discard response + * packets from the cache. + */ + void + discardSenderState() + { + assert(_senderState); + _senderState->deleteRequest(); + } + + /** + * Test if there is any in-flight translation or mem access request + */ + bool + isAnyOutstandingRequest() + { + return numInTranslationFragments > 0 || + _numOutstandingPackets > 0 || + (flags.isSet(Flag::WritebackScheduled) && + !flags.isSet(Flag::WritebackDone)); + } + + bool + isSplit() const + { + return flags.isSet(Flag::IsSplit); + } + /** @} */ + virtual bool recvTimingResp(PacketPtr pkt) = 0; + virtual void sendPacketToCache() = 0; + virtual void buildPackets() = 0; + + /** + * Memory mapped IPR accesses + */ + virtual void handleIprWrite(ThreadContext *thread, PacketPtr pkt) = 0; + virtual Cycles handleIprRead(ThreadContext *thread, PacketPtr pkt) = 0; + + /** + * Test if the request accesses a particular cache line. + */ + virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask) = 0; + + /** Update the status to reflect that a packet was sent. */ + void + packetSent() + { + flags.set(Flag::Sent); + } + /** Update the status to reflect that a packet was not sent. + * When a packet fails to be sent, we mark the request as needing a + * retry. Note that Retry flag is sticky. + */ + void + packetNotSent() + { + flags.set(Flag::Retry); + flags.clear(Flag::Sent); + } + + void sendFragmentToTranslation(int i); + bool + isComplete() + { + return flags.isSet(Flag::Complete); + } + + bool + isInTranslation() + { + return _state == State::Translation; + } + + bool + isTranslationComplete() + { + return flags.isSet(Flag::TranslationStarted) && + !isInTranslation(); + } + + bool + isTranslationBlocked() + { + return _state == State::Translation && + flags.isSet(Flag::TranslationStarted) && + !flags.isSet(Flag::TranslationFinished); + } + + bool + isSent() + { + return flags.isSet(Flag::Sent); + } + + /** + * The LSQ entry is cleared + */ + void + freeLSQEntry() + { + release(Flag::LSQEntryFreed); + } + + /** + * The request is discarded (e.g. partial store-load forwarding) + */ + void + discard() + { + release(Flag::Discarded); + } + + void + packetReplied() + { + assert(_numOutstandingPackets > 0); + _numOutstandingPackets--; + if (_numOutstandingPackets == 0 && isReleased()) + delete this; + } + + void + writebackScheduled() + { + assert(!flags.isSet(Flag::WritebackScheduled)); + flags.set(Flag::WritebackScheduled); + } + + void + writebackDone() + { + flags.set(Flag::WritebackDone); + /* If the lsq resources are already free */ + if (isReleased()) { + delete this; + } + } + + void + squashTranslation() + { + assert(numInTranslationFragments == 0); + flags.set(Flag::TranslationSquashed); + /* If we are on our own, self-destruct. */ + if (isReleased()) { + delete this; + } + } + + void + complete() + { + flags.set(Flag::Complete); + } + }; + + class SingleDataRequest : public LSQRequest + { + protected: + /* Given that we are inside templates, children need explicit + * declaration of the names in the parent class. */ + using Flag = typename LSQRequest::Flag; + using State = typename LSQRequest::State; + using LSQRequest::_fault; + using LSQRequest::_inst; + using LSQRequest::_packets; + using LSQRequest::_port; + using LSQRequest::_res; + using LSQRequest::_senderState; + using LSQRequest::_state; + using LSQRequest::flags; + using LSQRequest::isLoad; + using LSQRequest::isTranslationComplete; + using LSQRequest::lsqUnit; + using LSQRequest::request; + using LSQRequest::sendFragmentToTranslation; + using LSQRequest::setState; + using LSQRequest::numInTranslationFragments; + using LSQRequest::numTranslatedFragments; + using LSQRequest::_numOutstandingPackets; + public: + SingleDataRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad, + const Addr& addr, const uint32_t& size, + const Request::Flags& flags_, + PacketDataPtr data = nullptr, + uint64_t* res = nullptr) : + LSQRequest(port, inst, isLoad, addr, size, flags_, data, res) + { + LSQRequest::_requests.push_back( + std::make_shared<Request>(inst->getASID(), addr, size, flags_, + inst->masterId(), inst->instAddr(), inst->contextId())); + LSQRequest::_requests.back()->setReqInstSeqNum(inst->seqNum); + } + inline virtual ~SingleDataRequest() {} + virtual void initiateTranslation(); + virtual void finish(const Fault &fault, const RequestPtr &req, + ThreadContext* tc, BaseTLB::Mode mode); + virtual bool recvTimingResp(PacketPtr pkt); + virtual void sendPacketToCache(); + virtual void buildPackets(); + virtual void handleIprWrite(ThreadContext *thread, PacketPtr pkt); + virtual Cycles handleIprRead(ThreadContext *thread, PacketPtr pkt); + virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask); + }; + + class SplitDataRequest : public LSQRequest + { + protected: + /* Given that we are inside templates, children need explicit + * declaration of the names in the parent class. */ + using Flag = typename LSQRequest::Flag; + using State = typename LSQRequest::State; + using LSQRequest::_addr; + using LSQRequest::_data; + using LSQRequest::_fault; + using LSQRequest::_flags; + using LSQRequest::_inst; + using LSQRequest::_packets; + using LSQRequest::_port; + using LSQRequest::_requests; + using LSQRequest::_res; + using LSQRequest::_senderState; + using LSQRequest::_size; + using LSQRequest::_state; + using LSQRequest::_taskId; + using LSQRequest::flags; + using LSQRequest::isLoad; + using LSQRequest::isTranslationComplete; + using LSQRequest::lsqUnit; + using LSQRequest::numInTranslationFragments; + using LSQRequest::numTranslatedFragments; + using LSQRequest::request; + using LSQRequest::sendFragmentToTranslation; + using LSQRequest::setState; + using LSQRequest::_numOutstandingPackets; + + uint32_t numFragments; + uint32_t numReceivedPackets; + RequestPtr mainReq; + PacketPtr _mainPacket; + + + public: + SplitDataRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad, + const Addr& addr, const uint32_t& size, + const Request::Flags & flags_, + PacketDataPtr data = nullptr, + uint64_t* res = nullptr) : + LSQRequest(port, inst, isLoad, addr, size, flags_, data, res), + numFragments(0), + numReceivedPackets(0), + mainReq(nullptr), + _mainPacket(nullptr) + { + flags.set(Flag::IsSplit); + } + virtual ~SplitDataRequest() + { + if (mainReq) { + mainReq = nullptr; + } + if (_mainPacket) { + delete _mainPacket; + _mainPacket = nullptr; + } + } + virtual void finish(const Fault &fault, const RequestPtr &req, + ThreadContext* tc, BaseTLB::Mode mode); + virtual bool recvTimingResp(PacketPtr pkt); + virtual void initiateTranslation(); + virtual void sendPacketToCache(); + virtual void buildPackets(); + + virtual void handleIprWrite(ThreadContext *thread, PacketPtr pkt); + virtual Cycles handleIprRead(ThreadContext *thread, PacketPtr pkt); + virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask); + + virtual RequestPtr mainRequest(); + virtual PacketPtr mainPacket(); + }; + /** Constructs an LSQ with the given parameters. */ LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params); ~LSQ() { } @@ -85,17 +732,9 @@ class LSQ { /** Number of entries needed for the given amount of threads.*/ int entryAmount(ThreadID num_threads); - void removeEntries(ThreadID tid); - /** Reset the max entries for each thread. */ - void resetEntries(); - /** Resize the max entries for a thread. */ - void resizeEntries(unsigned size, ThreadID tid); /** Ticks the LSQ. */ - void tick(); - /** Ticks a specific LSQ Unit. */ - void tick(ThreadID tid) - { thread[tid].tick(); } + void tick() { usedStorePorts = 0; } /** Inserts a load into the LSQ. */ void insertLoad(const DynInstPtr &load_inst); @@ -112,13 +751,13 @@ class LSQ { * Commits loads up until the given sequence number for a specific thread. */ void commitLoads(InstSeqNum &youngest_inst, ThreadID tid) - { thread[tid].commitLoads(youngest_inst); } + { thread.at(tid).commitLoads(youngest_inst); } /** * Commits stores up until the given sequence number for a specific thread. */ void commitStores(InstSeqNum &youngest_inst, ThreadID tid) - { thread[tid].commitStores(youngest_inst); } + { thread.at(tid).commitStores(youngest_inst); } /** * Attempts to write back stores until all cache ports are used or the @@ -131,8 +770,11 @@ class LSQ { /** * Squash instructions from a thread until the specified sequence number. */ - void squash(const InstSeqNum &squashed_num, ThreadID tid) - { thread[tid].squash(squashed_num); } + void + squash(const InstSeqNum &squashed_num, ThreadID tid) + { + thread.at(tid).squash(squashed_num); + } /** Returns whether or not there was a memory ordering violation. */ bool violation(); @@ -140,50 +782,49 @@ class LSQ { * Returns whether or not there was a memory ordering violation for a * specific thread. */ - bool violation(ThreadID tid) - { return thread[tid].violation(); } + bool violation(ThreadID tid) { return thread.at(tid).violation(); } /** Gets the instruction that caused the memory ordering violation. */ - DynInstPtr getMemDepViolator(ThreadID tid) - { return thread[tid].getMemDepViolator(); } + DynInstPtr + getMemDepViolator(ThreadID tid) + { + return thread.at(tid).getMemDepViolator(); + } /** Returns the head index of the load queue for a specific thread. */ - int getLoadHead(ThreadID tid) - { return thread[tid].getLoadHead(); } + int getLoadHead(ThreadID tid) { return thread.at(tid).getLoadHead(); } /** Returns the sequence number of the head of the load queue. */ - InstSeqNum getLoadHeadSeqNum(ThreadID tid) + InstSeqNum + getLoadHeadSeqNum(ThreadID tid) { - return thread[tid].getLoadHeadSeqNum(); + return thread.at(tid).getLoadHeadSeqNum(); } /** Returns the head index of the store queue. */ - int getStoreHead(ThreadID tid) - { return thread[tid].getStoreHead(); } + int getStoreHead(ThreadID tid) { return thread.at(tid).getStoreHead(); } /** Returns the sequence number of the head of the store queue. */ - InstSeqNum getStoreHeadSeqNum(ThreadID tid) + InstSeqNum + getStoreHeadSeqNum(ThreadID tid) { - return thread[tid].getStoreHeadSeqNum(); + return thread.at(tid).getStoreHeadSeqNum(); } /** Returns the number of instructions in all of the queues. */ int getCount(); /** Returns the number of instructions in the queues of one thread. */ - int getCount(ThreadID tid) - { return thread[tid].getCount(); } + int getCount(ThreadID tid) { return thread.at(tid).getCount(); } /** Returns the total number of loads in the load queue. */ int numLoads(); /** Returns the total number of loads for a single thread. */ - int numLoads(ThreadID tid) - { return thread[tid].numLoads(); } + int numLoads(ThreadID tid) { return thread.at(tid).numLoads(); } /** Returns the total number of stores in the store queue. */ int numStores(); /** Returns the total number of stores for a single thread. */ - int numStores(ThreadID tid) - { return thread[tid].numStores(); } + int numStores(ThreadID tid) { return thread.at(tid).numStores(); } /** Returns the number of free load entries. */ unsigned numFreeLoadEntries(); @@ -242,46 +883,39 @@ class LSQ { /** Returns whether or not a specific thread has any stores to write back * to memory. */ - bool hasStoresToWB(ThreadID tid) - { return thread[tid].hasStoresToWB(); } + bool hasStoresToWB(ThreadID tid) { return thread.at(tid).hasStoresToWB(); } /** Returns the number of stores a specific thread has to write back. */ - int numStoresToWB(ThreadID tid) - { return thread[tid].numStoresToWB(); } + int numStoresToWB(ThreadID tid) { return thread.at(tid).numStoresToWB(); } /** Returns if the LSQ will write back to memory this cycle. */ bool willWB(); /** Returns if the LSQ of a specific thread will write back to memory this * cycle. */ - bool willWB(ThreadID tid) - { return thread[tid].willWB(); } + bool willWB(ThreadID tid) { return thread.at(tid).willWB(); } /** Debugging function to print out all instructions. */ void dumpInsts() const; /** Debugging function to print out instructions from a specific thread. */ - void dumpInsts(ThreadID tid) const - { thread[tid].dumpInsts(); } + void dumpInsts(ThreadID tid) const { thread.at(tid).dumpInsts(); } /** Executes a read operation, using the load specified at the load * index. */ - Fault read(const RequestPtr &req, - RequestPtr &sreqLow, RequestPtr &sreqHigh, - int load_idx); + Fault read(LSQRequest* req, int load_idx); /** Executes a store operation, using the store specified at the store * index. */ - Fault write(const RequestPtr &req, - const RequestPtr &sreqLow, const RequestPtr &sreqHigh, - uint8_t *data, int store_idx); + Fault write(LSQRequest* req, uint8_t *data, int store_idx); /** * Retry the previous send that failed. */ void recvReqRetry(); + void completeDataAccess(PacketPtr pkt); /** * Handles writing back and completing the load or store that has * returned from memory. @@ -292,13 +926,34 @@ class LSQ { void recvTimingSnoopReq(PacketPtr pkt); + Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, + unsigned int size, Addr addr, Request::Flags flags, + uint64_t *res); + /** The CPU pointer. */ O3CPU *cpu; /** The IEW stage pointer. */ IEW *iewStage; + /** Is D-cache blocked? */ + bool cacheBlocked() const; + /** Set D-cache blocked status */ + void cacheBlocked(bool v); + /** Is any store port available to use? */ + bool storePortAvailable() const; + /** Another store port is in use */ + void storePortBusy(); + protected: + /** D-cache is blocked */ + bool _cacheBlocked; + /** The number of cache ports available each cycle (stores only). */ + int cacheStorePorts; + /** The number of used cache ports in this cycle by stores. */ + int usedStorePorts; + + /** The LSQ policy for SMT mode. */ SMTQueuePolicy lsqPolicy; @@ -307,8 +962,10 @@ class LSQ { * and threshold, this function calculates how many resources each thread * can occupy at most. */ - static uint32_t maxLSQAllocation(SMTQueuePolicy pol, uint32_t entries, - uint32_t numThreads, uint32_t SMTThreshold) { + static uint32_t + maxLSQAllocation(SMTQueuePolicy pol, uint32_t entries, + uint32_t numThreads, uint32_t SMTThreshold) + { if (pol == SMTQueuePolicy::Dynamic) { return entries; } else if (pol == SMTQueuePolicy::Partitioned) { @@ -346,24 +1003,20 @@ class LSQ { template <class Impl> Fault -LSQ<Impl>::read(const RequestPtr &req, - RequestPtr &sreqLow, RequestPtr &sreqHigh, - int load_idx) +LSQ<Impl>::read(LSQRequest* req, int load_idx) { - ThreadID tid = cpu->contextToThread(req->contextId()); + ThreadID tid = cpu->contextToThread(req->request()->contextId()); - return thread[tid].read(req, sreqLow, sreqHigh, load_idx); + return thread.at(tid).read(req, load_idx); } template <class Impl> Fault -LSQ<Impl>::write(const RequestPtr &req, - const RequestPtr &sreqLow, const RequestPtr &sreqHigh, - uint8_t *data, int store_idx) +LSQ<Impl>::write(LSQRequest* req, uint8_t *data, int store_idx) { - ThreadID tid = cpu->contextToThread(req->contextId()); + ThreadID tid = cpu->contextToThread(req->request()->contextId()); - return thread[tid].write(req, sreqLow, sreqHigh, data, store_idx); + return thread.at(tid).write(req, data, store_idx); } #endif // __CPU_O3_LSQ_HH__ diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh index edc3f469b..8a221a8d5 100644 --- a/src/cpu/o3/lsq_impl.hh +++ b/src/cpu/o3/lsq_impl.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2012, 2014 ARM Limited + * Copyright (c) 2011-2012, 2014, 2017-2018 ARM Limited * Copyright (c) 2013 Advanced Micro Devices, Inc. * All rights reserved * @@ -61,6 +61,8 @@ using namespace std; template <class Impl> LSQ<Impl>::LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params) : cpu(cpu_ptr), iewStage(iew_ptr), + _cacheBlocked(false), + cacheStorePorts(params->cacheStorePorts), usedStorePorts(0), lsqPolicy(params->smtLSQPolicy), LQEntries(params->LQEntries), SQEntries(params->SQEntries), @@ -76,8 +78,8 @@ LSQ<Impl>::LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params) //************ Handle SMT Parameters ***********/ //**********************************************/ - //Figure out fetch policy - if (lsqPolicy == SMTQueuePolicy::Dynamic) { + /* Run SMT olicy checks. */ + if (lsqPolicy == SMTQueuePolicy::Dynamic) { DPRINTF(LSQ, "LSQ sharing policy set to Dynamic\n"); } else if (lsqPolicy == SMTQueuePolicy::Partitioned) { DPRINTF(Fetch, "LSQ sharing policy set to Partitioned: " @@ -85,8 +87,8 @@ LSQ<Impl>::LSQ(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params) maxLQEntries,maxSQEntries); } else if (lsqPolicy == SMTQueuePolicy::Threshold) { - assert(params->smtLSQThreshold > LQEntries); - assert(params->smtLSQThreshold > SQEntries); + assert(params->smtLSQThreshold > params->LQEntries); + assert(params->smtLSQThreshold > params->SQEntries); DPRINTF(LSQ, "LSQ sharing policy set to Threshold: " "%i entries per LQ | %i entries per SQ\n", @@ -163,79 +165,41 @@ template <class Impl> void LSQ<Impl>::takeOverFrom() { + usedStorePorts = 0; + _cacheBlocked = false; + for (ThreadID tid = 0; tid < numThreads; tid++) { thread[tid].takeOverFrom(); } } -template <class Impl> -int -LSQ<Impl>::entryAmount(ThreadID num_threads) -{ - if (lsqPolicy == SMTQueuePolicy::Partitioned) { - return LQEntries / num_threads; - } else { - return 0; - } -} - -template <class Impl> -void -LSQ<Impl>::resetEntries() +template<class Impl> +bool +LSQ<Impl>::cacheBlocked() const { - if (lsqPolicy != SMTQueuePolicy::Dynamic || numThreads > 1) { - int active_threads = activeThreads->size(); - - int maxEntries; - - if (lsqPolicy == SMTQueuePolicy::Partitioned) { - maxEntries = LQEntries / active_threads; - } else if (lsqPolicy == SMTQueuePolicy::Threshold && - active_threads == 1) { - maxEntries = LQEntries; - } else { - maxEntries = LQEntries; - } - - list<ThreadID>::iterator threads = activeThreads->begin(); - list<ThreadID>::iterator end = activeThreads->end(); - - while (threads != end) { - ThreadID tid = *threads++; - - resizeEntries(maxEntries, tid); - } - } + return _cacheBlocked; } template<class Impl> void -LSQ<Impl>::removeEntries(ThreadID tid) +LSQ<Impl>::cacheBlocked(bool v) { - thread[tid].clearLQ(); - thread[tid].clearSQ(); + _cacheBlocked = v; } template<class Impl> -void -LSQ<Impl>::resizeEntries(unsigned size, ThreadID tid) +bool +LSQ<Impl>::storePortAvailable() const { - thread[tid].resizeLQ(size); - thread[tid].resizeSQ(size); + return usedStorePorts < cacheStorePorts; } template<class Impl> void -LSQ<Impl>::tick() +LSQ<Impl>::storePortBusy() { - list<ThreadID>::iterator threads = activeThreads->begin(); - list<ThreadID>::iterator end = activeThreads->end(); - - while (threads != end) { - ThreadID tid = *threads++; - - thread[tid].tick(); - } + usedStorePorts++; + assert(usedStorePorts <= cacheStorePorts); } template<class Impl> @@ -316,6 +280,7 @@ void LSQ<Impl>::recvReqRetry() { iewStage->cacheUnblocked(); + cacheBlocked(false); for (ThreadID tid : *activeThreads) { thread[tid].recvRetry(); @@ -323,6 +288,15 @@ LSQ<Impl>::recvReqRetry() } template <class Impl> +void +LSQ<Impl>::completeDataAccess(PacketPtr pkt) +{ + auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState); + thread[cpu->contextToThread(senderState->contextId())] + .completeDataAccess(pkt); +} + +template <class Impl> bool LSQ<Impl>::recvTimingResp(PacketPtr pkt) { @@ -330,8 +304,10 @@ LSQ<Impl>::recvTimingResp(PacketPtr pkt) DPRINTF(LSQ, "Got error packet back for address: %#X\n", pkt->getAddr()); - thread[cpu->contextToThread(pkt->req->contextId())] - .completeDataAccess(pkt); + auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState); + panic_if(!senderState, "Got packet back with unknown sender state\n"); + + thread[cpu->contextToThread(senderState->contextId())].recvTimingResp(pkt); if (pkt->isInvalidate()) { // This response also contains an invalidate; e.g. this can be the case @@ -352,8 +328,9 @@ LSQ<Impl>::recvTimingResp(PacketPtr pkt) thread[tid].checkSnoop(pkt); } } + // Update the LSQRequest state (this may delete the request) + senderState->request()->packetReplied(); - delete pkt; return true; } @@ -681,4 +658,442 @@ LSQ<Impl>::dumpInsts() const } } +static Addr +addrBlockOffset(Addr addr, unsigned int block_size) +{ + return addr & (block_size - 1); +} + +static Addr +addrBlockAlign(Addr addr, uint64_t block_size) +{ + return addr & ~(block_size - 1); +} + +static bool +transferNeedsBurst(Addr addr, uint64_t size, uint64_t block_size) +{ + return (addrBlockOffset(addr, block_size) + size) > block_size; +} + +template<class Impl> +Fault +LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, + unsigned int size, Addr addr, Request::Flags flags, + uint64_t *res) +{ + ThreadID tid = cpu->contextToThread(inst->contextId()); + auto cacheLineSize = cpu->cacheLineSize(); + bool needs_burst = transferNeedsBurst(addr, size, cacheLineSize); + LSQRequest* req = nullptr; + + if (inst->translationStarted()) { + req = inst->savedReq; + assert(req); + } else { + if (needs_burst) { + req = new SplitDataRequest(&thread[tid], inst, isLoad, addr, + size, flags, data, res); + } else { + req = new SingleDataRequest(&thread[tid], inst, isLoad, addr, + size, flags, data, res); + } + assert(req); + inst->setRequest(); + req->taskId(cpu->taskId()); + + req->initiateTranslation(); + } + + /* This is the place were instructions get the effAddr. */ + if (req->isTranslationComplete()) { + if (inst->getFault() == NoFault) { + inst->effAddr = req->getVaddr(); + inst->effSize = size; + inst->effAddrValid(true); + + if (cpu->checker) { + inst->reqToVerify = std::make_shared<Request>(*req->request()); + } + if (isLoad) + inst->getFault() = cpu->read(req, inst->lqIdx); + else + inst->getFault() = cpu->write(req, data, inst->sqIdx); + } else if (isLoad) { + // Commit will have to clean up whatever happened. Set this + // instruction as executed. + inst->setExecuted(); + } + } + + if (inst->traceData) + inst->traceData->setMem(addr, size, flags); + + return inst->getFault(); +} + +template<class Impl> +void +LSQ<Impl>::SingleDataRequest::finish(const Fault &fault, const RequestPtr &req, + ThreadContext* tc, BaseTLB::Mode mode) +{ + _fault.push_back(fault); + numInTranslationFragments = 0; + numTranslatedFragments = 1; + /* If the instruction has been squahsed, let the request know + * as it may have to self-destruct. */ + if (_inst->isSquashed()) { + this->squashTranslation(); + } else { + _inst->strictlyOrdered(req->isStrictlyOrdered()); + + flags.set(Flag::TranslationFinished); + if (fault == NoFault) { + _inst->physEffAddr = req->getPaddr(); + _inst->memReqFlags = req->getFlags(); + if (req->isCondSwap()) { + assert(_res); + req->setExtraData(*_res); + } + setState(State::Request); + } else { + setState(State::Fault); + } + + LSQRequest::_inst->fault = fault; + LSQRequest::_inst->translationCompleted(true); + } +} + +template<class Impl> +void +LSQ<Impl>::SplitDataRequest::finish(const Fault &fault, const RequestPtr &req, + ThreadContext* tc, BaseTLB::Mode mode) +{ + _fault.push_back(fault); + assert(req == _requests[numTranslatedFragments] || this->isDelayed()); + + numInTranslationFragments--; + numTranslatedFragments++; + + mainReq->setFlags(req->getFlags()); + + if (numTranslatedFragments == _requests.size()) { + if (_inst->isSquashed()) { + this->squashTranslation(); + } else { + _inst->strictlyOrdered(mainReq->isStrictlyOrdered()); + flags.set(Flag::TranslationFinished); + auto fault_it = _fault.begin(); + /* Ffwd to the first NoFault. */ + while (fault_it != _fault.end() && *fault_it == NoFault) + fault_it++; + /* If none of the fragments faulted: */ + if (fault_it == _fault.end()) { + _inst->physEffAddr = request(0)->getPaddr(); + + _inst->memReqFlags = mainReq->getFlags(); + if (mainReq->isCondSwap()) { + assert(_res); + mainReq->setExtraData(*_res); + } + setState(State::Request); + _inst->fault = NoFault; + } else { + setState(State::Fault); + _inst->fault = *fault_it; + } + _inst->translationCompleted(true); + } + } +} + +template<class Impl> +void +LSQ<Impl>::SingleDataRequest::initiateTranslation() +{ + _inst->translationStarted(true); + setState(State::Translation); + flags.set(Flag::TranslationStarted); + + _inst->savedReq = this; + sendFragmentToTranslation(0); + + if (isTranslationComplete()) { + } +} + +template<class Impl> +PacketPtr +LSQ<Impl>::SplitDataRequest::mainPacket() +{ + return _mainPacket; +} + +template<class Impl> +RequestPtr +LSQ<Impl>::SplitDataRequest::mainRequest() +{ + return mainReq; +} + +template<class Impl> +void +LSQ<Impl>::SplitDataRequest::initiateTranslation() +{ + _inst->translationStarted(true); + setState(State::Translation); + flags.set(Flag::TranslationStarted); + + unsigned int cacheLineSize = _port.cacheLineSize(); + Addr base_addr = _addr; + Addr next_addr = addrBlockAlign(_addr + cacheLineSize, cacheLineSize); + Addr final_addr = addrBlockAlign(_addr + _size, cacheLineSize); + uint32_t size_so_far = 0; + + mainReq = std::make_shared<Request>(_inst->getASID(), base_addr, + _size, _flags, _inst->masterId(), + _inst->instAddr(), _inst->contextId()); + + // Paddr is not used in mainReq. However, we will accumulate the flags + // from the sub requests into mainReq by calling setFlags() in finish(). + // setFlags() assumes that paddr is set so flip the paddr valid bit here to + // avoid a potential assert in setFlags() when we call it from finish(). + mainReq->setPaddr(0); + + /* Get the pre-fix, possibly unaligned. */ + _requests.push_back(std::make_shared<Request>(_inst->getASID(), base_addr, + next_addr - base_addr, _flags, _inst->masterId(), + _inst->instAddr(), _inst->contextId())); + size_so_far = next_addr - base_addr; + + /* We are block aligned now, reading whole blocks. */ + base_addr = next_addr; + while (base_addr != final_addr) { + _requests.push_back(std::make_shared<Request>(_inst->getASID(), + base_addr, cacheLineSize, _flags, _inst->masterId(), + _inst->instAddr(), _inst->contextId())); + size_so_far += cacheLineSize; + base_addr += cacheLineSize; + } + + /* Deal with the tail. */ + if (size_so_far < _size) { + _requests.push_back(std::make_shared<Request>(_inst->getASID(), + base_addr, _size - size_so_far, _flags, _inst->masterId(), + _inst->instAddr(), _inst->contextId())); + } + + /* Setup the requests and send them to translation. */ + for (auto& r: _requests) { + r->setReqInstSeqNum(_inst->seqNum); + r->taskId(_taskId); + } + this->_inst->savedReq = this; + numInTranslationFragments = 0; + numTranslatedFragments = 0; + + for (uint32_t i = 0; i < _requests.size(); i++) { + sendFragmentToTranslation(i); + } +} + +template<class Impl> +void +LSQ<Impl>::LSQRequest::sendFragmentToTranslation(int i) +{ + numInTranslationFragments++; + _port.dTLB()->translateTiming( + this->request(i), + this->_inst->thread->getTC(), this, + this->isLoad() ? BaseTLB::Read : BaseTLB::Write); +} + +template<class Impl> +bool +LSQ<Impl>::SingleDataRequest::recvTimingResp(PacketPtr pkt) +{ + assert(_numOutstandingPackets == 1); + auto state = dynamic_cast<LSQSenderState*>(pkt->senderState); + setState(State::Complete); + flags.set(Flag::Complete); + state->outstanding--; + assert(pkt == _packets.front()); + _port.completeDataAccess(pkt); + return true; +} + +template<class Impl> +bool +LSQ<Impl>::SplitDataRequest::recvTimingResp(PacketPtr pkt) +{ + auto state = dynamic_cast<LSQSenderState*>(pkt->senderState); + uint32_t pktIdx = 0; + while (pktIdx < _packets.size() && pkt != _packets[pktIdx]) + pktIdx++; + assert(pktIdx < _packets.size()); + assert(pkt->req == _requests[pktIdx]); + assert(pkt == _packets[pktIdx]); + numReceivedPackets++; + state->outstanding--; + if (numReceivedPackets == _packets.size()) { + setState(State::Complete); + flags.set(Flag::Complete); + /* Assemble packets. */ + PacketPtr resp = isLoad() + ? Packet::createRead(mainReq) + : Packet::createWrite(mainReq); + if (isLoad()) + resp->dataStatic(_inst->memData); + else + resp->dataStatic(_data); + resp->senderState = _senderState; + _port.completeDataAccess(resp); + delete resp; + } + return true; +} + +template<class Impl> +void +LSQ<Impl>::SingleDataRequest::buildPackets() +{ + assert(_senderState); + /* Retries do not create new packets. */ + if (_packets.size() == 0) { + _packets.push_back( + isLoad() + ? Packet::createRead(request()) + : Packet::createWrite(request())); + _packets.back()->dataStatic(_inst->memData); + _packets.back()->senderState = _senderState; + } + assert(_packets.size() == 1); +} + +template<class Impl> +void +LSQ<Impl>::SplitDataRequest::buildPackets() +{ + /* Extra data?? */ + ptrdiff_t offset = 0; + if (_packets.size() == 0) { + /* New stuff */ + if (isLoad()) { + _mainPacket = Packet::createRead(mainReq); + _mainPacket->dataStatic(_inst->memData); + } + for (auto& r: _requests) { + PacketPtr pkt = isLoad() ? Packet::createRead(r) + : Packet::createWrite(r); + if (isLoad()) { + pkt->dataStatic(_inst->memData + offset); + } else { + uint8_t* req_data = new uint8_t[r->getSize()]; + std::memcpy(req_data, + _inst->memData + offset, + r->getSize()); + pkt->dataDynamic(req_data); + } + offset += r->getSize(); + pkt->senderState = _senderState; + _packets.push_back(pkt); + } + } + assert(_packets.size() == _requests.size()); +} + +template<class Impl> +void +LSQ<Impl>::SingleDataRequest::sendPacketToCache() +{ + assert(_numOutstandingPackets == 0); + if (lsqUnit()->trySendPacket(isLoad(), _packets.at(0))) + _numOutstandingPackets = 1; +} + +template<class Impl> +void +LSQ<Impl>::SplitDataRequest::sendPacketToCache() +{ + /* Try to send the packets. */ + while (numReceivedPackets + _numOutstandingPackets < _packets.size() && + lsqUnit()->trySendPacket(isLoad(), + _packets.at(numReceivedPackets + _numOutstandingPackets))) { + _numOutstandingPackets++; + } +} + +template<class Impl> +void +LSQ<Impl>::SingleDataRequest::handleIprWrite(ThreadContext *thread, + PacketPtr pkt) +{ + TheISA::handleIprWrite(thread, pkt); +} + +template<class Impl> +void +LSQ<Impl>::SplitDataRequest::handleIprWrite(ThreadContext *thread, + PacketPtr mainPkt) +{ + unsigned offset = 0; + for (auto r: _requests) { + PacketPtr pkt = new Packet(r, MemCmd::WriteReq); + pkt->dataStatic(mainPkt->getPtr<uint8_t>() + offset); + TheISA::handleIprWrite(thread, pkt); + offset += r->getSize(); + delete pkt; + } +} + +template<class Impl> +Cycles +LSQ<Impl>::SingleDataRequest::handleIprRead(ThreadContext *thread, + PacketPtr pkt) +{ + return TheISA::handleIprRead(thread, pkt); +} + +template<class Impl> +Cycles +LSQ<Impl>::SplitDataRequest::handleIprRead(ThreadContext *thread, + PacketPtr mainPkt) +{ + Cycles delay(0); + unsigned offset = 0; + + for (auto r: _requests) { + PacketPtr pkt = new Packet(r, MemCmd::ReadReq); + pkt->dataStatic(mainPkt->getPtr<uint8_t>() + offset); + Cycles d = TheISA::handleIprRead(thread, pkt); + if (d > delay) + delay = d; + offset += r->getSize(); + delete pkt; + } + return delay; +} + +template<class Impl> +bool +LSQ<Impl>::SingleDataRequest::isCacheBlockHit(Addr blockAddr, Addr blockMask) +{ + return ( (LSQRequest::_requests[0]->getPaddr() & blockMask) == blockAddr); +} + +template<class Impl> +bool +LSQ<Impl>::SplitDataRequest::isCacheBlockHit(Addr blockAddr, Addr blockMask) +{ + bool is_hit = false; + for (auto &r: _requests) { + if ((r->getPaddr() & blockMask) == blockAddr) { + is_hit = true; + break; + } + } + return is_hit; +} + #endif//__CPU_O3_LSQ_IMPL_HH__ diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 48a06b386..5b90da4f5 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2014,2017 ARM Limited + * Copyright (c) 2012-2014,2017-2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -62,6 +62,7 @@ #include "mem/port.hh" struct DerivO3CPUParams; +#include "base/circular_queue.hh" /** * Class that implements the actual LQ and SQ for each specific @@ -76,7 +77,8 @@ struct DerivO3CPUParams; * replayed. */ template <class Impl> -class LSQUnit { +class LSQUnit +{ public: typedef typename Impl::O3CPU O3CPU; typedef typename Impl::DynInstPtr DynInstPtr; @@ -84,6 +86,130 @@ class LSQUnit { typedef typename Impl::CPUPol::LSQ LSQ; typedef typename Impl::CPUPol::IssueStruct IssueStruct; + using LSQSenderState = typename LSQ::LSQSenderState; + using LSQRequest = typename Impl::CPUPol::LSQ::LSQRequest; + private: + class LSQEntry + { + private: + /** The instruction. */ + DynInstPtr inst; + /** The request. */ + LSQRequest* req; + /** The size of the operation. */ + uint8_t _size; + /** Valid entry. */ + bool _valid; + public: + /** Constructs an empty store queue entry. */ + LSQEntry() + : inst(nullptr), req(nullptr), _size(0), _valid(false) + { + } + + ~LSQEntry() + { + inst = nullptr; + if (req != nullptr) { + req->freeLSQEntry(); + req = nullptr; + } + } + + void + clear() + { + inst = nullptr; + if (req != nullptr) { + req->freeLSQEntry(); + } + req = nullptr; + _valid = false; + _size = 0; + } + + void + set(const DynInstPtr& inst) + { + assert(!_valid); + this->inst = inst; + _valid = true; + _size = 0; + } + LSQRequest* request() { return req; } + void setRequest(LSQRequest* r) { req = r; } + bool hasRequest() { return req != nullptr; } + /** Member accessors. */ + /** @{ */ + bool valid() const { return _valid; } + uint8_t& size() { return _size; } + const uint8_t& size() const { return _size; } + const DynInstPtr& instruction() const { return inst; } + /** @} */ + }; + + class SQEntry : public LSQEntry + { + private: + /** The store data. */ + char _data[64]; // TODO: 64 should become a parameter + /** Whether or not the store can writeback. */ + bool _canWB; + /** Whether or not the store is committed. */ + bool _committed; + /** Whether or not the store is completed. */ + bool _completed; + /** Does this request write all zeros and thus doesn't + * have any data attached to it. Used for cache block zero + * style instructs (ARM DC ZVA; ALPHA WH64) + */ + bool _isAllZeros; + public: + static constexpr size_t DataSize = sizeof(_data); + /** Constructs an empty store queue entry. */ + SQEntry() + : _canWB(false), _committed(false), _completed(false), + _isAllZeros(false) + { + std::memset(_data, 0, DataSize); + } + + ~SQEntry() + { + } + + void + set(const DynInstPtr& inst) + { + LSQEntry::set(inst); + } + + void + clear() + { + LSQEntry::clear(); + _canWB = _completed = _committed = _isAllZeros = false; + } + /** Member accessors. */ + /** @{ */ + bool& canWB() { return _canWB; } + const bool& canWB() const { return _canWB; } + bool& completed() { return _completed; } + const bool& completed() const { return _completed; } + bool& committed() { return _committed; } + const bool& committed() const { return _committed; } + bool& isAllZeros() { return _isAllZeros; } + const bool& isAllZeros() const { return _isAllZeros; } + char* data() { return _data; } + const char* data() const { return _data; } + /** @} */ + }; + using LQEntry = LSQEntry; + + public: + using LoadQueue = CircularQueue<LQEntry>; + using StoreQueue = CircularQueue<SQEntry>; + public: /** Constructs an LSQ unit. init() must be called prior to use. */ LSQUnit(uint32_t lqEntries, uint32_t sqEntries); @@ -113,13 +239,6 @@ class LSQUnit { /** Takes over from another CPU's thread. */ void takeOverFrom(); - /** Ticks the LSQ unit, which in this case only resets the number of - * used cache ports. - * @todo: Move the number of used ports up to the LSQ level so it can - * be shared by all LSQ units. - */ - void tick() { usedStorePorts = 0; } - /** Inserts an instruction. */ void insert(const DynInstPtr &inst); /** Inserts a load instruction. */ @@ -133,7 +252,8 @@ class LSQUnit { * @param load_idx index to start checking at * @param inst the instruction to check */ - Fault checkViolations(int load_idx, const DynInstPtr &inst); + Fault checkViolations(typename LoadQueue::iterator& loadIt, + const DynInstPtr& inst); /** Check if an incoming invalidate hits in the lsq on a load * that might have issued out of order wrt another load beacuse @@ -163,18 +283,6 @@ class LSQUnit { * memory system. */ void completeDataAccess(PacketPtr pkt); - /** Clears all the entries in the LQ. */ - void clearLQ(); - - /** Clears all the entries in the SQ. */ - void clearSQ(); - - /** Resizes the LQ to a given size. */ - void resizeLQ(unsigned size); - - /** Resizes the SQ to a given size. */ - void resizeSQ(unsigned size); - /** Squashes all instructions younger than a specific sequence number. */ void squash(const InstSeqNum &squashed_num); @@ -205,10 +313,10 @@ class LSQUnit { bool isEmpty() const { return lqEmpty() && sqEmpty(); } /** Returns if the LQ is full. */ - bool lqFull() { return loads >= (LQEntries - 1); } + bool lqFull() { return loadQueue.full(); } /** Returns if the SQ is full. */ - bool sqFull() { return stores >= (SQEntries - 1); } + bool sqFull() { return storeQueue.full(); } /** Returns if the LQ is empty. */ bool lqEmpty() const { return loads == 0; } @@ -226,13 +334,20 @@ class LSQUnit { int numStoresToWB() { return storesToWB; } /** Returns if the LSQ unit will writeback on this cycle. */ - bool willWB() { return storeQueue[storeWBIdx].canWB && - !storeQueue[storeWBIdx].completed && - !isStoreBlocked; } + bool + willWB() + { + return storeWBIt.dereferenceable() && + storeWBIt->valid() && + storeWBIt->canWB() && + !storeWBIt->completed() && + !isStoreBlocked; + } /** Handles doing the retry. */ void recvRetry(); + unsigned int cacheLineSize(); private: /** Reset the LSQ state */ void resetState(); @@ -240,31 +355,31 @@ class LSQUnit { /** Writes back the instruction, sending it to IEW. */ void writeback(const DynInstPtr &inst, PacketPtr pkt); - /** Writes back a store that couldn't be completed the previous cycle. */ - void writebackPendingStore(); - - /** Handles completing the send of a store to memory. */ - void storePostSend(PacketPtr pkt); + /** Try to finish a previously blocked write back attempt */ + void writebackBlockedStore(); /** Completes the store at the specified index. */ - void completeStore(int store_idx); - - /** Attempts to send a store to the cache. */ - bool sendStore(PacketPtr data_pkt); + void completeStore(typename StoreQueue::iterator store_idx); - /** Increments the given store index (circular queue). */ - inline void incrStIdx(int &store_idx) const; - /** Decrements the given store index (circular queue). */ - inline void decrStIdx(int &store_idx) const; - /** Increments the given load index (circular queue). */ - inline void incrLdIdx(int &load_idx) const; - /** Decrements the given load index (circular queue). */ - inline void decrLdIdx(int &load_idx) const; + /** Handles completing the send of a store to memory. */ + void storePostSend(); public: + /** Attempts to send a packet to the cache. + * Check if there are ports available. Return true if + * there are, false if there are not. + */ + bool trySendPacket(bool isLoad, PacketPtr data_pkt); + + /** Debugging function to dump instructions in the LSQ. */ void dumpInsts() const; + /** Schedule event for the cpu. */ + void schedule(Event& ev, Tick when) { cpu->schedule(ev, when); } + + BaseTLB* dTLB() { return cpu->dtb; } + private: /** Pointer to the CPU. */ O3CPU *cpu; @@ -278,44 +393,46 @@ class LSQUnit { /** Pointer to the dcache port. Used only for sending. */ MasterPort *dcachePort; - /** Derived class to hold any sender state the LSQ needs. */ - class LSQSenderState : public Packet::SenderState + /** Particularisation of the LSQSenderState to the LQ. */ + class LQSenderState : public LSQSenderState { + using LSQSenderState::alive; public: - /** Default constructor. */ - LSQSenderState() - : mainPkt(NULL), pendingPacket(NULL), idx(0), outstanding(1), - isLoad(false), noWB(false), isSplit(false), - pktToSend(false), cacheBlocked(false) - { } - - /** Instruction who initiated the access to memory. */ - DynInstPtr inst; - /** The main packet from a split load, used during writeback. */ - PacketPtr mainPkt; - /** A second packet from a split store that needs sending. */ - PacketPtr pendingPacket; - /** The LQ/SQ index of the instruction. */ - uint8_t idx; - /** Number of outstanding packets to complete. */ - uint8_t outstanding; - /** Whether or not it is a load. */ - bool isLoad; - /** Whether or not the instruction will need to writeback. */ - bool noWB; - /** Whether or not this access is split in two. */ - bool isSplit; - /** Whether or not there is a packet that needs sending. */ - bool pktToSend; - /** Whether or not the second packet of this split load was blocked */ - bool cacheBlocked; - - /** Completes a packet and returns whether the access is finished. */ - inline bool complete() { return --outstanding == 0; } + LQSenderState(typename LoadQueue::iterator idx_) + : LSQSenderState(idx_->request(), true), idx(idx_) { } + + /** The LQ index of the instruction. */ + typename LoadQueue::iterator idx; + //virtual LSQRequest* request() { return idx->request(); } + virtual void + complete() + { + //if (alive()) + // idx->request()->senderState(nullptr); + } + }; + + /** Particularisation of the LSQSenderState to the SQ. */ + class SQSenderState : public LSQSenderState + { + using LSQSenderState::alive; + public: + SQSenderState(typename StoreQueue::iterator idx_) + : LSQSenderState(idx_->request(), false), idx(idx_) { } + /** The SQ index of the instruction. */ + typename StoreQueue::iterator idx; + //virtual LSQRequest* request() { return idx->request(); } + virtual void + complete() + { + //if (alive()) + // idx->request()->senderState(nullptr); + } }; /** Writeback event, specifically for when stores forward data to loads. */ - class WritebackEvent : public Event { + class WritebackEvent : public Event + { public: /** Constructs a writeback event. */ WritebackEvent(const DynInstPtr &_inst, PacketPtr pkt, @@ -339,72 +456,25 @@ class LSQUnit { }; public: - struct SQEntry { - /** Constructs an empty store queue entry. */ - SQEntry() - : inst(NULL), req(NULL), size(0), - canWB(0), committed(0), completed(0) - { - std::memset(data, 0, sizeof(data)); - } - - ~SQEntry() - { - inst = NULL; - } - - /** Constructs a store queue entry for a given instruction. */ - SQEntry(const DynInstPtr &_inst) - : inst(_inst), req(NULL), sreqLow(NULL), sreqHigh(NULL), size(0), - isSplit(0), canWB(0), committed(0), completed(0), isAllZeros(0) - { - std::memset(data, 0, sizeof(data)); - } - /** The store data. */ - char data[16]; - /** The store instruction. */ - DynInstPtr inst; - /** The request for the store. */ - RequestPtr req; - /** The split requests for the store. */ - RequestPtr sreqLow; - RequestPtr sreqHigh; - /** The size of the store. */ - uint8_t size; - /** Whether or not the store is split into two requests. */ - bool isSplit; - /** Whether or not the store can writeback. */ - bool canWB; - /** Whether or not the store is committed. */ - bool committed; - /** Whether or not the store is completed. */ - bool completed; - /** Does this request write all zeros and thus doesn't - * have any data attached to it. Used for cache block zero - * style instructs (ARM DC ZVA; ALPHA WH64) - */ - bool isAllZeros; - }; + /** + * Handles writing back and completing the load or store that has + * returned from memory. + * + * @param pkt Response packet from the memory sub-system + */ + bool recvTimingResp(PacketPtr pkt); private: /** The LSQUnit thread id. */ ThreadID lsqID; - + public: /** The store queue. */ - std::vector<SQEntry> storeQueue; + CircularQueue<SQEntry> storeQueue; /** The load queue. */ - std::vector<DynInstPtr> loadQueue; - - /** The number of LQ entries, plus a sentinel entry (circular queue). - * @todo: Consider having var that records the true number of LQ entries. - */ - unsigned LQEntries; - /** The number of SQ entries, plus a sentinel entry (circular queue). - * @todo: Consider having var that records the true number of SQ entries. - */ - unsigned SQEntries; + LoadQueue loadQueue; + private: /** The number of places to shift addresses in the LSQ before checking * for dependency violations */ @@ -420,28 +490,10 @@ class LSQUnit { /** The number of store instructions in the SQ waiting to writeback. */ int storesToWB; - /** The index of the head instruction in the LQ. */ - int loadHead; - /** The index of the tail instruction in the LQ. */ - int loadTail; - - /** The index of the head instruction in the SQ. */ - int storeHead; /** The index of the first instruction that may be ready to be * written back, and has not yet been written back. */ - int storeWBIdx; - /** The index of the tail instruction in the SQ. */ - int storeTail; - - /// @todo Consider moving to a more advanced model with write vs read ports - /** The number of cache ports available each cycle (stores only). */ - int cacheStorePorts; - - /** The number of used cache ports in this cycle by stores. */ - int usedStorePorts; - - //list<InstSeqNum> mshrSeqNums; + typename StoreQueue::iterator storeWBIt; /** Address Mask for a cache block (e.g. ~(cache_block_size-1)) */ Addr cacheBlockMask; @@ -472,10 +524,10 @@ class LSQUnit { /** Whether or not there is a packet that couldn't be sent because of * a lack of cache ports. */ - bool hasPendingPkt; + bool hasPendingRequest; /** The packet that is pending free cache ports. */ - PacketPtr pendingPkt; + LSQRequest* pendingRequest; /** Flag for memory model. */ bool needsTSO; @@ -516,53 +568,51 @@ class LSQUnit { public: /** Executes the load at the given index. */ - Fault read(const RequestPtr &req, - RequestPtr &sreqLow, RequestPtr &sreqHigh, - int load_idx); + Fault read(LSQRequest *req, int load_idx); /** Executes the store at the given index. */ - Fault write(const RequestPtr &req, - const RequestPtr &sreqLow, const RequestPtr &sreqHigh, - uint8_t *data, int store_idx); + Fault write(LSQRequest *req, uint8_t *data, int store_idx); /** Returns the index of the head load instruction. */ - int getLoadHead() { return loadHead; } + int getLoadHead() { return loadQueue.head(); } + /** Returns the sequence number of the head load instruction. */ - InstSeqNum getLoadHeadSeqNum() + InstSeqNum + getLoadHeadSeqNum() { - if (loadQueue[loadHead]) { - return loadQueue[loadHead]->seqNum; - } else { - return 0; - } - + return loadQueue.front().valid() + ? loadQueue.front().instruction()->seqNum + : 0; } /** Returns the index of the head store instruction. */ - int getStoreHead() { return storeHead; } + int getStoreHead() { return storeQueue.head(); } /** Returns the sequence number of the head store instruction. */ - InstSeqNum getStoreHeadSeqNum() + InstSeqNum + getStoreHeadSeqNum() { - if (storeQueue[storeHead].inst) { - return storeQueue[storeHead].inst->seqNum; - } else { - return 0; - } - + return storeQueue.front().valid() + ? storeQueue.front().instruction()->seqNum + : 0; } /** Returns whether or not the LSQ unit is stalled. */ bool isStalled() { return stalled; } + public: + typedef typename CircularQueue<LQEntry>::iterator LQIterator; + typedef typename CircularQueue<SQEntry>::iterator SQIterator; + typedef CircularQueue<LQEntry> LQueue; + typedef CircularQueue<SQEntry> SQueue; }; template <class Impl> Fault -LSQUnit<Impl>::read(const RequestPtr &req, - RequestPtr &sreqLow, RequestPtr &sreqHigh, - int load_idx) +LSQUnit<Impl>::read(LSQRequest *req, int load_idx) { - DynInstPtr load_inst = loadQueue[load_idx]; + LQEntry& load_req = loadQueue[load_idx]; + const DynInstPtr& load_inst = load_req.instruction(); + load_req.setRequest(req); assert(load_inst); assert(!load_inst->isExecuted()); @@ -571,184 +621,188 @@ LSQUnit<Impl>::read(const RequestPtr &req, // A bit of a hackish way to get strictly ordered accesses to work // only if they're at the head of the LSQ and are ready to commit // (at the head of the ROB too). - if (req->isStrictlyOrdered() && - (load_idx != loadHead || !load_inst->isAtCommit())) { + + if (req->mainRequest()->isStrictlyOrdered() && + (load_idx != loadQueue.head() || !load_inst->isAtCommit())) { + // Tell IQ/mem dep unit that this instruction will need to be + // rescheduled eventually iewStage->rescheduleMemInst(load_inst); + load_inst->clearIssued(); + load_inst->effAddrValid(false); ++lsqRescheduledLoads; DPRINTF(LSQUnit, "Strictly ordered load [sn:%lli] PC %s\n", load_inst->seqNum, load_inst->pcState()); + // Must delete request now that it wasn't handed off to + // memory. This is quite ugly. @todo: Figure out the proper + // place to really handle request deletes. + load_req.setRequest(nullptr); + req->discard(); return std::make_shared<GenericISA::M5PanicFault>( "Strictly ordered load [sn:%llx] PC %s\n", load_inst->seqNum, load_inst->pcState()); } - // Check the SQ for any previous stores that might lead to forwarding - int store_idx = load_inst->sqIdx; - - int store_size = 0; - DPRINTF(LSQUnit, "Read called, load idx: %i, store idx: %i, " "storeHead: %i addr: %#x%s\n", - load_idx, store_idx, storeHead, req->getPaddr(), - sreqLow ? " split" : ""); + load_idx - 1, load_inst->sqIt._idx, storeQueue.head() - 1, + req->mainRequest()->getPaddr(), req->isSplit() ? " split" : ""); - if (req->isLLSC()) { - assert(!sreqLow); + if (req->mainRequest()->isLLSC()) { // Disable recording the result temporarily. Writing to misc // regs normally updates the result, but this is not the // desired behavior when handling store conditionals. load_inst->recordResult(false); - TheISA::handleLockedRead(load_inst.get(), req); + TheISA::handleLockedRead(load_inst.get(), req->mainRequest()); load_inst->recordResult(true); } - if (req->isMmappedIpr()) { + if (req->mainRequest()->isMmappedIpr()) { assert(!load_inst->memData); load_inst->memData = new uint8_t[64]; ThreadContext *thread = cpu->tcBase(lsqID); - Cycles delay(0); - PacketPtr data_pkt = new Packet(req, MemCmd::ReadReq); - - data_pkt->dataStatic(load_inst->memData); - if (!TheISA::HasUnalignedMemAcc || !sreqLow) { - delay = TheISA::handleIprRead(thread, data_pkt); - } else { - assert(sreqLow->isMmappedIpr() && sreqHigh->isMmappedIpr()); - PacketPtr fst_data_pkt = new Packet(sreqLow, MemCmd::ReadReq); - PacketPtr snd_data_pkt = new Packet(sreqHigh, MemCmd::ReadReq); - - fst_data_pkt->dataStatic(load_inst->memData); - snd_data_pkt->dataStatic(load_inst->memData + sreqLow->getSize()); - - delay = TheISA::handleIprRead(thread, fst_data_pkt); - Cycles delay2 = TheISA::handleIprRead(thread, snd_data_pkt); - if (delay2 > delay) - delay = delay2; - - delete fst_data_pkt; - delete snd_data_pkt; - } - WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this); + PacketPtr main_pkt = new Packet(req->mainRequest(), MemCmd::ReadReq); + + Cycles delay = req->handleIprRead(thread, main_pkt); + + WritebackEvent *wb = new WritebackEvent(load_inst, main_pkt, this); cpu->schedule(wb, cpu->clockEdge(delay)); return NoFault; } - while (store_idx != -1) { - // End once we've reached the top of the LSQ - if (store_idx == storeWBIdx) { - break; - } - + // Check the SQ for any previous stores that might lead to forwarding + auto store_it = load_inst->sqIt; + assert (store_it >= storeWBIt); + // End once we've reached the top of the LSQ + while (store_it != storeWBIt) { // Move the index to one younger - if (--store_idx < 0) - store_idx += SQEntries; - - assert(storeQueue[store_idx].inst); - - store_size = storeQueue[store_idx].size; - - if (!store_size || storeQueue[store_idx].inst->strictlyOrdered() || - (storeQueue[store_idx].req && - storeQueue[store_idx].req->isCacheMaintenance())) { - // Cache maintenance instructions go down via the store - // path but they carry no data and they shouldn't be - // considered for forwarding - continue; - } - - assert(storeQueue[store_idx].inst->effAddrValid()); - - // Check if the store data is within the lower and upper bounds of - // addresses that the request needs. - bool store_has_lower_limit = - req->getVaddr() >= storeQueue[store_idx].inst->effAddr; - bool store_has_upper_limit = - (req->getVaddr() + req->getSize()) <= - (storeQueue[store_idx].inst->effAddr + store_size); - bool lower_load_has_store_part = - req->getVaddr() < (storeQueue[store_idx].inst->effAddr + - store_size); - bool upper_load_has_store_part = - (req->getVaddr() + req->getSize()) > - storeQueue[store_idx].inst->effAddr; - - // If the store's data has all of the data needed and the load isn't - // LLSC, we can forward. - if (store_has_lower_limit && store_has_upper_limit && !req->isLLSC()) { - // Get shift amount for offset into the store's data. - int shift_amt = req->getVaddr() - storeQueue[store_idx].inst->effAddr; - - // Allocate memory if this is the first time a load is issued. - if (!load_inst->memData) { - load_inst->memData = new uint8_t[req->getSize()]; - } - if (storeQueue[store_idx].isAllZeros) - memset(load_inst->memData, 0, req->getSize()); - else - memcpy(load_inst->memData, - storeQueue[store_idx].data + shift_amt, req->getSize()); - - DPRINTF(LSQUnit, "Forwarding from store idx %i to load to " - "addr %#x\n", store_idx, req->getVaddr()); - - PacketPtr data_pkt = new Packet(req, MemCmd::ReadReq); - data_pkt->dataStatic(load_inst->memData); - - WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this); - - // We'll say this has a 1 cycle load-store forwarding latency - // for now. - // @todo: Need to make this a parameter. - cpu->schedule(wb, curTick()); - - ++lsqForwLoads; - return NoFault; - } else if ( - (!req->isLLSC() && + store_it--; + assert(store_it->valid()); + assert(store_it->instruction()->seqNum < load_inst->seqNum); + int store_size = store_it->size(); + + // Cache maintenance instructions go down via the store + // path but they carry no data and they shouldn't be + // considered for forwarding + if (store_size != 0 && !store_it->instruction()->strictlyOrdered() && + !(store_it->request()->mainRequest() && + store_it->request()->mainRequest()->isCacheMaintenance())) { + assert(store_it->instruction()->effAddrValid()); + + // Check if the store data is within the lower and upper bounds of + // addresses that the request needs. + auto req_s = req->mainRequest()->getVaddr(); + auto req_e = req_s + req->mainRequest()->getSize(); + auto st_s = store_it->instruction()->effAddr; + auto st_e = st_s + store_size; + + bool store_has_lower_limit = req_s >= st_s; + bool store_has_upper_limit = req_e <= st_e; + bool lower_load_has_store_part = req_s < st_e; + bool upper_load_has_store_part = req_e > st_s; + + // If the store's data has all of the data needed and the load + // isn't LLSC then + // we can forward. + if (store_has_lower_limit && store_has_upper_limit && + !req->mainRequest()->isLLSC()) { + + // Get shift amount for offset into the store's data. + int shift_amt = req->mainRequest()->getVaddr() - + store_it->instruction()->effAddr; + + // Allocate memory if this is the first time a load is issued. + if (!load_inst->memData) { + load_inst->memData = + new uint8_t[req->mainRequest()->getSize()]; + } + if (store_it->isAllZeros()) + memset(load_inst->memData, 0, + req->mainRequest()->getSize()); + else + memcpy(load_inst->memData, + store_it->data() + shift_amt, + req->mainRequest()->getSize()); + + DPRINTF(LSQUnit, "Forwarding from store idx %i to load to " + "addr %#x\n", store_it._idx, + req->mainRequest()->getVaddr()); + + PacketPtr data_pkt = new Packet(req->mainRequest(), + MemCmd::ReadReq); + data_pkt->dataStatic(load_inst->memData); + + if (req->isAnyOutstandingRequest()) { + assert(req->_numOutstandingPackets > 0); + // There are memory requests packets in flight already. + // This may happen if the store was not complete the + // first time this load got executed. Signal the senderSate + // that response packets should be discarded. + req->discardSenderState(); + } + + WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, + this); + + // We'll say this has a 1 cycle load-store forwarding latency + // for now. + // @todo: Need to make this a parameter. + cpu->schedule(wb, curTick()); + + // Don't need to do anything special for split loads. + ++lsqForwLoads; + + return NoFault; + } else if ( + (!req->mainRequest()->isLLSC() && ((store_has_lower_limit && lower_load_has_store_part) || (store_has_upper_limit && upper_load_has_store_part) || (lower_load_has_store_part && upper_load_has_store_part))) || - (req->isLLSC() && + (req->mainRequest()->isLLSC() && ((store_has_lower_limit || upper_load_has_store_part) && (store_has_upper_limit || lower_load_has_store_part)))) { - // This is the partial store-load forwarding case where a store - // has only part of the load's data and the load isn't LLSC or - // the load is LLSC and the store has all or part of the load's - // data - - // If it's already been written back, then don't worry about - // stalling on it. - if (storeQueue[store_idx].completed) { - panic("Should not check one of these"); - continue; + // This is the partial store-load forwarding case where a store + // has only part of the load's data and the load isn't LLSC or + // the load is LLSC and the store has all or part of the load's + // data + + // If it's already been written back, then don't worry about + // stalling on it. + if (store_it->completed()) { + panic("Should not check one of these"); + continue; + } + + // Must stall load and force it to retry, so long as it's the + // oldest load that needs to do so. + if (!stalled || + (stalled && + load_inst->seqNum < + loadQueue[stallingLoadIdx].instruction()->seqNum)) { + stalled = true; + stallingStoreIsn = store_it->instruction()->seqNum; + stallingLoadIdx = load_idx; + } + + // Tell IQ/mem dep unit that this instruction will need to be + // rescheduled eventually + iewStage->rescheduleMemInst(load_inst); + load_inst->clearIssued(); + load_inst->effAddrValid(false); + ++lsqRescheduledLoads; + + // Do not generate a writeback event as this instruction is not + // complete. + DPRINTF(LSQUnit, "Load-store forwarding mis-match. " + "Store idx %i to load addr %#x\n", + store_it._idx, req->mainRequest()->getVaddr()); + + // Must discard the request. + req->discard(); + load_req.setRequest(nullptr); + return NoFault; } - - // Must stall load and force it to retry, so long as it's the oldest - // load that needs to do so. - if (!stalled || - (stalled && - load_inst->seqNum < - loadQueue[stallingLoadIdx]->seqNum)) { - stalled = true; - stallingStoreIsn = storeQueue[store_idx].inst->seqNum; - stallingLoadIdx = load_idx; - } - - // Tell IQ/mem dep unit that this instruction will need to be - // rescheduled eventually - iewStage->rescheduleMemInst(load_inst); - load_inst->clearIssued(); - ++lsqRescheduledLoads; - - // Do not generate a writeback event as this instruction is not - // complete. - DPRINTF(LSQUnit, "Load-store forwarding mis-match. " - "Store idx %i to load addr %#x\n", - store_idx, req->getVaddr()); - - return NoFault; } } @@ -758,40 +812,7 @@ LSQUnit<Impl>::read(const RequestPtr &req, // Allocate memory if this is the first time a load is issued. if (!load_inst->memData) { - load_inst->memData = new uint8_t[req->getSize()]; - } - - // if we the cache is not blocked, do cache access - bool completedFirst = false; - PacketPtr data_pkt = Packet::createRead(req); - PacketPtr fst_data_pkt = NULL; - PacketPtr snd_data_pkt = NULL; - - data_pkt->dataStatic(load_inst->memData); - - LSQSenderState *state = new LSQSenderState; - state->isLoad = true; - state->idx = load_idx; - state->inst = load_inst; - data_pkt->senderState = state; - - if (!TheISA::HasUnalignedMemAcc || !sreqLow) { - // Point the first packet at the main data packet. - fst_data_pkt = data_pkt; - } else { - // Create the split packets. - fst_data_pkt = Packet::createRead(sreqLow); - snd_data_pkt = Packet::createRead(sreqHigh); - - fst_data_pkt->dataStatic(load_inst->memData); - snd_data_pkt->dataStatic(load_inst->memData + sreqLow->getSize()); - - fst_data_pkt->senderState = state; - snd_data_pkt->senderState = state; - - state->isSplit = true; - state->outstanding = 2; - state->mainPkt = data_pkt; + load_inst->memData = new uint8_t[req->mainRequest()->getSize()]; } // For now, load throughput is constrained by the number of @@ -799,97 +820,46 @@ LSQUnit<Impl>::read(const RequestPtr &req, // stores do). // @todo We should account for cache port contention // and arbitrate between loads and stores. - bool successful_load = true; - if (!dcachePort->sendTimingReq(fst_data_pkt)) { - successful_load = false; - } else if (TheISA::HasUnalignedMemAcc && sreqLow) { - completedFirst = true; - - // The first packet was sent without problems, so send this one - // too. If there is a problem with this packet then the whole - // load will be squashed, so indicate this to the state object. - // The first packet will return in completeDataAccess and be - // handled there. - // @todo We should also account for cache port contention - // here. - if (!dcachePort->sendTimingReq(snd_data_pkt)) { - // The main packet will be deleted in completeDataAccess. - state->complete(); - // Signify to 1st half that the 2nd half was blocked via state - state->cacheBlocked = true; - successful_load = false; - } - } - - // If the cache was blocked, or has become blocked due to the access, - // handle it. - if (!successful_load) { - if (!sreqLow) { - // Packet wasn't split, just delete main packet info - delete state; - delete data_pkt; - } - - if (TheISA::HasUnalignedMemAcc && sreqLow) { - if (!completedFirst) { - // Split packet, but first failed. Delete all state. - delete state; - delete data_pkt; - delete fst_data_pkt; - delete snd_data_pkt; - sreqLow.reset(); - sreqHigh.reset(); - } else { - // Can't delete main packet data or state because first packet - // was sent to the memory system - delete data_pkt; - delete snd_data_pkt; - sreqHigh.reset(); - } - } - - ++lsqCacheBlocked; - - iewStage->blockMemInst(load_inst); - // No fault occurred, even though the interface is blocked. - return NoFault; + // if we the cache is not blocked, do cache access + if (req->senderState() == nullptr) { + LQSenderState *state = new LQSenderState( + loadQueue.getIterator(load_idx)); + state->isLoad = true; + state->inst = load_inst; + state->isSplit = req->isSplit(); + req->senderState(state); } + req->buildPackets(); + req->sendPacketToCache(); + if (!req->isSent()) + iewStage->blockMemInst(load_inst); return NoFault; } template <class Impl> Fault -LSQUnit<Impl>::write(const RequestPtr &req, - const RequestPtr &sreqLow, const RequestPtr &sreqHigh, - uint8_t *data, int store_idx) +LSQUnit<Impl>::write(LSQRequest *req, uint8_t *data, int store_idx) { - assert(storeQueue[store_idx].inst); - - DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x" - " | storeHead:%i [sn:%i]\n", - store_idx, req->getPaddr(), storeHead, - storeQueue[store_idx].inst->seqNum); - - storeQueue[store_idx].req = req; - storeQueue[store_idx].sreqLow = sreqLow; - storeQueue[store_idx].sreqHigh = sreqHigh; - unsigned size = req->getSize(); - storeQueue[store_idx].size = size; - bool store_no_data = req->getFlags() & Request::STORE_NO_DATA; - storeQueue[store_idx].isAllZeros = store_no_data; - assert(size <= sizeof(storeQueue[store_idx].data) || store_no_data); - - // Split stores can only occur in ISAs with unaligned memory accesses. If - // a store request has been split, sreqLow and sreqHigh will be non-null. - if (TheISA::HasUnalignedMemAcc && sreqLow) { - storeQueue[store_idx].isSplit = true; - } - - if (!(req->getFlags() & Request::CACHE_BLOCK_ZERO) && \ - !req->isCacheMaintenance()) - memcpy(storeQueue[store_idx].data, data, size); + assert(storeQueue[store_idx].valid()); + + DPRINTF(LSQUnit, "Doing write to store idx %i, addr %#x | storeHead:%i " + "[sn:%i]\n", + store_idx - 1, req->request()->getPaddr(), storeQueue.head() - 1, + storeQueue[store_idx].instruction()->seqNum); + + storeQueue[store_idx].setRequest(req); + unsigned size = req->_size; + storeQueue[store_idx].size() = size; + bool store_no_data = + req->mainRequest()->getFlags() & Request::STORE_NO_DATA; + storeQueue[store_idx].isAllZeros() = store_no_data; + assert(size <= SQEntry::DataSize || store_no_data); + + if (!(req->request()->getFlags() & Request::CACHE_BLOCK_ZERO) && + !req->request()->isCacheMaintenance()) + memcpy(storeQueue[store_idx].data(), data, size); // This function only writes the data to the store queue, so no fault // can happen here. diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh index 13b148768..9756a9ef1 100644 --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -1,6 +1,6 @@ /* - * Copyright (c) 2010-2014, 2017 ARM Limited + * Copyright (c) 2010-2014, 2017-2018 ARM Limited * Copyright (c) 2013 Advanced Micro Devices, Inc. * All rights reserved * @@ -66,6 +66,8 @@ LSQUnit<Impl>::WritebackEvent::WritebackEvent(const DynInstPtr &_inst, : Event(Default_Pri, AutoDelete), inst(_inst), pkt(_pkt), lsqPtr(lsq_ptr) { + assert(_inst->savedReq); + _inst->savedReq->writebackScheduled(); } template<class Impl> @@ -76,9 +78,8 @@ LSQUnit<Impl>::WritebackEvent::process() lsqPtr->writeback(inst, pkt); - if (pkt->senderState) - delete pkt->senderState; - + assert(inst->savedReq); + inst->savedReq->writebackDone(); delete pkt; } @@ -89,65 +90,61 @@ LSQUnit<Impl>::WritebackEvent::description() const return "Store writeback"; } +template <class Impl> +bool +LSQUnit<Impl>::recvTimingResp(PacketPtr pkt) +{ + auto senderState = dynamic_cast<LSQSenderState*>(pkt->senderState); + LSQRequest* req = senderState->request(); + assert(req != nullptr); + bool ret = true; + /* Check that the request is still alive before any further action. */ + if (senderState->alive()) { + ret = req->recvTimingResp(pkt); + } else { + senderState->outstanding--; + } + return ret; + +} + template<class Impl> void LSQUnit<Impl>::completeDataAccess(PacketPtr pkt) { LSQSenderState *state = dynamic_cast<LSQSenderState *>(pkt->senderState); DynInstPtr inst = state->inst; - DPRINTF(IEW, "Writeback event [sn:%lli].\n", inst->seqNum); - DPRINTF(Activity, "Activity: Writeback event [sn:%lli].\n", inst->seqNum); - - if (state->cacheBlocked) { - // This is the first half of a previous split load, - // where the 2nd half blocked, ignore this response - DPRINTF(IEW, "[sn:%lli]: Response from first half of earlier " - "blocked split load recieved. Ignoring.\n", inst->seqNum); - delete state; - return; - } - // If this is a split access, wait until all packets are received. - if (TheISA::HasUnalignedMemAcc && !state->complete()) { - return; - } + cpu->ppDataAccessComplete->notify(std::make_pair(inst, pkt)); + + /* Notify the sender state that the access is complete (for ownership + * tracking). */ + state->complete(); assert(!cpu->switchedOut()); if (!inst->isSquashed()) { - if (!state->noWB) { + if (state->needWB) { // Only loads and store conditionals perform the writeback // after receving the response from the memory assert(inst->isLoad() || inst->isStoreConditional()); - if (!TheISA::HasUnalignedMemAcc || !state->isSplit || - !state->isLoad) { - writeback(inst, pkt); - } else { - writeback(inst, state->mainPkt); + writeback(inst, state->request()->mainPacket()); + if (inst->isStore()) { + auto ss = dynamic_cast<SQSenderState*>(state); + ss->writebackDone(); + completeStore(ss->idx); } - } - - if (inst->isStore()) { - completeStore(state->idx); + } else if (inst->isStore()) { + completeStore(dynamic_cast<SQSenderState*>(state)->idx); } } - - if (TheISA::HasUnalignedMemAcc && state->isSplit && state->isLoad) { - delete state->mainPkt; - } - - pkt->req->setAccessLatency(); - cpu->ppDataAccessComplete->notify(std::make_pair(inst, pkt)); - - delete state; } template <class Impl> LSQUnit<Impl>::LSQUnit(uint32_t lqEntries, uint32_t sqEntries) : lsqID(-1), storeQueue(sqEntries+1), loadQueue(lqEntries+1), - LQEntries(lqEntries+1), SQEntries(sqEntries+1), loads(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false), - isStoreBlocked(false), storeInFlight(false), hasPendingPkt(false), - pendingPkt(nullptr) + isStoreBlocked(false), storeInFlight(false), hasPendingRequest(false), + pendingRequest(nullptr) { } @@ -167,7 +164,6 @@ LSQUnit<Impl>::init(O3CPU *cpu_ptr, IEW *iew_ptr, DerivO3CPUParams *params, depCheckShift = params->LSQDepCheckShift; checkLoads = params->LSQCheckLoads; - cacheStorePorts = params->cacheStorePorts; needsTSO = params->needsTSO; resetState(); @@ -180,11 +176,8 @@ LSQUnit<Impl>::resetState() { loads = stores = storesToWB = 0; - loadHead = loadTail = 0; - - storeHead = storeWBIdx = storeTail = 0; - usedStorePorts = 0; + storeWBIt = storeQueue.begin(); retryPkt = NULL; memDepViolator = NULL; @@ -259,24 +252,10 @@ LSQUnit<Impl>::setDcachePort(MasterPort *dcache_port) template<class Impl> void -LSQUnit<Impl>::clearLQ() -{ - loadQueue.clear(); -} - -template<class Impl> -void -LSQUnit<Impl>::clearSQ() -{ - storeQueue.clear(); -} - -template<class Impl> -void LSQUnit<Impl>::drainSanityCheck() const { - for (int i = 0; i < loadQueue.size(); ++i) - assert(!loadQueue[i]); + for (int i = 0; i < loadQueue.capacity(); ++i) + assert(!loadQueue[i].valid()); assert(storesToWB == 0); assert(!retryPkt); @@ -289,44 +268,6 @@ LSQUnit<Impl>::takeOverFrom() resetState(); } -template<class Impl> -void -LSQUnit<Impl>::resizeLQ(unsigned size) -{ - unsigned size_plus_sentinel = size + 1; - assert(size_plus_sentinel >= LQEntries); - - if (size_plus_sentinel > LQEntries) { - while (size_plus_sentinel > loadQueue.size()) { - DynInstPtr dummy; - loadQueue.push_back(dummy); - LQEntries++; - } - } else { - LQEntries = size_plus_sentinel; - } - - assert(LQEntries <= 256); -} - -template<class Impl> -void -LSQUnit<Impl>::resizeSQ(unsigned size) -{ - unsigned size_plus_sentinel = size + 1; - if (size_plus_sentinel > SQEntries) { - while (size_plus_sentinel > storeQueue.size()) { - SQEntry dummy; - storeQueue.push_back(dummy); - SQEntries++; - } - } else { - SQEntries = size_plus_sentinel; - } - - assert(SQEntries <= 256); -} - template <class Impl> void LSQUnit<Impl>::insert(const DynInstPtr &inst) @@ -348,44 +289,42 @@ template <class Impl> void LSQUnit<Impl>::insertLoad(const DynInstPtr &load_inst) { - assert((loadTail + 1) % LQEntries != loadHead); - assert(loads < LQEntries); + assert(!loadQueue.full()); + assert(loads < loadQueue.capacity()); DPRINTF(LSQUnit, "Inserting load PC %s, idx:%i [sn:%lli]\n", - load_inst->pcState(), loadTail, load_inst->seqNum); + load_inst->pcState(), loadQueue.tail(), load_inst->seqNum); - load_inst->lqIdx = loadTail; + /* Grow the queue. */ + loadQueue.advance_tail(); - if (stores == 0) { - load_inst->sqIdx = -1; - } else { - load_inst->sqIdx = storeTail; - } + load_inst->sqIt = storeQueue.end(); - loadQueue[loadTail] = load_inst; - - incrLdIdx(loadTail); + assert(!loadQueue.back().valid()); + loadQueue.back().set(load_inst); + load_inst->lqIdx = loadQueue.tail(); + load_inst->lqIt = loadQueue.getIterator(load_inst->lqIdx); ++loads; } template <class Impl> void -LSQUnit<Impl>::insertStore(const DynInstPtr &store_inst) +LSQUnit<Impl>::insertStore(const DynInstPtr& store_inst) { // Make sure it is not full before inserting an instruction. - assert((storeTail + 1) % SQEntries != storeHead); - assert(stores < SQEntries); + assert(!storeQueue.full()); + assert(stores < storeQueue.capacity()); DPRINTF(LSQUnit, "Inserting store PC %s, idx:%i [sn:%lli]\n", - store_inst->pcState(), storeTail, store_inst->seqNum); - - store_inst->sqIdx = storeTail; - store_inst->lqIdx = loadTail; + store_inst->pcState(), storeQueue.tail(), store_inst->seqNum); + storeQueue.advance_tail(); - storeQueue[storeTail] = SQEntry(store_inst); + store_inst->sqIdx = storeQueue.tail(); + store_inst->lqIdx = loadQueue.moduloAdd(loadQueue.tail(), 1); + store_inst->lqIt = loadQueue.end(); - incrStIdx(storeTail); + storeQueue.back().set(store_inst); ++stores; } @@ -407,8 +346,9 @@ LSQUnit<Impl>::numFreeLoadEntries() { //LQ has an extra dummy entry to differentiate //empty/full conditions. Subtract 1 from the free entries. - DPRINTF(LSQUnit, "LQ size: %d, #loads occupied: %d\n", LQEntries, loads); - return LQEntries - loads - 1; + DPRINTF(LSQUnit, "LQ size: %d, #loads occupied: %d\n", + 1 + loadQueue.capacity(), loads); + return loadQueue.capacity() - loads; } template <class Impl> @@ -417,8 +357,9 @@ LSQUnit<Impl>::numFreeStoreEntries() { //SQ has an extra dummy entry to differentiate //empty/full conditions. Subtract 1 from the free entries. - DPRINTF(LSQUnit, "SQ size: %d, #stores occupied: %d\n", SQEntries, stores); - return SQEntries - stores - 1; + DPRINTF(LSQUnit, "SQ size: %d, #stores occupied: %d\n", + 1 + storeQueue.capacity(), stores); + return storeQueue.capacity() - stores; } @@ -429,11 +370,8 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt) // Should only ever get invalidations in here assert(pkt->isInvalidate()); - int load_idx = loadHead; DPRINTF(LSQUnit, "Got snoop for address %#x\n", pkt->getAddr()); - // Only Invalidate packet calls checkSnoop - assert(pkt->isInvalidate()); for (int x = 0; x < cpu->numContexts(); x++) { ThreadContext *tc = cpu->getContext(x); bool no_squash = cpu->thread[x]->noSquashFromTC; @@ -442,44 +380,37 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt) cpu->thread[x]->noSquashFromTC = no_squash; } - Addr invalidate_addr = pkt->getAddr() & cacheBlockMask; + if (loadQueue.empty()) + return; - DynInstPtr ld_inst = loadQueue[load_idx]; - if (ld_inst) { - Addr load_addr_low = ld_inst->physEffAddrLow & cacheBlockMask; - Addr load_addr_high = ld_inst->physEffAddrHigh & cacheBlockMask; + auto iter = loadQueue.begin(); - // Check that this snoop didn't just invalidate our lock flag - if (ld_inst->effAddrValid() && (load_addr_low == invalidate_addr - || load_addr_high == invalidate_addr) - && ld_inst->memReqFlags & Request::LLSC) - TheISA::handleLockedSnoopHit(ld_inst.get()); - } + Addr invalidate_addr = pkt->getAddr() & cacheBlockMask; - // If this is the only load in the LSQ we don't care - if (load_idx == loadTail) - return; + DynInstPtr ld_inst = iter->instruction(); + assert(ld_inst); + LSQRequest *req = iter->request(); - incrLdIdx(load_idx); + // Check that this snoop didn't just invalidate our lock flag + if (ld_inst->effAddrValid() && + req->isCacheBlockHit(invalidate_addr, cacheBlockMask) + && ld_inst->memReqFlags & Request::LLSC) + TheISA::handleLockedSnoopHit(ld_inst.get()); bool force_squash = false; - while (load_idx != loadTail) { - DynInstPtr ld_inst = loadQueue[load_idx]; - - if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) { - incrLdIdx(load_idx); + while (++iter != loadQueue.end()) { + ld_inst = iter->instruction(); + assert(ld_inst); + req = iter->request(); + if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) continue; - } - - Addr load_addr_low = ld_inst->physEffAddrLow & cacheBlockMask; - Addr load_addr_high = ld_inst->physEffAddrHigh & cacheBlockMask; - DPRINTF(LSQUnit, "-- inst [sn:%lli] load_addr: %#x to pktAddr:%#x\n", - ld_inst->seqNum, load_addr_low, invalidate_addr); + DPRINTF(LSQUnit, "-- inst [sn:%lli] to pktAddr:%#x\n", + ld_inst->seqNum, invalidate_addr); - if ((load_addr_low == invalidate_addr - || load_addr_high == invalidate_addr) || force_squash) { + if (force_squash || + req->isCacheBlockHit(invalidate_addr, cacheBlockMask)) { if (needsTSO) { // If we have a TSO system, as all loads must be ordered with // all other loads, this load as well as *all* subsequent loads @@ -508,14 +439,14 @@ LSQUnit<Impl>::checkSnoop(PacketPtr pkt) ld_inst->hitExternalSnoop(true); } } - incrLdIdx(load_idx); } return; } template <class Impl> Fault -LSQUnit<Impl>::checkViolations(int load_idx, const DynInstPtr &inst) +LSQUnit<Impl>::checkViolations(typename LoadQueue::iterator& loadIt, + const DynInstPtr& inst) { Addr inst_eff_addr1 = inst->effAddr >> depCheckShift; Addr inst_eff_addr2 = (inst->effAddr + inst->effSize - 1) >> depCheckShift; @@ -525,10 +456,10 @@ LSQUnit<Impl>::checkViolations(int load_idx, const DynInstPtr &inst) * all instructions that will execute before the store writes back. Thus, * like the implementation that came before it, we're overly conservative. */ - while (load_idx != loadTail) { - DynInstPtr ld_inst = loadQueue[load_idx]; + while (loadIt != loadQueue.end()) { + DynInstPtr ld_inst = loadIt->instruction(); if (!ld_inst->effAddrValid() || ld_inst->strictlyOrdered()) { - incrLdIdx(load_idx); + ++loadIt; continue; } @@ -585,7 +516,7 @@ LSQUnit<Impl>::checkViolations(int load_idx, const DynInstPtr &inst) } } - incrLdIdx(load_idx); + ++loadIt; } return NoFault; } @@ -608,8 +539,7 @@ LSQUnit<Impl>::executeLoad(const DynInstPtr &inst) load_fault = inst->initiateAcc(); - if (inst->isTranslationDelayed() && - load_fault == NoFault) + if (inst->isTranslationDelayed() && load_fault == NoFault) return load_fault; // If the instruction faulted or predicated false, then we need to send it @@ -631,12 +561,13 @@ LSQUnit<Impl>::executeLoad(const DynInstPtr &inst) iewStage->instToCommit(inst); iewStage->activityThisCycle(); } else { - assert(inst->effAddrValid()); - int load_idx = inst->lqIdx; - incrLdIdx(load_idx); + if (inst->effAddrValid()) { + auto it = inst->lqIt; + ++it; - if (checkLoads) - return checkViolations(load_idx, inst); + if (checkLoads) + return checkViolations(it, inst); + } } return load_fault; @@ -659,7 +590,7 @@ LSQUnit<Impl>::executeStore(const DynInstPtr &store_inst) // Check the recently completed loads to see if any match this store's // address. If so, then we have a memory ordering violation. - int load_idx = store_inst->lqIdx; + typename LoadQueue::iterator loadIt = store_inst->lqIt; Fault store_fault = store_inst->initiateAcc(); @@ -674,7 +605,7 @@ LSQUnit<Impl>::executeStore(const DynInstPtr &store_inst) return store_fault; } - if (storeQueue[store_idx].size == 0) { + if (storeQueue[store_idx].size() == 0) { DPRINTF(LSQUnit,"Fault on Store PC %s, [sn:%lli], Size = 0\n", store_inst->pcState(), store_inst->seqNum); @@ -686,12 +617,12 @@ LSQUnit<Impl>::executeStore(const DynInstPtr &store_inst) if (store_inst->isStoreConditional()) { // Store conditionals need to set themselves as able to // writeback if we haven't had a fault by here. - storeQueue[store_idx].canWB = true; + storeQueue[store_idx].canWB() = true; ++storesToWB; } - return checkViolations(load_idx, store_inst); + return checkViolations(loadIt, store_inst); } @@ -699,14 +630,13 @@ template <class Impl> void LSQUnit<Impl>::commitLoad() { - assert(loadQueue[loadHead]); + assert(loadQueue.front().valid()); DPRINTF(LSQUnit, "Committing head load instruction, PC %s\n", - loadQueue[loadHead]->pcState()); - - loadQueue[loadHead] = NULL; + loadQueue.front().instruction()->pcState()); - incrLdIdx(loadHead); + loadQueue.front().clear(); + loadQueue.pop_front(); --loads; } @@ -715,9 +645,10 @@ template <class Impl> void LSQUnit<Impl>::commitLoads(InstSeqNum &youngest_inst) { - assert(loads == 0 || loadQueue[loadHead]); + assert(loads == 0 || loadQueue.front().valid()); - while (loads != 0 && loadQueue[loadHead]->seqNum <= youngest_inst) { + while (loads != 0 && loadQueue.front().instruction()->seqNum + <= youngest_inst) { commitLoad(); } } @@ -726,45 +657,37 @@ template <class Impl> void LSQUnit<Impl>::commitStores(InstSeqNum &youngest_inst) { - assert(stores == 0 || storeQueue[storeHead].inst); - - int store_idx = storeHead; + assert(stores == 0 || storeQueue.front().valid()); - while (store_idx != storeTail) { - assert(storeQueue[store_idx].inst); + /* Forward iterate the store queue (age order). */ + for (auto& x : storeQueue) { + assert(x.valid()); // Mark any stores that are now committed and have not yet // been marked as able to write back. - if (!storeQueue[store_idx].canWB) { - if (storeQueue[store_idx].inst->seqNum > youngest_inst) { + if (!x.canWB()) { + if (x.instruction()->seqNum > youngest_inst) { break; } DPRINTF(LSQUnit, "Marking store as able to write back, PC " "%s [sn:%lli]\n", - storeQueue[store_idx].inst->pcState(), - storeQueue[store_idx].inst->seqNum); + x.instruction()->pcState(), + x.instruction()->seqNum); - storeQueue[store_idx].canWB = true; + x.canWB() = true; ++storesToWB; } - - incrStIdx(store_idx); } } template <class Impl> void -LSQUnit<Impl>::writebackPendingStore() +LSQUnit<Impl>::writebackBlockedStore() { - if (hasPendingPkt) { - assert(pendingPkt != NULL); - - // If the cache is blocked, this will store the packet for retry. - if (sendStore(pendingPkt)) { - storePostSend(pendingPkt); - } - pendingPkt = NULL; - hasPendingPkt = false; + assert(isStoreBlocked); + storeWBIt->request()->sendPacketToCache(); + if (storeWBIt->request()->isSent()){ + storePostSend(); } } @@ -772,18 +695,17 @@ template <class Impl> void LSQUnit<Impl>::writebackStores() { - // First writeback the second packet from any split store that didn't - // complete last cycle because there weren't enough cache ports available. - if (TheISA::HasUnalignedMemAcc) { - writebackPendingStore(); + if (isStoreBlocked) { + DPRINTF(LSQUnit, "Writing back blocked store\n"); + writebackBlockedStore(); } while (storesToWB > 0 && - storeWBIdx != storeTail && - storeQueue[storeWBIdx].inst && - storeQueue[storeWBIdx].canWB && + storeWBIt.dereferenceable() && + storeWBIt->valid() && + storeWBIt->canWB() && ((!needsTSO) || (!storeInFlight)) && - usedStorePorts < cacheStorePorts) { + lsq->storePortAvailable()) { if (isStoreBlocked) { DPRINTF(LSQUnit, "Unable to write back any more stores, cache" @@ -793,188 +715,112 @@ LSQUnit<Impl>::writebackStores() // Store didn't write any data so no need to write it back to // memory. - if (storeQueue[storeWBIdx].size == 0) { - completeStore(storeWBIdx); - - incrStIdx(storeWBIdx); - + if (storeWBIt->size() == 0) { + /* It is important that the preincrement happens at (or before) + * the call, as the the code of completeStore checks + * storeWBIt. */ + completeStore(storeWBIt++); continue; } - ++usedStorePorts; - - if (storeQueue[storeWBIdx].inst->isDataPrefetch()) { - incrStIdx(storeWBIdx); - + if (storeWBIt->instruction()->isDataPrefetch()) { + storeWBIt++; continue; } - assert(storeQueue[storeWBIdx].req); - assert(!storeQueue[storeWBIdx].committed); - - if (TheISA::HasUnalignedMemAcc && storeQueue[storeWBIdx].isSplit) { - assert(storeQueue[storeWBIdx].sreqLow); - assert(storeQueue[storeWBIdx].sreqHigh); - } - - DynInstPtr inst = storeQueue[storeWBIdx].inst; - - RequestPtr &req = storeQueue[storeWBIdx].req; - const RequestPtr &sreqLow = storeQueue[storeWBIdx].sreqLow; - const RequestPtr &sreqHigh = storeQueue[storeWBIdx].sreqHigh; + assert(storeWBIt->hasRequest()); + assert(!storeWBIt->committed()); - storeQueue[storeWBIdx].committed = true; + DynInstPtr inst = storeWBIt->instruction(); + LSQRequest* req = storeWBIt->request(); + storeWBIt->committed() = true; assert(!inst->memData); - inst->memData = new uint8_t[req->getSize()]; + inst->memData = new uint8_t[req->_size]; - if (storeQueue[storeWBIdx].isAllZeros) - memset(inst->memData, 0, req->getSize()); + if (storeWBIt->isAllZeros()) + memset(inst->memData, 0, req->_size); else - memcpy(inst->memData, storeQueue[storeWBIdx].data, req->getSize()); + memcpy(inst->memData, storeWBIt->data(), req->_size); - PacketPtr data_pkt; - PacketPtr snd_data_pkt = NULL; - LSQSenderState *state = new LSQSenderState; - state->isLoad = false; - state->idx = storeWBIdx; - state->inst = inst; + if (req->senderState() == nullptr) { + SQSenderState *state = new SQSenderState(storeWBIt); + state->isLoad = false; + state->needWB = false; + state->inst = inst; - if (!TheISA::HasUnalignedMemAcc || !storeQueue[storeWBIdx].isSplit) { - - // Build a single data packet if the store isn't split. - data_pkt = Packet::createWrite(req); - data_pkt->dataStatic(inst->memData); - data_pkt->senderState = state; - } else { - // Create two packets if the store is split in two. - data_pkt = Packet::createWrite(sreqLow); - snd_data_pkt = Packet::createWrite(sreqHigh); - - data_pkt->dataStatic(inst->memData); - snd_data_pkt->dataStatic(inst->memData + sreqLow->getSize()); - - data_pkt->senderState = state; - snd_data_pkt->senderState = state; - - state->isSplit = true; - state->outstanding = 2; - - // Can delete the main request now. - req = sreqLow; + req->senderState(state); + if (inst->isStoreConditional()) { + /* Only store conditionals need a writeback. */ + state->needWB = true; + } } + req->buildPackets(); DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%s " "to Addr:%#x, data:%#x [sn:%lli]\n", - storeWBIdx, inst->pcState(), - req->getPaddr(), (int)*(inst->memData), + storeWBIt.idx(), inst->pcState(), + req->request()->getPaddr(), (int)*(inst->memData), inst->seqNum); // @todo: Remove this SC hack once the memory system handles it. if (inst->isStoreConditional()) { - assert(!storeQueue[storeWBIdx].isSplit); // Disable recording the result temporarily. Writing to // misc regs normally updates the result, but this is not // the desired behavior when handling store conditionals. inst->recordResult(false); - bool success = TheISA::handleLockedWrite(inst.get(), req, cacheBlockMask); + bool success = TheISA::handleLockedWrite(inst.get(), + req->request(), cacheBlockMask); inst->recordResult(true); + req->packetSent(); if (!success) { + req->complete(); // Instantly complete this store. DPRINTF(LSQUnit, "Store conditional [sn:%lli] failed. " "Instantly completing it.\n", inst->seqNum); - WritebackEvent *wb = new WritebackEvent(inst, data_pkt, this); + PacketPtr new_pkt = new Packet(*req->packet()); + WritebackEvent *wb = new WritebackEvent(inst, + new_pkt, this); cpu->schedule(wb, curTick() + 1); - completeStore(storeWBIdx); - incrStIdx(storeWBIdx); + completeStore(storeWBIt); + if (!storeQueue.empty()) + storeWBIt++; + else + storeWBIt = storeQueue.end(); continue; } - } else { - // Non-store conditionals do not need a writeback. - state->noWB = true; } - bool split = - TheISA::HasUnalignedMemAcc && storeQueue[storeWBIdx].isSplit; - - ThreadContext *thread = cpu->tcBase(lsqID); - - if (req->isMmappedIpr()) { + if (req->request()->isMmappedIpr()) { assert(!inst->isStoreConditional()); - TheISA::handleIprWrite(thread, data_pkt); - delete data_pkt; - if (split) { - assert(snd_data_pkt->req->isMmappedIpr()); - TheISA::handleIprWrite(thread, snd_data_pkt); - delete snd_data_pkt; - } - delete state; - completeStore(storeWBIdx); - incrStIdx(storeWBIdx); - } else if (!sendStore(data_pkt)) { - DPRINTF(IEW, "D-Cache became blocked when writing [sn:%lli], will" - "retry later\n", - inst->seqNum); + ThreadContext *thread = cpu->tcBase(lsqID); + PacketPtr main_pkt = new Packet(req->mainRequest(), + MemCmd::WriteReq); + main_pkt->dataStatic(inst->memData); + req->handleIprWrite(thread, main_pkt); + delete main_pkt; + completeStore(storeWBIt); + storeWBIt++; + continue; + } + /* Send to cache */ + req->sendPacketToCache(); - // Need to store the second packet, if split. - if (split) { - state->pktToSend = true; - state->pendingPacket = snd_data_pkt; - } + /* If successful, do the post send */ + if (req->isSent()) { + storePostSend(); } else { - - // If split, try to send the second packet too - if (split) { - assert(snd_data_pkt); - - // Ensure there are enough ports to use. - if (usedStorePorts < cacheStorePorts) { - ++usedStorePorts; - if (sendStore(snd_data_pkt)) { - storePostSend(snd_data_pkt); - } else { - DPRINTF(IEW, "D-Cache became blocked when writing" - " [sn:%lli] second packet, will retry later\n", - inst->seqNum); - } - } else { - - // Store the packet for when there's free ports. - assert(pendingPkt == NULL); - pendingPkt = snd_data_pkt; - hasPendingPkt = true; - } - } else { - - // Not a split store. - storePostSend(data_pkt); - } + DPRINTF(LSQUnit, "D-Cache became blocked when writing [sn:%lli], " + "will retry later\n", + inst->seqNum); } } - - // Not sure this should set it to 0. - usedStorePorts = 0; - assert(stores >= 0 && storesToWB >= 0); } -/*template <class Impl> -void -LSQUnit<Impl>::removeMSHR(InstSeqNum seqNum) -{ - list<InstSeqNum>::iterator mshr_it = find(mshrSeqNums.begin(), - mshrSeqNums.end(), - seqNum); - - if (mshr_it != mshrSeqNums.end()) { - mshrSeqNums.erase(mshr_it); - DPRINTF(LSQUnit, "Removing MSHR. count = %i\n",mshrSeqNums.size()); - } -}*/ - template <class Impl> void LSQUnit<Impl>::squash(const InstSeqNum &squashed_num) @@ -982,30 +828,26 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num) DPRINTF(LSQUnit, "Squashing until [sn:%lli]!" "(Loads:%i Stores:%i)\n", squashed_num, loads, stores); - int load_idx = loadTail; - decrLdIdx(load_idx); - - while (loads != 0 && loadQueue[load_idx]->seqNum > squashed_num) { + while (loads != 0 && + loadQueue.back().instruction()->seqNum > squashed_num) { DPRINTF(LSQUnit,"Load Instruction PC %s squashed, " "[sn:%lli]\n", - loadQueue[load_idx]->pcState(), - loadQueue[load_idx]->seqNum); + loadQueue.back().instruction()->pcState(), + loadQueue.back().instruction()->seqNum); - if (isStalled() && load_idx == stallingLoadIdx) { + if (isStalled() && loadQueue.tail() == stallingLoadIdx) { stalled = false; stallingStoreIsn = 0; stallingLoadIdx = 0; } // Clear the smart pointer to make sure it is decremented. - loadQueue[load_idx]->setSquashed(); - loadQueue[load_idx] = NULL; - --loads; + loadQueue.back().instruction()->setSquashed(); + loadQueue.back().clear(); - // Inefficient! - loadTail = load_idx; + --loads; - decrLdIdx(load_idx); + loadQueue.pop_back(); ++lsqSquashedLoads; } @@ -1013,76 +855,63 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num) memDepViolator = NULL; } - int store_idx = storeTail; - decrStIdx(store_idx); - while (stores != 0 && - storeQueue[store_idx].inst->seqNum > squashed_num) { + storeQueue.back().instruction()->seqNum > squashed_num) { // Instructions marked as can WB are already committed. - if (storeQueue[store_idx].canWB) { + if (storeQueue.back().canWB()) { break; } DPRINTF(LSQUnit,"Store Instruction PC %s squashed, " "idx:%i [sn:%lli]\n", - storeQueue[store_idx].inst->pcState(), - store_idx, storeQueue[store_idx].inst->seqNum); + storeQueue.back().instruction()->pcState(), + storeQueue.tail(), storeQueue.back().instruction()->seqNum); // I don't think this can happen. It should have been cleared // by the stalling load. if (isStalled() && - storeQueue[store_idx].inst->seqNum == stallingStoreIsn) { + storeQueue.back().instruction()->seqNum == stallingStoreIsn) { panic("Is stalled should have been cleared by stalling load!\n"); stalled = false; stallingStoreIsn = 0; } // Clear the smart pointer to make sure it is decremented. - storeQueue[store_idx].inst->setSquashed(); - storeQueue[store_idx].inst = NULL; - storeQueue[store_idx].canWB = 0; + storeQueue.back().instruction()->setSquashed(); // Must delete request now that it wasn't handed off to // memory. This is quite ugly. @todo: Figure out the proper // place to really handle request deletes. - storeQueue[store_idx].req.reset(); - if (TheISA::HasUnalignedMemAcc && storeQueue[store_idx].isSplit) { - storeQueue[store_idx].sreqLow.reset(); - storeQueue[store_idx].sreqHigh.reset(); - } - + storeQueue.back().clear(); --stores; - // Inefficient! - storeTail = store_idx; - - decrStIdx(store_idx); + storeQueue.pop_back(); ++lsqSquashedStores; } } template <class Impl> void -LSQUnit<Impl>::storePostSend(PacketPtr pkt) +LSQUnit<Impl>::storePostSend() { if (isStalled() && - storeQueue[storeWBIdx].inst->seqNum == stallingStoreIsn) { + storeWBIt->instruction()->seqNum == stallingStoreIsn) { DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] " "load idx:%i\n", stallingStoreIsn, stallingLoadIdx); stalled = false; stallingStoreIsn = 0; - iewStage->replayMemInst(loadQueue[stallingLoadIdx]); + iewStage->replayMemInst(loadQueue[stallingLoadIdx].instruction()); } - if (!storeQueue[storeWBIdx].inst->isStoreConditional()) { + if (!storeWBIt->instruction()->isStoreConditional()) { // The store is basically completed at this time. This // only works so long as the checker doesn't try to // verify the value in memory for stores. - storeQueue[storeWBIdx].inst->setCompleted(); + storeWBIt->instruction()->setCompleted(); if (cpu->checker) { - cpu->checker->verify(storeQueue[storeWBIdx].inst); + cpu->checker->verify(storeWBIt->instruction()); } } @@ -1090,7 +919,7 @@ LSQUnit<Impl>::storePostSend(PacketPtr pkt) storeInFlight = true; } - incrStIdx(storeWBIdx); + storeWBIt++; } template <class Impl> @@ -1136,10 +965,10 @@ LSQUnit<Impl>::writeback(const DynInstPtr &inst, PacketPtr pkt) template <class Impl> void -LSQUnit<Impl>::completeStore(int store_idx) +LSQUnit<Impl>::completeStore(typename StoreQueue::iterator store_idx) { - assert(storeQueue[store_idx].inst); - storeQueue[store_idx].completed = true; + assert(store_idx->valid()); + store_idx->completed() = true; --storesToWB; // A bit conservative because a store completion may not free up entries, // but hopefully avoids two store completions in one cycle from making @@ -1147,39 +976,42 @@ LSQUnit<Impl>::completeStore(int store_idx) cpu->wakeCPU(); cpu->activityThisCycle(); - if (store_idx == storeHead) { + /* We 'need' a copy here because we may clear the entry from the + * store queue. */ + DynInstPtr store_inst = store_idx->instruction(); + if (store_idx == storeQueue.begin()) { do { - incrStIdx(storeHead); - + storeQueue.front().clear(); + storeQueue.pop_front(); --stores; - } while (storeQueue[storeHead].completed && - storeHead != storeTail); + } while (storeQueue.front().completed() && + !storeQueue.empty()); iewStage->updateLSQNextCycle = true; } DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head " "idx:%i\n", - storeQueue[store_idx].inst->seqNum, store_idx, storeHead); + store_inst->seqNum, store_idx.idx() - 1, storeQueue.head() - 1); #if TRACING_ON if (DTRACE(O3PipeView)) { - storeQueue[store_idx].inst->storeTick = - curTick() - storeQueue[store_idx].inst->fetchTick; + store_idx->instruction()->storeTick = + curTick() - store_idx->instruction()->fetchTick; } #endif if (isStalled() && - storeQueue[store_idx].inst->seqNum == stallingStoreIsn) { + store_inst->seqNum == stallingStoreIsn) { DPRINTF(LSQUnit, "Unstalling, stalling store [sn:%lli] " "load idx:%i\n", stallingStoreIsn, stallingLoadIdx); stalled = false; stallingStoreIsn = 0; - iewStage->replayMemInst(loadQueue[stallingLoadIdx]); + iewStage->replayMemInst(loadQueue[stallingLoadIdx].instruction()); } - storeQueue[store_idx].inst->setCompleted(); + store_inst->setCompleted(); if (needsTSO) { storeInFlight = false; @@ -1188,95 +1020,62 @@ LSQUnit<Impl>::completeStore(int store_idx) // Tell the checker we've completed this instruction. Some stores // may get reported twice to the checker, but the checker can // handle that case. - // Store conditionals cannot be sent to the checker yet, they have // to update the misc registers first which should take place // when they commit - if (cpu->checker && !storeQueue[store_idx].inst->isStoreConditional()) { - cpu->checker->verify(storeQueue[store_idx].inst); + if (cpu->checker && !store_inst->isStoreConditional()) { + cpu->checker->verify(store_inst); } } template <class Impl> bool -LSQUnit<Impl>::sendStore(PacketPtr data_pkt) +LSQUnit<Impl>::trySendPacket(bool isLoad, PacketPtr data_pkt) { - if (!dcachePort->sendTimingReq(data_pkt)) { - // Need to handle becoming blocked on a store. - isStoreBlocked = true; - ++lsqCacheBlocked; - assert(retryPkt == NULL); - retryPkt = data_pkt; - return false; + bool ret = true; + bool cache_got_blocked = false; + + auto state = dynamic_cast<LSQSenderState*>(data_pkt->senderState); + + if (!lsq->cacheBlocked() && (isLoad || lsq->storePortAvailable())) { + if (!dcachePort->sendTimingReq(data_pkt)) { + ret = false; + cache_got_blocked = true; + } + } else { + ret = false; } - return true; -} -template <class Impl> -void -LSQUnit<Impl>::recvRetry() -{ - if (isStoreBlocked) { - DPRINTF(LSQUnit, "Receiving retry: store blocked\n"); - assert(retryPkt != NULL); - - LSQSenderState *state = - dynamic_cast<LSQSenderState *>(retryPkt->senderState); - - if (dcachePort->sendTimingReq(retryPkt)) { - // Don't finish the store unless this is the last packet. - if (!TheISA::HasUnalignedMemAcc || !state->pktToSend || - state->pendingPacket == retryPkt) { - state->pktToSend = false; - storePostSend(retryPkt); - } - retryPkt = NULL; + if (ret) { + if (!isLoad) { + lsq->storePortBusy(); isStoreBlocked = false; - - // Send any outstanding packet. - if (TheISA::HasUnalignedMemAcc && state->pktToSend) { - assert(state->pendingPacket); - if (sendStore(state->pendingPacket)) { - storePostSend(state->pendingPacket); - } - } - } else { - // Still blocked! + } + state->outstanding++; + state->request()->packetSent(); + } else { + if (cache_got_blocked) { + lsq->cacheBlocked(true); ++lsqCacheBlocked; } + if (!isLoad) { + assert(state->request() == storeWBIt->request()); + isStoreBlocked = true; + } + state->request()->packetNotSent(); } -} - -template <class Impl> -inline void -LSQUnit<Impl>::incrStIdx(int &store_idx) const -{ - if (++store_idx >= SQEntries) - store_idx = 0; -} - -template <class Impl> -inline void -LSQUnit<Impl>::decrStIdx(int &store_idx) const -{ - if (--store_idx < 0) - store_idx += SQEntries; -} -template <class Impl> -inline void -LSQUnit<Impl>::incrLdIdx(int &load_idx) const -{ - if (++load_idx >= LQEntries) - load_idx = 0; + return ret; } template <class Impl> -inline void -LSQUnit<Impl>::decrLdIdx(int &load_idx) const +void +LSQUnit<Impl>::recvRetry() { - if (--load_idx < 0) - load_idx += LQEntries; + if (isStoreBlocked) { + DPRINTF(LSQUnit, "Receiving retry: blocked store\n"); + writebackBlockedStore(); + } } template <class Impl> @@ -1287,29 +1086,28 @@ LSQUnit<Impl>::dumpInsts() const cprintf("Load queue size: %i\n", loads); cprintf("Load queue: "); - int load_idx = loadHead; - - while (load_idx != loadTail && loadQueue[load_idx]) { - const DynInstPtr &inst(loadQueue[load_idx]); + for (const auto& e: loadQueue) { + const DynInstPtr &inst(e.instruction()); cprintf("%s.[sn:%i] ", inst->pcState(), inst->seqNum); - - incrLdIdx(load_idx); } cprintf("\n"); cprintf("Store queue size: %i\n", stores); cprintf("Store queue: "); - int store_idx = storeHead; - - while (store_idx != storeTail && storeQueue[store_idx].inst) { - const DynInstPtr &inst(storeQueue[store_idx].inst); + for (const auto& e: storeQueue) { + const DynInstPtr &inst(e.instruction()); cprintf("%s.[sn:%i] ", inst->pcState(), inst->seqNum); - - incrStIdx(store_idx); } cprintf("\n"); } +template <class Impl> +unsigned int +LSQUnit<Impl>::cacheLineSize() +{ + return cpu->cacheLineSize(); +} + #endif//__CPU_O3_LSQ_UNIT_IMPL_HH__ diff --git a/src/cpu/o3/probe/elastic_trace.cc b/src/cpu/o3/probe/elastic_trace.cc index a4a201398..36d8297d1 100644 --- a/src/cpu/o3/probe/elastic_trace.cc +++ b/src/cpu/o3/probe/elastic_trace.cc @@ -409,7 +409,7 @@ ElasticTrace::addDepTraceRecord(const DynInstConstPtr& head_inst, new_record->reqFlags = head_inst->memReqFlags; new_record->virtAddr = head_inst->effAddr; new_record->asid = head_inst->asid; - new_record->physAddr = head_inst->physEffAddrLow; + new_record->physAddr = head_inst->physEffAddr; // Currently the tracing does not support split requests. new_record->size = head_inst->effSize; new_record->pc = head_inst->instAddr(); |