diff options
-rw-r--r-- | src/cpu/base.hh | 4 | ||||
-rw-r--r-- | src/cpu/base_dyn_inst.hh | 17 | ||||
-rw-r--r-- | src/cpu/checker/cpu.cc | 120 | ||||
-rw-r--r-- | src/cpu/checker/cpu.hh | 29 | ||||
-rw-r--r-- | src/cpu/exec_context.hh | 10 | ||||
-rw-r--r-- | src/cpu/minor/dyn_inst.hh | 17 | ||||
-rw-r--r-- | src/cpu/minor/exec_context.hh | 22 | ||||
-rw-r--r-- | src/cpu/minor/execute.cc | 10 | ||||
-rw-r--r-- | src/cpu/minor/lsq.cc | 148 | ||||
-rw-r--r-- | src/cpu/minor/lsq.hh | 11 | ||||
-rw-r--r-- | src/cpu/o3/cpu.hh | 7 | ||||
-rw-r--r-- | src/cpu/o3/lsq.hh | 48 | ||||
-rw-r--r-- | src/cpu/o3/lsq_impl.hh | 129 | ||||
-rw-r--r-- | src/cpu/o3/lsq_unit_impl.hh | 3 | ||||
-rw-r--r-- | src/cpu/simple/atomic.cc | 138 | ||||
-rw-r--r-- | src/cpu/simple/atomic.hh | 29 | ||||
-rw-r--r-- | src/cpu/simple/base.cc | 6 | ||||
-rw-r--r-- | src/cpu/simple/base.hh | 14 | ||||
-rw-r--r-- | src/cpu/simple/exec_context.hh | 20 | ||||
-rw-r--r-- | src/cpu/simple/timing.cc | 18 | ||||
-rw-r--r-- | src/cpu/simple/timing.hh | 10 | ||||
-rw-r--r-- | src/cpu/simple_thread.cc | 7 | ||||
-rw-r--r-- | src/cpu/utils.hh | 96 | ||||
-rw-r--r-- | src/mem/abstract_mem.cc | 2 | ||||
-rw-r--r-- | src/mem/cache/cache.cc | 3 | ||||
-rw-r--r-- | src/mem/packet.hh | 35 | ||||
-rw-r--r-- | src/mem/request.hh | 27 |
27 files changed, 711 insertions, 269 deletions
diff --git a/src/cpu/base.hh b/src/cpu/base.hh index f013a3e02..3d679f172 100644 --- a/src/cpu/base.hh +++ b/src/cpu/base.hh @@ -175,9 +175,9 @@ class BaseCPU : public ClockedObject uint32_t socketId() const { return _socketId; } /** Reads this CPU's unique data requestor ID */ - MasterID dataMasterId() { return _dataMasterId; } + MasterID dataMasterId() const { return _dataMasterId; } /** Reads this CPU's unique instruction requestor ID */ - MasterID instMasterId() { return _instMasterId; } + MasterID instMasterId() const { return _instMasterId; } /** * Get a port on this CPU. All CPUs have a data and diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh index 4084241bd..22a32ec10 100644 --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -298,10 +298,12 @@ class BaseDynInst : public ExecContext, public RefCounted cpu->demapPage(vaddr, asn); } - Fault initiateMemRead(Addr addr, unsigned size, Request::Flags flags); + Fault initiateMemRead(Addr addr, unsigned size, Request::Flags flags, + const std::vector<bool>& byteEnable = std::vector<bool>()); Fault writeMem(uint8_t *data, unsigned size, Addr addr, - Request::Flags flags, uint64_t *res); + Request::Flags flags, uint64_t *res, + const std::vector<bool>& byteEnable = std::vector<bool>()); Fault initiateMemAMO(Addr addr, unsigned size, Request::Flags flags, AtomicOpFunctor *amo_op); @@ -918,21 +920,24 @@ class BaseDynInst : public ExecContext, public RefCounted template<class Impl> Fault BaseDynInst<Impl>::initiateMemRead(Addr addr, unsigned size, - Request::Flags flags) + Request::Flags flags, + const std::vector<bool>& byteEnable) { return cpu->pushRequest( dynamic_cast<typename DynInstPtr::PtrType>(this), - /* ld */ true, nullptr, size, addr, flags, nullptr); + /* ld */ true, nullptr, size, addr, flags, nullptr, nullptr, + byteEnable); } template<class Impl> Fault BaseDynInst<Impl>::writeMem(uint8_t *data, unsigned size, Addr addr, - Request::Flags flags, uint64_t *res) + Request::Flags flags, uint64_t *res, + const std::vector<bool>& byteEnable) { return cpu->pushRequest( dynamic_cast<typename DynInstPtr::PtrType>(this), - /* st */ false, data, size, addr, flags, res); + /* st */ false, data, size, addr, flags, res, nullptr, byteEnable); } template<class Impl> diff --git a/src/cpu/checker/cpu.cc b/src/cpu/checker/cpu.cc index 7f8eada4c..cca6d6b12 100644 --- a/src/cpu/checker/cpu.cc +++ b/src/cpu/checker/cpu.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011,2013,2017 ARM Limited + * Copyright (c) 2011,2013,2017-2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -52,6 +52,7 @@ #include "cpu/simple_thread.hh" #include "cpu/static_inst.hh" #include "cpu/thread_context.hh" +#include "cpu/utils.hh" #include "params/CheckerCPU.hh" #include "sim/full_system.hh" @@ -139,31 +140,68 @@ CheckerCPU::unserialize(CheckpointIn &cp) { } +RequestPtr +CheckerCPU::genMemFragmentRequest(Addr frag_addr, int size, + Request::Flags flags, + const std::vector<bool>& byte_enable, + int& frag_size, int& size_left) const +{ + frag_size = std::min( + cacheLineSize() - addrBlockOffset(frag_addr, cacheLineSize()), + (Addr) size_left); + size_left -= frag_size; + + RequestPtr mem_req; + + if (!byte_enable.empty()) { + // Set up byte-enable mask for the current fragment + auto it_start = byte_enable.cbegin() + (size - (frag_size + + size_left)); + auto it_end = byte_enable.cbegin() + (size - size_left); + if (isAnyActiveElement(it_start, it_end)) { + mem_req = std::make_shared<Request>(0, frag_addr, frag_size, + flags, masterId, thread->pcState().instAddr(), + tc->contextId()); + mem_req->setByteEnable(std::vector<bool>(it_start, it_end)); + } + } else { + mem_req = std::make_shared<Request>(0, frag_addr, frag_size, + flags, masterId, thread->pcState().instAddr(), + tc->contextId()); + } + + return mem_req; +} + Fault CheckerCPU::readMem(Addr addr, uint8_t *data, unsigned size, - Request::Flags flags) + Request::Flags flags, + const std::vector<bool>& byteEnable) { Fault fault = NoFault; - int fullSize = size; - Addr secondAddr = roundDown(addr + size - 1, cacheLineSize()); bool checked_flags = false; bool flags_match = true; Addr pAddr = 0x0; - - if (secondAddr > addr) - size = secondAddr - addr; + Addr frag_addr = addr; + int frag_size = 0; + int size_left = size; + bool predicate; // Need to account for multiple accesses like the Atomic and TimingSimple while (1) { - auto mem_req = std::make_shared<Request>( - 0, addr, size, flags, masterId, - thread->pcState().instAddr(), tc->contextId()); + RequestPtr mem_req = genMemFragmentRequest(frag_addr, size, flags, + byteEnable, frag_size, + size_left); + + predicate = (mem_req != nullptr); // translate to physical address - fault = dtb->translateFunctional(mem_req, tc, BaseTLB::Read); + if (predicate) { + fault = dtb->translateFunctional(mem_req, tc, BaseTLB::Read); + } - if (!checked_flags && fault == NoFault && unverifiedReq) { + if (predicate && !checked_flags && fault == NoFault && unverifiedReq) { flags_match = checkFlags(unverifiedReq, mem_req->getVaddr(), mem_req->getPaddr(), mem_req->getFlags()); pAddr = mem_req->getPaddr(); @@ -171,7 +209,7 @@ CheckerCPU::readMem(Addr addr, uint8_t *data, unsigned size, } // Now do the access - if (fault == NoFault && + if (predicate && fault == NoFault && !mem_req->getFlags().isSet(Request::NO_ACCESS)) { PacketPtr pkt = Packet::createRead(mem_req); @@ -182,7 +220,7 @@ CheckerCPU::readMem(Addr addr, uint8_t *data, unsigned size, dcachePort->sendFunctional(pkt); } else { // Assume the data is correct if it's an uncached access - memcpy(data, unverifiedMemData, size); + memcpy(data, unverifiedMemData, frag_size); } delete pkt; @@ -196,22 +234,21 @@ CheckerCPU::readMem(Addr addr, uint8_t *data, unsigned size, } //If we don't need to access a second cache line, stop now. - if (secondAddr <= addr) + if (size_left == 0) { break; } // Setup for accessing next cache line - data += size; - unverifiedMemData += size; - size = addr + fullSize - secondAddr; - addr = secondAddr; + frag_addr += frag_size; + data += frag_size; + unverifiedMemData += frag_size; } if (!flags_match) { warn("%lli: Flags do not match CPU:%#x %#x %#x Checker:%#x %#x %#x\n", curTick(), unverifiedReq->getVaddr(), unverifiedReq->getPaddr(), - unverifiedReq->getFlags(), addr, pAddr, flags); + unverifiedReq->getFlags(), frag_addr, pAddr, flags); handleError(); } @@ -220,31 +257,35 @@ CheckerCPU::readMem(Addr addr, uint8_t *data, unsigned size, Fault CheckerCPU::writeMem(uint8_t *data, unsigned size, - Addr addr, Request::Flags flags, uint64_t *res) + Addr addr, Request::Flags flags, uint64_t *res, + const std::vector<bool>& byteEnable) { + assert(byteEnable.empty() || byteEnable.size() == size); + Fault fault = NoFault; bool checked_flags = false; bool flags_match = true; Addr pAddr = 0x0; static uint8_t zero_data[64] = {}; - int fullSize = size; - - Addr secondAddr = roundDown(addr + size - 1, cacheLineSize()); - - if (secondAddr > addr) - size = secondAddr - addr; + Addr frag_addr = addr; + int frag_size = 0; + int size_left = size; + bool predicate; // Need to account for a multiple access like Atomic and Timing CPUs while (1) { - auto mem_req = std::make_shared<Request>( - 0, addr, size, flags, masterId, - thread->pcState().instAddr(), tc->contextId()); + RequestPtr mem_req = genMemFragmentRequest(frag_addr, size, flags, + byteEnable, frag_size, + size_left); - // translate to physical address - fault = dtb->translateFunctional(mem_req, tc, BaseTLB::Write); + predicate = (mem_req != nullptr); + + if (predicate) { + fault = dtb->translateFunctional(mem_req, tc, BaseTLB::Write); + } - if (!checked_flags && fault == NoFault && unverifiedReq) { + if (predicate && !checked_flags && fault == NoFault && unverifiedReq) { flags_match = checkFlags(unverifiedReq, mem_req->getVaddr(), mem_req->getPaddr(), mem_req->getFlags()); pAddr = mem_req->getPaddr(); @@ -261,7 +302,7 @@ CheckerCPU::writeMem(uint8_t *data, unsigned size, bool was_prefetch = mem_req->isPrefetch(); //If we don't need to access a second cache line, stop now. - if (fault != NoFault || secondAddr <= addr) + if (fault != NoFault || size_left == 0) { if (fault != NoFault && was_prefetch) { fault = NoFault; @@ -269,16 +310,13 @@ CheckerCPU::writeMem(uint8_t *data, unsigned size, break; } - //Update size and access address - size = addr + fullSize - secondAddr; - //And access the right address. - addr = secondAddr; + frag_addr += frag_size; } if (!flags_match) { warn("%lli: Flags do not match CPU:%#x %#x Checker:%#x %#x %#x\n", curTick(), unverifiedReq->getVaddr(), unverifiedReq->getPaddr(), - unverifiedReq->getFlags(), addr, pAddr, flags); + unverifiedReq->getFlags(), frag_addr, pAddr, flags); handleError(); } @@ -304,12 +342,12 @@ CheckerCPU::writeMem(uint8_t *data, unsigned size, // const set of zeros. if (flags & Request::STORE_NO_DATA) { assert(!data); - assert(sizeof(zero_data) <= fullSize); + assert(sizeof(zero_data) <= size); data = zero_data; } if (unverifiedReq && unverifiedMemData && - memcmp(data, unverifiedMemData, fullSize) && extraData) { + memcmp(data, unverifiedMemData, size) && extraData) { warn("%lli: Store value does not match value sent to memory! " "data: %#x inst_data: %#x", curTick(), data, unverifiedMemData); diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh index 8c3000005..66632b720 100644 --- a/src/cpu/checker/cpu.hh +++ b/src/cpu/checker/cpu.hh @@ -531,11 +531,36 @@ class CheckerCPU : public BaseCPU, public ExecContext this->dtb->demapPage(vaddr, asn); } + /** + * Helper function used to generate the request for a single fragment of a + * memory access. + * + * Takes care of setting up the appropriate byte-enable mask for the + * fragment, given the mask for the entire memory access. + * + * @param frag_addr Start address of the fragment. + * @param size Total size of the memory access in bytes. + * @param flags Request flags. + * @param byte_enable Byte-enable mask for the entire memory access. + * @param[out] frag_size Fragment size. + * @param[in,out] size_left Size left to be processed in the memory access. + * @return Pointer to the allocated Request, nullptr if the byte-enable + * mask is all-false for the fragment. + */ + RequestPtr genMemFragmentRequest(Addr frag_addr, int size, + Request::Flags flags, + const std::vector<bool>& byte_enable, + int& frag_size, int& size_left) const; + Fault readMem(Addr addr, uint8_t *data, unsigned size, - Request::Flags flags) override; + Request::Flags flags, + const std::vector<bool>& byteEnable = std::vector<bool>()) + override; Fault writeMem(uint8_t *data, unsigned size, Addr addr, - Request::Flags flags, uint64_t *res) override; + Request::Flags flags, uint64_t *res, + const std::vector<bool>& byteEnable = std::vector<bool>()) + override; Fault amoMem(Addr addr, uint8_t* data, unsigned size, Request::Flags flags, AtomicOpFunctor *amo_op) override diff --git a/src/cpu/exec_context.hh b/src/cpu/exec_context.hh index 4cad9e3e1..b294387e2 100644 --- a/src/cpu/exec_context.hh +++ b/src/cpu/exec_context.hh @@ -235,7 +235,8 @@ class ExecContext { * should never be called). */ virtual Fault readMem(Addr addr, uint8_t *data, unsigned int size, - Request::Flags flags) + Request::Flags flags, + const std::vector<bool>& byteEnable = std::vector<bool>()) { panic("ExecContext::readMem() should be overridden\n"); } @@ -248,7 +249,8 @@ class ExecContext { * should never be called). */ virtual Fault initiateMemRead(Addr addr, unsigned int size, - Request::Flags flags) + Request::Flags flags, + const std::vector<bool>& byteEnable = std::vector<bool>()) { panic("ExecContext::initiateMemRead() should be overridden\n"); } @@ -258,7 +260,9 @@ class ExecContext { * For timing-mode contexts, initiate a timing memory write operation. */ virtual Fault writeMem(uint8_t *data, unsigned int size, Addr addr, - Request::Flags flags, uint64_t *res) = 0; + Request::Flags flags, uint64_t *res, + const std::vector<bool>& byteEnable = + std::vector<bool>()) = 0; /** * For atomic-mode contexts, perform an atomic AMO (a.k.a., Atomic diff --git a/src/cpu/minor/dyn_inst.hh b/src/cpu/minor/dyn_inst.hh index b2decb39b..0a8ff8acf 100644 --- a/src/cpu/minor/dyn_inst.hh +++ b/src/cpu/minor/dyn_inst.hh @@ -202,6 +202,13 @@ class MinorDynInst : public RefCounted * to allow other instructions to fill the fetch delay */ bool canEarlyIssue; + /** Flag controlling conditional execution of the instruction */ + bool predicate; + + /** Flag controlling conditional execution of the memory access associated + * with the instruction (only meaningful for loads/stores) */ + bool memAccPredicate; + /** execSeqNum of the latest inst on which this inst depends. * This can be used as a sanity check for dependency ordering * where slightly out of order execution is required (notably @@ -227,7 +234,7 @@ class MinorDynInst : public RefCounted pc(TheISA::PCState(0)), fault(fault_), triedToPredict(false), predictedTaken(false), fuIndex(0), inLSQ(false), inStoreBuffer(false), - canEarlyIssue(false), + canEarlyIssue(false), predicate(true), memAccPredicate(true), instToWaitFor(0), extraCommitDelay(Cycles(0)), extraCommitDelayExpr(NULL), minimumCommitCycle(Cycles(0)) { } @@ -266,6 +273,14 @@ class MinorDynInst : public RefCounted /** ReportIF interface */ void reportData(std::ostream &os) const; + bool readPredicate() const { return predicate; } + + void setPredicate(bool val) { predicate = val; } + + bool readMemAccPredicate() const { return memAccPredicate; } + + void setMemAccPredicate(bool val) { memAccPredicate = val; } + ~MinorDynInst(); }; diff --git a/src/cpu/minor/exec_context.hh b/src/cpu/minor/exec_context.hh index b39bbac3f..9f6fce4cd 100644 --- a/src/cpu/minor/exec_context.hh +++ b/src/cpu/minor/exec_context.hh @@ -96,28 +96,40 @@ class ExecContext : public ::ExecContext { DPRINTF(MinorExecute, "ExecContext setting PC: %s\n", inst->pc); pcState(inst->pc); - setPredicate(true); + setPredicate(inst->readPredicate()); + setMemAccPredicate(inst->readMemAccPredicate()); thread.setIntReg(TheISA::ZeroReg, 0); #if THE_ISA == ALPHA_ISA thread.setFloatReg(TheISA::ZeroReg, 0); #endif } + ~ExecContext() + { + inst->setPredicate(readPredicate()); + inst->setMemAccPredicate(readMemAccPredicate()); + } + Fault initiateMemRead(Addr addr, unsigned int size, - Request::Flags flags) override + Request::Flags flags, + const std::vector<bool>& byteEnable = std::vector<bool>()) + override { execute.getLSQ().pushRequest(inst, true /* load */, nullptr, - size, addr, flags, NULL, nullptr); + size, addr, flags, nullptr, nullptr, byteEnable); return NoFault; } Fault writeMem(uint8_t *data, unsigned int size, Addr addr, - Request::Flags flags, uint64_t *res) override + Request::Flags flags, uint64_t *res, + const std::vector<bool>& byteEnable = std::vector<bool>()) + override { + assert(byteEnable.empty() || byteEnable.size() == size); execute.getLSQ().pushRequest(inst, false /* store */, data, - size, addr, flags, res, nullptr); + size, addr, flags, res, nullptr, byteEnable); return NoFault; } diff --git a/src/cpu/minor/execute.cc b/src/cpu/minor/execute.cc index 47f3cbc68..527eb2bc0 100644 --- a/src/cpu/minor/execute.cc +++ b/src/cpu/minor/execute.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014 ARM Limited + * Copyright (c) 2013-2014,2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -364,6 +364,8 @@ Execute::handleMemResponse(MinorDynInstPtr inst, DPRINTF(MinorMem, "Completing failed request inst: %s\n", *inst); use_context_predicate = false; + if (!context.readMemAccPredicate()) + inst->staticInst->completeAcc(nullptr, &context, inst->traceData); } else if (packet->isError()) { DPRINTF(MinorMem, "Trying to commit error response: %s\n", *inst); @@ -481,6 +483,10 @@ Execute::executeMemRefInst(MinorDynInstPtr inst, BranchData &branch, } else { /* Only set this if the instruction passed its * predicate */ + if (!context.readMemAccPredicate()) { + DPRINTF(MinorMem, "No memory access for inst: %s\n", *inst); + assert(context.readPredicate()); + } passed_predicate = context.readPredicate(); /* Set predicate in tracing */ @@ -928,7 +934,7 @@ Execute::commitInst(MinorDynInstPtr inst, bool early_memory_issue, * until it gets to the head of inFlightInsts */ inst->canEarlyIssue = false; /* Not completed as we'll come here again to pick up - * the fault when we get to the end of the FU */ + * the fault when we get to the end of the FU */ completed_inst = false; } else { DPRINTF(MinorExecute, "Fault in execute: %s\n", diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc index 6fe6c3738..1d9f17e8d 100644 --- a/src/cpu/minor/lsq.cc +++ b/src/cpu/minor/lsq.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014,2017 ARM Limited + * Copyright (c) 2013-2014,2017-2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -49,27 +49,13 @@ #include "cpu/minor/exec_context.hh" #include "cpu/minor/execute.hh" #include "cpu/minor/pipeline.hh" +#include "cpu/utils.hh" #include "debug/Activity.hh" #include "debug/MinorMem.hh" namespace Minor { -/** Returns the offset of addr into an aligned a block of size block_size */ -static Addr -addrBlockOffset(Addr addr, unsigned int block_size) -{ - return addr & (block_size - 1); -} - -/** Returns true if the given [addr .. addr+size-1] transfer needs to be - * fragmented across a block size of block_size */ -static bool -transferNeedsBurst(Addr addr, unsigned int size, unsigned int block_size) -{ - return (addrBlockOffset(addr, block_size) + size) > block_size; -} - LSQ::LSQRequest::LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_, PacketDataPtr data_, uint64_t *res_) : SenderState(), @@ -88,6 +74,13 @@ LSQ::LSQRequest::LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_, request = std::make_shared<Request>(); } +void +LSQ::LSQRequest::disableMemAccess() +{ + port.cpu.threads[inst->id.threadId]->setMemAccPredicate(false); + DPRINTFS(MinorMem, (&port), "Disable mem access for inst:%s\n", *inst); +} + LSQ::AddrRangeCoverage LSQ::LSQRequest::containsAddrRangeOf( Addr req1_addr, unsigned int req1_size, @@ -256,16 +249,23 @@ LSQ::SingleDataRequest::startAddrTranslation() ThreadContext *thread = port.cpu.getContext( inst->id.threadId); - port.numAccessesInDTLB++; + const auto &byteEnable = request->getByteEnable(); + if (byteEnable.size() == 0 || + isAnyActiveElement(byteEnable.cbegin(), byteEnable.cend())) { + port.numAccessesInDTLB++; - setState(LSQ::LSQRequest::InTranslation); + setState(LSQ::LSQRequest::InTranslation); - DPRINTFS(MinorMem, (&port), "Submitting DTLB request\n"); - /* Submit the translation request. The response will come through - * finish/markDelayed on the LSQRequest as it bears the Translation - * interface */ - thread->getDTBPtr()->translateTiming( - request, thread, this, (isLoad ? BaseTLB::Read : BaseTLB::Write)); + DPRINTFS(MinorMem, (&port), "Submitting DTLB request\n"); + /* Submit the translation request. The response will come through + * finish/markDelayed on the LSQRequest as it bears the Translation + * interface */ + thread->getDTBPtr()->translateTiming( + request, thread, this, (isLoad ? BaseTLB::Read : BaseTLB::Write)); + } else { + disableMemAccess(); + setState(LSQ::LSQRequest::Complete); + } } void @@ -357,6 +357,8 @@ LSQ::SplitDataRequest::makeFragmentRequests() unsigned int fragment_size; Addr fragment_addr; + std::vector<bool> fragment_write_byte_en; + /* Assume that this transfer is across potentially many block snap * boundaries: * @@ -401,6 +403,9 @@ LSQ::SplitDataRequest::makeFragmentRequests() /* Just past the last address in the request */ Addr end_addr = base_addr + whole_size; + auto& byte_enable = request->getByteEnable(); + unsigned int num_disabled_fragments = 0; + for (unsigned int fragment_index = 0; fragment_index < numFragments; fragment_index++) { @@ -421,32 +426,58 @@ LSQ::SplitDataRequest::makeFragmentRequests() } RequestPtr fragment = std::make_shared<Request>(); + bool disabled_fragment = false; fragment->setContext(request->contextId()); - fragment->setVirt(0 /* asid */, - fragment_addr, fragment_size, request->getFlags(), - request->masterId(), - request->getPC()); + if (byte_enable.empty()) { + fragment->setVirt(0 /* asid */, + fragment_addr, fragment_size, request->getFlags(), + request->masterId(), + request->getPC()); + } else { + // Set up byte-enable mask for the current fragment + auto it_start = byte_enable.begin() + + (fragment_addr - base_addr); + auto it_end = byte_enable.begin() + + (fragment_addr - base_addr) + fragment_size; + if (isAnyActiveElement(it_start, it_end)) { + fragment->setVirt(0 /* asid */, + fragment_addr, fragment_size, request->getFlags(), + request->masterId(), + request->getPC()); + fragment->setByteEnable(std::vector<bool>(it_start, it_end)); + } else { + disabled_fragment = true; + } + } - DPRINTFS(MinorMem, (&port), "Generating fragment addr: 0x%x size: %d" - " (whole request addr: 0x%x size: %d) %s\n", - fragment_addr, fragment_size, base_addr, whole_size, - (is_last_fragment ? "last fragment" : "")); + if (!disabled_fragment) { + DPRINTFS(MinorMem, (&port), "Generating fragment addr: 0x%x" + " size: %d (whole request addr: 0x%x size: %d) %s\n", + fragment_addr, fragment_size, base_addr, whole_size, + (is_last_fragment ? "last fragment" : "")); - fragment_addr += fragment_size; + fragmentRequests.push_back(fragment); + } else { + num_disabled_fragments++; + } - fragmentRequests.push_back(fragment); + fragment_addr += fragment_size; } + assert(numFragments >= num_disabled_fragments); + numFragments -= num_disabled_fragments; } void LSQ::SplitDataRequest::makeFragmentPackets() { + assert(numTranslatedFragments > 0); Addr base_addr = request->getVaddr(); DPRINTFS(MinorMem, (&port), "Making packets for request: %s\n", *inst); - for (unsigned int fragment_index = 0; fragment_index < numFragments; + for (unsigned int fragment_index = 0; + fragment_index < numTranslatedFragments; fragment_index++) { RequestPtr fragment = fragmentRequests[fragment_index]; @@ -490,28 +521,32 @@ LSQ::SplitDataRequest::makeFragmentPackets() void LSQ::SplitDataRequest::startAddrTranslation() { - setState(LSQ::LSQRequest::InTranslation); - makeFragmentRequests(); - numInTranslationFragments = 0; - numTranslatedFragments = 0; + if (numFragments > 0) { + setState(LSQ::LSQRequest::InTranslation); + numInTranslationFragments = 0; + numTranslatedFragments = 0; - /* @todo, just do these in sequence for now with - * a loop of: - * do { - * sendNextFragmentToTranslation ; translateTiming ; finish - * } while (numTranslatedFragments != numFragments); - */ + /* @todo, just do these in sequence for now with + * a loop of: + * do { + * sendNextFragmentToTranslation ; translateTiming ; finish + * } while (numTranslatedFragments != numFragments); + */ - /* Do first translation */ - sendNextFragmentToTranslation(); + /* Do first translation */ + sendNextFragmentToTranslation(); + } else { + disableMemAccess(); + setState(LSQ::LSQRequest::Complete); + } } PacketPtr LSQ::SplitDataRequest::getHeadPacket() { - assert(numIssuedFragments < numFragments); + assert(numIssuedFragments < numTranslatedFragments); return fragmentPackets[numIssuedFragments]; } @@ -519,7 +554,7 @@ LSQ::SplitDataRequest::getHeadPacket() void LSQ::SplitDataRequest::stepToNextPacket() { - assert(numIssuedFragments < numFragments); + assert(numIssuedFragments < numTranslatedFragments); numIssuedFragments++; } @@ -527,14 +562,13 @@ LSQ::SplitDataRequest::stepToNextPacket() void LSQ::SplitDataRequest::retireResponse(PacketPtr response) { - assert(numRetiredFragments < numFragments); + assert(numRetiredFragments < numTranslatedFragments); DPRINTFS(MinorMem, (&port), "Retiring fragment addr: 0x%x size: %d" - " offset: 0x%x (retired fragment num: %d) %s\n", + " offset: 0x%x (retired fragment num: %d)\n", response->req->getVaddr(), response->req->getSize(), request->getVaddr() - response->req->getVaddr(), - numRetiredFragments, - (fault == NoFault ? "" : fault->name())); + numRetiredFragments); numRetiredFragments++; @@ -573,7 +607,7 @@ LSQ::SplitDataRequest::retireResponse(PacketPtr response) packet->makeResponse(); } - if (numRetiredFragments == numFragments) + if (numRetiredFragments == numTranslatedFragments) setState(Complete); if (!skipped && isComplete()) { @@ -1477,7 +1511,8 @@ LSQ::needsToTick() void LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, unsigned int size, Addr addr, Request::Flags flags, - uint64_t *res, AtomicOpFunctor *amo_op) + uint64_t *res, AtomicOpFunctor *amo_op, + const std::vector<bool>& byteEnable) { bool needs_burst = transferNeedsBurst(addr, size, lineWidth); @@ -1533,6 +1568,9 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, addr, size, flags, cpu.dataMasterId(), /* I've no idea why we need the PC, but give it */ inst->pc.instAddr(), amo_op); + if (!byteEnable.empty()) { + request->request->setByteEnable(byteEnable); + } requests.push(request); request->startAddrTranslation(); diff --git a/src/cpu/minor/lsq.hh b/src/cpu/minor/lsq.hh index 11fa8774f..23b47c53c 100644 --- a/src/cpu/minor/lsq.hh +++ b/src/cpu/minor/lsq.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014 ARM Limited + * Copyright (c) 2013-2014, 2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -188,6 +188,8 @@ class LSQ : public Named /** BaseTLB::Translation interface */ void markDelayed() { } + void disableMemAccess(); + public: LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_, PacketDataPtr data_ = NULL, uint64_t *res_ = NULL); @@ -441,7 +443,8 @@ class LSQ : public Named { return numIssuedFragments != numRetiredFragments; } /** Have we stepped past the end of fragmentPackets? */ - bool sentAllPackets() { return numIssuedFragments == numFragments; } + bool sentAllPackets() + { return numIssuedFragments == numTranslatedFragments; } /** For loads, paste the response data into the main * response packet */ @@ -700,7 +703,9 @@ class LSQ : public Named * the LSQ */ void pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, unsigned int size, Addr addr, Request::Flags flags, - uint64_t *res, AtomicOpFunctor *amo_op); + uint64_t *res, AtomicOpFunctor *amo_op, + const std::vector<bool>& byteEnable = + std::vector<bool>()); /** Push a predicate failed-representing request into the queues just * to maintain commit order */ diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index c754fe8cf..db8fca20a 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -787,10 +787,13 @@ class FullO3CPU : public BaseO3CPU /** CPU pushRequest function, forwards request to LSQ. */ Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, unsigned int size, Addr addr, Request::Flags flags, - uint64_t *res, AtomicOpFunctor *amo_op = nullptr) + uint64_t *res, AtomicOpFunctor *amo_op = nullptr, + const std::vector<bool>& byteEnable = + std::vector<bool>()) + { return iew.ldstQueue.pushRequest(inst, isLoad, data, size, addr, - flags, res, amo_op); + flags, res, amo_op, byteEnable); } /** CPU read function, forwards read to LSQ. */ diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index a6037b7f4..84f1411a5 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -50,6 +50,7 @@ #include "arch/generic/tlb.hh" #include "cpu/inst_seq.hh" #include "cpu/o3/lsq_unit.hh" +#include "cpu/utils.hh" #include "enums/SMTQueuePolicy.hh" #include "mem/port.hh" #include "sim/sim_object.hh" @@ -251,6 +252,7 @@ class LSQ const Addr _addr; const uint32_t _size; const Request::Flags _flags; + std::vector<bool> _byteEnable; uint32_t _numOutstandingPackets; AtomicOpFunctor *_amo_op; protected: @@ -351,6 +353,28 @@ class LSQ } } + /** Helper function used to add a (sub)request, given its address + * `addr`, size `size` and byte-enable mask `byteEnable`. + * + * The request is only added if the mask is empty or if there is at + * least an active element in it. + */ + void + addRequest(Addr addr, unsigned size, + const std::vector<bool>& byteEnable) + { + if (byteEnable.empty() || + isAnyActiveElement(byteEnable.begin(), byteEnable.end())) { + auto request = std::make_shared<Request>(_inst->getASID(), + addr, size, _flags, _inst->masterId(), + _inst->instAddr(), _inst->contextId()); + if (!byteEnable.empty()) { + request->setByteEnable(byteEnable); + } + _requests.push_back(request); + } + } + /** Destructor. * The LSQRequest owns the request. If the packet has already been * sent, the sender state will be deleted upon receiving the reply. @@ -609,11 +633,17 @@ class LSQ * declaration of the names in the parent class. */ using Flag = typename LSQRequest::Flag; using State = typename LSQRequest::State; + using LSQRequest::_addr; using LSQRequest::_fault; + using LSQRequest::_flags; + using LSQRequest::_size; + using LSQRequest::_byteEnable; + using LSQRequest::_requests; using LSQRequest::_inst; using LSQRequest::_packets; using LSQRequest::_port; using LSQRequest::_res; + using LSQRequest::_taskId; using LSQRequest::_senderState; using LSQRequest::_state; using LSQRequest::flags; @@ -635,14 +665,8 @@ class LSQ uint64_t* res = nullptr, AtomicOpFunctor* amo_op = nullptr) : LSQRequest(port, inst, isLoad, addr, size, flags_, data, res, - amo_op) - { - LSQRequest::_requests.push_back( - std::make_shared<Request>(inst->getASID(), addr, size, - flags_, inst->masterId(), inst->instAddr(), - inst->contextId(), amo_op)); - LSQRequest::_requests.back()->setReqInstSeqNum(inst->seqNum); - } + amo_op) {} + inline virtual ~SingleDataRequest() {} virtual void initiateTranslation(); virtual void finish(const Fault &fault, const RequestPtr &req, @@ -671,6 +695,7 @@ class LSQ using LSQRequest::_port; using LSQRequest::_requests; using LSQRequest::_res; + using LSQRequest::_byteEnable; using LSQRequest::_senderState; using LSQRequest::_size; using LSQRequest::_state; @@ -691,14 +716,14 @@ class LSQ RequestPtr mainReq; PacketPtr _mainPacket; - public: SplitDataRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad, const Addr& addr, const uint32_t& size, const Request::Flags & flags_, PacketDataPtr data = nullptr, uint64_t* res = nullptr) : - LSQRequest(port, inst, isLoad, addr, size, flags_, data, res), + LSQRequest(port, inst, isLoad, addr, size, flags_, data, res, + nullptr), numFragments(0), numReceivedPackets(0), mainReq(nullptr), @@ -949,7 +974,8 @@ class LSQ Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, unsigned int size, Addr addr, Request::Flags flags, - uint64_t *res, AtomicOpFunctor *amo_op); + uint64_t *res, AtomicOpFunctor *amo_op, + const std::vector<bool>& byteEnable); /** The CPU pointer. */ O3CPU *cpu; diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh index 732712029..70621a523 100644 --- a/src/cpu/o3/lsq_impl.hh +++ b/src/cpu/o3/lsq_impl.hh @@ -681,29 +681,12 @@ LSQ<Impl>::dumpInsts() const } } -static Addr -addrBlockOffset(Addr addr, unsigned int block_size) -{ - return addr & (block_size - 1); -} - -static Addr -addrBlockAlign(Addr addr, uint64_t block_size) -{ - return addr & ~(block_size - 1); -} - -static bool -transferNeedsBurst(Addr addr, uint64_t size, uint64_t block_size) -{ - return (addrBlockOffset(addr, block_size) + size) > block_size; -} - template<class Impl> Fault LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, unsigned int size, Addr addr, Request::Flags flags, - uint64_t *res, AtomicOpFunctor *amo_op) + uint64_t *res, AtomicOpFunctor *amo_op, + const std::vector<bool>& byteEnable) { // This comming request can be either load, store or atomic. // Atomic request has a corresponding pointer to its atomic memory @@ -735,6 +718,9 @@ LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, size, flags, data, res, amo_op); } assert(req); + if (!byteEnable.empty()) { + req->_byteEnable = byteEnable; + } inst->setRequest(); req->taskId(cpu->taskId()); @@ -756,6 +742,7 @@ LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, else inst->getFault() = cpu->write(req, data, inst->sqIdx); } else if (isLoad) { + inst->setMemAccPredicate(false); // Commit will have to clean up whatever happened. Set this // instruction as executed. inst->setExecuted(); @@ -848,14 +835,21 @@ template<class Impl> void LSQ<Impl>::SingleDataRequest::initiateTranslation() { - _inst->translationStarted(true); - setState(State::Translation); - flags.set(Flag::TranslationStarted); + assert(_requests.size() == 0); - _inst->savedReq = this; - sendFragmentToTranslation(0); + this->addRequest(_addr, _size, _byteEnable); - if (isTranslationComplete()) { + if (_requests.size() > 0) { + _requests.back()->setReqInstSeqNum(_inst->seqNum); + _requests.back()->taskId(_taskId); + _inst->translationStarted(true); + setState(State::Translation); + flags.set(Flag::TranslationStarted); + + _inst->savedReq = this; + sendFragmentToTranslation(0); + } else { + _inst->setMemAccPredicate(false); } } @@ -877,11 +871,7 @@ template<class Impl> void LSQ<Impl>::SplitDataRequest::initiateTranslation() { - _inst->translationStarted(true); - setState(State::Translation); - flags.set(Flag::TranslationStarted); - - unsigned int cacheLineSize = _port.cacheLineSize(); + auto cacheLineSize = _port.cacheLineSize(); Addr base_addr = _addr; Addr next_addr = addrBlockAlign(_addr + cacheLineSize, cacheLineSize); Addr final_addr = addrBlockAlign(_addr + _size, cacheLineSize); @@ -890,6 +880,9 @@ LSQ<Impl>::SplitDataRequest::initiateTranslation() mainReq = std::make_shared<Request>(_inst->getASID(), base_addr, _size, _flags, _inst->masterId(), _inst->instAddr(), _inst->contextId()); + if (!_byteEnable.empty()) { + mainReq->setByteEnable(_byteEnable); + } // Paddr is not used in mainReq. However, we will accumulate the flags // from the sub requests into mainReq by calling setFlags() in finish(). @@ -898,39 +891,63 @@ LSQ<Impl>::SplitDataRequest::initiateTranslation() mainReq->setPaddr(0); /* Get the pre-fix, possibly unaligned. */ - _requests.push_back(std::make_shared<Request>(_inst->getASID(), base_addr, - next_addr - base_addr, _flags, _inst->masterId(), - _inst->instAddr(), _inst->contextId())); + if (_byteEnable.empty()) { + this->addRequest(base_addr, next_addr - base_addr, _byteEnable); + } else { + auto it_start = _byteEnable.begin(); + auto it_end = _byteEnable.begin() + (next_addr - base_addr); + this->addRequest(base_addr, next_addr - base_addr, + std::vector<bool>(it_start, it_end)); + } size_so_far = next_addr - base_addr; /* We are block aligned now, reading whole blocks. */ base_addr = next_addr; while (base_addr != final_addr) { - _requests.push_back(std::make_shared<Request>(_inst->getASID(), - base_addr, cacheLineSize, _flags, _inst->masterId(), - _inst->instAddr(), _inst->contextId())); + if (_byteEnable.empty()) { + this->addRequest(base_addr, cacheLineSize, _byteEnable); + } else { + auto it_start = _byteEnable.begin() + size_so_far; + auto it_end = _byteEnable.begin() + size_so_far + cacheLineSize; + this->addRequest(base_addr, cacheLineSize, + std::vector<bool>(it_start, it_end)); + } size_so_far += cacheLineSize; base_addr += cacheLineSize; } /* Deal with the tail. */ if (size_so_far < _size) { - _requests.push_back(std::make_shared<Request>(_inst->getASID(), - base_addr, _size - size_so_far, _flags, _inst->masterId(), - _inst->instAddr(), _inst->contextId())); + if (_byteEnable.empty()) { + this->addRequest(base_addr, _size - size_so_far, _byteEnable); + } else { + auto it_start = _byteEnable.begin() + size_so_far; + auto it_end = _byteEnable.end(); + this->addRequest(base_addr, _size - size_so_far, + std::vector<bool>(it_start, it_end)); + } } - /* Setup the requests and send them to translation. */ - for (auto& r: _requests) { - r->setReqInstSeqNum(_inst->seqNum); - r->taskId(_taskId); - } - this->_inst->savedReq = this; - numInTranslationFragments = 0; - numTranslatedFragments = 0; + if (_requests.size() > 0) { + /* Setup the requests and send them to translation. */ + for (auto& r: _requests) { + r->setReqInstSeqNum(_inst->seqNum); + r->taskId(_taskId); + } - for (uint32_t i = 0; i < _requests.size(); i++) { - sendFragmentToTranslation(i); + _inst->translationStarted(true); + setState(State::Translation); + flags.set(Flag::TranslationStarted); + this->_inst->savedReq = this; + numInTranslationFragments = 0; + numTranslatedFragments = 0; + _fault.resize(_requests.size()); + + for (uint32_t i = 0; i < _requests.size(); i++) { + sendFragmentToTranslation(i); + } + } else { + _inst->setMemAccPredicate(false); } } @@ -968,8 +985,6 @@ LSQ<Impl>::SplitDataRequest::recvTimingResp(PacketPtr pkt) while (pktIdx < _packets.size() && pkt != _packets[pktIdx]) pktIdx++; assert(pktIdx < _packets.size()); - assert(pkt->req == _requests[pktIdx]); - assert(pkt == _packets[pktIdx]); numReceivedPackets++; state->outstanding--; if (numReceivedPackets == _packets.size()) { @@ -1012,16 +1027,19 @@ void LSQ<Impl>::SplitDataRequest::buildPackets() { /* Extra data?? */ - ptrdiff_t offset = 0; + Addr base_address = _addr; + if (_packets.size() == 0) { /* New stuff */ if (isLoad()) { _mainPacket = Packet::createRead(mainReq); _mainPacket->dataStatic(_inst->memData); } - for (auto& r: _requests) { + for (int i = 0; i < _requests.size() && _fault[i] == NoFault; i++) { + RequestPtr r = _requests[i]; PacketPtr pkt = isLoad() ? Packet::createRead(r) - : Packet::createWrite(r); + : Packet::createWrite(r); + ptrdiff_t offset = r->getVaddr() - base_address; if (isLoad()) { pkt->dataStatic(_inst->memData + offset); } else { @@ -1031,12 +1049,11 @@ LSQ<Impl>::SplitDataRequest::buildPackets() r->getSize()); pkt->dataDynamic(req_data); } - offset += r->getSize(); pkt->senderState = _senderState; _packets.push_back(pkt); } } - assert(_packets.size() == _requests.size()); + assert(_packets.size() > 0); } template<class Impl> diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh index 9323e8634..21bed99fa 100644 --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -542,8 +542,7 @@ LSQUnit<Impl>::executeLoad(const DynInstPtr &inst) load_fault = inst->initiateAcc(); - if (!inst->readMemAccPredicate()) { - assert(load_fault == NoFault); + if (load_fault == NoFault && !inst->readMemAccPredicate()) { assert(inst->readPredicate()); inst->setExecuted(); inst->completeAcc(nullptr); diff --git a/src/cpu/simple/atomic.cc b/src/cpu/simple/atomic.cc index caf2427ef..c5b024532 100644 --- a/src/cpu/simple/atomic.cc +++ b/src/cpu/simple/atomic.cc @@ -49,6 +49,7 @@ #include "base/output.hh" #include "config/the_isa.hh" #include "cpu/exetrace.hh" +#include "cpu/utils.hh" #include "debug/Drain.hh" #include "debug/ExecFaulting.hh" #include "debug/SimpleCPU.hh" @@ -333,9 +334,43 @@ AtomicSimpleCPU::AtomicCPUDPort::recvFunctionalSnoop(PacketPtr pkt) } } +bool +AtomicSimpleCPU::genMemFragmentRequest(const RequestPtr& req, Addr frag_addr, + int size, Request::Flags flags, + const std::vector<bool>& byte_enable, + int& frag_size, int& size_left) const +{ + bool predicate = true; + Addr inst_addr = threadInfo[curThread]->thread->pcState().instAddr(); + + frag_size = std::min( + cacheLineSize() - addrBlockOffset(frag_addr, cacheLineSize()), + (Addr) size_left); + size_left -= frag_size; + + if (!byte_enable.empty()) { + // Set up byte-enable mask for the current fragment + auto it_start = byte_enable.begin() + (size - (frag_size + size_left)); + auto it_end = byte_enable.begin() + (size - size_left); + if (isAnyActiveElement(it_start, it_end)) { + req->setVirt(0, frag_addr, frag_size, flags, dataMasterId(), + inst_addr); + req->setByteEnable(std::vector<bool>(it_start, it_end)); + } else { + predicate = false; + } + } else { + req->setVirt(0, frag_addr, frag_size, flags, dataMasterId(), + inst_addr); + } + + return predicate; +} + Fault AtomicSimpleCPU::readMem(Addr addr, uint8_t * data, unsigned size, - Request::Flags flags) + Request::Flags flags, + const std::vector<bool>& byteEnable) { SimpleExecContext& t_info = *threadInfo[curThread]; SimpleThread* thread = t_info.thread; @@ -346,28 +381,29 @@ AtomicSimpleCPU::readMem(Addr addr, uint8_t * data, unsigned size, if (traceData) traceData->setMem(addr, size, flags); - //The size of the data we're trying to read. - int fullSize = size; - - //The address of the second part of this access if it needs to be split - //across a cache line boundary. - Addr secondAddr = roundDown(addr + size - 1, cacheLineSize()); - - if (secondAddr > addr) - size = secondAddr - addr; - dcache_latency = 0; req->taskId(taskId()); + + Addr frag_addr = addr; + int frag_size = 0; + int size_left = size; + bool predicate; + Fault fault = NoFault; + while (1) { - req->setVirt(0, addr, size, flags, dataMasterId(), thread->pcState().instAddr()); + predicate = genMemFragmentRequest(req, frag_addr, size, flags, + byteEnable, frag_size, size_left); // translate to physical address - Fault fault = thread->dtb->translateAtomic(req, thread->getTC(), - BaseTLB::Read); + if (predicate) { + fault = thread->dtb->translateAtomic(req, thread->getTC(), + BaseTLB::Read); + } // Now do the access. - if (fault == NoFault && !req->getFlags().isSet(Request::NO_ACCESS)) { + if (predicate && fault == NoFault && + !req->getFlags().isSet(Request::NO_ACCESS)) { Packet pkt(req, Packet::makeReadCmd(req)); pkt.dataStatic(data); @@ -394,33 +430,29 @@ AtomicSimpleCPU::readMem(Addr addr, uint8_t * data, unsigned size, } } - //If we don't need to access a second cache line, stop now. - if (secondAddr <= addr) - { + // If we don't need to access further cache lines, stop now. + if (size_left == 0) { if (req->isLockedRMW() && fault == NoFault) { assert(!locked); locked = true; } - return fault; } /* - * Set up for accessing the second cache line. + * Set up for accessing the next cache line. */ + frag_addr += frag_size; //Move the pointer we're reading into to the correct location. - data += size; - //Adjust the size to get the remaining bytes. - size = addr + fullSize - secondAddr; - //And access the right address. - addr = secondAddr; + data += frag_size; } } Fault AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr, - Request::Flags flags, uint64_t *res) + Request::Flags flags, uint64_t *res, + const std::vector<bool>& byteEnable) { SimpleExecContext& t_info = *threadInfo[curThread]; SimpleThread* thread = t_info.thread; @@ -439,32 +471,37 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr, if (traceData) traceData->setMem(addr, size, flags); - //The size of the data we're trying to read. - int fullSize = size; - - //The address of the second part of this access if it needs to be split - //across a cache line boundary. - Addr secondAddr = roundDown(addr + size - 1, cacheLineSize()); - - if (secondAddr > addr) - size = secondAddr - addr; - dcache_latency = 0; req->taskId(taskId()); + + Addr frag_addr = addr; + int frag_size = 0; + int size_left = size; + int curr_frag_id = 0; + bool predicate; + Fault fault = NoFault; + while (1) { - req->setVirt(0, addr, size, flags, dataMasterId(), thread->pcState().instAddr()); + predicate = genMemFragmentRequest(req, frag_addr, size, flags, + byteEnable, frag_size, size_left); // translate to physical address - Fault fault = thread->dtb->translateAtomic(req, thread->getTC(), BaseTLB::Write); + if (predicate) + fault = thread->dtb->translateAtomic(req, thread->getTC(), + BaseTLB::Write); // Now do the access. - if (fault == NoFault) { + if (predicate && fault == NoFault) { bool do_access = true; // flag to suppress cache access if (req->isLLSC()) { - do_access = TheISA::handleLockedWrite(thread, req, dcachePort.cacheBlockMask); + assert(curr_frag_id == 0); + do_access = + TheISA::handleLockedWrite(thread, req, + dcachePort.cacheBlockMask); } else if (req->isSwap()) { + assert(curr_frag_id == 0); if (req->isCondSwap()) { assert(res); req->setExtraData(*res); @@ -488,8 +525,8 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr, assert(!pkt.isError()); if (req->isSwap()) { - assert(res); - memcpy(res, pkt.getConstPtr<uint8_t>(), fullSize); + assert(res && curr_frag_id == 0); + memcpy(res, pkt.getConstPtr<uint8_t>(), size); } } @@ -500,14 +537,14 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr, //If there's a fault or we don't need to access a second cache line, //stop now. - if (fault != NoFault || secondAddr <= addr) + if (fault != NoFault || size_left == 0) { if (req->isLockedRMW() && fault == NoFault) { - assert(locked); + assert(byteEnable.empty()); + assert(locked && curr_frag_id == 0); locked = false; } - if (fault != NoFault && req->isPrefetch()) { return NoFault; } else { @@ -516,15 +553,14 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr, } /* - * Set up for accessing the second cache line. + * Set up for accessing the next cache line. */ + frag_addr += frag_size; //Move the pointer we're reading into to the correct location. - data += size; - //Adjust the size to get the remaining bytes. - size = addr + fullSize - secondAddr; - //And access the right address. - addr = secondAddr; + data += frag_size; + + curr_frag_id++; } } diff --git a/src/cpu/simple/atomic.hh b/src/cpu/simple/atomic.hh index 84f379121..100306981 100644 --- a/src/cpu/simple/atomic.hh +++ b/src/cpu/simple/atomic.hh @@ -195,11 +195,36 @@ class AtomicSimpleCPU : public BaseSimpleCPU void activateContext(ThreadID thread_num) override; void suspendContext(ThreadID thread_num) override; + /** + * Helper function used to set up the request for a single fragment of a + * memory access. + * + * Takes care of setting up the appropriate byte-enable mask for the + * fragment, given the mask for the entire memory access. + * + * @param req Pointer to the Request object to populate. + * @param frag_addr Start address of the fragment. + * @param size Total size of the memory access in bytes. + * @param flags Request flags. + * @param byte_enable Byte-enable mask for the entire memory access. + * @param[out] frag_size Fragment size. + * @param[in,out] size_left Size left to be processed in the memory access. + * @return True if the byte-enable mask for the fragment is not all-false. + */ + bool genMemFragmentRequest(const RequestPtr& req, Addr frag_addr, + int size, Request::Flags flags, + const std::vector<bool>& byte_enable, + int& frag_size, int& size_left) const; + Fault readMem(Addr addr, uint8_t *data, unsigned size, - Request::Flags flags) override; + Request::Flags flags, + const std::vector<bool>& byteEnable = std::vector<bool>()) + override; Fault writeMem(uint8_t *data, unsigned size, - Addr addr, Request::Flags flags, uint64_t *res) override; + Addr addr, Request::Flags flags, uint64_t *res, + const std::vector<bool>& byteEnable = std::vector<bool>()) + override; Fault amoMem(Addr addr, uint8_t* data, unsigned size, Request::Flags flags, AtomicOpFunctor *amo_op) override; diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc index 298ba9f9e..816add707 100644 --- a/src/cpu/simple/base.cc +++ b/src/cpu/simple/base.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2012, 2015, 2017 ARM Limited + * Copyright (c) 2010-2012, 2015, 2017, 2018 ARM Limited * Copyright (c) 2013 Advanced Micro Devices, Inc. * All rights reserved * @@ -494,6 +494,10 @@ BaseSimpleCPU::preExecute() thread->setFloatReg(ZeroReg, 0); #endif // ALPHA_ISA + // resets predicates + t_info.setPredicate(true); + t_info.setMemAccPredicate(true); + // check for instruction-count-based events comInstEventQueue[curThread]->serviceEvents(t_info.numInst); system->instEventQueue.serviceEvents(system->totalNumInsts); diff --git a/src/cpu/simple/base.hh b/src/cpu/simple/base.hh index 8060b07ad..5404e5df8 100644 --- a/src/cpu/simple/base.hh +++ b/src/cpu/simple/base.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2012,2015 ARM Limited + * Copyright (c) 2011-2012,2015,2018 ARM Limited * Copyright (c) 2013 Advanced Micro Devices, Inc. * All rights reserved * @@ -143,15 +143,21 @@ class BaseSimpleCPU : public BaseCPU void startup() override; virtual Fault readMem(Addr addr, uint8_t* data, unsigned size, - Request::Flags flags) + Request::Flags flags, + const std::vector<bool>& byteEnable = + std::vector<bool>()) { panic("readMem() is not implemented\n"); } virtual Fault initiateMemRead(Addr addr, unsigned size, - Request::Flags flags) + Request::Flags flags, + const std::vector<bool>& byteEnable = + std::vector<bool>()) { panic("initiateMemRead() is not implemented\n"); } virtual Fault writeMem(uint8_t* data, unsigned size, Addr addr, - Request::Flags flags, uint64_t* res) + Request::Flags flags, uint64_t* res, + const std::vector<bool>& byteEnable = + std::vector<bool>()) { panic("writeMem() is not implemented\n"); } virtual Fault amoMem(Addr addr, uint8_t* data, unsigned size, diff --git a/src/cpu/simple/exec_context.hh b/src/cpu/simple/exec_context.hh index be7a863c5..de98d6efd 100644 --- a/src/cpu/simple/exec_context.hh +++ b/src/cpu/simple/exec_context.hh @@ -434,26 +434,32 @@ class SimpleExecContext : public ExecContext { thread->pcState(val); } - Fault readMem(Addr addr, uint8_t *data, unsigned int size, - Request::Flags flags) override + Request::Flags flags, + const std::vector<bool>& byteEnable = std::vector<bool>()) + override { - return cpu->readMem(addr, data, size, flags); + return cpu->readMem(addr, data, size, flags, byteEnable); } Fault initiateMemRead(Addr addr, unsigned int size, - Request::Flags flags) override + Request::Flags flags, + const std::vector<bool>& byteEnable = std::vector<bool>()) + override { - return cpu->initiateMemRead(addr, size, flags); + return cpu->initiateMemRead(addr, size, flags, byteEnable); } Fault writeMem(uint8_t *data, unsigned int size, Addr addr, - Request::Flags flags, uint64_t *res) override + Request::Flags flags, uint64_t *res, + const std::vector<bool>& byteEnable = std::vector<bool>()) + override { - return cpu->writeMem(data, size, addr, flags, res); + assert(byteEnable.empty() || byteEnable.size() == size); + return cpu->writeMem(data, size, addr, flags, res, byteEnable); } Fault amoMem(Addr addr, uint8_t *data, unsigned int size, diff --git a/src/cpu/simple/timing.cc b/src/cpu/simple/timing.cc index 637308a96..454259099 100644 --- a/src/cpu/simple/timing.cc +++ b/src/cpu/simple/timing.cc @@ -1,6 +1,6 @@ /* * Copyright 2014 Google, Inc. - * Copyright (c) 2010-2013,2015,2017 ARM Limited + * Copyright (c) 2010-2013,2015,2017-2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -417,7 +417,8 @@ TimingSimpleCPU::buildSplitPacket(PacketPtr &pkt1, PacketPtr &pkt2, Fault TimingSimpleCPU::initiateMemRead(Addr addr, unsigned size, - Request::Flags flags) + Request::Flags flags, + const std::vector<bool>& byteEnable) { SimpleExecContext &t_info = *threadInfo[curThread]; SimpleThread* thread = t_info.thread; @@ -434,6 +435,9 @@ TimingSimpleCPU::initiateMemRead(Addr addr, unsigned size, RequestPtr req = std::make_shared<Request>( asid, addr, size, flags, dataMasterId(), pc, thread->contextId()); + if (!byteEnable.empty()) { + req->setByteEnable(byteEnable); + } req->taskId(taskId()); @@ -491,7 +495,8 @@ TimingSimpleCPU::handleWritePacket() Fault TimingSimpleCPU::writeMem(uint8_t *data, unsigned size, - Addr addr, Request::Flags flags, uint64_t *res) + Addr addr, Request::Flags flags, uint64_t *res, + const std::vector<bool>& byteEnable) { SimpleExecContext &t_info = *threadInfo[curThread]; SimpleThread* thread = t_info.thread; @@ -516,6 +521,9 @@ TimingSimpleCPU::writeMem(uint8_t *data, unsigned size, RequestPtr req = std::make_shared<Request>( asid, addr, size, flags, dataMasterId(), pc, thread->contextId()); + if (!byteEnable.empty()) { + req->setByteEnable(byteEnable); + } req->taskId(taskId()); @@ -523,6 +531,10 @@ TimingSimpleCPU::writeMem(uint8_t *data, unsigned size, assert(split_addr <= addr || split_addr - addr < block_size); _status = DTBWaitResponse; + + // TODO: TimingSimpleCPU doesn't support arbitrarily long multi-line mem. + // accesses yet + if (split_addr > addr) { RequestPtr req1, req2; assert(!req->isLLSC() && !req->isSwap()); diff --git a/src/cpu/simple/timing.hh b/src/cpu/simple/timing.hh index ce0a4dbfc..a49822fc1 100644 --- a/src/cpu/simple/timing.hh +++ b/src/cpu/simple/timing.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2013,2015 ARM Limited + * Copyright (c) 2012-2013,2015,2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -283,10 +283,14 @@ class TimingSimpleCPU : public BaseSimpleCPU void suspendContext(ThreadID thread_num) override; Fault initiateMemRead(Addr addr, unsigned size, - Request::Flags flags) override; + Request::Flags flags, + const std::vector<bool>& byteEnable =std::vector<bool>()) + override; Fault writeMem(uint8_t *data, unsigned size, - Addr addr, Request::Flags flags, uint64_t *res) override; + Addr addr, Request::Flags flags, uint64_t *res, + const std::vector<bool>& byteEnable = std::vector<bool>()) + override; Fault initiateMemAMO(Addr addr, unsigned size, Request::Flags flags, AtomicOpFunctor *amo_op) override; diff --git a/src/cpu/simple_thread.cc b/src/cpu/simple_thread.cc index 9067e877b..0936e41ad 100644 --- a/src/cpu/simple_thread.cc +++ b/src/cpu/simple_thread.cc @@ -77,7 +77,7 @@ SimpleThread::SimpleThread(BaseCPU *_cpu, int _thread_num, System *_sys, Process *_process, BaseTLB *_itb, BaseTLB *_dtb, TheISA::ISA *_isa) : ThreadState(_cpu, _thread_num, _process), isa(_isa), - predicate(false), system(_sys), + predicate(true), memAccPredicate(true), system(_sys), itb(_itb), dtb(_dtb), decoder(TheISA::Decoder(_isa)) { clearArchRegs(); @@ -87,8 +87,9 @@ SimpleThread::SimpleThread(BaseCPU *_cpu, int _thread_num, System *_sys, SimpleThread::SimpleThread(BaseCPU *_cpu, int _thread_num, System *_sys, BaseTLB *_itb, BaseTLB *_dtb, TheISA::ISA *_isa, bool use_kernel_stats) - : ThreadState(_cpu, _thread_num, NULL), isa(_isa), system(_sys), itb(_itb), - dtb(_dtb), decoder(TheISA::Decoder(_isa)) + : ThreadState(_cpu, _thread_num, NULL), isa(_isa), + predicate(true), memAccPredicate(true), system(_sys), + itb(_itb), dtb(_dtb), decoder(TheISA::Decoder(_isa)) { quiesceEvent = new EndQuiesceEvent(this); diff --git a/src/cpu/utils.hh b/src/cpu/utils.hh new file mode 100644 index 000000000..4c1318174 --- /dev/null +++ b/src/cpu/utils.hh @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2017-2018 ARM Limited + * All rights reserved + * + * The license below extends only to copyright in the software and shall + * not be construed as granting a license to any other intellectual + * property including but not limited to intellectual property relating + * to a hardware implementation of the functionality of the software + * licensed hereunder. You may use the software subject to the license + * terms below provided that you ensure that this notice is replicated + * unmodified and in its entirety in all distributions of the software, + * modified or unmodified, in source code or in binary form. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Andrew Bardsley + */ + +#ifndef __CPU_UTILS_HH__ +#define __CPU_UTILS_HH__ + +#include "base/types.hh" + +/** + * Calculates the offset of a given address wrt aligned fixed-size blocks. + * @param addr Input address. + * @param block_size Block size in bytes. + * @return Offset of the given address in bytes. + */ +inline Addr +addrBlockOffset(Addr addr, Addr block_size) +{ + return addr & (block_size - 1); +} + +/** + * Returns the address of the closest aligned fixed-size block to the given + * address. + * @param addr Input address. + * @param block_size Block size in bytes. + * @return Address of the closest aligned block. + */ +inline Addr +addrBlockAlign(Addr addr, Addr block_size) +{ + return addr & ~(block_size - 1); +} + +/** + * Returns true if the given memory access (address, size) needs to be + * fragmented across aligned fixed-size blocks. + * @param addr Address of the memory access. + * @param size Size of the memory access. + * @param block_size Block size in bytes. + * @return True if the memory access needs to be fragmented. + */ +inline bool +transferNeedsBurst(Addr addr, unsigned int size, unsigned int block_size) +{ + return (addrBlockOffset(addr, block_size) + size) > block_size; +} + +/** + * Test if there is any active element in an enablement range. + */ +inline bool +isAnyActiveElement(const std::vector<bool>::const_iterator& it_start, + const std::vector<bool>::const_iterator& it_end) +{ + auto it_tmp = it_start; + for (;it_tmp != it_end && !(*it_tmp); ++it_tmp); + return (it_tmp != it_end); +} + +#endif // __CPU_UTILS_HH__ diff --git a/src/mem/abstract_mem.cc b/src/mem/abstract_mem.cc index f7b02ce17..a998530fd 100644 --- a/src/mem/abstract_mem.cc +++ b/src/mem/abstract_mem.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2012,2017 ARM Limited + * Copyright (c) 2010-2012,2017-2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall diff --git a/src/mem/cache/cache.cc b/src/mem/cache/cache.cc index 494a998a5..b72ff4261 100644 --- a/src/mem/cache/cache.cc +++ b/src/mem/cache/cache.cc @@ -297,7 +297,8 @@ Cache::promoteWholeLineWrites(PacketPtr pkt) { // Cache line clearing instructions if (doFastWrites && (pkt->cmd == MemCmd::WriteReq) && - (pkt->getSize() == blkSize) && (pkt->getOffset(blkSize) == 0)) { + (pkt->getSize() == blkSize) && (pkt->getOffset(blkSize) == 0) && + !pkt->isMaskedWrite()) { pkt->cmd = MemCmd::WriteLineReq; DPRINTF(Cache, "packet promoted from Write to WriteLineReq\n"); } diff --git a/src/mem/packet.hh b/src/mem/packet.hh index 93b3ad5de..130cc41ad 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -1092,6 +1092,7 @@ class Packet : public Printable getPtr() { assert(flags.isSet(STATIC_DATA|DYNAMIC_DATA)); + assert(!isMaskedWrite()); return (T*)data; } @@ -1180,10 +1181,11 @@ class Packet : public Printable // same pointer from source to destination and back assert(p != getPtr<uint8_t>() || flags.isSet(STATIC_DATA)); - if (p != getPtr<uint8_t>()) + if (p != getPtr<uint8_t>()) { // for packet with allocated dynamic data, we copy data from // one to the other, e.g. a forwarded response to a response std::memcpy(getPtr<uint8_t>(), p, getSize()); + } } /** @@ -1203,7 +1205,19 @@ class Packet : public Printable void writeData(uint8_t *p) const { - std::memcpy(p, getConstPtr<uint8_t>(), getSize()); + if (!isMaskedWrite()) { + std::memcpy(p, getConstPtr<uint8_t>(), getSize()); + } else { + assert(req->getByteEnable().size() == getSize()); + // Write only the enabled bytes + const uint8_t *base = getConstPtr<uint8_t>(); + for (int i = 0; i < getSize(); i++) { + if (req->getByteEnable()[i]) { + p[i] = *(base + i); + } + // Disabled bytes stay untouched + } + } } /** @@ -1268,6 +1282,17 @@ class Packet : public Printable bool trySatisfyFunctional(PacketPtr other) { + if (other->isMaskedWrite()) { + // Do not forward data if overlapping with a masked write + if (_isSecure == other->isSecure() && + getAddr() <= (other->getAddr() + other->getSize() - 1) && + other->getAddr() <= (getAddr() + getSize() - 1)) { + warn("Trying to check against a masked write, skipping." + " (addr: 0x%x, other addr: 0x%x)", getAddr(), + other->getAddr()); + } + return false; + } // all packets that are carrying a payload should have a valid // data pointer return trySatisfyFunctional(other, other->getAddr(), other->isSecure(), @@ -1296,6 +1321,12 @@ class Packet : public Printable return cmd == MemCmd::CleanEvict || cmd == MemCmd::WritebackClean; } + bool + isMaskedWrite() const + { + return (cmd == MemCmd::WriteReq && !req->getByteEnable().empty()); + } + /** * Check a functional request against a memory value represented * by a base/size pair and an associated data array. If the diff --git a/src/mem/request.hh b/src/mem/request.hh index 2a53c21a4..324ae382e 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -320,6 +320,9 @@ class Request */ unsigned _size; + /** Byte-enable mask for writes. */ + std::vector<bool> _byteEnable; + /** The requestor ID which is unique in the system for all ports * that are capable of issuing a transaction */ @@ -567,6 +570,9 @@ class Request * Generate two requests as if this request had been split into two * pieces. The original request can't have been translated already. */ + // TODO: this function is still required by TimingSimpleCPU - should be + // removed once TimingSimpleCPU will support arbitrarily long multi-line + // mem. accesses void splitOnVaddr(Addr split_addr, RequestPtr &req1, RequestPtr &req2) { assert(privateFlags.isSet(VALID_VADDR)); @@ -577,6 +583,14 @@ class Request req1->_size = split_addr - _vaddr; req2->_vaddr = split_addr; req2->_size = _size - req1->_size; + if (!_byteEnable.empty()) { + req1->_byteEnable = std::vector<bool>( + _byteEnable.begin(), + _byteEnable.begin() + req1->_size); + req2->_byteEnable = std::vector<bool>( + _byteEnable.begin() + req1->_size, + _byteEnable.end()); + } } /** @@ -628,6 +642,19 @@ class Request return _size; } + const std::vector<bool>& + getByteEnable() const + { + return _byteEnable; + } + + void + setByteEnable(const std::vector<bool>& be) + { + assert(be.empty() || be.size() == _size); + _byteEnable = be; + } + /** Accessor for time. */ Tick time() const |