diff options
author | Giacomo Gabrielli <giacomo.gabrielli@arm.com> | 2017-07-07 14:13:11 +0100 |
---|---|---|
committer | Giacomo Gabrielli <giacomo.gabrielli@arm.com> | 2019-05-11 12:48:58 +0000 |
commit | c58cb8c9dbeef377da180f1fdaaa1c0eadf85550 (patch) | |
tree | 7591abeb888d8c8e645332749bcaea627628f9bf /src/cpu/minor | |
parent | d0e4cdc9c36466a3dbef8c9f9f509cce8f1a6c34 (diff) | |
download | gem5-c58cb8c9dbeef377da180f1fdaaa1c0eadf85550.tar.xz |
cpu,mem: Add support for partial loads/stores and wide mem. accesses
This changeset adds support for partial (or masked) loads/stores, i.e.
loads/stores that can disable accesses to individual bytes within the
target address range. In addition, this changeset extends the code to
crack memory accesses across most CPU models (TimingSimpleCPU still
TBD), so that arbitrarily wide memory accesses are supported. These
changes are required for supporting ISAs with wide vectors.
Additional authors:
- Gabor Dozsa <gabor.dozsa@arm.com>
- Tiago Muck <tiago.muck@arm.com>
Change-Id: Ibad33541c258ad72925c0b1d5abc3e5e8bf92d92
Signed-off-by: Giacomo Gabrielli <giacomo.gabrielli@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/13518
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nikos Nikoleris <nikos.nikoleris@arm.com>
Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com>
Diffstat (limited to 'src/cpu/minor')
-rw-r--r-- | src/cpu/minor/dyn_inst.hh | 17 | ||||
-rw-r--r-- | src/cpu/minor/exec_context.hh | 22 | ||||
-rw-r--r-- | src/cpu/minor/execute.cc | 10 | ||||
-rw-r--r-- | src/cpu/minor/lsq.cc | 148 | ||||
-rw-r--r-- | src/cpu/minor/lsq.hh | 11 |
5 files changed, 142 insertions, 66 deletions
diff --git a/src/cpu/minor/dyn_inst.hh b/src/cpu/minor/dyn_inst.hh index b2decb39b..0a8ff8acf 100644 --- a/src/cpu/minor/dyn_inst.hh +++ b/src/cpu/minor/dyn_inst.hh @@ -202,6 +202,13 @@ class MinorDynInst : public RefCounted * to allow other instructions to fill the fetch delay */ bool canEarlyIssue; + /** Flag controlling conditional execution of the instruction */ + bool predicate; + + /** Flag controlling conditional execution of the memory access associated + * with the instruction (only meaningful for loads/stores) */ + bool memAccPredicate; + /** execSeqNum of the latest inst on which this inst depends. * This can be used as a sanity check for dependency ordering * where slightly out of order execution is required (notably @@ -227,7 +234,7 @@ class MinorDynInst : public RefCounted pc(TheISA::PCState(0)), fault(fault_), triedToPredict(false), predictedTaken(false), fuIndex(0), inLSQ(false), inStoreBuffer(false), - canEarlyIssue(false), + canEarlyIssue(false), predicate(true), memAccPredicate(true), instToWaitFor(0), extraCommitDelay(Cycles(0)), extraCommitDelayExpr(NULL), minimumCommitCycle(Cycles(0)) { } @@ -266,6 +273,14 @@ class MinorDynInst : public RefCounted /** ReportIF interface */ void reportData(std::ostream &os) const; + bool readPredicate() const { return predicate; } + + void setPredicate(bool val) { predicate = val; } + + bool readMemAccPredicate() const { return memAccPredicate; } + + void setMemAccPredicate(bool val) { memAccPredicate = val; } + ~MinorDynInst(); }; diff --git a/src/cpu/minor/exec_context.hh b/src/cpu/minor/exec_context.hh index b39bbac3f..9f6fce4cd 100644 --- a/src/cpu/minor/exec_context.hh +++ b/src/cpu/minor/exec_context.hh @@ -96,28 +96,40 @@ class ExecContext : public ::ExecContext { DPRINTF(MinorExecute, "ExecContext setting PC: %s\n", inst->pc); pcState(inst->pc); - setPredicate(true); + setPredicate(inst->readPredicate()); + setMemAccPredicate(inst->readMemAccPredicate()); thread.setIntReg(TheISA::ZeroReg, 0); #if THE_ISA == ALPHA_ISA thread.setFloatReg(TheISA::ZeroReg, 0); #endif } + ~ExecContext() + { + inst->setPredicate(readPredicate()); + inst->setMemAccPredicate(readMemAccPredicate()); + } + Fault initiateMemRead(Addr addr, unsigned int size, - Request::Flags flags) override + Request::Flags flags, + const std::vector<bool>& byteEnable = std::vector<bool>()) + override { execute.getLSQ().pushRequest(inst, true /* load */, nullptr, - size, addr, flags, NULL, nullptr); + size, addr, flags, nullptr, nullptr, byteEnable); return NoFault; } Fault writeMem(uint8_t *data, unsigned int size, Addr addr, - Request::Flags flags, uint64_t *res) override + Request::Flags flags, uint64_t *res, + const std::vector<bool>& byteEnable = std::vector<bool>()) + override { + assert(byteEnable.empty() || byteEnable.size() == size); execute.getLSQ().pushRequest(inst, false /* store */, data, - size, addr, flags, res, nullptr); + size, addr, flags, res, nullptr, byteEnable); return NoFault; } diff --git a/src/cpu/minor/execute.cc b/src/cpu/minor/execute.cc index 47f3cbc68..527eb2bc0 100644 --- a/src/cpu/minor/execute.cc +++ b/src/cpu/minor/execute.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014 ARM Limited + * Copyright (c) 2013-2014,2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -364,6 +364,8 @@ Execute::handleMemResponse(MinorDynInstPtr inst, DPRINTF(MinorMem, "Completing failed request inst: %s\n", *inst); use_context_predicate = false; + if (!context.readMemAccPredicate()) + inst->staticInst->completeAcc(nullptr, &context, inst->traceData); } else if (packet->isError()) { DPRINTF(MinorMem, "Trying to commit error response: %s\n", *inst); @@ -481,6 +483,10 @@ Execute::executeMemRefInst(MinorDynInstPtr inst, BranchData &branch, } else { /* Only set this if the instruction passed its * predicate */ + if (!context.readMemAccPredicate()) { + DPRINTF(MinorMem, "No memory access for inst: %s\n", *inst); + assert(context.readPredicate()); + } passed_predicate = context.readPredicate(); /* Set predicate in tracing */ @@ -928,7 +934,7 @@ Execute::commitInst(MinorDynInstPtr inst, bool early_memory_issue, * until it gets to the head of inFlightInsts */ inst->canEarlyIssue = false; /* Not completed as we'll come here again to pick up - * the fault when we get to the end of the FU */ + * the fault when we get to the end of the FU */ completed_inst = false; } else { DPRINTF(MinorExecute, "Fault in execute: %s\n", diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc index 6fe6c3738..1d9f17e8d 100644 --- a/src/cpu/minor/lsq.cc +++ b/src/cpu/minor/lsq.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014,2017 ARM Limited + * Copyright (c) 2013-2014,2017-2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -49,27 +49,13 @@ #include "cpu/minor/exec_context.hh" #include "cpu/minor/execute.hh" #include "cpu/minor/pipeline.hh" +#include "cpu/utils.hh" #include "debug/Activity.hh" #include "debug/MinorMem.hh" namespace Minor { -/** Returns the offset of addr into an aligned a block of size block_size */ -static Addr -addrBlockOffset(Addr addr, unsigned int block_size) -{ - return addr & (block_size - 1); -} - -/** Returns true if the given [addr .. addr+size-1] transfer needs to be - * fragmented across a block size of block_size */ -static bool -transferNeedsBurst(Addr addr, unsigned int size, unsigned int block_size) -{ - return (addrBlockOffset(addr, block_size) + size) > block_size; -} - LSQ::LSQRequest::LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_, PacketDataPtr data_, uint64_t *res_) : SenderState(), @@ -88,6 +74,13 @@ LSQ::LSQRequest::LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_, request = std::make_shared<Request>(); } +void +LSQ::LSQRequest::disableMemAccess() +{ + port.cpu.threads[inst->id.threadId]->setMemAccPredicate(false); + DPRINTFS(MinorMem, (&port), "Disable mem access for inst:%s\n", *inst); +} + LSQ::AddrRangeCoverage LSQ::LSQRequest::containsAddrRangeOf( Addr req1_addr, unsigned int req1_size, @@ -256,16 +249,23 @@ LSQ::SingleDataRequest::startAddrTranslation() ThreadContext *thread = port.cpu.getContext( inst->id.threadId); - port.numAccessesInDTLB++; + const auto &byteEnable = request->getByteEnable(); + if (byteEnable.size() == 0 || + isAnyActiveElement(byteEnable.cbegin(), byteEnable.cend())) { + port.numAccessesInDTLB++; - setState(LSQ::LSQRequest::InTranslation); + setState(LSQ::LSQRequest::InTranslation); - DPRINTFS(MinorMem, (&port), "Submitting DTLB request\n"); - /* Submit the translation request. The response will come through - * finish/markDelayed on the LSQRequest as it bears the Translation - * interface */ - thread->getDTBPtr()->translateTiming( - request, thread, this, (isLoad ? BaseTLB::Read : BaseTLB::Write)); + DPRINTFS(MinorMem, (&port), "Submitting DTLB request\n"); + /* Submit the translation request. The response will come through + * finish/markDelayed on the LSQRequest as it bears the Translation + * interface */ + thread->getDTBPtr()->translateTiming( + request, thread, this, (isLoad ? BaseTLB::Read : BaseTLB::Write)); + } else { + disableMemAccess(); + setState(LSQ::LSQRequest::Complete); + } } void @@ -357,6 +357,8 @@ LSQ::SplitDataRequest::makeFragmentRequests() unsigned int fragment_size; Addr fragment_addr; + std::vector<bool> fragment_write_byte_en; + /* Assume that this transfer is across potentially many block snap * boundaries: * @@ -401,6 +403,9 @@ LSQ::SplitDataRequest::makeFragmentRequests() /* Just past the last address in the request */ Addr end_addr = base_addr + whole_size; + auto& byte_enable = request->getByteEnable(); + unsigned int num_disabled_fragments = 0; + for (unsigned int fragment_index = 0; fragment_index < numFragments; fragment_index++) { @@ -421,32 +426,58 @@ LSQ::SplitDataRequest::makeFragmentRequests() } RequestPtr fragment = std::make_shared<Request>(); + bool disabled_fragment = false; fragment->setContext(request->contextId()); - fragment->setVirt(0 /* asid */, - fragment_addr, fragment_size, request->getFlags(), - request->masterId(), - request->getPC()); + if (byte_enable.empty()) { + fragment->setVirt(0 /* asid */, + fragment_addr, fragment_size, request->getFlags(), + request->masterId(), + request->getPC()); + } else { + // Set up byte-enable mask for the current fragment + auto it_start = byte_enable.begin() + + (fragment_addr - base_addr); + auto it_end = byte_enable.begin() + + (fragment_addr - base_addr) + fragment_size; + if (isAnyActiveElement(it_start, it_end)) { + fragment->setVirt(0 /* asid */, + fragment_addr, fragment_size, request->getFlags(), + request->masterId(), + request->getPC()); + fragment->setByteEnable(std::vector<bool>(it_start, it_end)); + } else { + disabled_fragment = true; + } + } - DPRINTFS(MinorMem, (&port), "Generating fragment addr: 0x%x size: %d" - " (whole request addr: 0x%x size: %d) %s\n", - fragment_addr, fragment_size, base_addr, whole_size, - (is_last_fragment ? "last fragment" : "")); + if (!disabled_fragment) { + DPRINTFS(MinorMem, (&port), "Generating fragment addr: 0x%x" + " size: %d (whole request addr: 0x%x size: %d) %s\n", + fragment_addr, fragment_size, base_addr, whole_size, + (is_last_fragment ? "last fragment" : "")); - fragment_addr += fragment_size; + fragmentRequests.push_back(fragment); + } else { + num_disabled_fragments++; + } - fragmentRequests.push_back(fragment); + fragment_addr += fragment_size; } + assert(numFragments >= num_disabled_fragments); + numFragments -= num_disabled_fragments; } void LSQ::SplitDataRequest::makeFragmentPackets() { + assert(numTranslatedFragments > 0); Addr base_addr = request->getVaddr(); DPRINTFS(MinorMem, (&port), "Making packets for request: %s\n", *inst); - for (unsigned int fragment_index = 0; fragment_index < numFragments; + for (unsigned int fragment_index = 0; + fragment_index < numTranslatedFragments; fragment_index++) { RequestPtr fragment = fragmentRequests[fragment_index]; @@ -490,28 +521,32 @@ LSQ::SplitDataRequest::makeFragmentPackets() void LSQ::SplitDataRequest::startAddrTranslation() { - setState(LSQ::LSQRequest::InTranslation); - makeFragmentRequests(); - numInTranslationFragments = 0; - numTranslatedFragments = 0; + if (numFragments > 0) { + setState(LSQ::LSQRequest::InTranslation); + numInTranslationFragments = 0; + numTranslatedFragments = 0; - /* @todo, just do these in sequence for now with - * a loop of: - * do { - * sendNextFragmentToTranslation ; translateTiming ; finish - * } while (numTranslatedFragments != numFragments); - */ + /* @todo, just do these in sequence for now with + * a loop of: + * do { + * sendNextFragmentToTranslation ; translateTiming ; finish + * } while (numTranslatedFragments != numFragments); + */ - /* Do first translation */ - sendNextFragmentToTranslation(); + /* Do first translation */ + sendNextFragmentToTranslation(); + } else { + disableMemAccess(); + setState(LSQ::LSQRequest::Complete); + } } PacketPtr LSQ::SplitDataRequest::getHeadPacket() { - assert(numIssuedFragments < numFragments); + assert(numIssuedFragments < numTranslatedFragments); return fragmentPackets[numIssuedFragments]; } @@ -519,7 +554,7 @@ LSQ::SplitDataRequest::getHeadPacket() void LSQ::SplitDataRequest::stepToNextPacket() { - assert(numIssuedFragments < numFragments); + assert(numIssuedFragments < numTranslatedFragments); numIssuedFragments++; } @@ -527,14 +562,13 @@ LSQ::SplitDataRequest::stepToNextPacket() void LSQ::SplitDataRequest::retireResponse(PacketPtr response) { - assert(numRetiredFragments < numFragments); + assert(numRetiredFragments < numTranslatedFragments); DPRINTFS(MinorMem, (&port), "Retiring fragment addr: 0x%x size: %d" - " offset: 0x%x (retired fragment num: %d) %s\n", + " offset: 0x%x (retired fragment num: %d)\n", response->req->getVaddr(), response->req->getSize(), request->getVaddr() - response->req->getVaddr(), - numRetiredFragments, - (fault == NoFault ? "" : fault->name())); + numRetiredFragments); numRetiredFragments++; @@ -573,7 +607,7 @@ LSQ::SplitDataRequest::retireResponse(PacketPtr response) packet->makeResponse(); } - if (numRetiredFragments == numFragments) + if (numRetiredFragments == numTranslatedFragments) setState(Complete); if (!skipped && isComplete()) { @@ -1477,7 +1511,8 @@ LSQ::needsToTick() void LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, unsigned int size, Addr addr, Request::Flags flags, - uint64_t *res, AtomicOpFunctor *amo_op) + uint64_t *res, AtomicOpFunctor *amo_op, + const std::vector<bool>& byteEnable) { bool needs_burst = transferNeedsBurst(addr, size, lineWidth); @@ -1533,6 +1568,9 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, addr, size, flags, cpu.dataMasterId(), /* I've no idea why we need the PC, but give it */ inst->pc.instAddr(), amo_op); + if (!byteEnable.empty()) { + request->request->setByteEnable(byteEnable); + } requests.push(request); request->startAddrTranslation(); diff --git a/src/cpu/minor/lsq.hh b/src/cpu/minor/lsq.hh index 11fa8774f..23b47c53c 100644 --- a/src/cpu/minor/lsq.hh +++ b/src/cpu/minor/lsq.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014 ARM Limited + * Copyright (c) 2013-2014, 2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -188,6 +188,8 @@ class LSQ : public Named /** BaseTLB::Translation interface */ void markDelayed() { } + void disableMemAccess(); + public: LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_, PacketDataPtr data_ = NULL, uint64_t *res_ = NULL); @@ -441,7 +443,8 @@ class LSQ : public Named { return numIssuedFragments != numRetiredFragments; } /** Have we stepped past the end of fragmentPackets? */ - bool sentAllPackets() { return numIssuedFragments == numFragments; } + bool sentAllPackets() + { return numIssuedFragments == numTranslatedFragments; } /** For loads, paste the response data into the main * response packet */ @@ -700,7 +703,9 @@ class LSQ : public Named * the LSQ */ void pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, unsigned int size, Addr addr, Request::Flags flags, - uint64_t *res, AtomicOpFunctor *amo_op); + uint64_t *res, AtomicOpFunctor *amo_op, + const std::vector<bool>& byteEnable = + std::vector<bool>()); /** Push a predicate failed-representing request into the queues just * to maintain commit order */ |