cpu,mem: Add support for partial loads/stores and wide mem. accesses

This changeset adds support for partial (or masked) loads/stores, i.e. loads/stores that can disable accesses to individual bytes within the target address range. In addition, this changeset extends the code to crack memory accesses across most CPU models (TimingSimpleCPU still TBD), so that arbitrarily wide memory accesses are supported. These changes are required for supporting ISAs with wide vectors. Additional authors: - Gabor Dozsa <gabor.dozsa@arm.com> - Tiago Muck <tiago.muck@arm.com> Change-Id: Ibad33541c258ad72925c0b1d5abc3e5e8bf92d92 Signed-off-by: Giacomo Gabrielli <giacomo.gabrielli@arm.com> Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/13518 Tested-by: kokoro <noreply+kokoro@google.com> Reviewed-by: Nikos Nikoleris <nikos.nikoleris@arm.com> Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com>
author: Giacomo Gabrielli <giacomo.gabrielli@arm.com> 2017-07-07 14:13:11 +0100
committer: Giacomo Gabrielli <giacomo.gabrielli@arm.com> 2019-05-11 12:48:58 +0000
commit: c58cb8c9dbeef377da180f1fdaaa1c0eadf85550 (patch)
tree: 7591abeb888d8c8e645332749bcaea627628f9bf /src/cpu/minor
parent: d0e4cdc9c36466a3dbef8c9f9f509cce8f1a6c34 (diff)
download: gem5-c58cb8c9dbeef377da180f1fdaaa1c0eadf85550.tar.xz
5 files changed, 142 insertions, 66 deletions
diff --git a/src/cpu/minor/dyn_inst.hh b/src/cpu/minor/dyn_inst.hh
index b2decb39b..0a8ff8acf 100644
--- a/src/cpu/minor/dyn_inst.hh
+++ b/src/cpu/minor/dyn_inst.hh
@@ -202,6 +202,13 @@ class MinorDynInst : public RefCounted
      *  to allow other instructions to fill the fetch delay */
     bool canEarlyIssue;
 
+    /** Flag controlling conditional execution of the instruction */
+    bool predicate;
+
+    /** Flag controlling conditional execution of the memory access associated
+     *  with the instruction (only meaningful for loads/stores) */
+    bool memAccPredicate;
+
     /** execSeqNum of the latest inst on which this inst depends.
      *  This can be used as a sanity check for dependency ordering
      *  where slightly out of order execution is required (notably
@@ -227,7 +234,7 @@ class MinorDynInst : public RefCounted
         pc(TheISA::PCState(0)), fault(fault_),
         triedToPredict(false), predictedTaken(false),
         fuIndex(0), inLSQ(false), inStoreBuffer(false),
-        canEarlyIssue(false),
+        canEarlyIssue(false), predicate(true), memAccPredicate(true),
         instToWaitFor(0), extraCommitDelay(Cycles(0)),
         extraCommitDelayExpr(NULL), minimumCommitCycle(Cycles(0))
     { }
@@ -266,6 +273,14 @@ class MinorDynInst : public RefCounted
     /** ReportIF interface */
     void reportData(std::ostream &os) const;
 
+    bool readPredicate() const { return predicate; }
+
+    void setPredicate(bool val) { predicate = val; }
+
+    bool readMemAccPredicate() const { return memAccPredicate; }
+
+    void setMemAccPredicate(bool val) { memAccPredicate = val; }
+
     ~MinorDynInst();
 };
 
diff --git a/src/cpu/minor/exec_context.hh b/src/cpu/minor/exec_context.hh
index b39bbac3f..9f6fce4cd 100644
--- a/src/cpu/minor/exec_context.hh
+++ b/src/cpu/minor/exec_context.hh
@@ -96,28 +96,40 @@ class ExecContext : public ::ExecContext
     {
         DPRINTF(MinorExecute, "ExecContext setting PC: %s\n", inst->pc);
         pcState(inst->pc);
-        setPredicate(true);
+        setPredicate(inst->readPredicate());
+        setMemAccPredicate(inst->readMemAccPredicate());
         thread.setIntReg(TheISA::ZeroReg, 0);
 #if THE_ISA == ALPHA_ISA
         thread.setFloatReg(TheISA::ZeroReg, 0);
 #endif
     }
 
+    ~ExecContext()
+    {
+        inst->setPredicate(readPredicate());
+        inst->setMemAccPredicate(readMemAccPredicate());
+    }
+
     Fault
     initiateMemRead(Addr addr, unsigned int size,
-                    Request::Flags flags) override
+                    Request::Flags flags,
+                    const std::vector<bool>& byteEnable = std::vector<bool>())
+        override
     {
         execute.getLSQ().pushRequest(inst, true /* load */, nullptr,
-            size, addr, flags, NULL, nullptr);
+            size, addr, flags, nullptr, nullptr, byteEnable);
         return NoFault;
     }
 
     Fault
     writeMem(uint8_t *data, unsigned int size, Addr addr,
-             Request::Flags flags, uint64_t *res) override
+             Request::Flags flags, uint64_t *res,
+             const std::vector<bool>& byteEnable = std::vector<bool>())
+        override
     {
+        assert(byteEnable.empty() || byteEnable.size() == size);
         execute.getLSQ().pushRequest(inst, false /* store */, data,
-            size, addr, flags, res, nullptr);
+            size, addr, flags, res, nullptr, byteEnable);
         return NoFault;
     }
 
diff --git a/src/cpu/minor/execute.cc b/src/cpu/minor/execute.cc
index 47f3cbc68..527eb2bc0 100644
--- a/src/cpu/minor/execute.cc
+++ b/src/cpu/minor/execute.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014 ARM Limited
+ * Copyright (c) 2013-2014,2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -364,6 +364,8 @@ Execute::handleMemResponse(MinorDynInstPtr inst,
         DPRINTF(MinorMem, "Completing failed request inst: %s\n",
             *inst);
         use_context_predicate = false;
+        if (!context.readMemAccPredicate())
+            inst->staticInst->completeAcc(nullptr, &context, inst->traceData);
     } else if (packet->isError()) {
         DPRINTF(MinorMem, "Trying to commit error response: %s\n",
             *inst);
@@ -481,6 +483,10 @@ Execute::executeMemRefInst(MinorDynInstPtr inst, BranchData &branch,
         } else {
             /* Only set this if the instruction passed its
              * predicate */
+            if (!context.readMemAccPredicate()) {
+                DPRINTF(MinorMem, "No memory access for inst: %s\n", *inst);
+                assert(context.readPredicate());
+            }
             passed_predicate = context.readPredicate();
 
             /* Set predicate in tracing */
@@ -928,7 +934,7 @@ Execute::commitInst(MinorDynInstPtr inst, bool early_memory_issue,
                  *  until it gets to the head of inFlightInsts */
                 inst->canEarlyIssue = false;
                 /* Not completed as we'll come here again to pick up
-                *  the fault when we get to the end of the FU */
+                 * the fault when we get to the end of the FU */
                 completed_inst = false;
             } else {
                 DPRINTF(MinorExecute, "Fault in execute: %s\n",
diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc
index 6fe6c3738..1d9f17e8d 100644
--- a/src/cpu/minor/lsq.cc
+++ b/src/cpu/minor/lsq.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014,2017 ARM Limited
+ * Copyright (c) 2013-2014,2017-2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -49,27 +49,13 @@
 #include "cpu/minor/exec_context.hh"
 #include "cpu/minor/execute.hh"
 #include "cpu/minor/pipeline.hh"
+#include "cpu/utils.hh"
 #include "debug/Activity.hh"
 #include "debug/MinorMem.hh"
 
 namespace Minor
 {
 
-/** Returns the offset of addr into an aligned a block of size block_size */
-static Addr
-addrBlockOffset(Addr addr, unsigned int block_size)
-{
-    return addr & (block_size - 1);
-}
-
-/** Returns true if the given [addr .. addr+size-1] transfer needs to be
- *  fragmented across a block size of block_size */
-static bool
-transferNeedsBurst(Addr addr, unsigned int size, unsigned int block_size)
-{
-    return (addrBlockOffset(addr, block_size) + size) > block_size;
-}
-
 LSQ::LSQRequest::LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_,
     PacketDataPtr data_, uint64_t *res_) :
     SenderState(),
@@ -88,6 +74,13 @@ LSQ::LSQRequest::LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_,
     request = std::make_shared<Request>();
 }
 
+void
+LSQ::LSQRequest::disableMemAccess()
+{
+    port.cpu.threads[inst->id.threadId]->setMemAccPredicate(false);
+    DPRINTFS(MinorMem, (&port), "Disable mem access for inst:%s\n", *inst);
+}
+
 LSQ::AddrRangeCoverage
 LSQ::LSQRequest::containsAddrRangeOf(
     Addr req1_addr, unsigned int req1_size,
@@ -256,16 +249,23 @@ LSQ::SingleDataRequest::startAddrTranslation()
     ThreadContext *thread = port.cpu.getContext(
         inst->id.threadId);
 
-    port.numAccessesInDTLB++;
+    const auto &byteEnable = request->getByteEnable();
+    if (byteEnable.size() == 0 ||
+        isAnyActiveElement(byteEnable.cbegin(), byteEnable.cend())) {
+        port.numAccessesInDTLB++;
 
-    setState(LSQ::LSQRequest::InTranslation);
+        setState(LSQ::LSQRequest::InTranslation);
 
-    DPRINTFS(MinorMem, (&port), "Submitting DTLB request\n");
-    /* Submit the translation request.  The response will come through
-     *  finish/markDelayed on the LSQRequest as it bears the Translation
-     *  interface */
-    thread->getDTBPtr()->translateTiming(
-        request, thread, this, (isLoad ? BaseTLB::Read : BaseTLB::Write));
+        DPRINTFS(MinorMem, (&port), "Submitting DTLB request\n");
+        /* Submit the translation request.  The response will come through
+         *  finish/markDelayed on the LSQRequest as it bears the Translation
+         *  interface */
+        thread->getDTBPtr()->translateTiming(
+            request, thread, this, (isLoad ? BaseTLB::Read : BaseTLB::Write));
+    } else {
+        disableMemAccess();
+        setState(LSQ::LSQRequest::Complete);
+    }
 }
 
 void
@@ -357,6 +357,8 @@ LSQ::SplitDataRequest::makeFragmentRequests()
     unsigned int fragment_size;
     Addr fragment_addr;
 
+    std::vector<bool> fragment_write_byte_en;
+
     /* Assume that this transfer is across potentially many block snap
      * boundaries:
      *
@@ -401,6 +403,9 @@ LSQ::SplitDataRequest::makeFragmentRequests()
     /* Just past the last address in the request */
     Addr end_addr = base_addr + whole_size;
 
+    auto& byte_enable = request->getByteEnable();
+    unsigned int num_disabled_fragments = 0;
+
     for (unsigned int fragment_index = 0; fragment_index < numFragments;
          fragment_index++)
     {
@@ -421,32 +426,58 @@ LSQ::SplitDataRequest::makeFragmentRequests()
         }
 
         RequestPtr fragment = std::make_shared<Request>();
+        bool disabled_fragment = false;
 
         fragment->setContext(request->contextId());
-        fragment->setVirt(0 /* asid */,
-            fragment_addr, fragment_size, request->getFlags(),
-            request->masterId(),
-            request->getPC());
+        if (byte_enable.empty()) {
+            fragment->setVirt(0 /* asid */,
+                fragment_addr, fragment_size, request->getFlags(),
+                request->masterId(),
+                request->getPC());
+        } else {
+            // Set up byte-enable mask for the current fragment
+            auto it_start = byte_enable.begin() +
+                (fragment_addr - base_addr);
+            auto it_end = byte_enable.begin() +
+                (fragment_addr - base_addr) + fragment_size;
+            if (isAnyActiveElement(it_start, it_end)) {
+                fragment->setVirt(0 /* asid */,
+                    fragment_addr, fragment_size, request->getFlags(),
+                    request->masterId(),
+                    request->getPC());
+                fragment->setByteEnable(std::vector<bool>(it_start, it_end));
+            } else {
+                disabled_fragment = true;
+            }
+        }
 
-        DPRINTFS(MinorMem, (&port), "Generating fragment addr: 0x%x size: %d"
-            " (whole request addr: 0x%x size: %d) %s\n",
-            fragment_addr, fragment_size, base_addr, whole_size,
-            (is_last_fragment ? "last fragment" : ""));
+        if (!disabled_fragment) {
+            DPRINTFS(MinorMem, (&port), "Generating fragment addr: 0x%x"
+                " size: %d (whole request addr: 0x%x size: %d) %s\n",
+                fragment_addr, fragment_size, base_addr, whole_size,
+                (is_last_fragment ? "last fragment" : ""));
 
-        fragment_addr += fragment_size;
+            fragmentRequests.push_back(fragment);
+        } else {
+            num_disabled_fragments++;
+        }
 
-        fragmentRequests.push_back(fragment);
+        fragment_addr += fragment_size;
     }
+    assert(numFragments >= num_disabled_fragments);
+    numFragments -= num_disabled_fragments;
 }
 
 void
 LSQ::SplitDataRequest::makeFragmentPackets()
 {
+    assert(numTranslatedFragments > 0);
     Addr base_addr = request->getVaddr();
 
     DPRINTFS(MinorMem, (&port), "Making packets for request: %s\n", *inst);
 
-    for (unsigned int fragment_index = 0; fragment_index < numFragments;
+    for (unsigned int fragment_index = 0;
+         fragment_index < numTranslatedFragments;
          fragment_index++)
     {
         RequestPtr fragment = fragmentRequests[fragment_index];
@@ -490,28 +521,32 @@ LSQ::SplitDataRequest::makeFragmentPackets()
 void
 LSQ::SplitDataRequest::startAddrTranslation()
 {
-    setState(LSQ::LSQRequest::InTranslation);
-
     makeFragmentRequests();
 
-    numInTranslationFragments = 0;
-    numTranslatedFragments = 0;
+    if (numFragments > 0) {
+        setState(LSQ::LSQRequest::InTranslation);
+        numInTranslationFragments = 0;
+        numTranslatedFragments = 0;
 
-    /* @todo, just do these in sequence for now with
-     * a loop of:
-     * do {
-     *  sendNextFragmentToTranslation ; translateTiming ; finish
-     * } while (numTranslatedFragments != numFragments);
-     */
+        /* @todo, just do these in sequence for now with
+         * a loop of:
+         * do {
+         *  sendNextFragmentToTranslation ; translateTiming ; finish
+         * } while (numTranslatedFragments != numFragments);
+         */
 
-    /* Do first translation */
-    sendNextFragmentToTranslation();
+        /* Do first translation */
+        sendNextFragmentToTranslation();
+    } else {
+        disableMemAccess();
+        setState(LSQ::LSQRequest::Complete);
+    }
 }
 
 PacketPtr
 LSQ::SplitDataRequest::getHeadPacket()
 {
-    assert(numIssuedFragments < numFragments);
+    assert(numIssuedFragments < numTranslatedFragments);
 
     return fragmentPackets[numIssuedFragments];
 }
@@ -519,7 +554,7 @@ LSQ::SplitDataRequest::getHeadPacket()
 void
 LSQ::SplitDataRequest::stepToNextPacket()
 {
-    assert(numIssuedFragments < numFragments);
+    assert(numIssuedFragments < numTranslatedFragments);
 
     numIssuedFragments++;
 }
@@ -527,14 +562,13 @@ LSQ::SplitDataRequest::stepToNextPacket()
 void
 LSQ::SplitDataRequest::retireResponse(PacketPtr response)
 {
-    assert(numRetiredFragments < numFragments);
+    assert(numRetiredFragments < numTranslatedFragments);
 
     DPRINTFS(MinorMem, (&port), "Retiring fragment addr: 0x%x size: %d"
-        " offset: 0x%x (retired fragment num: %d) %s\n",
+        " offset: 0x%x (retired fragment num: %d)\n",
         response->req->getVaddr(), response->req->getSize(),
         request->getVaddr() - response->req->getVaddr(),
-        numRetiredFragments,
-        (fault == NoFault ? "" : fault->name()));
+        numRetiredFragments);
 
     numRetiredFragments++;
 
@@ -573,7 +607,7 @@ LSQ::SplitDataRequest::retireResponse(PacketPtr response)
             packet->makeResponse();
     }
 
-    if (numRetiredFragments == numFragments)
+    if (numRetiredFragments == numTranslatedFragments)
         setState(Complete);
 
     if (!skipped && isComplete()) {
@@ -1477,7 +1511,8 @@ LSQ::needsToTick()
 void
 LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
                  unsigned int size, Addr addr, Request::Flags flags,
-                 uint64_t *res, AtomicOpFunctor *amo_op)
+                 uint64_t *res, AtomicOpFunctor *amo_op,
+                 const std::vector<bool>& byteEnable)
 {
     bool needs_burst = transferNeedsBurst(addr, size, lineWidth);
 
@@ -1533,6 +1568,9 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
         addr, size, flags, cpu.dataMasterId(),
         /* I've no idea why we need the PC, but give it */
         inst->pc.instAddr(), amo_op);
+    if (!byteEnable.empty()) {
+        request->request->setByteEnable(byteEnable);
+    }
 
     requests.push(request);
     request->startAddrTranslation();
diff --git a/src/cpu/minor/lsq.hh b/src/cpu/minor/lsq.hh
index 11fa8774f..23b47c53c 100644
--- a/src/cpu/minor/lsq.hh
+++ b/src/cpu/minor/lsq.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014 ARM Limited
+ * Copyright (c) 2013-2014, 2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -188,6 +188,8 @@ class LSQ : public Named
         /** BaseTLB::Translation interface */
         void markDelayed() { }
 
+        void disableMemAccess();
+
       public:
         LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_,
             PacketDataPtr data_ = NULL, uint64_t *res_ = NULL);
@@ -441,7 +443,8 @@ class LSQ : public Named
         { return numIssuedFragments != numRetiredFragments; }
 
         /** Have we stepped past the end of fragmentPackets? */
-        bool sentAllPackets() { return numIssuedFragments == numFragments; }
+        bool sentAllPackets()
+        { return numIssuedFragments == numTranslatedFragments; }
 
         /** For loads, paste the response data into the main
          *  response packet */
@@ -700,7 +703,9 @@ class LSQ : public Named
      *  the LSQ */
     void pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
                      unsigned int size, Addr addr, Request::Flags flags,
-                     uint64_t *res, AtomicOpFunctor *amo_op);
+                     uint64_t *res, AtomicOpFunctor *amo_op,
+                     const std::vector<bool>& byteEnable =
+                         std::vector<bool>());
 
     /** Push a predicate failed-representing request into the queues just
      *  to maintain commit order */
author	Giacomo Gabrielli <giacomo.gabrielli@arm.com>	2017-07-07 14:13:11 +0100
committer	Giacomo Gabrielli <giacomo.gabrielli@arm.com>	2019-05-11 12:48:58 +0000
commit	c58cb8c9dbeef377da180f1fdaaa1c0eadf85550 (patch)
tree	7591abeb888d8c8e645332749bcaea627628f9bf /src/cpu/minor
parent	d0e4cdc9c36466a3dbef8c9f9f509cce8f1a6c34 (diff)
download	gem5-c58cb8c9dbeef377da180f1fdaaa1c0eadf85550.tar.xz