From c58cb8c9dbeef377da180f1fdaaa1c0eadf85550 Mon Sep 17 00:00:00 2001
From: Giacomo Gabrielli <giacomo.gabrielli@arm.com>
Date: Fri, 7 Jul 2017 14:13:11 +0100
Subject: cpu,mem: Add support for partial loads/stores and wide mem. accesses

This changeset adds support for partial (or masked) loads/stores, i.e.
loads/stores that can disable accesses to individual bytes within the
target address range.  In addition, this changeset extends the code to
crack memory accesses across most CPU models (TimingSimpleCPU still
TBD), so that arbitrarily wide memory accesses are supported.  These
changes are required for supporting ISAs with wide vectors.

Additional authors:
- Gabor Dozsa <gabor.dozsa@arm.com>
- Tiago Muck <tiago.muck@arm.com>

Change-Id: Ibad33541c258ad72925c0b1d5abc3e5e8bf92d92
Signed-off-by: Giacomo Gabrielli <giacomo.gabrielli@arm.com>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/13518
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Nikos Nikoleris <nikos.nikoleris@arm.com>
Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com>
---
 src/cpu/simple/atomic.cc       | 138 ++++++++++++++++++++++++++---------------
 src/cpu/simple/atomic.hh       |  29 ++++++++-
 src/cpu/simple/base.cc         |   6 +-
 src/cpu/simple/base.hh         |  14 +++--
 src/cpu/simple/exec_context.hh |  20 +++---
 src/cpu/simple/timing.cc       |  18 +++++-
 src/cpu/simple/timing.hh       |  10 ++-
 7 files changed, 164 insertions(+), 71 deletions(-)

(limited to 'src/cpu/simple')
diff --git a/src/cpu/simple/atomic.cc b/src/cpu/simple/atomic.cc
index caf2427ef..c5b024532 100644
--- a/src/cpu/simple/atomic.cc
+++ b/src/cpu/simple/atomic.cc
@@ -49,6 +49,7 @@
 #include "base/output.hh"
 #include "config/the_isa.hh"
 #include "cpu/exetrace.hh"
+#include "cpu/utils.hh"
 #include "debug/Drain.hh"
 #include "debug/ExecFaulting.hh"
 #include "debug/SimpleCPU.hh"
@@ -333,9 +334,43 @@ AtomicSimpleCPU::AtomicCPUDPort::recvFunctionalSnoop(PacketPtr pkt)
     }
 }
 
+bool
+AtomicSimpleCPU::genMemFragmentRequest(const RequestPtr& req, Addr frag_addr,
+                                       int size, Request::Flags flags,
+                                       const std::vector<bool>& byte_enable,
+                                       int& frag_size, int& size_left) const
+{
+    bool predicate = true;
+    Addr inst_addr = threadInfo[curThread]->thread->pcState().instAddr();
+
+    frag_size = std::min(
+        cacheLineSize() - addrBlockOffset(frag_addr, cacheLineSize()),
+        (Addr) size_left);
+    size_left -= frag_size;
+
+    if (!byte_enable.empty()) {
+        // Set up byte-enable mask for the current fragment
+        auto it_start = byte_enable.begin() + (size - (frag_size + size_left));
+        auto it_end = byte_enable.begin() + (size - size_left);
+        if (isAnyActiveElement(it_start, it_end)) {
+            req->setVirt(0, frag_addr, frag_size, flags, dataMasterId(),
+                         inst_addr);
+            req->setByteEnable(std::vector<bool>(it_start, it_end));
+        } else {
+            predicate = false;
+        }
+    } else {
+        req->setVirt(0, frag_addr, frag_size, flags, dataMasterId(),
+                     inst_addr);
+    }
+
+    return predicate;
+}
+
 Fault
 AtomicSimpleCPU::readMem(Addr addr, uint8_t * data, unsigned size,
-                         Request::Flags flags)
+                         Request::Flags flags,
+                         const std::vector<bool>& byteEnable)
 {
     SimpleExecContext& t_info = *threadInfo[curThread];
     SimpleThread* thread = t_info.thread;
@@ -346,28 +381,29 @@ AtomicSimpleCPU::readMem(Addr addr, uint8_t * data, unsigned size,
     if (traceData)
         traceData->setMem(addr, size, flags);
 
-    //The size of the data we're trying to read.
-    int fullSize = size;
-
-    //The address of the second part of this access if it needs to be split
-    //across a cache line boundary.
-    Addr secondAddr = roundDown(addr + size - 1, cacheLineSize());
-
-    if (secondAddr > addr)
-        size = secondAddr - addr;
-
     dcache_latency = 0;
 
     req->taskId(taskId());
+
+    Addr frag_addr = addr;
+    int frag_size = 0;
+    int size_left = size;
+    bool predicate;
+    Fault fault = NoFault;
+
     while (1) {
-        req->setVirt(0, addr, size, flags, dataMasterId(), thread->pcState().instAddr());
+        predicate = genMemFragmentRequest(req, frag_addr, size, flags,
+                                          byteEnable, frag_size, size_left);
 
         // translate to physical address
-        Fault fault = thread->dtb->translateAtomic(req, thread->getTC(),
-                                                          BaseTLB::Read);
+        if (predicate) {
+            fault = thread->dtb->translateAtomic(req, thread->getTC(),
+                                                 BaseTLB::Read);
+        }
 
         // Now do the access.
-        if (fault == NoFault && !req->getFlags().isSet(Request::NO_ACCESS)) {
+        if (predicate && fault == NoFault &&
+            !req->getFlags().isSet(Request::NO_ACCESS)) {
             Packet pkt(req, Packet::makeReadCmd(req));
             pkt.dataStatic(data);
 
@@ -394,33 +430,29 @@ AtomicSimpleCPU::readMem(Addr addr, uint8_t * data, unsigned size,
             }
         }
 
-        //If we don't need to access a second cache line, stop now.
-        if (secondAddr <= addr)
-        {
+        // If we don't need to access further cache lines, stop now.
+        if (size_left == 0) {
             if (req->isLockedRMW() && fault == NoFault) {
                 assert(!locked);
                 locked = true;
             }
-
             return fault;
         }
 
         /*
-         * Set up for accessing the second cache line.
+         * Set up for accessing the next cache line.
          */
+        frag_addr += frag_size;
 
         //Move the pointer we're reading into to the correct location.
-        data += size;
-        //Adjust the size to get the remaining bytes.
-        size = addr + fullSize - secondAddr;
-        //And access the right address.
-        addr = secondAddr;
+        data += frag_size;
     }
 }
 
 Fault
 AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr,
-                          Request::Flags flags, uint64_t *res)
+                          Request::Flags flags, uint64_t *res,
+                          const std::vector<bool>& byteEnable)
 {
     SimpleExecContext& t_info = *threadInfo[curThread];
     SimpleThread* thread = t_info.thread;
@@ -439,32 +471,37 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr,
     if (traceData)
         traceData->setMem(addr, size, flags);
 
-    //The size of the data we're trying to read.
-    int fullSize = size;
-
-    //The address of the second part of this access if it needs to be split
-    //across a cache line boundary.
-    Addr secondAddr = roundDown(addr + size - 1, cacheLineSize());
-
-    if (secondAddr > addr)
-        size = secondAddr - addr;
-
     dcache_latency = 0;
 
     req->taskId(taskId());
+
+    Addr frag_addr = addr;
+    int frag_size = 0;
+    int size_left = size;
+    int curr_frag_id = 0;
+    bool predicate;
+    Fault fault = NoFault;
+
     while (1) {
-        req->setVirt(0, addr, size, flags, dataMasterId(), thread->pcState().instAddr());
+        predicate = genMemFragmentRequest(req, frag_addr, size, flags,
+                                          byteEnable, frag_size, size_left);
 
         // translate to physical address
-        Fault fault = thread->dtb->translateAtomic(req, thread->getTC(), BaseTLB::Write);
+        if (predicate)
+            fault = thread->dtb->translateAtomic(req, thread->getTC(),
+                                                 BaseTLB::Write);
 
         // Now do the access.
-        if (fault == NoFault) {
+        if (predicate && fault == NoFault) {
             bool do_access = true;  // flag to suppress cache access
 
             if (req->isLLSC()) {
-                do_access = TheISA::handleLockedWrite(thread, req, dcachePort.cacheBlockMask);
+                assert(curr_frag_id == 0);
+                do_access =
+                    TheISA::handleLockedWrite(thread, req,
+                                              dcachePort.cacheBlockMask);
             } else if (req->isSwap()) {
+                assert(curr_frag_id == 0);
                 if (req->isCondSwap()) {
                     assert(res);
                     req->setExtraData(*res);
@@ -488,8 +525,8 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr,
                 assert(!pkt.isError());
 
                 if (req->isSwap()) {
-                    assert(res);
-                    memcpy(res, pkt.getConstPtr<uint8_t>(), fullSize);
+                    assert(res && curr_frag_id == 0);
+                    memcpy(res, pkt.getConstPtr<uint8_t>(), size);
                 }
             }
 
@@ -500,14 +537,14 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr,
 
         //If there's a fault or we don't need to access a second cache line,
         //stop now.
-        if (fault != NoFault || secondAddr <= addr)
+        if (fault != NoFault || size_left == 0)
         {
             if (req->isLockedRMW() && fault == NoFault) {
-                assert(locked);
+                assert(byteEnable.empty());
+                assert(locked && curr_frag_id == 0);
                 locked = false;
             }
 
-
             if (fault != NoFault && req->isPrefetch()) {
                 return NoFault;
             } else {
@@ -516,15 +553,14 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr,
         }
 
         /*
-         * Set up for accessing the second cache line.
+         * Set up for accessing the next cache line.
          */
+        frag_addr += frag_size;
 
         //Move the pointer we're reading into to the correct location.
-        data += size;
-        //Adjust the size to get the remaining bytes.
-        size = addr + fullSize - secondAddr;
-        //And access the right address.
-        addr = secondAddr;
+        data += frag_size;
+
+        curr_frag_id++;
     }
 }
 
diff --git a/src/cpu/simple/atomic.hh b/src/cpu/simple/atomic.hh
index 84f379121..100306981 100644
--- a/src/cpu/simple/atomic.hh
+++ b/src/cpu/simple/atomic.hh
@@ -195,11 +195,36 @@ class AtomicSimpleCPU : public BaseSimpleCPU
     void activateContext(ThreadID thread_num) override;
     void suspendContext(ThreadID thread_num) override;
 
+    /**
+     * Helper function used to set up the request for a single fragment of a
+     * memory access.
+     *
+     * Takes care of setting up the appropriate byte-enable mask for the
+     * fragment, given the mask for the entire memory access.
+     *
+     * @param req Pointer to the Request object to populate.
+     * @param frag_addr Start address of the fragment.
+     * @param size Total size of the memory access in bytes.
+     * @param flags Request flags.
+     * @param byte_enable Byte-enable mask for the entire memory access.
+     * @param[out] frag_size Fragment size.
+     * @param[in,out] size_left Size left to be processed in the memory access.
+     * @return True if the byte-enable mask for the fragment is not all-false.
+     */
+    bool genMemFragmentRequest(const RequestPtr& req, Addr frag_addr,
+                               int size, Request::Flags flags,
+                               const std::vector<bool>& byte_enable,
+                               int& frag_size, int& size_left) const;
+
     Fault readMem(Addr addr, uint8_t *data, unsigned size,
-                  Request::Flags flags) override;
+                  Request::Flags flags,
+                  const std::vector<bool>& byteEnable = std::vector<bool>())
+        override;
 
     Fault writeMem(uint8_t *data, unsigned size,
-                   Addr addr, Request::Flags flags, uint64_t *res) override;
+                   Addr addr, Request::Flags flags, uint64_t *res,
+                   const std::vector<bool>& byteEnable = std::vector<bool>())
+        override;
 
     Fault amoMem(Addr addr, uint8_t* data, unsigned size,
                  Request::Flags flags, AtomicOpFunctor *amo_op) override;
diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc
index 298ba9f9e..816add707 100644
--- a/src/cpu/simple/base.cc
+++ b/src/cpu/simple/base.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2012, 2015, 2017 ARM Limited
+ * Copyright (c) 2010-2012, 2015, 2017, 2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -494,6 +494,10 @@ BaseSimpleCPU::preExecute()
     thread->setFloatReg(ZeroReg, 0);
 #endif // ALPHA_ISA
 
+    // resets predicates
+    t_info.setPredicate(true);
+    t_info.setMemAccPredicate(true);
+
     // check for instruction-count-based events
     comInstEventQueue[curThread]->serviceEvents(t_info.numInst);
     system->instEventQueue.serviceEvents(system->totalNumInsts);
diff --git a/src/cpu/simple/base.hh b/src/cpu/simple/base.hh
index 8060b07ad..5404e5df8 100644
--- a/src/cpu/simple/base.hh
+++ b/src/cpu/simple/base.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2012,2015 ARM Limited
+ * Copyright (c) 2011-2012,2015,2018 ARM Limited
  * Copyright (c) 2013 Advanced Micro Devices, Inc.
  * All rights reserved
  *
@@ -143,15 +143,21 @@ class BaseSimpleCPU : public BaseCPU
     void startup() override;
 
     virtual Fault readMem(Addr addr, uint8_t* data, unsigned size,
-                          Request::Flags flags)
+                          Request::Flags flags,
+                          const std::vector<bool>& byteEnable =
+                              std::vector<bool>())
     { panic("readMem() is not implemented\n"); }
 
     virtual Fault initiateMemRead(Addr addr, unsigned size,
-                                  Request::Flags flags)
+                                  Request::Flags flags,
+                                  const std::vector<bool>& byteEnable =
+                                      std::vector<bool>())
     { panic("initiateMemRead() is not implemented\n"); }
 
     virtual Fault writeMem(uint8_t* data, unsigned size, Addr addr,
-                           Request::Flags flags, uint64_t* res)
+                           Request::Flags flags, uint64_t* res,
+                           const std::vector<bool>& byteEnable =
+                               std::vector<bool>())
     { panic("writeMem() is not implemented\n"); }
 
     virtual Fault amoMem(Addr addr, uint8_t* data, unsigned size,
diff --git a/src/cpu/simple/exec_context.hh b/src/cpu/simple/exec_context.hh
index be7a863c5..de98d6efd 100644
--- a/src/cpu/simple/exec_context.hh
+++ b/src/cpu/simple/exec_context.hh
@@ -434,26 +434,32 @@ class SimpleExecContext : public ExecContext {
         thread->pcState(val);
     }
 
-
     Fault
     readMem(Addr addr, uint8_t *data, unsigned int size,
-            Request::Flags flags) override
+            Request::Flags flags,
+            const std::vector<bool>& byteEnable = std::vector<bool>())
+        override
     {
-        return cpu->readMem(addr, data, size, flags);
+        return cpu->readMem(addr, data, size, flags, byteEnable);
     }
 
     Fault
     initiateMemRead(Addr addr, unsigned int size,
-                    Request::Flags flags) override
+                    Request::Flags flags,
+                    const std::vector<bool>& byteEnable = std::vector<bool>())
+        override
     {
-        return cpu->initiateMemRead(addr, size, flags);
+        return cpu->initiateMemRead(addr, size, flags, byteEnable);
     }
 
     Fault
     writeMem(uint8_t *data, unsigned int size, Addr addr,
-             Request::Flags flags, uint64_t *res) override
+             Request::Flags flags, uint64_t *res,
+             const std::vector<bool>& byteEnable = std::vector<bool>())
+        override
     {
-        return cpu->writeMem(data, size, addr, flags, res);
+        assert(byteEnable.empty() || byteEnable.size() == size);
+        return cpu->writeMem(data, size, addr, flags, res, byteEnable);
     }
 
     Fault amoMem(Addr addr, uint8_t *data, unsigned int size,
diff --git a/src/cpu/simple/timing.cc b/src/cpu/simple/timing.cc
index 637308a96..454259099 100644
--- a/src/cpu/simple/timing.cc
+++ b/src/cpu/simple/timing.cc
@@ -1,6 +1,6 @@
 /*
  * Copyright 2014 Google, Inc.
- * Copyright (c) 2010-2013,2015,2017 ARM Limited
+ * Copyright (c) 2010-2013,2015,2017-2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -417,7 +417,8 @@ TimingSimpleCPU::buildSplitPacket(PacketPtr &pkt1, PacketPtr &pkt2,
 
 Fault
 TimingSimpleCPU::initiateMemRead(Addr addr, unsigned size,
-                                 Request::Flags flags)
+                                 Request::Flags flags,
+                                 const std::vector<bool>& byteEnable)
 {
     SimpleExecContext &t_info = *threadInfo[curThread];
     SimpleThread* thread = t_info.thread;
@@ -434,6 +435,9 @@ TimingSimpleCPU::initiateMemRead(Addr addr, unsigned size,
     RequestPtr req = std::make_shared<Request>(
         asid, addr, size, flags, dataMasterId(), pc,
         thread->contextId());
+    if (!byteEnable.empty()) {
+        req->setByteEnable(byteEnable);
+    }
 
     req->taskId(taskId());
 
@@ -491,7 +495,8 @@ TimingSimpleCPU::handleWritePacket()
 
 Fault
 TimingSimpleCPU::writeMem(uint8_t *data, unsigned size,
-                          Addr addr, Request::Flags flags, uint64_t *res)
+                          Addr addr, Request::Flags flags, uint64_t *res,
+                          const std::vector<bool>& byteEnable)
 {
     SimpleExecContext &t_info = *threadInfo[curThread];
     SimpleThread* thread = t_info.thread;
@@ -516,6 +521,9 @@ TimingSimpleCPU::writeMem(uint8_t *data, unsigned size,
     RequestPtr req = std::make_shared<Request>(
         asid, addr, size, flags, dataMasterId(), pc,
         thread->contextId());
+    if (!byteEnable.empty()) {
+        req->setByteEnable(byteEnable);
+    }
 
     req->taskId(taskId());
 
@@ -523,6 +531,10 @@ TimingSimpleCPU::writeMem(uint8_t *data, unsigned size,
     assert(split_addr <= addr || split_addr - addr < block_size);
 
     _status = DTBWaitResponse;
+
+    // TODO: TimingSimpleCPU doesn't support arbitrarily long multi-line mem.
+    // accesses yet
+
     if (split_addr > addr) {
         RequestPtr req1, req2;
         assert(!req->isLLSC() && !req->isSwap());
diff --git a/src/cpu/simple/timing.hh b/src/cpu/simple/timing.hh
index ce0a4dbfc..a49822fc1 100644
--- a/src/cpu/simple/timing.hh
+++ b/src/cpu/simple/timing.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2013,2015 ARM Limited
+ * Copyright (c) 2012-2013,2015,2018 ARM Limited
  * All rights reserved
  *
  * The license below extends only to copyright in the software and shall
@@ -283,10 +283,14 @@ class TimingSimpleCPU : public BaseSimpleCPU
     void suspendContext(ThreadID thread_num) override;
 
     Fault initiateMemRead(Addr addr, unsigned size,
-                          Request::Flags flags) override;
+            Request::Flags flags,
+            const std::vector<bool>& byteEnable =std::vector<bool>())
+        override;
 
     Fault writeMem(uint8_t *data, unsigned size,
-                   Addr addr, Request::Flags flags, uint64_t *res) override;
+                   Addr addr, Request::Flags flags, uint64_t *res,
+                   const std::vector<bool>& byteEnable = std::vector<bool>())
+        override;
 
     Fault initiateMemAMO(Addr addr, unsigned size, Request::Flags flags,
                          AtomicOpFunctor *amo_op) override;
-- 
cgit v1.2.3