cpu: support atomic memory request type with AtomicOpFunctor

This patch enables all 4 CPU models (AtomicSimpleCPU, TimingSimpleCPU, MinorCPU and DerivO3CPU) to issue atomic memory (AMO) requests to memory system. Atomic memory instruction is treated as a special store instruction in all CPU models. In simple CPUs, an AMO request with an associated AtomicOpFunctor is simply sent to L1 dcache. In MinorCPU, an AMO request bypasses store buffer and waits for any conflicting store request(s) currently in the store buffer to retire before the AMO request is sent to the cache. AMO requests are not buffered in the store buffer, so their effects appear immediately in the cache. In DerivO3CPU, an AMO request is inserted in the store buffer so that it is delivered to the cache only after all previous stores are issued to the cache. Data forwarding between between an outstanding AMO in the store buffer and a subsequent load is not allowed since the AMO request does not hold valid data until it's executed in the cache. This implementation assumes that a target ISA implementation must insert enough memory fences as micro-ops around an atomic instruction to enforce a correct order of memory instructions with respect to its memory consistency model. Without extra memory fences, this implementation can allow AMOs and other memory instructions that do not conflict (i.e., not target the same address) to reorder. This implementation also assumes that atomic instructions execute within a cache line boundary since the cache for now is not able to execute an operation on two different cache lines in one single step. Therefore, ISAs like x86 that require multi-cache-line atomic instructions need to either use a pair of locking load and unlocking store or change the cache implementation to guarantee the atomicity of an atomic instruction. Change-Id: Ib8a7c81868ac05b98d73afc7d16eb88486f8cf9a Reviewed-on: https://gem5-review.googlesource.com/c/8188 Reviewed-by: Giacomo Travaglini <giacomo.travaglini@arm.com> Maintainer: Jason Lowe-Power <jason@lowepower.com>
author: Tuan Ta <qtt2@cornell.edu> 2018-01-22 13:12:50 -0500
committer: Tuan Ta <qtt2@cornell.edu> 2019-02-08 15:27:04 +0000
commit: 25dc765889d948693995cfa622f001aa94b5364b (patch)
tree: 38a8e93881ad150a482020a1fd706d664ee0c061 /src/cpu/minor
parent: 165a7dab558c8118622a387683521bea1ebf2e6c (diff)
download: gem5-25dc765889d948693995cfa622f001aa94b5364b.tar.xz
6 files changed, 57 insertions, 17 deletions
diff --git a/src/cpu/minor/exec_context.hh b/src/cpu/minor/exec_context.hh
index 179883ecc..02b3dae1c 100644
--- a/src/cpu/minor/exec_context.hh
+++ b/src/cpu/minor/exec_context.hh
@@ -108,7 +108,7 @@ class ExecContext : public ::ExecContext
                     Request::Flags flags) override
     {
         execute.getLSQ().pushRequest(inst, true /* load */, nullptr,
-            size, addr, flags, NULL);
+            size, addr, flags, NULL, nullptr);
         return NoFault;
     }
 
@@ -117,7 +117,17 @@ class ExecContext : public ::ExecContext
              Request::Flags flags, uint64_t *res) override
     {
         execute.getLSQ().pushRequest(inst, false /* store */, data,
-            size, addr, flags, res);
+            size, addr, flags, res, nullptr);
+        return NoFault;
+    }
+
+    Fault
+    initiateMemAMO(Addr addr, unsigned int size, Request::Flags flags,
+                   AtomicOpFunctor *amo_op) override
+    {
+        // AMO requests are pushed through the store path
+        execute.getLSQ().pushRequest(inst, false /* amo */, nullptr,
+            size, addr, flags, nullptr, amo_op);
         return NoFault;
     }
 
diff --git a/src/cpu/minor/execute.cc b/src/cpu/minor/execute.cc
index 234a233c2..6a418202f 100644
--- a/src/cpu/minor/execute.cc
+++ b/src/cpu/minor/execute.cc
@@ -337,6 +337,7 @@ Execute::handleMemResponse(MinorDynInstPtr inst,
 
     bool is_load = inst->staticInst->isLoad();
     bool is_store = inst->staticInst->isStore();
+    bool is_atomic = inst->staticInst->isAtomic();
     bool is_prefetch = inst->staticInst->isDataPrefetch();
 
     /* If true, the trace's predicate value will be taken from the exec
@@ -368,7 +369,7 @@ Execute::handleMemResponse(MinorDynInstPtr inst,
             *inst);
 
         fatal("Received error response packet for inst: %s\n", *inst);
-    } else if (is_store || is_load || is_prefetch) {
+    } else if (is_store || is_load || is_prefetch || is_atomic) {
         assert(packet);
 
         DPRINTF(MinorMem, "Memory response inst: %s addr: 0x%x size: %d\n",
diff --git a/src/cpu/minor/fetch2.cc b/src/cpu/minor/fetch2.cc
index 180890147..9347e4ccb 100644
--- a/src/cpu/minor/fetch2.cc
+++ b/src/cpu/minor/fetch2.cc
@@ -421,6 +421,8 @@ Fetch2::evaluate()
                         loadInstructions++;
                     else if (decoded_inst->isStore())
                         storeInstructions++;
+                    else if (decoded_inst->isAtomic())
+                        amoInstructions++;
                     else if (decoded_inst->isVector())
                         vecInstructions++;
                     else if (decoded_inst->isFloating())
@@ -636,6 +638,11 @@ Fetch2::regStats()
         .name(name() + ".store_instructions")
         .desc("Number of memory store instructions successfully decoded")
         .flags(total);
+
+    amoInstructions
+        .name(name() + ".amo_instructions")
+        .desc("Number of memory atomic instructions successfully decoded")
+        .flags(total);
 }
 
 void
diff --git a/src/cpu/minor/fetch2.hh b/src/cpu/minor/fetch2.hh
index 2230560f1..114dec0f5 100644
--- a/src/cpu/minor/fetch2.hh
+++ b/src/cpu/minor/fetch2.hh
@@ -171,6 +171,7 @@ class Fetch2 : public Named
     Stats::Scalar vecInstructions;
     Stats::Scalar loadInstructions;
     Stats::Scalar storeInstructions;
+    Stats::Scalar amoInstructions;
 
   public:
     /** Dump the whole contents of the input buffer.  Useful after a
diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc
index b836ed22d..6fe6c3738 100644
--- a/src/cpu/minor/lsq.cc
+++ b/src/cpu/minor/lsq.cc
@@ -676,9 +676,9 @@ LSQ::StoreBuffer::canForwardDataToLoad(LSQRequestPtr request,
     while (ret == NoAddrRangeCoverage && i != slots.rend()) {
         LSQRequestPtr slot = *i;
 
-        /* Cache maintenance instructions go down via the store path *
-         * but they carry no data and they shouldn't be considered for
-         * forwarding */
+        /* Cache maintenance instructions go down via the store path but
+         * they carry no data and they shouldn't be considered
+         * for forwarding */
         if (slot->packet &&
             slot->inst->id.threadId == request->inst->id.threadId &&
             !slot->packet->req->isCacheMaintenance()) {
@@ -931,8 +931,9 @@ LSQ::tryToSendToTransfers(LSQRequestPtr request)
     bool is_load = request->isLoad;
     bool is_llsc = request->request->isLLSC();
     bool is_swap = request->request->isSwap();
+    bool is_atomic = request->request->isAtomic();
     bool bufferable = !(request->request->isStrictlyOrdered() ||
-        is_llsc || is_swap);
+                        is_llsc || is_swap || is_atomic);
 
     if (is_load) {
         if (numStoresInTransfers != 0) {
@@ -965,9 +966,16 @@ LSQ::tryToSendToTransfers(LSQRequestPtr request)
         if (storeBuffer.canForwardDataToLoad(request, forwarding_slot) !=
             NoAddrRangeCoverage)
         {
+            // There's at least another request that targets the same
+            // address and is staying in the storeBuffer. Since our
+            // request is non-bufferable (e.g., strictly ordered or atomic),
+            // we must wait for the other request in the storeBuffer to
+            // complete before we can issue this non-bufferable request.
+            // This is to make sure that the order they access the cache is
+            // correct.
             DPRINTF(MinorMem, "Memory access can receive forwarded data"
-                " from the store buffer, need to wait for store buffer to"
-                " drain\n");
+                " from the store buffer, but need to wait for store buffer"
+                " to drain\n");
             return;
         }
     }
@@ -1469,9 +1477,21 @@ LSQ::needsToTick()
 void
 LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
                  unsigned int size, Addr addr, Request::Flags flags,
-                 uint64_t *res)
+                 uint64_t *res, AtomicOpFunctor *amo_op)
 {
     bool needs_burst = transferNeedsBurst(addr, size, lineWidth);
+
+    if (needs_burst && inst->staticInst->isAtomic()) {
+        // AMO requests that access across a cache line boundary are not
+        // allowed since the cache does not guarantee AMO ops to be executed
+        // atomically in two cache lines
+        // For ISAs such as x86 that requires AMO operations to work on
+        // accesses that cross cache-line boundaries, the cache needs to be
+        // modified to support locking both cache lines to guarantee the
+        // atomicity.
+        panic("Do not expect cross-cache-line atomic memory request\n");
+    }
+
     LSQRequestPtr request;
 
     /* Copy given data into the request.  The request will pass this to the
@@ -1480,15 +1500,16 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
 
     DPRINTF(MinorMem, "Pushing request (%s) addr: 0x%x size: %d flags:"
         " 0x%x%s lineWidth : 0x%x\n",
-        (isLoad ? "load" : "store"), addr, size, flags,
+        (isLoad ? "load" : "store/atomic"), addr, size, flags,
             (needs_burst ? " (needs burst)" : ""), lineWidth);
 
     if (!isLoad) {
-        /* request_data becomes the property of a ...DataRequest (see below)
+        /* Request_data becomes the property of a ...DataRequest (see below)
          *  and destroyed by its destructor */
         request_data = new uint8_t[size];
-        if (flags & Request::STORE_NO_DATA) {
-            /* For cache zeroing, just use zeroed data */
+        if (inst->staticInst->isAtomic() ||
+            (flags & Request::STORE_NO_DATA)) {
+            /* For atomic or store-no-data, just use zeroed data */
             std::memset(request_data, 0, size);
         } else {
             std::memcpy(request_data, data, size);
@@ -1511,7 +1532,7 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
     request->request->setVirt(0 /* asid */,
         addr, size, flags, cpu.dataMasterId(),
         /* I've no idea why we need the PC, but give it */
-        inst->pc.instAddr());
+        inst->pc.instAddr(), amo_op);
 
     requests.push(request);
     request->startAddrTranslation();
diff --git a/src/cpu/minor/lsq.hh b/src/cpu/minor/lsq.hh
index da873b4ac..11fa8774f 100644
--- a/src/cpu/minor/lsq.hh
+++ b/src/cpu/minor/lsq.hh
@@ -696,11 +696,11 @@ class LSQ : public Named
     void completeMemBarrierInst(MinorDynInstPtr inst,
         bool committed);
 
-    /** Single interface for readMem/writeMem to issue requests into
+    /** Single interface for readMem/writeMem/amoMem to issue requests into
      *  the LSQ */
     void pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
                      unsigned int size, Addr addr, Request::Flags flags,
-                     uint64_t *res);
+                     uint64_t *res, AtomicOpFunctor *amo_op);
 
     /** Push a predicate failed-representing request into the queues just
      *  to maintain commit order */
author	Tuan Ta <qtt2@cornell.edu>	2018-01-22 13:12:50 -0500
committer	Tuan Ta <qtt2@cornell.edu>	2019-02-08 15:27:04 +0000
commit	25dc765889d948693995cfa622f001aa94b5364b (patch)
tree	38a8e93881ad150a482020a1fd706d664ee0c061 /src/cpu/minor
parent	165a7dab558c8118622a387683521bea1ebf2e6c (diff)
download	gem5-25dc765889d948693995cfa622f001aa94b5364b.tar.xz