6 files changed, 57 insertions, 17 deletions
diff --git a/src/cpu/minor/exec_context.hh b/src/cpu/minor/exec_context.hh
index 179883ecc..02b3dae1c 100644
--- a/src/cpu/minor/exec_context.hh
+++ b/src/cpu/minor/exec_context.hh
@@ -108,7 +108,7 @@ class ExecContext : public ::ExecContext
                     Request::Flags flags) override
     {
         execute.getLSQ().pushRequest(inst, true /* load */, nullptr,
-            size, addr, flags, NULL);
+            size, addr, flags, NULL, nullptr);
         return NoFault;
     }
 
@@ -117,7 +117,17 @@ class ExecContext : public ::ExecContext
              Request::Flags flags, uint64_t *res) override
     {
         execute.getLSQ().pushRequest(inst, false /* store */, data,
-            size, addr, flags, res);
+            size, addr, flags, res, nullptr);
+        return NoFault;
+    }
+
+    Fault
+    initiateMemAMO(Addr addr, unsigned int size, Request::Flags flags,
+                   AtomicOpFunctor *amo_op) override
+    {
+        // AMO requests are pushed through the store path
+        execute.getLSQ().pushRequest(inst, false /* amo */, nullptr,
+            size, addr, flags, nullptr, amo_op);
         return NoFault;
     }
 
diff --git a/src/cpu/minor/execute.cc b/src/cpu/minor/execute.cc
index 234a233c2..6a418202f 100644
--- a/src/cpu/minor/execute.cc
+++ b/src/cpu/minor/execute.cc
@@ -337,6 +337,7 @@ Execute::handleMemResponse(MinorDynInstPtr inst,
 
     bool is_load = inst->staticInst->isLoad();
     bool is_store = inst->staticInst->isStore();
+    bool is_atomic = inst->staticInst->isAtomic();
     bool is_prefetch = inst->staticInst->isDataPrefetch();
 
     /* If true, the trace's predicate value will be taken from the exec
@@ -368,7 +369,7 @@ Execute::handleMemResponse(MinorDynInstPtr inst,
             *inst);
 
         fatal("Received error response packet for inst: %s\n", *inst);
-    } else if (is_store || is_load || is_prefetch) {
+    } else if (is_store || is_load || is_prefetch || is_atomic) {
         assert(packet);
 
         DPRINTF(MinorMem, "Memory response inst: %s addr: 0x%x size: %d\n",
diff --git a/src/cpu/minor/fetch2.cc b/src/cpu/minor/fetch2.cc
index 180890147..9347e4ccb 100644
--- a/src/cpu/minor/fetch2.cc
+++ b/src/cpu/minor/fetch2.cc
@@ -421,6 +421,8 @@ Fetch2::evaluate()
                         loadInstructions++;
                     else if (decoded_inst->isStore())
                         storeInstructions++;
+                    else if (decoded_inst->isAtomic())
+                        amoInstructions++;
                     else if (decoded_inst->isVector())
                         vecInstructions++;
                     else if (decoded_inst->isFloating())
@@ -636,6 +638,11 @@ Fetch2::regStats()
         .name(name() + ".store_instructions")
         .desc("Number of memory store instructions successfully decoded")
         .flags(total);
+
+    amoInstructions
+        .name(name() + ".amo_instructions")
+        .desc("Number of memory atomic instructions successfully decoded")
+        .flags(total);
 }
 
 void
diff --git a/src/cpu/minor/fetch2.hh b/src/cpu/minor/fetch2.hh
index 2230560f1..114dec0f5 100644
--- a/src/cpu/minor/fetch2.hh
+++ b/src/cpu/minor/fetch2.hh
@@ -171,6 +171,7 @@ class Fetch2 : public Named
     Stats::Scalar vecInstructions;
     Stats::Scalar loadInstructions;
     Stats::Scalar storeInstructions;
+    Stats::Scalar amoInstructions;
 
   public:
     /** Dump the whole contents of the input buffer.  Useful after a
diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc
index b836ed22d..6fe6c3738 100644
--- a/src/cpu/minor/lsq.cc
+++ b/src/cpu/minor/lsq.cc
@@ -676,9 +676,9 @@ LSQ::StoreBuffer::canForwardDataToLoad(LSQRequestPtr request,
     while (ret == NoAddrRangeCoverage && i != slots.rend()) {
         LSQRequestPtr slot = *i;
 
-        /* Cache maintenance instructions go down via the store path *
-         * but they carry no data and they shouldn't be considered for
-         * forwarding */
+        /* Cache maintenance instructions go down via the store path but
+         * they carry no data and they shouldn't be considered
+         * for forwarding */
         if (slot->packet &&
             slot->inst->id.threadId == request->inst->id.threadId &&
             !slot->packet->req->isCacheMaintenance()) {
@@ -931,8 +931,9 @@ LSQ::tryToSendToTransfers(LSQRequestPtr request)
     bool is_load = request->isLoad;
     bool is_llsc = request->request->isLLSC();
     bool is_swap = request->request->isSwap();
+    bool is_atomic = request->request->isAtomic();
     bool bufferable = !(request->request->isStrictlyOrdered() ||
-        is_llsc || is_swap);
+                        is_llsc || is_swap || is_atomic);
 
     if (is_load) {
         if (numStoresInTransfers != 0) {
@@ -965,9 +966,16 @@ LSQ::tryToSendToTransfers(LSQRequestPtr request)
         if (storeBuffer.canForwardDataToLoad(request, forwarding_slot) !=
             NoAddrRangeCoverage)
         {
+            // There's at least another request that targets the same
+            // address and is staying in the storeBuffer. Since our
+            // request is non-bufferable (e.g., strictly ordered or atomic),
+            // we must wait for the other request in the storeBuffer to
+            // complete before we can issue this non-bufferable request.
+            // This is to make sure that the order they access the cache is
+            // correct.
             DPRINTF(MinorMem, "Memory access can receive forwarded data"
-                " from the store buffer, need to wait for store buffer to"
-                " drain\n");
+                " from the store buffer, but need to wait for store buffer"
+                " to drain\n");
             return;
         }
     }
@@ -1469,9 +1477,21 @@ LSQ::needsToTick()
 void
 LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
                  unsigned int size, Addr addr, Request::Flags flags,
-                 uint64_t *res)
+                 uint64_t *res, AtomicOpFunctor *amo_op)
 {
     bool needs_burst = transferNeedsBurst(addr, size, lineWidth);
+
+    if (needs_burst && inst->staticInst->isAtomic()) {
+        // AMO requests that access across a cache line boundary are not
+        // allowed since the cache does not guarantee AMO ops to be executed
+        // atomically in two cache lines
+        // For ISAs such as x86 that requires AMO operations to work on
+        // accesses that cross cache-line boundaries, the cache needs to be
+        // modified to support locking both cache lines to guarantee the
+        // atomicity.
+        panic("Do not expect cross-cache-line atomic memory request\n");
+    }
+
     LSQRequestPtr request;
 
     /* Copy given data into the request.  The request will pass this to the
@@ -1480,15 +1500,16 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
 
     DPRINTF(MinorMem, "Pushing request (%s) addr: 0x%x size: %d flags:"
         " 0x%x%s lineWidth : 0x%x\n",
-        (isLoad ? "load" : "store"), addr, size, flags,
+        (isLoad ? "load" : "store/atomic"), addr, size, flags,
             (needs_burst ? " (needs burst)" : ""), lineWidth);
 
     if (!isLoad) {
-        /* request_data becomes the property of a ...DataRequest (see below)
+        /* Request_data becomes the property of a ...DataRequest (see below)
          *  and destroyed by its destructor */
         request_data = new uint8_t[size];
-        if (flags & Request::STORE_NO_DATA) {
-            /* For cache zeroing, just use zeroed data */
+        if (inst->staticInst->isAtomic() ||
+            (flags & Request::STORE_NO_DATA)) {
+            /* For atomic or store-no-data, just use zeroed data */
             std::memset(request_data, 0, size);
         } else {
             std::memcpy(request_data, data, size);
@@ -1511,7 +1532,7 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
     request->request->setVirt(0 /* asid */,
         addr, size, flags, cpu.dataMasterId(),
         /* I've no idea why we need the PC, but give it */
-        inst->pc.instAddr());
+        inst->pc.instAddr(), amo_op);
 
     requests.push(request);
     request->startAddrTranslation();
diff --git a/src/cpu/minor/lsq.hh b/src/cpu/minor/lsq.hh
index da873b4ac..11fa8774f 100644
--- a/src/cpu/minor/lsq.hh
+++ b/src/cpu/minor/lsq.hh
@@ -696,11 +696,11 @@ class LSQ : public Named
     void completeMemBarrierInst(MinorDynInstPtr inst,
         bool committed);
 
-    /** Single interface for readMem/writeMem to issue requests into
+    /** Single interface for readMem/writeMem/amoMem to issue requests into
      *  the LSQ */
     void pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
                      unsigned int size, Addr addr, Request::Flags flags,
-                     uint64_t *res);
+                     uint64_t *res, AtomicOpFunctor *amo_op);
 
     /** Push a predicate failed-representing request into the queues just
      *  to maintain commit order */