28 files changed, 413 insertions, 101 deletions
diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index 878e65551..30f6baf20 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -409,7 +409,7 @@ BaseCPU::probeInstCommit(const StaticInstPtr &inst)
     if (inst->isLoad())
         ppRetiredLoads->notify(1);
 
-    if (inst->isStore())
+    if (inst->isStore() || inst->isAtomic())
         ppRetiredStores->notify(1);
 
     if (inst->isControl())
diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh
index c24517937..9a1ab062c 100644
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@@ -303,6 +303,9 @@ class BaseDynInst : public ExecContext, public RefCounted
     Fault writeMem(uint8_t *data, unsigned size, Addr addr,
                    Request::Flags flags, uint64_t *res);
 
+    Fault initiateMemAMO(Addr addr, unsigned size, Request::Flags flags,
+                         AtomicOpFunctor *amo_op);
+
     /** True if the DTB address translation has started. */
     bool translationStarted() const { return instFlags[TranslationStarted]; }
     void translationStarted(bool f) { instFlags[TranslationStarted] = f; }
@@ -920,4 +923,20 @@ BaseDynInst<Impl>::writeMem(uint8_t *data, unsigned size, Addr addr,
             /* st */ false, data, size, addr, flags, res);
 }
 
+template<class Impl>
+Fault
+BaseDynInst<Impl>::initiateMemAMO(Addr addr, unsigned size,
+                                  Request::Flags flags,
+                                  AtomicOpFunctor *amo_op)
+{
+    // atomic memory instructions do not have data to be written to memory yet
+    // since the atomic operations will be executed directly in cache/memory.
+    // Therefore, its `data` field is nullptr.
+    // Atomic memory requests need to carry their `amo_op` fields to cache/
+    // memory
+    return cpu->pushRequest(
+            dynamic_cast<typename DynInstPtr::PtrType>(this),
+            /* atomic */ false, nullptr, size, addr, flags, nullptr, amo_op);
+}
+
 #endif // __CPU_BASE_DYN_INST_HH__
diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh
index e32c015bf..5f830d7a9 100644
--- a/src/cpu/checker/cpu.hh
+++ b/src/cpu/checker/cpu.hh
@@ -536,9 +536,16 @@ class CheckerCPU : public BaseCPU, public ExecContext
 
     Fault readMem(Addr addr, uint8_t *data, unsigned size,
                   Request::Flags flags) override;
+
     Fault writeMem(uint8_t *data, unsigned size, Addr addr,
                    Request::Flags flags, uint64_t *res) override;
 
+    Fault amoMem(Addr addr, uint8_t* data, unsigned size,
+                 Request::Flags flags, AtomicOpFunctor *amo_op) override
+    {
+        panic("AMO is not supported yet in CPU checker\n");
+    }
+
     unsigned int
     readStCondFailures() const override {
         return thread->readStCondFailures();
diff --git a/src/cpu/exec_context.hh b/src/cpu/exec_context.hh
index 1c1c8956a..d46cc1315 100644
--- a/src/cpu/exec_context.hh
+++ b/src/cpu/exec_context.hh
@@ -261,6 +261,28 @@ class ExecContext {
                            Request::Flags flags, uint64_t *res) = 0;
 
     /**
+     * For atomic-mode contexts, perform an atomic AMO (a.k.a., Atomic
+     * Read-Modify-Write Memory Operation)
+     */
+    virtual Fault amoMem(Addr addr, uint8_t *data, unsigned int size,
+                         Request::Flags flags,
+                         AtomicOpFunctor *amo_op)
+    {
+        panic("ExecContext::amoMem() should be overridden\n");
+    }
+
+    /**
+     * For timing-mode contexts, initiate an atomic AMO (atomic
+     * read-modify-write memory operation)
+     */
+    virtual Fault initiateMemAMO(Addr addr, unsigned int size,
+                                 Request::Flags flags,
+                                 AtomicOpFunctor *amo_op)
+    {
+        panic("ExecContext::initiateMemAMO() should be overridden\n");
+    }
+
+    /**
      * Sets the number of consecutive store conditional failures.
      */
     virtual void setStCondFailures(unsigned int sc_failures) = 0;
diff --git a/src/cpu/minor/exec_context.hh b/src/cpu/minor/exec_context.hh
index 179883ecc..02b3dae1c 100644
--- a/src/cpu/minor/exec_context.hh
+++ b/src/cpu/minor/exec_context.hh
@@ -108,7 +108,7 @@ class ExecContext : public ::ExecContext
                     Request::Flags flags) override
     {
         execute.getLSQ().pushRequest(inst, true /* load */, nullptr,
-            size, addr, flags, NULL);
+            size, addr, flags, NULL, nullptr);
         return NoFault;
     }
 
@@ -117,7 +117,17 @@ class ExecContext : public ::ExecContext
              Request::Flags flags, uint64_t *res) override
     {
         execute.getLSQ().pushRequest(inst, false /* store */, data,
-            size, addr, flags, res);
+            size, addr, flags, res, nullptr);
+        return NoFault;
+    }
+
+    Fault
+    initiateMemAMO(Addr addr, unsigned int size, Request::Flags flags,
+                   AtomicOpFunctor *amo_op) override
+    {
+        // AMO requests are pushed through the store path
+        execute.getLSQ().pushRequest(inst, false /* amo */, nullptr,
+            size, addr, flags, nullptr, amo_op);
         return NoFault;
     }
 
diff --git a/src/cpu/minor/execute.cc b/src/cpu/minor/execute.cc
index 234a233c2..6a418202f 100644
--- a/src/cpu/minor/execute.cc
+++ b/src/cpu/minor/execute.cc
@@ -337,6 +337,7 @@ Execute::handleMemResponse(MinorDynInstPtr inst,
 
     bool is_load = inst->staticInst->isLoad();
     bool is_store = inst->staticInst->isStore();
+    bool is_atomic = inst->staticInst->isAtomic();
     bool is_prefetch = inst->staticInst->isDataPrefetch();
 
     /* If true, the trace's predicate value will be taken from the exec
@@ -368,7 +369,7 @@ Execute::handleMemResponse(MinorDynInstPtr inst,
             *inst);
 
         fatal("Received error response packet for inst: %s\n", *inst);
-    } else if (is_store || is_load || is_prefetch) {
+    } else if (is_store || is_load || is_prefetch || is_atomic) {
         assert(packet);
 
         DPRINTF(MinorMem, "Memory response inst: %s addr: 0x%x size: %d\n",
diff --git a/src/cpu/minor/fetch2.cc b/src/cpu/minor/fetch2.cc
index 180890147..9347e4ccb 100644
--- a/src/cpu/minor/fetch2.cc
+++ b/src/cpu/minor/fetch2.cc
@@ -421,6 +421,8 @@ Fetch2::evaluate()
                         loadInstructions++;
                     else if (decoded_inst->isStore())
                         storeInstructions++;
+                    else if (decoded_inst->isAtomic())
+                        amoInstructions++;
                     else if (decoded_inst->isVector())
                         vecInstructions++;
                     else if (decoded_inst->isFloating())
@@ -636,6 +638,11 @@ Fetch2::regStats()
         .name(name() + ".store_instructions")
         .desc("Number of memory store instructions successfully decoded")
         .flags(total);
+
+    amoInstructions
+        .name(name() + ".amo_instructions")
+        .desc("Number of memory atomic instructions successfully decoded")
+        .flags(total);
 }
 
 void
diff --git a/src/cpu/minor/fetch2.hh b/src/cpu/minor/fetch2.hh
index 2230560f1..114dec0f5 100644
--- a/src/cpu/minor/fetch2.hh
+++ b/src/cpu/minor/fetch2.hh
@@ -171,6 +171,7 @@ class Fetch2 : public Named
     Stats::Scalar vecInstructions;
     Stats::Scalar loadInstructions;
     Stats::Scalar storeInstructions;
+    Stats::Scalar amoInstructions;
 
   public:
     /** Dump the whole contents of the input buffer.  Useful after a
diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc
index b836ed22d..6fe6c3738 100644
--- a/src/cpu/minor/lsq.cc
+++ b/src/cpu/minor/lsq.cc
@@ -676,9 +676,9 @@ LSQ::StoreBuffer::canForwardDataToLoad(LSQRequestPtr request,
     while (ret == NoAddrRangeCoverage && i != slots.rend()) {
         LSQRequestPtr slot = *i;
 
-        /* Cache maintenance instructions go down via the store path *
-         * but they carry no data and they shouldn't be considered for
-         * forwarding */
+        /* Cache maintenance instructions go down via the store path but
+         * they carry no data and they shouldn't be considered
+         * for forwarding */
         if (slot->packet &&
             slot->inst->id.threadId == request->inst->id.threadId &&
             !slot->packet->req->isCacheMaintenance()) {
@@ -931,8 +931,9 @@ LSQ::tryToSendToTransfers(LSQRequestPtr request)
     bool is_load = request->isLoad;
     bool is_llsc = request->request->isLLSC();
     bool is_swap = request->request->isSwap();
+    bool is_atomic = request->request->isAtomic();
     bool bufferable = !(request->request->isStrictlyOrdered() ||
-        is_llsc || is_swap);
+                        is_llsc || is_swap || is_atomic);
 
     if (is_load) {
         if (numStoresInTransfers != 0) {
@@ -965,9 +966,16 @@ LSQ::tryToSendToTransfers(LSQRequestPtr request)
         if (storeBuffer.canForwardDataToLoad(request, forwarding_slot) !=
             NoAddrRangeCoverage)
         {
+            // There's at least another request that targets the same
+            // address and is staying in the storeBuffer. Since our
+            // request is non-bufferable (e.g., strictly ordered or atomic),
+            // we must wait for the other request in the storeBuffer to
+            // complete before we can issue this non-bufferable request.
+            // This is to make sure that the order they access the cache is
+            // correct.
             DPRINTF(MinorMem, "Memory access can receive forwarded data"
-                " from the store buffer, need to wait for store buffer to"
-                " drain\n");
+                " from the store buffer, but need to wait for store buffer"
+                " to drain\n");
             return;
         }
     }
@@ -1469,9 +1477,21 @@ LSQ::needsToTick()
 void
 LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
                  unsigned int size, Addr addr, Request::Flags flags,
-                 uint64_t *res)
+                 uint64_t *res, AtomicOpFunctor *amo_op)
 {
     bool needs_burst = transferNeedsBurst(addr, size, lineWidth);
+
+    if (needs_burst && inst->staticInst->isAtomic()) {
+        // AMO requests that access across a cache line boundary are not
+        // allowed since the cache does not guarantee AMO ops to be executed
+        // atomically in two cache lines
+        // For ISAs such as x86 that requires AMO operations to work on
+        // accesses that cross cache-line boundaries, the cache needs to be
+        // modified to support locking both cache lines to guarantee the
+        // atomicity.
+        panic("Do not expect cross-cache-line atomic memory request\n");
+    }
+
     LSQRequestPtr request;
 
     /* Copy given data into the request.  The request will pass this to the
@@ -1480,15 +1500,16 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
 
     DPRINTF(MinorMem, "Pushing request (%s) addr: 0x%x size: %d flags:"
         " 0x%x%s lineWidth : 0x%x\n",
-        (isLoad ? "load" : "store"), addr, size, flags,
+        (isLoad ? "load" : "store/atomic"), addr, size, flags,
             (needs_burst ? " (needs burst)" : ""), lineWidth);
 
     if (!isLoad) {
-        /* request_data becomes the property of a ...DataRequest (see below)
+        /* Request_data becomes the property of a ...DataRequest (see below)
          *  and destroyed by its destructor */
         request_data = new uint8_t[size];
-        if (flags & Request::STORE_NO_DATA) {
-            /* For cache zeroing, just use zeroed data */
+        if (inst->staticInst->isAtomic() ||
+            (flags & Request::STORE_NO_DATA)) {
+            /* For atomic or store-no-data, just use zeroed data */
             std::memset(request_data, 0, size);
         } else {
             std::memcpy(request_data, data, size);
@@ -1511,7 +1532,7 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
     request->request->setVirt(0 /* asid */,
         addr, size, flags, cpu.dataMasterId(),
         /* I've no idea why we need the PC, but give it */
-        inst->pc.instAddr());
+        inst->pc.instAddr(), amo_op);
 
     requests.push(request);
     request->startAddrTranslation();
diff --git a/src/cpu/minor/lsq.hh b/src/cpu/minor/lsq.hh
index da873b4ac..11fa8774f 100644
--- a/src/cpu/minor/lsq.hh
+++ b/src/cpu/minor/lsq.hh
@@ -696,11 +696,11 @@ class LSQ : public Named
     void completeMemBarrierInst(MinorDynInstPtr inst,
         bool committed);
 
-    /** Single interface for readMem/writeMem to issue requests into
+    /** Single interface for readMem/writeMem/amoMem to issue requests into
      *  the LSQ */
     void pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data,
                      unsigned int size, Addr addr, Request::Flags flags,
-                     uint64_t *res);
+                     uint64_t *res, AtomicOpFunctor *amo_op);
 
     /** Push a predicate failed-representing request into the queues just
      *  to maintain commit order */
diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh
index 4e32f865d..e624557c8 100644
--- a/src/cpu/o3/commit.hh
+++ b/src/cpu/o3/commit.hh
@@ -498,6 +498,8 @@ class DefaultCommit
     Stats::Vector statComRefs;
     /** Stat for the total number of committed loads. */
     Stats::Vector statComLoads;
+    /** Stat for the total number of committed atomics. */
+    Stats::Vector statComAmos;
     /** Total number of committed memory barriers. */
     Stats::Vector statComMembars;
     /** Total number of committed branches. */
diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh
index 2891ce331..ec3d61050 100644
--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@@ -208,6 +208,13 @@ DefaultCommit<Impl>::regStats()
         .flags(total)
         ;
 
+    statComAmos
+        .init(cpu->numThreads)
+        .name(name() +  ".amos")
+        .desc("Number of atomic instructions committed")
+        .flags(total)
+        ;
+
     statComMembars
         .init(cpu->numThreads)
         .name(name() +  ".membars")
@@ -1158,8 +1165,9 @@ DefaultCommit<Impl>::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
         // Make sure we are only trying to commit un-executed instructions we
         // think are possible.
         assert(head_inst->isNonSpeculative() || head_inst->isStoreConditional()
-               || head_inst->isMemBarrier() || head_inst->isWriteBarrier() ||
-               (head_inst->isLoad() && head_inst->strictlyOrdered()));
+               || head_inst->isMemBarrier() || head_inst->isWriteBarrier()
+               || head_inst->isAtomic()
+               || (head_inst->isLoad() && head_inst->strictlyOrdered()));
 
         DPRINTF(Commit, "Encountered a barrier or non-speculative "
                 "instruction [sn:%lli] at the head of the ROB, PC %s.\n",
@@ -1306,7 +1314,7 @@ DefaultCommit<Impl>::commitHead(const DynInstPtr &head_inst, unsigned inst_num)
 #endif
 
     // If this was a store, record it for this cycle.
-    if (head_inst->isStore())
+    if (head_inst->isStore() || head_inst->isAtomic())
         committedStores[tid] = true;
 
     // Return true to indicate that we have committed an instruction.
@@ -1399,6 +1407,10 @@ DefaultCommit<Impl>::updateComInstStats(const DynInstPtr &inst)
         if (inst->isLoad()) {
             statComLoads[tid]++;
         }
+
+        if (inst->isAtomic()) {
+            statComAmos[tid]++;
+        }
     }
 
     if (inst->isMemBarrier()) {
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index ec6be657a..21cae444b 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -793,10 +793,10 @@ class FullO3CPU : public BaseO3CPU
     /** CPU pushRequest function, forwards request to LSQ. */
     Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
                       unsigned int size, Addr addr, Request::Flags flags,
-                      uint64_t *res)
+                      uint64_t *res, AtomicOpFunctor *amo_op = nullptr)
     {
         return iew.ldstQueue.pushRequest(inst, isLoad, data, size, addr,
-                flags, res);
+                flags, res, amo_op);
     }
 
     /** CPU read function, forwards read to LSQ. */
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
index 251389631..6434ec8c3 100644
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -479,7 +479,8 @@ DefaultIEW<Impl>::squash(ThreadID tid)
         if (skidBuffer[tid].front()->isLoad()) {
             toRename->iewInfo[tid].dispatchedToLQ++;
         }
-        if (skidBuffer[tid].front()->isStore()) {
+        if (skidBuffer[tid].front()->isStore() ||
+            skidBuffer[tid].front()->isAtomic()) {
             toRename->iewInfo[tid].dispatchedToSQ++;
         }
 
@@ -862,7 +863,8 @@ DefaultIEW<Impl>::emptyRenameInsts(ThreadID tid)
         if (insts[tid].front()->isLoad()) {
             toRename->iewInfo[tid].dispatchedToLQ++;
         }
-        if (insts[tid].front()->isStore()) {
+        if (insts[tid].front()->isStore() ||
+            insts[tid].front()->isAtomic()) {
             toRename->iewInfo[tid].dispatchedToSQ++;
         }
 
@@ -1004,7 +1006,7 @@ DefaultIEW<Impl>::dispatchInsts(ThreadID tid)
             if (inst->isLoad()) {
                 toRename->iewInfo[tid].dispatchedToLQ++;
             }
-            if (inst->isStore()) {
+            if (inst->isStore() || inst->isAtomic()) {
                 toRename->iewInfo[tid].dispatchedToSQ++;
             }
 
@@ -1030,7 +1032,8 @@ DefaultIEW<Impl>::dispatchInsts(ThreadID tid)
         }
 
         // Check LSQ if inst is LD/ST
-        if ((inst->isLoad() && ldstQueue.lqFull(tid)) ||
+        if ((inst->isAtomic() && ldstQueue.sqFull(tid)) ||
+            (inst->isLoad() && ldstQueue.lqFull(tid)) ||
             (inst->isStore() && ldstQueue.sqFull(tid))) {
             DPRINTF(IEW, "[tid:%i]: Issue: %s has become full.\n",tid,
                     inst->isLoad() ? "LQ" : "SQ");
@@ -1048,7 +1051,25 @@ DefaultIEW<Impl>::dispatchInsts(ThreadID tid)
         }
 
         // Otherwise issue the instruction just fine.
-        if (inst->isLoad()) {
+        if (inst->isAtomic()) {
+            DPRINTF(IEW, "[tid:%i]: Issue: Memory instruction "
+                    "encountered, adding to LSQ.\n", tid);
+
+            ldstQueue.insertStore(inst);
+
+            ++iewDispStoreInsts;
+
+            // AMOs need to be set as "canCommit()"
+            // so that commit can process them when they reach the
+            // head of commit.
+            inst->setCanCommit();
+            instQueue.insertNonSpec(inst);
+            add_to_iq = false;
+
+            ++iewDispNonSpecInsts;
+
+            toRename->iewInfo[tid].dispatchedToSQ++;
+        } else if (inst->isLoad()) {
             DPRINTF(IEW, "[tid:%i]: Issue: Memory instruction "
                     "encountered, adding to LSQ.\n", tid);
 
@@ -1243,7 +1264,20 @@ DefaultIEW<Impl>::executeInsts()
                     "reference.\n");
 
             // Tell the LDSTQ to execute this instruction (if it is a load).
-            if (inst->isLoad()) {
+            if (inst->isAtomic()) {
+                // AMOs are treated like store requests
+                fault = ldstQueue.executeStore(inst);
+
+                if (inst->isTranslationDelayed() &&
+                    fault == NoFault) {
+                    // A hw page table walk is currently going on; the
+                    // instruction must be deferred.
+                    DPRINTF(IEW, "Execute: Delayed translation, deferring "
+                            "store.\n");
+                    instQueue.deferMemInst(inst);
+                    continue;
+                }
+            } else if (inst->isLoad()) {
                 // Loads will mark themselves as executed, and their writeback
                 // event adds the instruction to the queue to commit
                 fault = ldstQueue.executeLoad(inst);
diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh
index ddd7b6d5f..aa12297d6 100644
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@@ -1251,13 +1251,15 @@ InstructionQueue<Impl>::doSquash(ThreadID tid)
 
             bool is_acq_rel = squashed_inst->isMemBarrier() &&
                          (squashed_inst->isLoad() ||
-                           (squashed_inst->isStore() &&
+                          squashed_inst->isAtomic() ||
+                          (squashed_inst->isStore() &&
                              !squashed_inst->isStoreConditional()));
 
             // Remove the instruction from the dependency list.
             if (is_acq_rel ||
                 (!squashed_inst->isNonSpeculative() &&
                  !squashed_inst->isStoreConditional() &&
+                 !squashed_inst->isAtomic() &&
                  !squashed_inst->isMemBarrier() &&
                  !squashed_inst->isWriteBarrier())) {
 
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 81b7c04a5..f576dd3f4 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -191,7 +191,7 @@ class LSQ
         enum Flag : FlagsStorage
         {
             IsLoad              = 0x00000001,
-            /** True if this is a store that writes registers (SC). */
+            /** True if this is a store/atomic that writes registers (SC). */
             WbStore             = 0x00000002,
             Delayed             = 0x00000004,
             IsSplit             = 0x00000008,
@@ -211,7 +211,9 @@ class LSQ
             LSQEntryFreed       = 0x00000800,
             /** Store written back. */
             WritebackScheduled  = 0x00001000,
-            WritebackDone       = 0x00002000
+            WritebackDone       = 0x00002000,
+            /** True if this is an atomic request */
+            IsAtomic            = 0x00004000
         };
         FlagsType flags;
 
@@ -250,32 +252,39 @@ class LSQ
         const uint32_t _size;
         const Request::Flags _flags;
         uint32_t _numOutstandingPackets;
+        AtomicOpFunctor *_amo_op;
       protected:
         LSQUnit* lsqUnit() { return &_port; }
         LSQRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad) :
             _state(State::NotIssued), _senderState(nullptr),
             _port(*port), _inst(inst), _data(nullptr),
             _res(nullptr), _addr(0), _size(0), _flags(0),
-            _numOutstandingPackets(0)
+            _numOutstandingPackets(0), _amo_op(nullptr)
         {
             flags.set(Flag::IsLoad, isLoad);
-            flags.set(Flag::WbStore, _inst->isStoreConditional());
+            flags.set(Flag::WbStore,
+                      _inst->isStoreConditional() || _inst->isAtomic());
+            flags.set(Flag::IsAtomic, _inst->isAtomic());
             install();
         }
         LSQRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad,
                    const Addr& addr, const uint32_t& size,
                    const Request::Flags& flags_,
-                   PacketDataPtr data = nullptr, uint64_t* res = nullptr)
+                   PacketDataPtr data = nullptr, uint64_t* res = nullptr,
+                   AtomicOpFunctor* amo_op = nullptr)
             : _state(State::NotIssued), _senderState(nullptr),
             numTranslatedFragments(0),
             numInTranslationFragments(0),
             _port(*port), _inst(inst), _data(data),
             _res(res), _addr(addr), _size(size),
             _flags(flags_),
-            _numOutstandingPackets(0)
+            _numOutstandingPackets(0),
+            _amo_op(amo_op)
         {
             flags.set(Flag::IsLoad, isLoad);
-            flags.set(Flag::WbStore, _inst->isStoreConditional());
+            flags.set(Flag::WbStore,
+                      _inst->isStoreConditional() || _inst->isAtomic());
+            flags.set(Flag::IsAtomic, _inst->isAtomic());
             install();
         }
 
@@ -285,12 +294,20 @@ class LSQ
             return flags.isSet(Flag::IsLoad);
         }
 
+        bool
+        isAtomic() const
+        {
+            return flags.isSet(Flag::IsAtomic);
+        }
+
         /** Install the request in the LQ/SQ. */
         void install()
         {
             if (isLoad()) {
                 _port.loadQueue[_inst->lqIdx].setRequest(this);
             } else {
+                // Store, StoreConditional, and Atomic requests are pushed
+                // to this storeQueue
                 _port.storeQueue[_inst->sqIdx].setRequest(this);
             }
         }
@@ -609,17 +626,21 @@ class LSQ
         using LSQRequest::numInTranslationFragments;
         using LSQRequest::numTranslatedFragments;
         using LSQRequest::_numOutstandingPackets;
+        using LSQRequest::_amo_op;
       public:
         SingleDataRequest(LSQUnit* port, const DynInstPtr& inst, bool isLoad,
                           const Addr& addr, const uint32_t& size,
                           const Request::Flags& flags_,
                           PacketDataPtr data = nullptr,
-                          uint64_t* res = nullptr) :
-            LSQRequest(port, inst, isLoad, addr, size, flags_, data, res)
+                          uint64_t* res = nullptr,
+                          AtomicOpFunctor* amo_op = nullptr) :
+            LSQRequest(port, inst, isLoad, addr, size, flags_, data, res,
+                       amo_op)
         {
             LSQRequest::_requests.push_back(
-                std::make_shared<Request>(inst->getASID(), addr, size, flags_,
-                    inst->masterId(), inst->instAddr(), inst->contextId()));
+                    std::make_shared<Request>(inst->getASID(), addr, size,
+                    flags_, inst->masterId(), inst->instAddr(),
+                    inst->contextId(), amo_op));
             LSQRequest::_requests.back()->setReqInstSeqNum(inst->seqNum);
         }
         inline virtual ~SingleDataRequest() {}
@@ -928,7 +949,7 @@ class LSQ
 
     Fault pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
                       unsigned int size, Addr addr, Request::Flags flags,
-                      uint64_t *res);
+                      uint64_t *res, AtomicOpFunctor *amo_op);
 
     /** The CPU pointer. */
     O3CPU *cpu;
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
index 8a221a8d5..abe751c88 100644
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -680,13 +680,26 @@ template<class Impl>
 Fault
 LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
                        unsigned int size, Addr addr, Request::Flags flags,
-                       uint64_t *res)
+                       uint64_t *res, AtomicOpFunctor *amo_op)
 {
+    // This comming request can be either load, store or atomic.
+    // Atomic request has a corresponding pointer to its atomic memory
+    // operation
+    bool isAtomic = !isLoad && amo_op;
+
     ThreadID tid = cpu->contextToThread(inst->contextId());
     auto cacheLineSize = cpu->cacheLineSize();
     bool needs_burst = transferNeedsBurst(addr, size, cacheLineSize);
     LSQRequest* req = nullptr;
 
+    // Atomic requests that access data across cache line boundary are
+    // currently not allowed since the cache does not guarantee corresponding
+    // atomic memory operations to be executed atomically across a cache line.
+    // For ISAs such as x86 that supports cross-cache-line atomic instructions,
+    // the cache needs to be modified to perform atomic update to both cache
+    // lines. For now, such cross-line update is not supported.
+    assert(!isAtomic || (isAtomic && !needs_burst));
+
     if (inst->translationStarted()) {
         req = inst->savedReq;
         assert(req);
@@ -696,7 +709,7 @@ LSQ<Impl>::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data,
                     size, flags, data, res);
         } else {
             req = new SingleDataRequest(&thread[tid], inst, isLoad, addr,
-                    size, flags, data, res);
+                    size, flags, data, res, amo_op);
         }
         assert(req);
         inst->setRequest();
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 5b90da4f5..3be67bec4 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -702,10 +702,12 @@ LSQUnit<Impl>::read(LSQRequest *req, int load_idx)
             bool lower_load_has_store_part = req_s < st_e;
             bool upper_load_has_store_part = req_e > st_s;
 
-            // If the store's data has all of the data needed and the load
-            // isn't LLSC then
-            // we can forward.
-            if (store_has_lower_limit && store_has_upper_limit &&
+            // If the store entry is not atomic (atomic does not have valid
+            // data), the store has all of the data needed, and
+            // the load is not LLSC, then
+            // we can forward data from the store to the load
+            if (!store_it->instruction()->isAtomic() &&
+                store_has_lower_limit && store_has_upper_limit &&
                 !req->mainRequest()->isLLSC()) {
 
                 // Get shift amount for offset into the store's data.
@@ -755,17 +757,22 @@ LSQUnit<Impl>::read(LSQRequest *req, int load_idx)
 
                 return NoFault;
             } else if (
+                // This is the partial store-load forwarding case where a store
+                // has only part of the load's data and the load isn't LLSC
                 (!req->mainRequest()->isLLSC() &&
                  ((store_has_lower_limit && lower_load_has_store_part) ||
                   (store_has_upper_limit && upper_load_has_store_part) ||
                   (lower_load_has_store_part && upper_load_has_store_part))) ||
+                // The load is LLSC, and the store has all or part of the
+                // load's data
                 (req->mainRequest()->isLLSC() &&
                  ((store_has_lower_limit || upper_load_has_store_part) &&
-                  (store_has_upper_limit || lower_load_has_store_part)))) {
-                // This is the partial store-load forwarding case where a store
-                // has only part of the load's data and the load isn't LLSC or
-                // the load is LLSC and the store has all or part of the load's
+                  (store_has_upper_limit || lower_load_has_store_part))) ||
+                // The store entry is atomic and has all or part of the load's
                 // data
+                (store_it->instruction()->isAtomic() &&
+                 ((store_has_lower_limit || upper_load_has_store_part) &&
+                  (store_has_upper_limit || lower_load_has_store_part)))) {
 
                 // If it's already been written back, then don't worry about
                 // stalling on it.
@@ -857,8 +864,10 @@ LSQUnit<Impl>::write(LSQRequest *req, uint8_t *data, int store_idx)
     storeQueue[store_idx].isAllZeros() = store_no_data;
     assert(size <= SQEntry::DataSize || store_no_data);
 
+    // copy data into the storeQueue only if the store request has valid data
     if (!(req->request()->getFlags() & Request::CACHE_BLOCK_ZERO) &&
-        !req->request()->isCacheMaintenance())
+        !req->request()->isCacheMaintenance() &&
+        !req->request()->isAtomic())
         memcpy(storeQueue[store_idx].data(), data, size);
 
     // This function only writes the data to the store queue, so no fault
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index 9756a9ef1..48179ceb8 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -124,16 +124,19 @@ LSQUnit<Impl>::completeDataAccess(PacketPtr pkt)
     assert(!cpu->switchedOut());
     if (!inst->isSquashed()) {
         if (state->needWB) {
-            // Only loads and store conditionals perform the writeback
+            // Only loads, store conditionals and atomics perform the writeback
             // after receving the response from the memory
-            assert(inst->isLoad() || inst->isStoreConditional());
+            assert(inst->isLoad() || inst->isStoreConditional() ||
+                   inst->isAtomic());
             writeback(inst, state->request()->mainPacket());
-            if (inst->isStore()) {
+            if (inst->isStore() || inst->isAtomic()) {
                 auto ss = dynamic_cast<SQSenderState*>(state);
                 ss->writebackDone();
                 completeStore(ss->idx);
             }
         } else if (inst->isStore()) {
+            // This is a regular store (i.e., not store conditionals and
+            // atomics), so it can complete without writing back
             completeStore(dynamic_cast<SQSenderState*>(state)->idx);
         }
     }
@@ -274,7 +277,7 @@ LSQUnit<Impl>::insert(const DynInstPtr &inst)
 {
     assert(inst->isMemRef());
 
-    assert(inst->isLoad() || inst->isStore());
+    assert(inst->isLoad() || inst->isStore() || inst->isAtomic());
 
     if (inst->isLoad()) {
         insertLoad(inst);
@@ -614,8 +617,8 @@ LSQUnit<Impl>::executeStore(const DynInstPtr &store_inst)
 
     assert(store_fault == NoFault);
 
-    if (store_inst->isStoreConditional()) {
-        // Store conditionals need to set themselves as able to
+    if (store_inst->isStoreConditional() || store_inst->isAtomic()) {
+        // Store conditionals and Atomics need to set themselves as able to
         // writeback if we haven't had a fault by here.
         storeQueue[store_idx].canWB() = true;
 
@@ -751,8 +754,8 @@ LSQUnit<Impl>::writebackStores()
             state->inst = inst;
 
             req->senderState(state);
-            if (inst->isStoreConditional()) {
-                /* Only store conditionals need a writeback. */
+            if (inst->isStoreConditional() || inst->isAtomic()) {
+                /* Only store conditionals and atomics need a writeback. */
                 state->needWB = true;
             }
         }
diff --git a/src/cpu/o3/mem_dep_unit_impl.hh b/src/cpu/o3/mem_dep_unit_impl.hh
index 26c4b4d6e..f1d0e2313 100644
--- a/src/cpu/o3/mem_dep_unit_impl.hh
+++ b/src/cpu/o3/mem_dep_unit_impl.hh
@@ -191,11 +191,11 @@ MemDepUnit<MemDepPred, Impl>::insert(const DynInstPtr &inst)
     // Check any barriers and the dependence predictor for any
     // producing memrefs/stores.
     InstSeqNum producing_store;
-    if (inst->isLoad() && loadBarrier) {
+    if ((inst->isLoad() || inst->isAtomic()) && loadBarrier) {
         DPRINTF(MemDepUnit, "Load barrier [sn:%lli] in flight\n",
                 loadBarrierSN);
         producing_store = loadBarrierSN;
-    } else if (inst->isStore() && storeBarrier) {
+    } else if ((inst->isStore() || inst->isAtomic()) && storeBarrier) {
         DPRINTF(MemDepUnit, "Store barrier [sn:%lli] in flight\n",
                 storeBarrierSN);
         producing_store = storeBarrierSN;
@@ -252,8 +252,8 @@ MemDepUnit<MemDepPred, Impl>::insert(const DynInstPtr &inst)
         }
     }
 
-    if (inst->isStore()) {
-        DPRINTF(MemDepUnit, "Inserting store PC %s [sn:%lli].\n",
+    if (inst->isStore() || inst->isAtomic()) {
+        DPRINTF(MemDepUnit, "Inserting store/atomic PC %s [sn:%lli].\n",
                 inst->pcState(), inst->seqNum);
 
         depPred.insertStore(inst->instAddr(), inst->seqNum, inst->threadNumber);
@@ -288,8 +288,8 @@ MemDepUnit<MemDepPred, Impl>::insertNonSpec(const DynInstPtr &inst)
 
     // Might want to turn this part into an inline function or something.
     // It's shared between both insert functions.
-    if (inst->isStore()) {
-        DPRINTF(MemDepUnit, "Inserting store PC %s [sn:%lli].\n",
+    if (inst->isStore() || inst->isAtomic()) {
+        DPRINTF(MemDepUnit, "Inserting store/atomic PC %s [sn:%lli].\n",
                 inst->pcState(), inst->seqNum);
 
         depPred.insertStore(inst->instAddr(), inst->seqNum, inst->threadNumber);
@@ -451,8 +451,9 @@ template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::wakeDependents(const DynInstPtr &inst)
 {
-    // Only stores and barriers have dependents.
-    if (!inst->isStore() && !inst->isMemBarrier() && !inst->isWriteBarrier()) {
+    // Only stores, atomics and barriers have dependents.
+    if (!inst->isStore() && !inst->isAtomic() && !inst->isMemBarrier() &&
+        !inst->isWriteBarrier()) {
         return;
     }
 
diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh
index fd9b09e20..c24a09711 100644
--- a/src/cpu/o3/rename_impl.hh
+++ b/src/cpu/o3/rename_impl.hh
@@ -647,7 +647,7 @@ DefaultRename<Impl>::renameInsts(ThreadID tid)
             }
         }
 
-        if (inst->isStore()) {
+        if (inst->isStore() || inst->isAtomic()) {
             if (calcFreeSQEntries(tid) <= 0) {
                 DPRINTF(Rename, "[tid:%u]: Cannot rename due to no free SQ\n");
                 source = SQ;
@@ -741,12 +741,12 @@ DefaultRename<Impl>::renameInsts(ThreadID tid)
 
         renameDestRegs(inst, inst->threadNumber);
 
-        if (inst->isLoad()) {
-                loadsInProgress[tid]++;
-        }
-        if (inst->isStore()) {
-                storesInProgress[tid]++;
+        if (inst->isAtomic() || inst->isStore()) {
+            storesInProgress[tid]++;
+        } else if (inst->isLoad()) {
+            loadsInProgress[tid]++;
         }
+
         ++renamed_insts;
         // Notify potential listeners that source and destination registers for
         // this instruction have been renamed.
diff --git a/src/cpu/simple/atomic.cc b/src/cpu/simple/atomic.cc
index e91fafbcc..caf2427ef 100644
--- a/src/cpu/simple/atomic.cc
+++ b/src/cpu/simple/atomic.cc
@@ -72,6 +72,7 @@ AtomicSimpleCPU::init()
     ifetch_req->setContext(cid);
     data_read_req->setContext(cid);
     data_write_req->setContext(cid);
+    data_amo_req->setContext(cid);
 }
 
 AtomicSimpleCPU::AtomicSimpleCPU(AtomicSimpleCPUParams *p)
@@ -90,6 +91,7 @@ AtomicSimpleCPU::AtomicSimpleCPU(AtomicSimpleCPUParams *p)
     ifetch_req = std::make_shared<Request>();
     data_read_req = std::make_shared<Request>();
     data_write_req = std::make_shared<Request>();
+    data_amo_req = std::make_shared<Request>();
 }
 
 
@@ -417,14 +419,6 @@ AtomicSimpleCPU::readMem(Addr addr, uint8_t * data, unsigned size,
 }
 
 Fault
-AtomicSimpleCPU::initiateMemRead(Addr addr, unsigned size,
-                                 Request::Flags flags)
-{
-    panic("initiateMemRead() is for timing accesses, and should "
-          "never be called on AtomicSimpleCPU.\n");
-}
-
-Fault
 AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr,
                           Request::Flags flags, uint64_t *res)
 {
@@ -534,6 +528,70 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr,
     }
 }
 
+Fault
+AtomicSimpleCPU::amoMem(Addr addr, uint8_t* data, unsigned size,
+                        Request::Flags flags, AtomicOpFunctor *amo_op)
+{
+    SimpleExecContext& t_info = *threadInfo[curThread];
+    SimpleThread* thread = t_info.thread;
+
+    // use the CPU's statically allocated amo request and packet objects
+    const RequestPtr &req = data_amo_req;
+
+    if (traceData)
+        traceData->setMem(addr, size, flags);
+
+    //The address of the second part of this access if it needs to be split
+    //across a cache line boundary.
+    Addr secondAddr = roundDown(addr + size - 1, cacheLineSize());
+
+    // AMO requests that access across a cache line boundary are not
+    // allowed since the cache does not guarantee AMO ops to be executed
+    // atomically in two cache lines
+    // For ISAs such as x86 that requires AMO operations to work on
+    // accesses that cross cache-line boundaries, the cache needs to be
+    // modified to support locking both cache lines to guarantee the
+    // atomicity.
+    if (secondAddr > addr) {
+        panic("AMO request should not access across a cache line boundary\n");
+    }
+
+    dcache_latency = 0;
+
+    req->taskId(taskId());
+    req->setVirt(0, addr, size, flags, dataMasterId(),
+                 thread->pcState().instAddr(), amo_op);
+
+    // translate to physical address
+    Fault fault = thread->dtb->translateAtomic(req, thread->getTC(),
+                                                      BaseTLB::Write);
+
+    // Now do the access.
+    if (fault == NoFault && !req->getFlags().isSet(Request::NO_ACCESS)) {
+        // We treat AMO accesses as Write accesses with SwapReq command
+        // data will hold the return data of the AMO access
+        Packet pkt(req, Packet::makeWriteCmd(req));
+        pkt.dataStatic(data);
+
+        if (req->isMmappedIpr())
+            dcache_latency += TheISA::handleIprRead(thread->getTC(), &pkt);
+        else {
+            dcache_latency += sendPacket(dcachePort, &pkt);
+        }
+
+        dcache_access = true;
+
+        assert(!pkt.isError());
+        assert(!req->isLLSC());
+    }
+
+    if (fault != NoFault && req->isPrefetch()) {
+        return NoFault;
+    }
+
+    //If there's a fault and we're not doing prefetch, return it
+    return fault;
+}
 
 void
 AtomicSimpleCPU::tick()
@@ -550,6 +608,7 @@ AtomicSimpleCPU::tick()
         ifetch_req->setContext(cid);
         data_read_req->setContext(cid);
         data_write_req->setContext(cid);
+        data_amo_req->setContext(cid);
     }
 
     SimpleExecContext& t_info = *threadInfo[curThread];
diff --git a/src/cpu/simple/atomic.hh b/src/cpu/simple/atomic.hh
index a5151aa18..84f379121 100644
--- a/src/cpu/simple/atomic.hh
+++ b/src/cpu/simple/atomic.hh
@@ -163,6 +163,7 @@ class AtomicSimpleCPU : public BaseSimpleCPU
     RequestPtr ifetch_req;
     RequestPtr data_read_req;
     RequestPtr data_write_req;
+    RequestPtr data_amo_req;
 
     bool dcache_access;
     Tick dcache_latency;
@@ -197,12 +198,12 @@ class AtomicSimpleCPU : public BaseSimpleCPU
     Fault readMem(Addr addr, uint8_t *data, unsigned size,
                   Request::Flags flags) override;
 
-    Fault initiateMemRead(Addr addr, unsigned size,
-                          Request::Flags flags) override;
-
     Fault writeMem(uint8_t *data, unsigned size,
                    Addr addr, Request::Flags flags, uint64_t *res) override;
 
+    Fault amoMem(Addr addr, uint8_t* data, unsigned size,
+                 Request::Flags flags, AtomicOpFunctor *amo_op) override;
+
     void regProbePoints() override;
 
     /**
diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc
index f71277d1c..422c73298 100644
--- a/src/cpu/simple/base.cc
+++ b/src/cpu/simple/base.cc
@@ -644,7 +644,7 @@ BaseSimpleCPU::postExecute()
         t_info.numLoadInsts++;
     }
 
-    if (curStaticInst->isStore()){
+    if (curStaticInst->isStore() || curStaticInst->isAtomic()){
         t_info.numStoreInsts++;
     }
     /* End power model statistics */
diff --git a/src/cpu/simple/base.hh b/src/cpu/simple/base.hh
index e62fcf4d1..8060b07ad 100644
--- a/src/cpu/simple/base.hh
+++ b/src/cpu/simple/base.hh
@@ -143,13 +143,26 @@ class BaseSimpleCPU : public BaseCPU
     void startup() override;
 
     virtual Fault readMem(Addr addr, uint8_t* data, unsigned size,
-                          Request::Flags flags) = 0;
+                          Request::Flags flags)
+    { panic("readMem() is not implemented\n"); }
 
     virtual Fault initiateMemRead(Addr addr, unsigned size,
-                                  Request::Flags flags) = 0;
+                                  Request::Flags flags)
+    { panic("initiateMemRead() is not implemented\n"); }
 
     virtual Fault writeMem(uint8_t* data, unsigned size, Addr addr,
-                           Request::Flags flags, uint64_t* res) = 0;
+                           Request::Flags flags, uint64_t* res)
+    { panic("writeMem() is not implemented\n"); }
+
+    virtual Fault amoMem(Addr addr, uint8_t* data, unsigned size,
+                         Request::Flags flags,
+                         AtomicOpFunctor *amo_op)
+    { panic("amoMem() is not implemented\n"); }
+
+    virtual Fault initiateMemAMO(Addr addr, unsigned size,
+                                 Request::Flags flags,
+                                 AtomicOpFunctor *amo_op)
+    { panic("initiateMemAMO() is not implemented\n"); }
 
     void countInst();
     Counter totalInsts() const override;
diff --git a/src/cpu/simple/exec_context.hh b/src/cpu/simple/exec_context.hh
index 0552dc0c6..de5cc7fd7 100644
--- a/src/cpu/simple/exec_context.hh
+++ b/src/cpu/simple/exec_context.hh
@@ -456,6 +456,19 @@ class SimpleExecContext : public ExecContext {
         return cpu->writeMem(data, size, addr, flags, res);
     }
 
+    Fault amoMem(Addr addr, uint8_t *data, unsigned int size,
+                 Request::Flags flags, AtomicOpFunctor *amo_op) override
+    {
+        return cpu->amoMem(addr, data, size, flags, amo_op);
+    }
+
+    Fault initiateMemAMO(Addr addr, unsigned int size,
+                         Request::Flags flags,
+                         AtomicOpFunctor *amo_op) override
+    {
+        return cpu->initiateMemAMO(addr, size, flags, amo_op);
+    }
+
     /**
      * Sets the number of consecutive store conditional failures.
      */
diff --git a/src/cpu/simple/timing.cc b/src/cpu/simple/timing.cc
index b5450cf5f..637308a96 100644
--- a/src/cpu/simple/timing.cc
+++ b/src/cpu/simple/timing.cc
@@ -293,6 +293,7 @@ TimingSimpleCPU::sendData(const RequestPtr &req, uint8_t *data, uint64_t *res,
 
     PacketPtr pkt = buildPacket(req, read);
     pkt->dataDynamic<uint8_t>(data);
+
     if (req->getFlags().isSet(Request::NO_ACCESS)) {
         assert(!dcache_pkt);
         pkt->makeResponse();
@@ -415,14 +416,6 @@ TimingSimpleCPU::buildSplitPacket(PacketPtr &pkt1, PacketPtr &pkt2,
 }
 
 Fault
-TimingSimpleCPU::readMem(Addr addr, uint8_t *data,
-                         unsigned size, Request::Flags flags)
-{
-    panic("readMem() is for atomic accesses, and should "
-          "never be called on TimingSimpleCPU.\n");
-}
-
-Fault
 TimingSimpleCPU::initiateMemRead(Addr addr, unsigned size,
                                  Request::Flags flags)
 {
@@ -556,6 +549,54 @@ TimingSimpleCPU::writeMem(uint8_t *data, unsigned size,
     return NoFault;
 }
 
+Fault
+TimingSimpleCPU::initiateMemAMO(Addr addr, unsigned size,
+                                Request::Flags flags,
+                                AtomicOpFunctor *amo_op)
+{
+    SimpleExecContext &t_info = *threadInfo[curThread];
+    SimpleThread* thread = t_info.thread;
+
+    Fault fault;
+    const int asid = 0;
+    const Addr pc = thread->instAddr();
+    unsigned block_size = cacheLineSize();
+    BaseTLB::Mode mode = BaseTLB::Write;
+
+    if (traceData)
+        traceData->setMem(addr, size, flags);
+
+    RequestPtr req = make_shared<Request>(asid, addr, size, flags,
+                            dataMasterId(), pc, thread->contextId(), amo_op);
+
+    assert(req->hasAtomicOpFunctor());
+
+    req->taskId(taskId());
+
+    Addr split_addr = roundDown(addr + size - 1, block_size);
+
+    // AMO requests that access across a cache line boundary are not
+    // allowed since the cache does not guarantee AMO ops to be executed
+    // atomically in two cache lines
+    // For ISAs such as x86 that requires AMO operations to work on
+    // accesses that cross cache-line boundaries, the cache needs to be
+    // modified to support locking both cache lines to guarantee the
+    // atomicity.
+    if (split_addr > addr) {
+        panic("AMO requests should not access across a cache line boundary\n");
+    }
+
+    _status = DTBWaitResponse;
+
+    WholeTranslationState *state =
+        new WholeTranslationState(req, new uint8_t[size], NULL, mode);
+    DataTranslation<TimingSimpleCPU *> *translation
+        = new DataTranslation<TimingSimpleCPU *>(this, state);
+    thread->dtb->translateTiming(req, thread->getTC(), translation, mode);
+
+    return NoFault;
+}
+
 void
 TimingSimpleCPU::threadSnoop(PacketPtr pkt, ThreadID sender)
 {
diff --git a/src/cpu/simple/timing.hh b/src/cpu/simple/timing.hh
index 0300d38eb..ce0a4dbfc 100644
--- a/src/cpu/simple/timing.hh
+++ b/src/cpu/simple/timing.hh
@@ -282,15 +282,15 @@ class TimingSimpleCPU : public BaseSimpleCPU
     void activateContext(ThreadID thread_num) override;
     void suspendContext(ThreadID thread_num) override;
 
-    Fault readMem(Addr addr, uint8_t *data, unsigned size,
-                  Request::Flags flags) override;
-
     Fault initiateMemRead(Addr addr, unsigned size,
                           Request::Flags flags) override;
 
     Fault writeMem(uint8_t *data, unsigned size,
                    Addr addr, Request::Flags flags, uint64_t *res) override;
 
+    Fault initiateMemAMO(Addr addr, unsigned size, Request::Flags flags,
+                         AtomicOpFunctor *amo_op) override;
+
     void fetch();
     void sendFetch(const Fault &fault,
                    const RequestPtr &req, ThreadContext *tc);