cpu: support atomic memory request type with AtomicOpFunctor

This patch enables all 4 CPU models (AtomicSimpleCPU, TimingSimpleCPU, MinorCPU and DerivO3CPU) to issue atomic memory (AMO) requests to memory system. Atomic memory instruction is treated as a special store instruction in all CPU models. In simple CPUs, an AMO request with an associated AtomicOpFunctor is simply sent to L1 dcache. In MinorCPU, an AMO request bypasses store buffer and waits for any conflicting store request(s) currently in the store buffer to retire before the AMO request is sent to the cache. AMO requests are not buffered in the store buffer, so their effects appear immediately in the cache. In DerivO3CPU, an AMO request is inserted in the store buffer so that it is delivered to the cache only after all previous stores are issued to the cache. Data forwarding between between an outstanding AMO in the store buffer and a subsequent load is not allowed since the AMO request does not hold valid data until it's executed in the cache. This implementation assumes that a target ISA implementation must insert enough memory fences as micro-ops around an atomic instruction to enforce a correct order of memory instructions with respect to its memory consistency model. Without extra memory fences, this implementation can allow AMOs and other memory instructions that do not conflict (i.e., not target the same address) to reorder. This implementation also assumes that atomic instructions execute within a cache line boundary since the cache for now is not able to execute an operation on two different cache lines in one single step. Therefore, ISAs like x86 that require multi-cache-line atomic instructions need to either use a pair of locking load and unlocking store or change the cache implementation to guarantee the atomicity of an atomic instruction. Change-Id: Ib8a7c81868ac05b98d73afc7d16eb88486f8cf9a Reviewed-on: https://gem5-review.googlesource.com/c/8188 Reviewed-by: Giacomo Travaglini <giacomo.travaglini@arm.com> Maintainer: Jason Lowe-Power <jason@lowepower.com>
author: Tuan Ta <qtt2@cornell.edu> 2018-01-22 13:12:50 -0500
committer: Tuan Ta <qtt2@cornell.edu> 2019-02-08 15:27:04 +0000
commit: 25dc765889d948693995cfa622f001aa94b5364b (patch)
tree: 38a8e93881ad150a482020a1fd706d664ee0c061 /src/cpu/simple
parent: 165a7dab558c8118622a387683521bea1ebf2e6c (diff)
download: gem5-25dc765889d948693995cfa622f001aa94b5364b.tar.xz
7 files changed, 153 insertions, 26 deletions
diff --git a/src/cpu/simple/atomic.cc b/src/cpu/simple/atomic.cc
index e91fafbcc..caf2427ef 100644
--- a/src/cpu/simple/atomic.cc
+++ b/src/cpu/simple/atomic.cc
@@ -72,6 +72,7 @@ AtomicSimpleCPU::init()
     ifetch_req->setContext(cid);
     data_read_req->setContext(cid);
     data_write_req->setContext(cid);
+    data_amo_req->setContext(cid);
 }
 
 AtomicSimpleCPU::AtomicSimpleCPU(AtomicSimpleCPUParams *p)
@@ -90,6 +91,7 @@ AtomicSimpleCPU::AtomicSimpleCPU(AtomicSimpleCPUParams *p)
     ifetch_req = std::make_shared<Request>();
     data_read_req = std::make_shared<Request>();
     data_write_req = std::make_shared<Request>();
+    data_amo_req = std::make_shared<Request>();
 }
 
 
@@ -417,14 +419,6 @@ AtomicSimpleCPU::readMem(Addr addr, uint8_t * data, unsigned size,
 }
 
 Fault
-AtomicSimpleCPU::initiateMemRead(Addr addr, unsigned size,
-                                 Request::Flags flags)
-{
-    panic("initiateMemRead() is for timing accesses, and should "
-          "never be called on AtomicSimpleCPU.\n");
-}
-
-Fault
 AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr,
                           Request::Flags flags, uint64_t *res)
 {
@@ -534,6 +528,70 @@ AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size, Addr addr,
     }
 }
 
+Fault
+AtomicSimpleCPU::amoMem(Addr addr, uint8_t* data, unsigned size,
+                        Request::Flags flags, AtomicOpFunctor *amo_op)
+{
+    SimpleExecContext& t_info = *threadInfo[curThread];
+    SimpleThread* thread = t_info.thread;
+
+    // use the CPU's statically allocated amo request and packet objects
+    const RequestPtr &req = data_amo_req;
+
+    if (traceData)
+        traceData->setMem(addr, size, flags);
+
+    //The address of the second part of this access if it needs to be split
+    //across a cache line boundary.
+    Addr secondAddr = roundDown(addr + size - 1, cacheLineSize());
+
+    // AMO requests that access across a cache line boundary are not
+    // allowed since the cache does not guarantee AMO ops to be executed
+    // atomically in two cache lines
+    // For ISAs such as x86 that requires AMO operations to work on
+    // accesses that cross cache-line boundaries, the cache needs to be
+    // modified to support locking both cache lines to guarantee the
+    // atomicity.
+    if (secondAddr > addr) {
+        panic("AMO request should not access across a cache line boundary\n");
+    }
+
+    dcache_latency = 0;
+
+    req->taskId(taskId());
+    req->setVirt(0, addr, size, flags, dataMasterId(),
+                 thread->pcState().instAddr(), amo_op);
+
+    // translate to physical address
+    Fault fault = thread->dtb->translateAtomic(req, thread->getTC(),
+                                                      BaseTLB::Write);
+
+    // Now do the access.
+    if (fault == NoFault && !req->getFlags().isSet(Request::NO_ACCESS)) {
+        // We treat AMO accesses as Write accesses with SwapReq command
+        // data will hold the return data of the AMO access
+        Packet pkt(req, Packet::makeWriteCmd(req));
+        pkt.dataStatic(data);
+
+        if (req->isMmappedIpr())
+            dcache_latency += TheISA::handleIprRead(thread->getTC(), &pkt);
+        else {
+            dcache_latency += sendPacket(dcachePort, &pkt);
+        }
+
+        dcache_access = true;
+
+        assert(!pkt.isError());
+        assert(!req->isLLSC());
+    }
+
+    if (fault != NoFault && req->isPrefetch()) {
+        return NoFault;
+    }
+
+    //If there's a fault and we're not doing prefetch, return it
+    return fault;
+}
 
 void
 AtomicSimpleCPU::tick()
@@ -550,6 +608,7 @@ AtomicSimpleCPU::tick()
         ifetch_req->setContext(cid);
         data_read_req->setContext(cid);
         data_write_req->setContext(cid);
+        data_amo_req->setContext(cid);
     }
 
     SimpleExecContext& t_info = *threadInfo[curThread];
diff --git a/src/cpu/simple/atomic.hh b/src/cpu/simple/atomic.hh
index a5151aa18..84f379121 100644
--- a/src/cpu/simple/atomic.hh
+++ b/src/cpu/simple/atomic.hh
@@ -163,6 +163,7 @@ class AtomicSimpleCPU : public BaseSimpleCPU
     RequestPtr ifetch_req;
     RequestPtr data_read_req;
     RequestPtr data_write_req;
+    RequestPtr data_amo_req;
 
     bool dcache_access;
     Tick dcache_latency;
@@ -197,12 +198,12 @@ class AtomicSimpleCPU : public BaseSimpleCPU
     Fault readMem(Addr addr, uint8_t *data, unsigned size,
                   Request::Flags flags) override;
 
-    Fault initiateMemRead(Addr addr, unsigned size,
-                          Request::Flags flags) override;
-
     Fault writeMem(uint8_t *data, unsigned size,
                    Addr addr, Request::Flags flags, uint64_t *res) override;
 
+    Fault amoMem(Addr addr, uint8_t* data, unsigned size,
+                 Request::Flags flags, AtomicOpFunctor *amo_op) override;
+
     void regProbePoints() override;
 
     /**
diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc
index f71277d1c..422c73298 100644
--- a/src/cpu/simple/base.cc
+++ b/src/cpu/simple/base.cc
@@ -644,7 +644,7 @@ BaseSimpleCPU::postExecute()
         t_info.numLoadInsts++;
     }
 
-    if (curStaticInst->isStore()){
+    if (curStaticInst->isStore() || curStaticInst->isAtomic()){
         t_info.numStoreInsts++;
     }
     /* End power model statistics */
diff --git a/src/cpu/simple/base.hh b/src/cpu/simple/base.hh
index e62fcf4d1..8060b07ad 100644
--- a/src/cpu/simple/base.hh
+++ b/src/cpu/simple/base.hh
@@ -143,13 +143,26 @@ class BaseSimpleCPU : public BaseCPU
     void startup() override;
 
     virtual Fault readMem(Addr addr, uint8_t* data, unsigned size,
-                          Request::Flags flags) = 0;
+                          Request::Flags flags)
+    { panic("readMem() is not implemented\n"); }
 
     virtual Fault initiateMemRead(Addr addr, unsigned size,
-                                  Request::Flags flags) = 0;
+                                  Request::Flags flags)
+    { panic("initiateMemRead() is not implemented\n"); }
 
     virtual Fault writeMem(uint8_t* data, unsigned size, Addr addr,
-                           Request::Flags flags, uint64_t* res) = 0;
+                           Request::Flags flags, uint64_t* res)
+    { panic("writeMem() is not implemented\n"); }
+
+    virtual Fault amoMem(Addr addr, uint8_t* data, unsigned size,
+                         Request::Flags flags,
+                         AtomicOpFunctor *amo_op)
+    { panic("amoMem() is not implemented\n"); }
+
+    virtual Fault initiateMemAMO(Addr addr, unsigned size,
+                                 Request::Flags flags,
+                                 AtomicOpFunctor *amo_op)
+    { panic("initiateMemAMO() is not implemented\n"); }
 
     void countInst();
     Counter totalInsts() const override;
diff --git a/src/cpu/simple/exec_context.hh b/src/cpu/simple/exec_context.hh
index 0552dc0c6..de5cc7fd7 100644
--- a/src/cpu/simple/exec_context.hh
+++ b/src/cpu/simple/exec_context.hh
@@ -456,6 +456,19 @@ class SimpleExecContext : public ExecContext {
         return cpu->writeMem(data, size, addr, flags, res);
     }
 
+    Fault amoMem(Addr addr, uint8_t *data, unsigned int size,
+                 Request::Flags flags, AtomicOpFunctor *amo_op) override
+    {
+        return cpu->amoMem(addr, data, size, flags, amo_op);
+    }
+
+    Fault initiateMemAMO(Addr addr, unsigned int size,
+                         Request::Flags flags,
+                         AtomicOpFunctor *amo_op) override
+    {
+        return cpu->initiateMemAMO(addr, size, flags, amo_op);
+    }
+
     /**
      * Sets the number of consecutive store conditional failures.
      */
diff --git a/src/cpu/simple/timing.cc b/src/cpu/simple/timing.cc
index b5450cf5f..637308a96 100644
--- a/src/cpu/simple/timing.cc
+++ b/src/cpu/simple/timing.cc
@@ -293,6 +293,7 @@ TimingSimpleCPU::sendData(const RequestPtr &req, uint8_t *data, uint64_t *res,
 
     PacketPtr pkt = buildPacket(req, read);
     pkt->dataDynamic<uint8_t>(data);
+
     if (req->getFlags().isSet(Request::NO_ACCESS)) {
         assert(!dcache_pkt);
         pkt->makeResponse();
@@ -415,14 +416,6 @@ TimingSimpleCPU::buildSplitPacket(PacketPtr &pkt1, PacketPtr &pkt2,
 }
 
 Fault
-TimingSimpleCPU::readMem(Addr addr, uint8_t *data,
-                         unsigned size, Request::Flags flags)
-{
-    panic("readMem() is for atomic accesses, and should "
-          "never be called on TimingSimpleCPU.\n");
-}
-
-Fault
 TimingSimpleCPU::initiateMemRead(Addr addr, unsigned size,
                                  Request::Flags flags)
 {
@@ -556,6 +549,54 @@ TimingSimpleCPU::writeMem(uint8_t *data, unsigned size,
     return NoFault;
 }
 
+Fault
+TimingSimpleCPU::initiateMemAMO(Addr addr, unsigned size,
+                                Request::Flags flags,
+                                AtomicOpFunctor *amo_op)
+{
+    SimpleExecContext &t_info = *threadInfo[curThread];
+    SimpleThread* thread = t_info.thread;
+
+    Fault fault;
+    const int asid = 0;
+    const Addr pc = thread->instAddr();
+    unsigned block_size = cacheLineSize();
+    BaseTLB::Mode mode = BaseTLB::Write;
+
+    if (traceData)
+        traceData->setMem(addr, size, flags);
+
+    RequestPtr req = make_shared<Request>(asid, addr, size, flags,
+                            dataMasterId(), pc, thread->contextId(), amo_op);
+
+    assert(req->hasAtomicOpFunctor());
+
+    req->taskId(taskId());
+
+    Addr split_addr = roundDown(addr + size - 1, block_size);
+
+    // AMO requests that access across a cache line boundary are not
+    // allowed since the cache does not guarantee AMO ops to be executed
+    // atomically in two cache lines
+    // For ISAs such as x86 that requires AMO operations to work on
+    // accesses that cross cache-line boundaries, the cache needs to be
+    // modified to support locking both cache lines to guarantee the
+    // atomicity.
+    if (split_addr > addr) {
+        panic("AMO requests should not access across a cache line boundary\n");
+    }
+
+    _status = DTBWaitResponse;
+
+    WholeTranslationState *state =
+        new WholeTranslationState(req, new uint8_t[size], NULL, mode);
+    DataTranslation<TimingSimpleCPU *> *translation
+        = new DataTranslation<TimingSimpleCPU *>(this, state);
+    thread->dtb->translateTiming(req, thread->getTC(), translation, mode);
+
+    return NoFault;
+}
+
 void
 TimingSimpleCPU::threadSnoop(PacketPtr pkt, ThreadID sender)
 {
diff --git a/src/cpu/simple/timing.hh b/src/cpu/simple/timing.hh
index 0300d38eb..ce0a4dbfc 100644
--- a/src/cpu/simple/timing.hh
+++ b/src/cpu/simple/timing.hh
@@ -282,15 +282,15 @@ class TimingSimpleCPU : public BaseSimpleCPU
     void activateContext(ThreadID thread_num) override;
     void suspendContext(ThreadID thread_num) override;
 
-    Fault readMem(Addr addr, uint8_t *data, unsigned size,
-                  Request::Flags flags) override;
-
     Fault initiateMemRead(Addr addr, unsigned size,
                           Request::Flags flags) override;
 
     Fault writeMem(uint8_t *data, unsigned size,
                    Addr addr, Request::Flags flags, uint64_t *res) override;
 
+    Fault initiateMemAMO(Addr addr, unsigned size, Request::Flags flags,
+                         AtomicOpFunctor *amo_op) override;
+
     void fetch();
     void sendFetch(const Fault &fault,
                    const RequestPtr &req, ThreadContext *tc);
author	Tuan Ta <qtt2@cornell.edu>	2018-01-22 13:12:50 -0500
committer	Tuan Ta <qtt2@cornell.edu>	2019-02-08 15:27:04 +0000
commit	25dc765889d948693995cfa622f001aa94b5364b (patch)
tree	38a8e93881ad150a482020a1fd706d664ee0c061 /src/cpu/simple
parent	165a7dab558c8118622a387683521bea1ebf2e6c (diff)
download	gem5-25dc765889d948693995cfa622f001aa94b5364b.tar.xz