34 files changed, 657 insertions, 360 deletions
diff --git a/src/cpu/SConscript b/src/cpu/SConscript
index 2bb9a2399..5771a7904 100644
--- a/src/cpu/SConscript
+++ b/src/cpu/SConscript
@@ -158,6 +158,7 @@ if 'O3CPU' in env['CPU_MODELS']:
         o3/scoreboard.cc
         o3/store_set.cc
         ''')
+    sources += Split('memtest/memtest.cc')
     if env['USE_CHECKER']:
         sources += Split('o3/checker_builder.cc')
     else:
diff --git a/src/cpu/base.cc b/src/cpu/base.cc
index 513dd7c55..ea4b03bf2 100644
--- a/src/cpu/base.cc
+++ b/src/cpu/base.cc
@@ -41,6 +41,7 @@
 #include "cpu/cpuevent.hh"
 #include "cpu/thread_context.hh"
 #include "cpu/profile.hh"
+#include "sim/sim_exit.hh"
 #include "sim/param.hh"
 #include "sim/process.hh"
 #include "sim/sim_events.hh"
@@ -125,8 +126,9 @@ BaseCPU::BaseCPU(Params *p)
     //
     if (p->max_insts_any_thread != 0)
         for (int i = 0; i < number_of_threads; ++i)
-            new SimLoopExitEvent(comInstEventQueue[i], p->max_insts_any_thread,
-                                 "a thread reached the max instruction count");
+            schedExitSimLoop("a thread reached the max instruction count",
+                             p->max_insts_any_thread, 0,
+                             comInstEventQueue[i]);
 
     if (p->max_insts_all_threads != 0) {
         // allocate & initialize shared downcounter: each event will
@@ -150,8 +152,9 @@ BaseCPU::BaseCPU(Params *p)
     //
     if (p->max_loads_any_thread != 0)
         for (int i = 0; i < number_of_threads; ++i)
-            new SimLoopExitEvent(comLoadEventQueue[i], p->max_loads_any_thread,
-                                 "a thread reached the max load count");
+            schedExitSimLoop("a thread reached the max load count",
+                             p->max_loads_any_thread, 0,
+                             comLoadEventQueue[i]);
 
     if (p->max_loads_all_threads != 0) {
         // allocate & initialize shared downcounter: each event will
diff --git a/src/cpu/base.hh b/src/cpu/base.hh
index e02527371..75e0d86af 100644
--- a/src/cpu/base.hh
+++ b/src/cpu/base.hh
@@ -140,8 +140,8 @@ class BaseCPU : public MemObject
         bool functionTrace;
         Tick functionTraceStart;
         System *system;
-#if FULL_SYSTEM
         int cpu_id;
+#if FULL_SYSTEM
         Tick profile;
 #endif
         Tick progress_interval;
diff --git a/src/cpu/base_dyn_inst_impl.hh b/src/cpu/base_dyn_inst_impl.hh
index f2109e88d..d6cdff5c5 100644
--- a/src/cpu/base_dyn_inst_impl.hh
+++ b/src/cpu/base_dyn_inst_impl.hh
@@ -193,7 +193,7 @@ BaseDynInst<Impl>::prefetch(Addr addr, unsigned flags)
     // note this is a local, not BaseDynInst::fault
     Fault trans_fault = cpu->translateDataReadReq(req);
 
-    if (trans_fault == NoFault && !(req->flags & UNCACHEABLE)) {
+    if (trans_fault == NoFault && !(req->isUncacheable())) {
         // It's a valid address to cacheable space.  Record key MemReq
         // parameters so we can generate another one just like it for
         // the timing access without calling translate() again (which
diff --git a/src/cpu/checker/cpu.cc b/src/cpu/checker/cpu.cc
index 1540a6b94..f6d56eef6 100644
--- a/src/cpu/checker/cpu.cc
+++ b/src/cpu/checker/cpu.cc
@@ -175,7 +175,7 @@ CheckerCPU::read(Addr addr, T &data, unsigned flags)
 
     pkt->dataStatic(&data);
 
-    if (!(memReq->getFlags() & UNCACHEABLE)) {
+    if (!(memReq->isUncacheable())) {
         // Access memory to see if we have the same data
         dcachePort->sendFunctional(pkt);
     } else {
@@ -251,9 +251,9 @@ CheckerCPU::write(T data, Addr addr, unsigned flags, uint64_t *res)
     // This is because the LSQ would have to be snooped in the CPU to
     // verify this data.
     if (unverifiedReq &&
-        !(unverifiedReq->getFlags() & UNCACHEABLE) &&
-        (!(unverifiedReq->getFlags() & LOCKED) ||
-         ((unverifiedReq->getFlags() & LOCKED) &&
+        !(unverifiedReq->isUncacheable()) &&
+        (!(unverifiedReq->isLocked()) ||
+         ((unverifiedReq->isLocked()) &&
           unverifiedReq->getScResult() == 1))) {
         T inst_data;
 /*
diff --git a/src/cpu/checker/thread_context.hh b/src/cpu/checker/thread_context.hh
index 8c0186dae..b2806d40b 100644
--- a/src/cpu/checker/thread_context.hh
+++ b/src/cpu/checker/thread_context.hh
@@ -133,7 +133,7 @@ class CheckerThreadContext : public ThreadContext
     void takeOverFrom(ThreadContext *oldContext)
     {
         actualTC->takeOverFrom(oldContext);
-        checkerTC->takeOverFrom(oldContext);
+        checkerTC->copyState(oldContext);
     }
 
     void regStats(const std::string &name) { actualTC->regStats(name); }
diff --git a/src/cpu/memtest/memtest.cc b/src/cpu/memtest/memtest.cc
index 7ea9eaefc..f42f0f8e2 100644
--- a/src/cpu/memtest/memtest.cc
+++ b/src/cpu/memtest/memtest.cc
@@ -38,86 +38,158 @@
 
 #include "base/misc.hh"
 #include "base/statistics.hh"
-#include "cpu/simple_thread.hh"
+//#include "cpu/simple_thread.hh"
 #include "cpu/memtest/memtest.hh"
-#include "mem/cache/base_cache.hh"
+//#include "mem/cache/base_cache.hh"
+//#include "mem/physical.hh"
 #include "sim/builder.hh"
 #include "sim/sim_events.hh"
 #include "sim/stats.hh"
+#include "mem/packet.hh"
+#include "mem/request.hh"
+#include "mem/port.hh"
+#include "mem/mem_object.hh"
 
 using namespace std;
-using namespace TheISA;
 
 int TESTER_ALLOCATOR=0;
 
+bool
+MemTest::CpuPort::recvTiming(Packet *pkt)
+{
+    memtest->completeRequest(pkt);
+    return true;
+}
+
+Tick
+MemTest::CpuPort::recvAtomic(Packet *pkt)
+{
+    panic("MemTest doesn't expect recvAtomic callback!");
+    return curTick;
+}
+
+void
+MemTest::CpuPort::recvFunctional(Packet *pkt)
+{
+    //Do nothing if we see one come through
+    if (curTick != 0)//Supress warning durring initialization
+        warn("Functional Writes not implemented in MemTester\n");
+    //Need to find any response values that intersect and update
+    return;
+}
+
+void
+MemTest::CpuPort::recvStatusChange(Status status)
+{
+    if (status == RangeChange)
+        return;
+
+    panic("MemTest doesn't expect recvStatusChange callback!");
+}
+
+void
+MemTest::CpuPort::recvRetry()
+{
+    memtest->doRetry();
+}
+
+void
+MemTest::sendPkt(Packet *pkt) {
+    if (atomic) {
+        cachePort.sendAtomic(pkt);
+        pkt->makeAtomicResponse();
+        completeRequest(pkt);
+    }
+    else if (!cachePort.sendTiming(pkt)) {
+        accessRetry = true;
+        retryPkt = pkt;
+    }
+
+}
+
 MemTest::MemTest(const string &name,
-                 MemInterface *_cache_interface,
-                 FunctionalMemory *main_mem,
-                 FunctionalMemory *check_mem,
+//		 MemInterface *_cache_interface,
+//		 PhysicalMemory *main_mem,
+//		 PhysicalMemory *check_mem,
                  unsigned _memorySize,
                  unsigned _percentReads,
-                 unsigned _percentCopies,
+//		 unsigned _percentCopies,
                  unsigned _percentUncacheable,
                  unsigned _progressInterval,
                  unsigned _percentSourceUnaligned,
                  unsigned _percentDestUnaligned,
                  Addr _traceAddr,
-                 Counter _max_loads)
-    : SimObject(name),
+                 Counter _max_loads,
+                 bool _atomic)
+    : MemObject(name),
       tickEvent(this),
-      cacheInterface(_cache_interface),
-      mainMem(main_mem),
-      checkMem(check_mem),
+      cachePort("test", this),
+      funcPort("functional", this),
+      retryPkt(NULL),
+//      mainMem(main_mem),
+//      checkMem(check_mem),
       size(_memorySize),
       percentReads(_percentReads),
-      percentCopies(_percentCopies),
+//      percentCopies(_percentCopies),
       percentUncacheable(_percentUncacheable),
       progressInterval(_progressInterval),
       nextProgressMessage(_progressInterval),
       percentSourceUnaligned(_percentSourceUnaligned),
       percentDestUnaligned(percentDestUnaligned),
-      maxLoads(_max_loads)
+      maxLoads(_max_loads),
+      atomic(_atomic)
 {
     vector<string> cmd;
     cmd.push_back("/bin/ls");
     vector<string> null_vec;
-    thread = new SimpleThread(NULL, 0, mainMem, 0);
-
-    blockSize = cacheInterface->getBlockSize();
-    blockAddrMask = blockSize - 1;
-    traceBlockAddr = blockAddr(_traceAddr);
-
-    //setup data storage with interesting values
-    uint8_t *data1 = new uint8_t[size];
-    uint8_t *data2 = new uint8_t[size];
-    uint8_t *data3 = new uint8_t[size];
-    memset(data1, 1, size);
-    memset(data2, 2, size);
-    memset(data3, 3, size);
+    //  thread = new SimpleThread(NULL, 0, NULL, 0, mainMem);
     curTick = 0;
 
+    // Needs to be masked off once we know the block size.
+    traceBlockAddr = _traceAddr;
     baseAddr1 = 0x100000;
     baseAddr2 = 0x400000;
     uncacheAddr = 0x800000;
 
-    // set up intial memory contents here
-    mainMem->prot_write(baseAddr1, data1, size);
-    checkMem->prot_write(baseAddr1, data1, size);
-    mainMem->prot_write(baseAddr2, data2, size);
-    checkMem->prot_write(baseAddr2, data2, size);
-    mainMem->prot_write(uncacheAddr, data3, size);
-    checkMem->prot_write(uncacheAddr, data3, size);
-
-    delete [] data1;
-    delete [] data2;
-    delete [] data3;
-
     // set up counters
     noResponseCycles = 0;
     numReads = 0;
     tickEvent.schedule(0);
 
     id = TESTER_ALLOCATOR++;
+    if (TESTER_ALLOCATOR > 8)
+        panic("False sharing memtester only allows up to 8 testers");
+
+    accessRetry = false;
+}
+
+Port *
+MemTest::getPort(const std::string &if_name, int idx)
+{
+    if (if_name == "functional")
+        return &funcPort;
+    else if (if_name == "test")
+        return &cachePort;
+    else
+        panic("No Such Port\n");
+}
+
+void
+MemTest::init()
+{
+    // By the time init() is called, the ports should be hooked up.
+    blockSize = cachePort.peerBlockSize();
+    blockAddrMask = blockSize - 1;
+    traceBlockAddr = blockAddr(traceBlockAddr);
+
+    // set up intial memory contents here
+
+    cachePort.memsetBlob(baseAddr1, 1, size);
+    funcPort.memsetBlob(baseAddr1, 1, size);
+    cachePort.memsetBlob(baseAddr2, 2, size);
+    funcPort.memsetBlob(baseAddr2, 2, size);
+    cachePort.memsetBlob(uncacheAddr, 3, size);
+    funcPort.memsetBlob(uncacheAddr, 3, size);
 }
 
 static void
@@ -132,23 +204,31 @@ printData(ostream &os, uint8_t *data, int nbytes)
 }
 
 void
-MemTest::completeRequest(MemReqPtr &req, uint8_t *data)
+MemTest::completeRequest(Packet *pkt)
 {
+    MemTestSenderState *state =
+        dynamic_cast<MemTestSenderState *>(pkt->senderState);
+
+    uint8_t *data = state->data;
+    uint8_t *pkt_data = pkt->getPtr<uint8_t>();
+    Request *req = pkt->req;
+
     //Remove the address from the list of outstanding
-    std::set<unsigned>::iterator removeAddr = outstandingAddrs.find(req->paddr);
+    std::set<unsigned>::iterator removeAddr = outstandingAddrs.find(req->getPaddr());
     assert(removeAddr != outstandingAddrs.end());
     outstandingAddrs.erase(removeAddr);
 
-    switch (req->cmd) {
-      case Read:
-        if (memcmp(req->data, data, req->size) != 0) {
-            cerr << name() << ": on read of 0x" << hex << req->paddr
-                 << " (0x" << hex << blockAddr(req->paddr) << ")"
+    switch (pkt->cmd) {
+      case Packet::ReadResp:
+
+        if (memcmp(pkt_data, data, pkt->getSize()) != 0) {
+            cerr << name() << ": on read of 0x" << hex << req->getPaddr()
+                 << " (0x" << hex << blockAddr(req->getPaddr()) << ")"
                  << "@ cycle " << dec << curTick
                  << ", cache returns 0x";
-            printData(cerr, req->data, req->size);
+            printData(cerr, pkt_data, pkt->getSize());
             cerr << ", expected 0x";
-            printData(cerr, data, req->size);
+            printData(cerr, data, pkt->getSize());
             cerr << endl;
             fatal("");
         }
@@ -163,13 +243,13 @@ MemTest::completeRequest(MemReqPtr &req, uint8_t *data)
         }
 
         if (numReads >= maxLoads)
-            SimExit(curTick, "Maximum number of loads reached!");
+            exitSimLoop("Maximum number of loads reached!");
         break;
 
-      case Write:
+      case Packet::WriteResp:
         numWritesStat++;
         break;
-
+/*
       case Copy:
         //Also remove dest from outstanding list
         removeAddr = outstandingAddrs.find(req->dest);
@@ -177,36 +257,37 @@ MemTest::completeRequest(MemReqPtr &req, uint8_t *data)
         outstandingAddrs.erase(removeAddr);
         numCopiesStat++;
         break;
-
+*/
       default:
         panic("invalid command");
     }
 
-    if (blockAddr(req->paddr) == traceBlockAddr) {
+    if (blockAddr(req->getPaddr()) == traceBlockAddr) {
         cerr << name() << ": completed "
-             << (req->cmd.isWrite() ? "write" : "read")
+             << (pkt->isWrite() ? "write" : "read")
              << " access of "
-             << dec << req->size << " bytes at address 0x"
-             << hex << req->paddr
-             << " (0x" << hex << blockAddr(req->paddr) << ")"
+             << dec << pkt->getSize() << " bytes at address 0x"
+             << hex << req->getPaddr()
+             << " (0x" << hex << blockAddr(req->getPaddr()) << ")"
              << ", value = 0x";
-        printData(cerr, req->data, req->size);
+        printData(cerr, pkt_data, pkt->getSize());
         cerr << " @ cycle " << dec << curTick;
 
         cerr << endl;
     }
 
     noResponseCycles = 0;
+    delete state;
     delete [] data;
+    delete pkt->req;
+    delete pkt;
 }
 
-
 void
 MemTest::regStats()
 {
     using namespace Stats;
 
-
     numReadsStat
         .name(name() + ".num_reads")
         .desc("number of read accesses completed")
@@ -234,7 +315,7 @@ MemTest::tick()
         fatal("");
     }
 
-    if (cacheInterface->isBlocked()) {
+    if (accessRetry) {
         return;
     }
 
@@ -248,30 +329,30 @@ MemTest::tick()
 
     //If we aren't doing copies, use id as offset, and do a false sharing
     //mem tester
-    if (percentCopies == 0) {
-        //We can eliminate the lower bits of the offset, and then use the id
-        //to offset within the blks
-        offset &= ~63; //Not the low order bits
-        offset += id;
-        access_size = 0;
-    }
+    //We can eliminate the lower bits of the offset, and then use the id
+    //to offset within the blks
+    offset &= ~63; //Not the low order bits
+    offset += id;
+    access_size = 0;
 
-    MemReqPtr req = new MemReq();
+    Request *req = new Request();
+    uint32_t flags = 0;
+    Addr paddr;
 
     if (cacheable < percentUncacheable) {
-        req->flags |= UNCACHEABLE;
-        req->paddr = uncacheAddr + offset;
+        flags |= UNCACHEABLE;
+        paddr = uncacheAddr + offset;
     } else {
-        req->paddr = ((base) ? baseAddr1 : baseAddr2) + offset;
+        paddr = ((base) ? baseAddr1 : baseAddr2) + offset;
     }
-    // bool probe = (random() % 2 == 1) && !req->isUncacheable();
+    //bool probe = (random() % 2 == 1) && !req->isUncacheable();
     bool probe = false;
 
-    req->size = 1 << access_size;
-    req->data = new uint8_t[req->size];
-    req->paddr &= ~(req->size - 1);
-    req->time = curTick;
-    req->xc = thread->getProxy();
+    paddr &= ~((1 << access_size) - 1);
+    req->setPhys(paddr, 1 << access_size, flags);
+    req->setThreadContext(id,0);
+
+    uint8_t *result = new uint8_t[8];
 
     if (cmd < percentReads) {
         // read
@@ -279,60 +360,75 @@ MemTest::tick()
         //For now we only allow one outstanding request per addreess per tester
         //This means we assume CPU does write forwarding to reads that alias something
         //in the cpu store buffer.
-        if (outstandingAddrs.find(req->paddr) != outstandingAddrs.end()) return;
-        else outstandingAddrs.insert(req->paddr);
+        if (outstandingAddrs.find(paddr) != outstandingAddrs.end()) return;
+        else outstandingAddrs.insert(paddr);
+
+        // ***** NOTE FOR RON: I'm not sure how to access checkMem. - Kevin
+        funcPort.readBlob(req->getPaddr(), result, req->getSize());
 
-        req->cmd = Read;
-        uint8_t *result = new uint8_t[8];
-        checkMem->access(Read, req->paddr, result, req->size);
-        if (blockAddr(req->paddr) == traceBlockAddr) {
+        if (blockAddr(paddr) == traceBlockAddr) {
             cerr << name()
                  << ": initiating read "
                  << ((probe) ? "probe of " : "access of ")
-                 << dec << req->size << " bytes from addr 0x"
-                 << hex << req->paddr
-                 << " (0x" << hex << blockAddr(req->paddr) << ")"
+                 << dec << req->getSize() << " bytes from addr 0x"
+                 << hex << paddr
+                 << " (0x" << hex << blockAddr(paddr) << ")"
                  << " at cycle "
                  << dec << curTick << endl;
         }
+
+        Packet *pkt = new Packet(req, Packet::ReadReq, Packet::Broadcast);
+        pkt->dataDynamicArray(new uint8_t[req->getSize()]);
+        MemTestSenderState *state = new MemTestSenderState(result);
+        pkt->senderState = state;
+
         if (probe) {
-            cacheInterface->probeAndUpdate(req);
-            completeRequest(req, result);
+            cachePort.sendFunctional(pkt);
+            completeRequest(pkt);
         } else {
-            req->completionEvent = new MemCompleteEvent(req, result, this);
-            cacheInterface->access(req);
+//	    req->completionEvent = new MemCompleteEvent(req, result, this);
+            sendPkt(pkt);
         }
-    } else if (cmd < (100 - percentCopies)){
+    } else {
         // write
 
         //For now we only allow one outstanding request per addreess per tester
         //This means we assume CPU does write forwarding to reads that alias something
         //in the cpu store buffer.
-        if (outstandingAddrs.find(req->paddr) != outstandingAddrs.end()) return;
-        else outstandingAddrs.insert(req->paddr);
+        if (outstandingAddrs.find(paddr) != outstandingAddrs.end()) return;
+        else outstandingAddrs.insert(paddr);
 
-        req->cmd = Write;
-        memcpy(req->data, &data, req->size);
-        checkMem->access(Write, req->paddr, req->data, req->size);
-        if (blockAddr(req->paddr) == traceBlockAddr) {
+/*
+        if (blockAddr(req->getPaddr()) == traceBlockAddr) {
             cerr << name() << ": initiating write "
                  << ((probe)?"probe of ":"access of ")
-                 << dec << req->size << " bytes (value = 0x";
-            printData(cerr, req->data, req->size);
+                 << dec << req->getSize() << " bytes (value = 0x";
+            printData(cerr, data_pkt->getPtr(), req->getSize());
             cerr << ") to addr 0x"
-                 << hex << req->paddr
-                 << " (0x" << hex << blockAddr(req->paddr) << ")"
+                 << hex << req->getPaddr()
+                 << " (0x" << hex << blockAddr(req->getPaddr()) << ")"
                  << " at cycle "
                  << dec << curTick << endl;
         }
+*/
+        Packet *pkt = new Packet(req, Packet::WriteReq, Packet::Broadcast);
+        uint8_t *pkt_data = new uint8_t[req->getSize()];
+        pkt->dataDynamicArray(pkt_data);
+        memcpy(pkt_data, &data, req->getSize());
+        MemTestSenderState *state = new MemTestSenderState(result);
+        pkt->senderState = state;
+
+        funcPort.writeBlob(req->getPaddr(), pkt_data, req->getSize());
+
         if (probe) {
-            cacheInterface->probeAndUpdate(req);
-            completeRequest(req, NULL);
+            cachePort.sendFunctional(pkt);
+            completeRequest(pkt);
         } else {
-            req->completionEvent = new MemCompleteEvent(req, NULL, this);
-            cacheInterface->access(req);
+//	    req->completionEvent = new MemCompleteEvent(req, NULL, this);
+            sendPkt(pkt);
         }
-    } else {
+    }
+/*    else {
         // copy
         unsigned source_align = random() % 100;
         unsigned dest_align = random() % 100;
@@ -369,56 +465,51 @@ MemTest::tick()
                  << " (0x" << hex << blockAddr(dest) << ")"
                  << " at cycle "
                  << dec << curTick << endl;
-        }
+        }*
         cacheInterface->access(req);
         uint8_t result[blockSize];
         checkMem->access(Read, source, &result, blockSize);
         checkMem->access(Write, dest, &result, blockSize);
     }
+*/
 }
 
-
 void
-MemCompleteEvent::process()
-{
-    tester->completeRequest(req, data);
-    delete this;
-}
-
-
-const char *
-MemCompleteEvent::description()
+MemTest::doRetry()
 {
-    return "memory access completion";
+    if (cachePort.sendTiming(retryPkt)) {
+        accessRetry = false;
+        retryPkt = NULL;
+    }
 }
 
-
 BEGIN_DECLARE_SIM_OBJECT_PARAMS(MemTest)
 
-    SimObjectParam<BaseCache *> cache;
-    SimObjectParam<FunctionalMemory *> main_mem;
-    SimObjectParam<FunctionalMemory *> check_mem;
+//    SimObjectParam<BaseCache *> cache;
+//    SimObjectParam<PhysicalMemory *> main_mem;
+//    SimObjectParam<PhysicalMemory *> check_mem;
     Param<unsigned> memory_size;
     Param<unsigned> percent_reads;
-    Param<unsigned> percent_copies;
+//    Param<unsigned> percent_copies;
     Param<unsigned> percent_uncacheable;
     Param<unsigned> progress_interval;
     Param<unsigned> percent_source_unaligned;
     Param<unsigned> percent_dest_unaligned;
     Param<Addr> trace_addr;
     Param<Counter> max_loads;
+    Param<bool> atomic;
 
 END_DECLARE_SIM_OBJECT_PARAMS(MemTest)
 
 
 BEGIN_INIT_SIM_OBJECT_PARAMS(MemTest)
 
-    INIT_PARAM(cache, "L1 cache"),
-    INIT_PARAM(main_mem, "hierarchical memory"),
-    INIT_PARAM(check_mem, "check memory"),
+//    INIT_PARAM(cache, "L1 cache"),
+//    INIT_PARAM(main_mem, "hierarchical memory"),
+//    INIT_PARAM(check_mem, "check memory"),
     INIT_PARAM(memory_size, "memory size"),
     INIT_PARAM(percent_reads, "target read percentage"),
-    INIT_PARAM(percent_copies, "target copy percentage"),
+//    INIT_PARAM(percent_copies, "target copy percentage"),
     INIT_PARAM(percent_uncacheable, "target uncacheable percentage"),
     INIT_PARAM(progress_interval, "progress report interval (in accesses)"),
     INIT_PARAM(percent_source_unaligned,
@@ -426,18 +517,19 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(MemTest)
     INIT_PARAM(percent_dest_unaligned,
                "percent of copy dest address that are unaligned"),
     INIT_PARAM(trace_addr, "address to trace"),
-    INIT_PARAM(max_loads, "terminate when we have reached this load count")
+                              INIT_PARAM(max_loads, "terminate when we have reached this load count"),
+    INIT_PARAM(atomic, "Is the tester testing atomic mode (or timing)")
 
 END_INIT_SIM_OBJECT_PARAMS(MemTest)
 
 
 CREATE_SIM_OBJECT(MemTest)
 {
-    return new MemTest(getInstanceName(), cache->getInterface(), main_mem,
-                       check_mem, memory_size, percent_reads, percent_copies,
+    return new MemTest(getInstanceName(), /*cache->getInterface(),*/ /*main_mem,*/
+                       /*check_mem,*/ memory_size, percent_reads, /*percent_copies,*/
                        percent_uncacheable, progress_interval,
                        percent_source_unaligned, percent_dest_unaligned,
-                       trace_addr, max_loads);
+                       trace_addr, max_loads, atomic);
 }
 
 REGISTER_SIM_OBJECT("MemTest", MemTest)
diff --git a/src/cpu/memtest/memtest.hh b/src/cpu/memtest/memtest.hh
index 42fb235db..5de41f0d8 100644
--- a/src/cpu/memtest/memtest.hh
+++ b/src/cpu/memtest/memtest.hh
@@ -35,31 +35,36 @@
 #include <set>
 
 #include "base/statistics.hh"
-#include "mem/functional/functional.hh"
-#include "mem/mem_interface.hh"
+//#include "mem/functional/functional.hh"
+//#include "mem/mem_interface.hh"
 #include "sim/eventq.hh"
 #include "sim/sim_exit.hh"
 #include "sim/sim_object.hh"
 #include "sim/stats.hh"
+#include "mem/mem_object.hh"
+#include "mem/port.hh"
 
-class ThreadContext;
-class MemTest : public SimObject
+class Packet;
+class MemTest : public MemObject
 {
   public:
 
     MemTest(const std::string &name,
-            MemInterface *_cache_interface,
-            FunctionalMemory *main_mem,
-            FunctionalMemory *check_mem,
+//	    MemInterface *_cache_interface,
+//	    PhysicalMemory *main_mem,
+//	    PhysicalMemory *check_mem,
             unsigned _memorySize,
             unsigned _percentReads,
-            unsigned _percentCopies,
+//	    unsigned _percentCopies,
             unsigned _percentUncacheable,
             unsigned _progressInterval,
             unsigned _percentSourceUnaligned,
             unsigned _percentDestUnaligned,
             Addr _traceAddr,
-            Counter _max_loads);
+            Counter _max_loads,
+            bool _atomic);
+
+    virtual void init();
 
     // register statistics
     virtual void regStats();
@@ -69,6 +74,8 @@ class MemTest : public SimObject
     // main simulation loop (one cycle)
     void tick();
 
+    virtual Port *getPort(const std::string &if_name, int idx = -1);
+
   protected:
     class TickEvent : public Event
     {
@@ -82,16 +89,62 @@ class MemTest : public SimObject
     };
 
     TickEvent tickEvent;
+    class CpuPort : public Port
+    {
+
+        MemTest *memtest;
+
+      public:
+
+        CpuPort(const std::string &_name, MemTest *_memtest)
+            : Port(_name), memtest(_memtest)
+        { }
+
+      protected:
+
+        virtual bool recvTiming(Packet *pkt);
 
-    MemInterface *cacheInterface;
-    FunctionalMemory *mainMem;
-    FunctionalMemory *checkMem;
-    SimpleThread *thread;
+        virtual Tick recvAtomic(Packet *pkt);
+
+        virtual void recvFunctional(Packet *pkt);
+
+        virtual void recvStatusChange(Status status);
+
+        virtual void recvRetry();
+
+        virtual void getDeviceAddressRanges(AddrRangeList &resp,
+            AddrRangeList &snoop)
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1)); }
+    };
+
+    CpuPort cachePort;
+    CpuPort funcPort;
+
+    class MemTestSenderState : public Packet::SenderState
+    {
+      public:
+        /** Constructor. */
+        MemTestSenderState(uint8_t *_data)
+            : data(_data)
+        { }
+
+        // Hold onto data pointer
+        uint8_t *data;
+    };
+
+//    Request *dataReq;
+    Packet  *retryPkt;
+//    MemInterface *cacheInterface;
+//    PhysicalMemory *mainMem;
+//    PhysicalMemory *checkMem;
+//    SimpleThread *thread;
+
+    bool accessRetry;
 
     unsigned size;		// size of testing memory region
 
     unsigned percentReads;	// target percentage of read accesses
-    unsigned percentCopies;	// target percentage of copy accesses
+//    unsigned percentCopies;	// target percentage of copy accesses
     unsigned percentUncacheable;
 
     int id;
@@ -123,34 +176,21 @@ class MemTest : public SimObject
 
     uint64_t numReads;
     uint64_t maxLoads;
+
+    bool atomic;
+
     Stats::Scalar<> numReadsStat;
     Stats::Scalar<> numWritesStat;
     Stats::Scalar<> numCopiesStat;
 
     // called by MemCompleteEvent::process()
-    void completeRequest(MemReqPtr &req, uint8_t *data);
+    void completeRequest(Packet *pkt);
 
-    friend class MemCompleteEvent;
-};
+    void sendPkt(Packet *pkt);
 
+    void doRetry();
 
-class MemCompleteEvent : public Event
-{
-    MemReqPtr req;
-    uint8_t *data;
-    MemTest *tester;
-
-  public:
-
-    MemCompleteEvent(MemReqPtr &_req, uint8_t *_data, MemTest *_tester)
-        : Event(&mainEventQueue),
-          req(_req), data(_data), tester(_tester)
-    {
-    }
-
-    void process();
-
-    virtual const char *description();
+    friend class MemCompleteEvent;
 };
 
 #endif // __CPU_MEMTEST_MEMTEST_HH__
diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh
index c80e4d8c1..ecf6ed632 100644
--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@@ -342,12 +342,6 @@ DefaultCommit<Impl>::drain()
 {
     drainPending = true;
 
-    // If it's already drained, return true.
-    if (rob->isEmpty() && !iewStage->hasStoresToWB()) {
-        cpu->signalDrained();
-        return true;
-    }
-
     return false;
 }
 
@@ -1218,16 +1212,16 @@ DefaultCommit<Impl>::skidInsert()
 
     for (int inst_num = 0; inst_num < fromRename->size; ++inst_num) {
         DynInstPtr inst = fromRename->insts[inst_num];
-        int tid = inst->threadNumber;
 
         if (!inst->isSquashed()) {
             DPRINTF(Commit, "Inserting PC %#x [sn:%i] [tid:%i] into ",
-                    "skidBuffer.\n", inst->readPC(), inst->seqNum, tid);
+                    "skidBuffer.\n", inst->readPC(), inst->seqNum,
+                    inst->threadNumber);
             skidBuffer.push(inst);
         } else {
             DPRINTF(Commit, "Instruction PC %#x [sn:%i] [tid:%i] was "
                     "squashed, skipping.\n",
-                    inst->readPC(), inst->seqNum, tid);
+                    inst->readPC(), inst->seqNum, inst->threadNumber);
         }
     }
 }
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index 7386dfadd..4c9a8e91f 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -88,7 +88,7 @@ FullO3CPU<Impl>::TickEvent::description()
 
 template <class Impl>
 FullO3CPU<Impl>::ActivateThreadEvent::ActivateThreadEvent()
-    : Event(&mainEventQueue, CPU_Tick_Pri)
+    : Event(&mainEventQueue, CPU_Switch_Pri)
 {
 }
 
@@ -135,7 +135,8 @@ void
 FullO3CPU<Impl>::DeallocateContextEvent::process()
 {
     cpu->deactivateThread(tid);
-    cpu->removeThread(tid);
+    if (remove)
+        cpu->removeThread(tid);
 }
 
 template <class Impl>
@@ -191,7 +192,11 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
       deferRegistration(params->deferRegistration),
       numThreads(number_of_threads)
 {
-    _status = Idle;
+    if (!deferRegistration) {
+        _status = Running;
+    } else {
+        _status = Idle;
+    }
 
     checker = NULL;
 
@@ -304,6 +309,9 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
 
                             tid,
                             bindRegs);
+
+        activateThreadEvent[tid].init(tid, this);
+        deallocateContextEvent[tid].init(tid, this);
     }
 
     rename.setRenameMap(renameMap);
@@ -447,13 +455,16 @@ FullO3CPU<Impl>::tick()
     if (!tickEvent.scheduled()) {
         if (_status == SwitchedOut ||
             getState() == SimObject::Drained) {
+            DPRINTF(O3CPU, "Switched out!\n");
             // increment stat
             lastRunningCycle = curTick;
-        } else if (!activityRec.active()) {
+        } else if (!activityRec.active() || _status == Idle) {
+            DPRINTF(O3CPU, "Idle!\n");
             lastRunningCycle = curTick;
             timesIdled++;
         } else {
             tickEvent.schedule(curTick + cycles(1));
+            DPRINTF(O3CPU, "Scheduling next tick!\n");
         }
     }
 
@@ -512,6 +523,8 @@ FullO3CPU<Impl>::activateThread(unsigned tid)
     list<unsigned>::iterator isActive = find(
         activeThreads.begin(), activeThreads.end(), tid);
 
+    DPRINTF(O3CPU, "[tid:%i]: Calling activate thread.\n", tid);
+
     if (isActive == activeThreads.end()) {
         DPRINTF(O3CPU, "[tid:%i]: Adding to active threads list\n",
                 tid);
@@ -528,6 +541,8 @@ FullO3CPU<Impl>::deactivateThread(unsigned tid)
     list<unsigned>::iterator thread_it =
         find(activeThreads.begin(), activeThreads.end(), tid);
 
+    DPRINTF(O3CPU, "[tid:%i]: Calling deactivate thread.\n", tid);
+
     if (thread_it != activeThreads.end()) {
         DPRINTF(O3CPU,"[tid:%i]: Removing from active threads list\n",
                 tid);
@@ -548,7 +563,7 @@ FullO3CPU<Impl>::activateContext(int tid, int delay)
         activateThread(tid);
     }
 
-    if(lastActivatedCycle < curTick) {
+    if (lastActivatedCycle < curTick) {
         scheduleTickEvent(delay);
 
         // Be sure to signal that there's some activity so the CPU doesn't
@@ -563,17 +578,20 @@ FullO3CPU<Impl>::activateContext(int tid, int delay)
 }
 
 template <class Impl>
-void
-FullO3CPU<Impl>::deallocateContext(int tid, int delay)
+bool
+FullO3CPU<Impl>::deallocateContext(int tid, bool remove, int delay)
 {
     // Schedule removal of thread data from CPU
     if (delay){
         DPRINTF(O3CPU, "[tid:%i]: Scheduling thread context to deallocate "
                 "on cycle %d\n", tid, curTick + cycles(delay));
-        scheduleDeallocateContextEvent(tid, delay);
+        scheduleDeallocateContextEvent(tid, remove, delay);
+        return false;
     } else {
         deactivateThread(tid);
-        removeThread(tid);
+        if (remove)
+            removeThread(tid);
+        return true;
     }
 }
 
@@ -582,8 +600,9 @@ void
 FullO3CPU<Impl>::suspendContext(int tid)
 {
     DPRINTF(O3CPU,"[tid: %i]: Suspending Thread Context.\n", tid);
-    deactivateThread(tid);
-    if (activeThreads.size() == 0)
+    bool deallocated = deallocateContext(tid, false, 1);
+    // If this was the last thread then unschedule the tick event.
+    if ((activeThreads.size() == 1 && !deallocated) || activeThreads.size() == 0)
         unscheduleTickEvent();
     _status = Idle;
 }
@@ -594,7 +613,7 @@ FullO3CPU<Impl>::haltContext(int tid)
 {
     //For now, this is the same as deallocate
     DPRINTF(O3CPU,"[tid:%i]: Halt Context called. Deallocating", tid);
-    deallocateContext(tid, 1);
+    deallocateContext(tid, true, 1);
 }
 
 template <class Impl>
@@ -682,10 +701,17 @@ FullO3CPU<Impl>::removeThread(unsigned tid)
     assert(iew.ldstQueue.getCount(tid) == 0);
 
     // Reset ROB/IQ/LSQ Entries
+
+    // Commented out for now.  This should be possible to do by
+    // telling all the pipeline stages to drain first, and then
+    // checking until the drain completes.  Once the pipeline is
+    // drained, call resetEntries(). - 10-09-06 ktlim
+/*
     if (activeThreads.size() >= 1) {
         commit.rob->resetEntries();
         iew.resetEntries();
     }
+*/
 }
 
 
@@ -824,7 +850,9 @@ template <class Impl>
 void
 FullO3CPU<Impl>::resume()
 {
+#if FULL_SYSTEM
     assert(system->getMemoryMode() == System::Timing);
+#endif
     fetch.resume();
     decode.resume();
     rename.resume();
@@ -935,6 +963,25 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
     }
     if (!tickEvent.scheduled())
         tickEvent.schedule(curTick);
+
+    Port *peer;
+    Port *icachePort = fetch.getIcachePort();
+    if (icachePort->getPeer() == NULL) {
+        peer = oldCPU->getPort("icache_port")->getPeer();
+        icachePort->setPeer(peer);
+    } else {
+        peer = icachePort->getPeer();
+    }
+    peer->setPeer(icachePort);
+
+    Port *dcachePort = iew.getDcachePort();
+    if (dcachePort->getPeer() == NULL) {
+        peer = oldCPU->getPort("dcache_port")->getPeer();
+        dcachePort->setPeer(peer);
+    } else {
+        peer = dcachePort->getPeer();
+    }
+    peer->setPeer(dcachePort);
 }
 
 template <class Impl>
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index dcdcd1fe6..fe510519c 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -202,9 +202,12 @@ class FullO3CPU : public BaseO3CPU
     class DeallocateContextEvent : public Event
     {
       private:
-        /** Number of Thread to Activate */
+        /** Number of Thread to deactivate */
         int tid;
 
+        /** Should the thread be removed from the CPU? */
+        bool remove;
+
         /** Pointer to the CPU. */
         FullO3CPU<Impl> *cpu;
 
@@ -218,12 +221,15 @@ class FullO3CPU : public BaseO3CPU
         /** Processes the event, calling activateThread() on the CPU. */
         void process();
 
+        /** Sets whether the thread should also be removed from the CPU. */
+        void setRemove(bool _remove) { remove = _remove; }
+
         /** Returns the description of the event. */
         const char *description();
     };
 
     /** Schedule cpu to deallocate thread context.*/
-    void scheduleDeallocateContextEvent(int tid, int delay)
+    void scheduleDeallocateContextEvent(int tid, bool remove, int delay)
     {
         // Schedule thread to activate, regardless of its current state.
         if (deallocateContextEvent[tid].squashed())
@@ -296,9 +302,9 @@ class FullO3CPU : public BaseO3CPU
     void suspendContext(int tid);
 
     /** Remove Thread from Active Threads List &&
-     *  Remove Thread Context from CPU.
+     *  Possibly Remove Thread Context from CPU.
      */
-    void deallocateContext(int tid, int delay = 1);
+    bool deallocateContext(int tid, bool remove, int delay = 1);
 
     /** Remove Thread from Active Threads List &&
      *  Remove Thread Context from CPU.
@@ -626,11 +632,6 @@ class FullO3CPU : public BaseO3CPU
     /** Pointers to all of the threads in the CPU. */
     std::vector<Thread *> thread;
 
-    /** Pointer to the icache interface. */
-    MemInterface *icacheInterface;
-    /** Pointer to the dcache interface. */
-    MemInterface *dcacheInterface;
-
     /** Whether or not the CPU should defer its registration. */
     bool deferRegistration;
 
diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh
index 1a2ca32a4..280bf0e71 100644
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -96,7 +96,7 @@ class DefaultFetch
         /** Returns the address ranges of this device. */
         virtual void getDeviceAddressRanges(AddrRangeList &resp,
                                             AddrRangeList &snoop)
-        { resp.clear(); snoop.clear(); }
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1)); }
 
         /** Timing version of receive.  Handles setting fetch to the
          * proper status to start fetching. */
diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh
index 3c47c39fa..072580af7 100644
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@@ -63,7 +63,7 @@ template<class Impl>
 void
 DefaultFetch<Impl>::IcachePort::recvFunctional(PacketPtr pkt)
 {
-    panic("DefaultFetch doesn't expect recvFunctional callback!");
+    warn("Default fetch doesn't update it's state from a functional call.");
 }
 
 template<class Impl>
@@ -599,7 +599,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
     if (fault == NoFault) {
 #if 0
         if (cpu->system->memctrl->badaddr(memReq[tid]->paddr) ||
-            memReq[tid]->flags & UNCACHEABLE) {
+            memReq[tid]->isUncacheable()) {
             DPRINTF(Fetch, "Fetch: Bad address %#x (hopefully on a "
                     "misspeculating path)!",
                     memReq[tid]->paddr);
@@ -623,6 +623,11 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
         // Now do the timing access to see whether or not the instruction
         // exists within the cache.
         if (!icachePort->sendTiming(data_pkt)) {
+            if (data_pkt->result == Packet::BadAddress) {
+                fault = TheISA::genMachineCheckFault();
+                delete mem_req;
+                memReq[tid] = NULL;
+            }
             assert(retryPkt == NULL);
             assert(retryTid == -1);
             DPRINTF(Fetch, "[tid:%i] Out of MSHRs!\n", tid);
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
index b2baae296..ba5260fe2 100644
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -600,6 +600,11 @@ template<class Impl>
 void
 DefaultIEW<Impl>::instToCommit(DynInstPtr &inst)
 {
+    // This function should not be called after writebackInsts in a
+    // single cycle.  That will cause problems with an instruction
+    // being added to the queue to commit without being processed by
+    // writebackInsts prior to being sent to commit.
+
     // First check the time slot that this instruction will write
     // to.  If there are free write ports at the time, then go ahead
     // and write the instruction to that time.  If there are not,
@@ -1286,6 +1291,7 @@ DefaultIEW<Impl>::executeInsts()
                 } else if (fault != NoFault) {
                     // If the instruction faulted, then we need to send it along to commit
                     // without the instruction completing.
+                    DPRINTF(IEW, "Store has fault! [sn:%lli]\n", inst->seqNum);
 
                     // Send this instruction to commit, also make sure iew stage
                     // realizes there is activity.
diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh
index 190734dc2..6b12d75b4 100644
--- a/src/cpu/o3/lsq.hh
+++ b/src/cpu/o3/lsq.hh
@@ -311,7 +311,7 @@ class LSQ {
         /** Returns the address ranges of this device. */
         virtual void getDeviceAddressRanges(AddrRangeList &resp,
                                             AddrRangeList &snoop)
-        { resp.clear(); snoop.clear(); }
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1)); }
 
         /** Timing version of receive.  Handles writing back and
          * completing the load or store that has returned from
diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh
index 2bbab71f0..7b7d1eb8e 100644
--- a/src/cpu/o3/lsq_impl.hh
+++ b/src/cpu/o3/lsq_impl.hh
@@ -46,7 +46,7 @@ template <class Impl>
 void
 LSQ<Impl>::DcachePort::recvFunctional(PacketPtr pkt)
 {
-    panic("O3CPU doesn't expect recvFunctional callback!");
+    warn("O3CPU doesn't update things on a recvFunctional.");
 }
 
 template <class Impl>
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 90d1a3d53..11a02e7c7 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -492,7 +492,7 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
     // A bit of a hackish way to get uncached accesses to work only if they're
     // at the head of the LSQ and are ready to commit (at the head of the ROB
     // too).
-    if (req->getFlags() & UNCACHEABLE &&
+    if (req->isUncacheable() &&
         (load_idx != loadHead || !load_inst->isAtCommit())) {
         iewStage->rescheduleMemInst(load_inst);
         ++lsqRescheduledLoads;
@@ -509,7 +509,7 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
             load_idx, store_idx, storeHead, req->getPaddr());
 
 #if FULL_SYSTEM
-    if (req->getFlags() & LOCKED) {
+    if (req->isLocked()) {
         cpu->lockAddr = req->getPaddr();
         cpu->lockFlag = true;
     }
@@ -626,18 +626,30 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
 
     ++usedPorts;
 
-    PacketPtr data_pkt = new Packet(req, Packet::ReadReq, Packet::Broadcast);
-    data_pkt->dataStatic(load_inst->memData);
-
-    LSQSenderState *state = new LSQSenderState;
-    state->isLoad = true;
-    state->idx = load_idx;
-    state->inst = load_inst;
-    data_pkt->senderState = state;
-
     // if we the cache is not blocked, do cache access
     if (!lsq->cacheBlocked()) {
+        PacketPtr data_pkt =
+            new Packet(req, Packet::ReadReq, Packet::Broadcast);
+        data_pkt->dataStatic(load_inst->memData);
+
+        LSQSenderState *state = new LSQSenderState;
+        state->isLoad = true;
+        state->idx = load_idx;
+        state->inst = load_inst;
+        data_pkt->senderState = state;
+
         if (!dcachePort->sendTiming(data_pkt)) {
+            Packet::Result result = data_pkt->result;
+
+            // Delete state and data packet because a load retry
+            // initiates a pipeline restart; it does not retry.
+            delete state;
+            delete data_pkt;
+
+            if (result == Packet::BadAddress) {
+                return TheISA::genMachineCheckFault();
+            }
+
             // If the access didn't succeed, tell the LSQ by setting
             // the retry thread id.
             lsq->setRetryTid(lsqID);
@@ -664,16 +676,6 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
         return NoFault;
     }
 
-    if (data_pkt->result != Packet::Success) {
-        DPRINTF(LSQUnit, "LSQUnit: D-cache miss!\n");
-        DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
-                load_inst->seqNum);
-    } else {
-        DPRINTF(LSQUnit, "LSQUnit: D-cache hit!\n");
-        DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
-                load_inst->seqNum);
-    }
-
     return NoFault;
 }
 
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index 98bea74fb..3f9db912f 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -416,7 +416,7 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
         // realizes there is activity.
         // Mark it as executed unless it is an uncached load that
         // needs to hit the head of commit.
-        if (!(inst->req->getFlags() & UNCACHEABLE) || inst->isAtCommit()) {
+        if (!(inst->req->isUncacheable()) || inst->isAtCommit()) {
             inst->setExecuted();
         }
         iewStage->instToCommit(inst);
@@ -608,21 +608,30 @@ LSQUnit<Impl>::writebackStores()
 
         DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%#x "
                 "to Addr:%#x, data:%#x [sn:%lli]\n",
-                storeWBIdx, storeQueue[storeWBIdx].inst->readPC(),
+                storeWBIdx, inst->readPC(),
                 req->getPaddr(), *(inst->memData),
-                storeQueue[storeWBIdx].inst->seqNum);
+                inst->seqNum);
 
         // @todo: Remove this SC hack once the memory system handles it.
-        if (req->getFlags() & LOCKED) {
-            if (req->getFlags() & UNCACHEABLE) {
+        if (req->isLocked()) {
+            if (req->isUncacheable()) {
                 req->setScResult(2);
             } else {
                 if (cpu->lockFlag) {
                     req->setScResult(1);
+                    DPRINTF(LSQUnit, "Store conditional [sn:%lli] succeeded.",
+                            inst->seqNum);
                 } else {
                     req->setScResult(0);
                     // Hack: Instantly complete this store.
-                    completeDataAccess(data_pkt);
+//                    completeDataAccess(data_pkt);
+                    DPRINTF(LSQUnit, "Store conditional [sn:%lli] failed.  "
+                            "Instantly completing it.\n",
+                            inst->seqNum);
+                    WritebackEvent *wb = new WritebackEvent(inst, data_pkt, this);
+                    wb->schedule(curTick + 1);
+                    delete state;
+                    completeStore(storeWBIdx);
                     incrStIdx(storeWBIdx);
                     continue;
                 }
@@ -633,7 +642,13 @@ LSQUnit<Impl>::writebackStores()
         }
 
         if (!dcachePort->sendTiming(data_pkt)) {
+            if (data_pkt->result == Packet::BadAddress) {
+                panic("LSQ sent out a bad address for a completed store!");
+            }
             // Need to handle becoming blocked on a store.
+            DPRINTF(IEW, "D-Cache became blcoked when writing [sn:%lli], will"
+                    "retry later\n",
+                    inst->seqNum);
             isStoreBlocked = true;
             ++lsqCacheBlocked;
             assert(retryPkt == NULL);
@@ -880,6 +895,9 @@ LSQUnit<Impl>::recvRetry()
         assert(retryPkt != NULL);
 
         if (dcachePort->sendTiming(retryPkt)) {
+            if (retryPkt->result == Packet::BadAddress) {
+                panic("LSQ sent out a bad address for a completed store!");
+            }
             storePostSend(retryPkt);
             retryPkt = NULL;
             isStoreBlocked = false;
diff --git a/src/cpu/o3/thread_context_impl.hh b/src/cpu/o3/thread_context_impl.hh
index 25e1db21c..2bc194d53 100755
--- a/src/cpu/o3/thread_context_impl.hh
+++ b/src/cpu/o3/thread_context_impl.hh
@@ -165,14 +165,14 @@ template <class Impl>
 void
 O3ThreadContext<Impl>::deallocate(int delay)
 {
-    DPRINTF(O3CPU, "Calling deallocate on Thread Context %d\n",
-            getThreadNum());
+    DPRINTF(O3CPU, "Calling deallocate on Thread Context %d delay %d\n",
+            getThreadNum(), delay);
 
     if (thread->status() == ThreadContext::Unallocated)
         return;
 
     thread->setStatus(ThreadContext::Unallocated);
-    cpu->deallocateContext(thread->readTid(), delay);
+    cpu->deallocateContext(thread->readTid(), true, delay);
 }
 
 template <class Impl>
diff --git a/src/cpu/ozone/back_end.hh b/src/cpu/ozone/back_end.hh
index 9bab6a964..8debd277d 100644
--- a/src/cpu/ozone/back_end.hh
+++ b/src/cpu/ozone/back_end.hh
@@ -493,7 +493,7 @@ BackEnd<Impl>::read(RequestPtr req, T &data, int load_idx)
     }
 */
 /*
-    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
+    if (!dcacheInterface && (memReq->isUncacheable()))
         recordEvent("Uncached Read");
 */
     return LSQ.read(req, data, load_idx);
@@ -534,7 +534,7 @@ BackEnd<Impl>::write(RequestPtr req, T &data, int store_idx)
         *res = memReq->result;
         */
 /*
-    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
+    if (!dcacheInterface && (memReq->isUncacheable()))
         recordEvent("Uncached Write");
 */
     return LSQ.write(req, data, store_idx);
diff --git a/src/cpu/ozone/back_end_impl.hh b/src/cpu/ozone/back_end_impl.hh
index ac3218c02..4078699fe 100644
--- a/src/cpu/ozone/back_end_impl.hh
+++ b/src/cpu/ozone/back_end_impl.hh
@@ -1256,7 +1256,7 @@ BackEnd<Impl>::executeInsts()
 
 //                ++iewExecStoreInsts;
 
-                if (!(inst->req->flags & LOCKED)) {
+                if (!(inst->req->isLocked())) {
                     inst->setExecuted();
 
                     instToCommit(inst);
diff --git a/src/cpu/ozone/cpu.hh b/src/cpu/ozone/cpu.hh
index 8c5be9424..70ec1d101 100644
--- a/src/cpu/ozone/cpu.hh
+++ b/src/cpu/ozone/cpu.hh
@@ -455,12 +455,12 @@ class OzoneCPU : public BaseCPU
     {
 #if 0
 #if FULL_SYSTEM && defined(TARGET_ALPHA)
-        if (req->flags & LOCKED) {
+        if (req->isLocked()) {
             req->xc->setMiscReg(TheISA::Lock_Addr_DepTag, req->paddr);
             req->xc->setMiscReg(TheISA::Lock_Flag_DepTag, true);
         }
 #endif
-        if (req->flags & LOCKED) {
+        if (req->isLocked()) {
             lockAddrList.insert(req->paddr);
             lockFlag = true;
         }
@@ -489,10 +489,10 @@ class OzoneCPU : public BaseCPU
         ExecContext *xc;
 
         // If this is a store conditional, act appropriately
-        if (req->flags & LOCKED) {
+        if (req->isLocked()) {
             xc = req->xc;
 
-            if (req->flags & UNCACHEABLE) {
+            if (req->isUncacheable()) {
                 // Don't update result register (see stq_c in isa_desc)
                 req->result = 2;
                 xc->setStCondFailures(0);//Needed? [RGD]
@@ -532,8 +532,8 @@ class OzoneCPU : public BaseCPU
 
 #endif
 
-        if (req->flags & LOCKED) {
-            if (req->flags & UNCACHEABLE) {
+        if (req->isLocked()) {
+            if (req->isUncacheable()) {
                 req->result = 2;
             } else {
                 if (this->lockFlag) {
diff --git a/src/cpu/ozone/front_end.hh b/src/cpu/ozone/front_end.hh
index 5ffd3666e..59cf9785c 100644
--- a/src/cpu/ozone/front_end.hh
+++ b/src/cpu/ozone/front_end.hh
@@ -92,7 +92,7 @@ class FrontEnd
         /** Returns the address ranges of this device. */
         virtual void getDeviceAddressRanges(AddrRangeList &resp,
                                             AddrRangeList &snoop)
-        { resp.clear(); snoop.clear(); }
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1)); }
 
         /** Timing version of receive.  Handles setting fetch to the
          * proper status to start fetching. */
diff --git a/src/cpu/ozone/front_end_impl.hh b/src/cpu/ozone/front_end_impl.hh
index 9a00aefbf..9eff8619d 100644
--- a/src/cpu/ozone/front_end_impl.hh
+++ b/src/cpu/ozone/front_end_impl.hh
@@ -59,7 +59,7 @@ template<class Impl>
 void
 FrontEnd<Impl>::IcachePort::recvFunctional(PacketPtr pkt)
 {
-    panic("FrontEnd doesn't expect recvFunctional callback!");
+    warn("FrontEnd doesn't update state from functional calls");
 }
 
 template<class Impl>
@@ -493,7 +493,7 @@ FrontEnd<Impl>::fetchCacheLine()
     if (fault == NoFault) {
 #if 0
         if (cpu->system->memctrl->badaddr(memReq->paddr) ||
-            memReq->flags & UNCACHEABLE) {
+            memReq->isUncacheable()) {
             DPRINTF(FE, "Fetch: Bad address %#x (hopefully on a "
                     "misspeculating path!",
                     memReq->paddr);
diff --git a/src/cpu/ozone/inorder_back_end.hh b/src/cpu/ozone/inorder_back_end.hh
index ffdba2f6c..76eef6fad 100644
--- a/src/cpu/ozone/inorder_back_end.hh
+++ b/src/cpu/ozone/inorder_back_end.hh
@@ -231,7 +231,7 @@ InorderBackEnd<Impl>::read(Addr addr, T &data, unsigned flags)
         }
     }
 /*
-    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
+    if (!dcacheInterface && (memReq->isUncacheable()))
         recordEvent("Uncached Read");
 */
     return fault;
@@ -243,7 +243,7 @@ Fault
 InorderBackEnd<Impl>::read(MemReqPtr &req, T &data)
 {
 #if FULL_SYSTEM && defined(TARGET_ALPHA)
-    if (req->flags & LOCKED) {
+    if (req->isLocked()) {
         req->xc->setMiscReg(TheISA::Lock_Addr_DepTag, req->paddr);
         req->xc->setMiscReg(TheISA::Lock_Flag_DepTag, true);
     }
@@ -291,7 +291,7 @@ InorderBackEnd<Impl>::write(T data, Addr addr, unsigned flags, uint64_t *res)
     if (res && (fault == NoFault))
         *res = memReq->result;
 /*
-    if (!dcacheInterface && (memReq->flags & UNCACHEABLE))
+    if (!dcacheInterface && (memReq->isUncacheable()))
         recordEvent("Uncached Write");
 */
     return fault;
@@ -306,10 +306,10 @@ InorderBackEnd<Impl>::write(MemReqPtr &req, T &data)
     ExecContext *xc;
 
     // If this is a store conditional, act appropriately
-    if (req->flags & LOCKED) {
+    if (req->isLocked()) {
         xc = req->xc;
 
-        if (req->flags & UNCACHEABLE) {
+        if (req->isUncacheable()) {
             // Don't update result register (see stq_c in isa_desc)
             req->result = 2;
             xc->setStCondFailures(0);//Needed? [RGD]
@@ -391,7 +391,7 @@ InorderBackEnd<Impl>::read(MemReqPtr &req, T &data, int load_idx)
     }
 
 /*
-    if (!dcacheInterface && (req->flags & UNCACHEABLE))
+    if (!dcacheInterface && (req->isUncacheable()))
         recordEvent("Uncached Read");
 */
     return NoFault;
@@ -455,8 +455,8 @@ InorderBackEnd<Impl>::write(MemReqPtr &req, T &data, int store_idx)
         }
     }
 /*
-    if (req->flags & LOCKED) {
-        if (req->flags & UNCACHEABLE) {
+    if (req->isLocked()) {
+        if (req->isUncacheable()) {
             // Don't update result register (see stq_c in isa_desc)
             req->result = 2;
         } else {
@@ -469,7 +469,7 @@ InorderBackEnd<Impl>::write(MemReqPtr &req, T &data, int store_idx)
         *res = req->result;
         */
 /*
-    if (!dcacheInterface && (req->flags & UNCACHEABLE))
+    if (!dcacheInterface && (req->isUncacheable()))
         recordEvent("Uncached Write");
 */
     return NoFault;
diff --git a/src/cpu/ozone/lsq_unit.hh b/src/cpu/ozone/lsq_unit.hh
index 38c1c09a2..056c79521 100644
--- a/src/cpu/ozone/lsq_unit.hh
+++ b/src/cpu/ozone/lsq_unit.hh
@@ -426,7 +426,7 @@ OzoneLSQ<Impl>::read(MemReqPtr &req, T &data, int load_idx)
     // at the head of the LSQ and are ready to commit (at the head of the ROB
     // too).
     // @todo: Fix uncached accesses.
-    if (req->flags & UNCACHEABLE &&
+    if (req->isUncacheable() &&
         (load_idx != loadHead || !loadQueue[load_idx]->readyToCommit())) {
 
         return TheISA::genMachineCheckFault();
diff --git a/src/cpu/ozone/lsq_unit_impl.hh b/src/cpu/ozone/lsq_unit_impl.hh
index ee0804036..c46eb90be 100644
--- a/src/cpu/ozone/lsq_unit_impl.hh
+++ b/src/cpu/ozone/lsq_unit_impl.hh
@@ -577,7 +577,7 @@ OzoneLSQ<Impl>::writebackStores()
             MemAccessResult result = dcacheInterface->access(req);
 
             //@todo temp fix for LL/SC (works fine for 1 CPU)
-            if (req->flags & LOCKED) {
+            if (req->isLocked()) {
                 req->result=1;
                 panic("LL/SC! oh no no support!!!");
             }
@@ -596,7 +596,7 @@ OzoneLSQ<Impl>::writebackStores()
                 Event *wb = NULL;
 /*
                 typename IEW::LdWritebackEvent *wb = NULL;
-                if (req->flags & LOCKED) {
+                if (req->isLocked()) {
                     // Stx_C does not generate a system port transaction.
                     req->result=0;
                     wb = new typename IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
@@ -630,7 +630,7 @@ OzoneLSQ<Impl>::writebackStores()
 //                DPRINTF(Activity, "Active st accessing mem hit [sn:%lli]\n",
 //                        storeQueue[storeWBIdx].inst->seqNum);
 
-                if (req->flags & LOCKED) {
+                if (req->isLocked()) {
                     // Stx_C does not generate a system port transaction.
                     req->result=1;
                     typename BackEnd::LdWritebackEvent *wb =
diff --git a/src/cpu/ozone/lw_lsq.hh b/src/cpu/ozone/lw_lsq.hh
index 6640a9f34..9b93ce74f 100644
--- a/src/cpu/ozone/lw_lsq.hh
+++ b/src/cpu/ozone/lw_lsq.hh
@@ -260,7 +260,7 @@ class OzoneLWLSQ {
 
         virtual void getDeviceAddressRanges(AddrRangeList &resp,
                                             AddrRangeList &snoop)
-        { resp.clear(); snoop.clear(); }
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1); }
 
         virtual bool recvTiming(PacketPtr pkt);
 
@@ -507,7 +507,7 @@ OzoneLWLSQ<Impl>::read(RequestPtr req, T &data, int load_idx)
     // at the head of the LSQ and are ready to commit (at the head of the ROB
     // too).
     // @todo: Fix uncached accesses.
-    if (req->getFlags() & UNCACHEABLE &&
+    if (req->isUncacheable() &&
         (inst != loadQueue.back() || !inst->isAtCommit())) {
         DPRINTF(OzoneLSQ, "[sn:%lli] Uncached load and not head of "
                 "commit/LSQ!\n",
@@ -659,7 +659,7 @@ OzoneLWLSQ<Impl>::read(RequestPtr req, T &data, int load_idx)
         return NoFault;
     }
 
-    if (req->getFlags() & LOCKED) {
+    if (req->isLocked()) {
         cpu->lockFlag = true;
     }
 
diff --git a/src/cpu/ozone/lw_lsq_impl.hh b/src/cpu/ozone/lw_lsq_impl.hh
index 4c96ad149..e523712da 100644
--- a/src/cpu/ozone/lw_lsq_impl.hh
+++ b/src/cpu/ozone/lw_lsq_impl.hh
@@ -72,7 +72,7 @@ template <class Impl>
 void
 OzoneLWLSQ<Impl>::DcachePort::recvFunctional(PacketPtr pkt)
 {
-    panic("O3CPU doesn't expect recvFunctional callback!");
+    warn("O3CPU doesn't update things on a recvFunctional");
 }
 
 template <class Impl>
@@ -394,7 +394,7 @@ OzoneLWLSQ<Impl>::executeLoad(DynInstPtr &inst)
     // Actually probably want the oldest faulting load
     if (load_fault != NoFault) {
         DPRINTF(OzoneLSQ, "Load [sn:%lli] has a fault\n", inst->seqNum);
-        if (!(inst->req->getFlags() & UNCACHEABLE && !inst->isAtCommit())) {
+        if (!(inst->req->isUncacheable() && !inst->isAtCommit())) {
             inst->setExecuted();
         }
         // Maybe just set it as can commit here, although that might cause
@@ -605,8 +605,8 @@ OzoneLWLSQ<Impl>::writebackStores()
                 inst->seqNum);
 
         // @todo: Remove this SC hack once the memory system handles it.
-        if (req->getFlags() & LOCKED) {
-            if (req->getFlags() & UNCACHEABLE) {
+        if (req->isLocked()) {
+            if (req->isUncacheable()) {
                 req->setScResult(2);
             } else {
                 if (cpu->lockFlag) {
@@ -663,7 +663,7 @@ OzoneLWLSQ<Impl>::writebackStores()
             if (result != MA_HIT && dcacheInterface->doEvents()) {
                 store_event->miss = true;
                 typename BackEnd::LdWritebackEvent *wb = NULL;
-                if (req->flags & LOCKED) {
+                if (req->isLocked()) {
                     wb = new typename BackEnd::LdWritebackEvent(inst,
                                                             be);
                     store_event->wbEvent = wb;
@@ -690,7 +690,7 @@ OzoneLWLSQ<Impl>::writebackStores()
 //                DPRINTF(Activity, "Active st accessing mem hit [sn:%lli]\n",
 //                        inst->seqNum);
 
-                if (req->flags & LOCKED) {
+                if (req->isLocked()) {
                     // Stx_C does not generate a system port
                     // transaction in the 21264, but that might be
                     // hard to accomplish in this model.
diff --git a/src/cpu/simple/atomic.cc b/src/cpu/simple/atomic.cc
index 7ba1b7df1..490be20ae 100644
--- a/src/cpu/simple/atomic.cc
+++ b/src/cpu/simple/atomic.cc
@@ -28,6 +28,7 @@
  * Authors: Steve Reinhardt
  */
 
+#include "arch/locked_mem.hh"
 #include "arch/utility.hh"
 #include "cpu/exetrace.hh"
 #include "cpu/simple/atomic.hh"
@@ -93,7 +94,7 @@ AtomicSimpleCPU::init()
 bool
 AtomicSimpleCPU::CpuPort::recvTiming(Packet *pkt)
 {
-    panic("AtomicSimpleCPU doesn't expect recvAtomic callback!");
+    panic("AtomicSimpleCPU doesn't expect recvTiming callback!");
     return true;
 }
 
@@ -107,7 +108,8 @@ AtomicSimpleCPU::CpuPort::recvAtomic(Packet *pkt)
 void
 AtomicSimpleCPU::CpuPort::recvFunctional(Packet *pkt)
 {
-    panic("AtomicSimpleCPU doesn't expect recvFunctional callback!");
+    //No internal storage to update, just return
+    return;
 }
 
 void
@@ -133,20 +135,19 @@ AtomicSimpleCPU::AtomicSimpleCPU(Params *p)
 {
     _status = Idle;
 
-    // @todo fix me and get the real cpu id & thread number!!!
     ifetch_req = new Request();
-    ifetch_req->setThreadContext(0,0); //Need CPU/Thread IDS HERE
+    ifetch_req->setThreadContext(p->cpu_id, 0); // Add thread ID if we add MT
     ifetch_pkt = new Packet(ifetch_req, Packet::ReadReq, Packet::Broadcast);
     ifetch_pkt->dataStatic(&inst);
 
     data_read_req = new Request();
-    data_read_req->setThreadContext(0,0); //Need CPU/Thread IDS HERE
+    data_read_req->setThreadContext(p->cpu_id, 0); // Add thread ID here too
     data_read_pkt = new Packet(data_read_req, Packet::ReadReq,
                                Packet::Broadcast);
     data_read_pkt->dataStatic(&dataReg);
 
     data_write_req = new Request();
-    data_write_req->setThreadContext(0,0); //Need CPU/Thread IDS HERE
+    data_write_req->setThreadContext(p->cpu_id, 0); // Add thread ID here too
     data_write_pkt = new Packet(data_write_req, Packet::WriteReq,
                                 Packet::Broadcast);
 }
@@ -161,9 +162,11 @@ AtomicSimpleCPU::serialize(ostream &os)
 {
     SimObject::State so_state = SimObject::getState();
     SERIALIZE_ENUM(so_state);
+    Status _status = status();
+    SERIALIZE_ENUM(_status);
+    BaseSimpleCPU::serialize(os);
     nameOut(os, csprintf("%s.tickEvent", name()));
     tickEvent.serialize(os);
-    BaseSimpleCPU::serialize(os);
 }
 
 void
@@ -171,8 +174,9 @@ AtomicSimpleCPU::unserialize(Checkpoint *cp, const string &section)
 {
     SimObject::State so_state;
     UNSERIALIZE_ENUM(so_state);
-    tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
+    UNSERIALIZE_ENUM(_status);
     BaseSimpleCPU::unserialize(cp, section);
+    tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
 }
 
 void
@@ -253,29 +257,36 @@ template <class T>
 Fault
 AtomicSimpleCPU::read(Addr addr, T &data, unsigned flags)
 {
-    data_read_req->setVirt(0, addr, sizeof(T), flags, thread->readPC());
+    // use the CPU's statically allocated read request and packet objects
+    Request *req = data_read_req;
+    Packet  *pkt = data_read_pkt;
+
+    req->setVirt(0, addr, sizeof(T), flags, thread->readPC());
 
     if (traceData) {
         traceData->setAddr(addr);
     }
 
     // translate to physical address
-    Fault fault = thread->translateDataReadReq(data_read_req);
+    Fault fault = thread->translateDataReadReq(req);
 
     // Now do the access.
     if (fault == NoFault) {
-        data_read_pkt->reinitFromRequest();
+        pkt->reinitFromRequest();
 
-        dcache_latency = dcachePort.sendAtomic(data_read_pkt);
+        dcache_latency = dcachePort.sendAtomic(pkt);
         dcache_access = true;
 
-        assert(data_read_pkt->result == Packet::Success);
-        data = data_read_pkt->get<T>();
+        assert(pkt->result == Packet::Success);
+        data = pkt->get<T>();
 
+        if (req->isLocked()) {
+            TheISA::handleLockedRead(thread, req);
+        }
     }
 
     // This will need a new way to tell if it has a dcache attached.
-    if (data_read_req->getFlags() & UNCACHEABLE)
+    if (req->isUncacheable())
         recordEvent("Uncached Read");
 
     return fault;
@@ -328,33 +339,52 @@ template <class T>
 Fault
 AtomicSimpleCPU::write(T data, Addr addr, unsigned flags, uint64_t *res)
 {
-    data_write_req->setVirt(0, addr, sizeof(T), flags, thread->readPC());
+    // use the CPU's statically allocated write request and packet objects
+    Request *req = data_write_req;
+    Packet  *pkt = data_write_pkt;
+
+    req->setVirt(0, addr, sizeof(T), flags, thread->readPC());
 
     if (traceData) {
         traceData->setAddr(addr);
     }
 
     // translate to physical address
-    Fault fault = thread->translateDataWriteReq(data_write_req);
+    Fault fault = thread->translateDataWriteReq(req);
 
     // Now do the access.
     if (fault == NoFault) {
-        data = htog(data);
-        data_write_pkt->reinitFromRequest();
-        data_write_pkt->dataStatic(&data);
+        bool do_access = true;  // flag to suppress cache access
 
-        dcache_latency = dcachePort.sendAtomic(data_write_pkt);
-        dcache_access = true;
+        if (req->isLocked()) {
+            do_access = TheISA::handleLockedWrite(thread, req);
+        }
+
+        if (do_access) {
+            data = htog(data);
+            pkt->reinitFromRequest();
+            pkt->dataStatic(&data);
 
-        assert(data_write_pkt->result == Packet::Success);
+            dcache_latency = dcachePort.sendAtomic(pkt);
+            dcache_access = true;
 
-        if (res && data_write_req->getFlags() & LOCKED) {
-            *res = data_write_req->getScResult();
+            assert(pkt->result == Packet::Success);
+        }
+
+        if (req->isLocked()) {
+            uint64_t scResult = req->getScResult();
+            if (scResult != 0) {
+                // clear failure counter
+                thread->setStCondFailures(0);
+            }
+            if (res) {
+                *res = req->getScResult();
+            }
         }
     }
 
     // This will need a new way to tell if it's hooked up to a cache or not.
-    if (data_write_req->getFlags() & UNCACHEABLE)
+    if (req->isUncacheable())
         recordEvent("Uncached Write");
 
     // If the write needs to have a fault on the access, consider calling
@@ -467,11 +497,11 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(AtomicSimpleCPU)
     Param<Tick> progress_interval;
     SimObjectParam<MemObject *> mem;
     SimObjectParam<System *> system;
+    Param<int> cpu_id;
 
 #if FULL_SYSTEM
     SimObjectParam<AlphaITB *> itb;
     SimObjectParam<AlphaDTB *> dtb;
-    Param<int> cpu_id;
     Param<Tick> profile;
 #else
     SimObjectParam<Process *> workload;
@@ -500,11 +530,11 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(AtomicSimpleCPU)
     INIT_PARAM(progress_interval, "Progress interval"),
     INIT_PARAM(mem, "memory"),
     INIT_PARAM(system, "system object"),
+    INIT_PARAM(cpu_id, "processor ID"),
 
 #if FULL_SYSTEM
     INIT_PARAM(itb, "Instruction TLB"),
     INIT_PARAM(dtb, "Data TLB"),
-    INIT_PARAM(cpu_id, "processor ID"),
     INIT_PARAM(profile, ""),
 #else
     INIT_PARAM(workload, "processes to run"),
@@ -538,11 +568,11 @@ CREATE_SIM_OBJECT(AtomicSimpleCPU)
     params->simulate_stalls = simulate_stalls;
     params->mem = mem;
     params->system = system;
+    params->cpu_id = cpu_id;
 
 #if FULL_SYSTEM
     params->itb = itb;
     params->dtb = dtb;
-    params->cpu_id = cpu_id;
     params->profile = profile;
 #else
     params->process = workload;
diff --git a/src/cpu/simple/atomic.hh b/src/cpu/simple/atomic.hh
index b602af558..52afd76ef 100644
--- a/src/cpu/simple/atomic.hh
+++ b/src/cpu/simple/atomic.hh
@@ -104,9 +104,9 @@ class AtomicSimpleCPU : public BaseSimpleCPU
 
         virtual void getDeviceAddressRanges(AddrRangeList &resp,
             AddrRangeList &snoop)
-        { resp.clear(); snoop.clear(); }
-    };
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1)); }
 
+    };
     CpuPort icachePort;
     CpuPort dcachePort;
 
diff --git a/src/cpu/simple/timing.cc b/src/cpu/simple/timing.cc
index 03ee27e04..33f673cbc 100644
--- a/src/cpu/simple/timing.cc
+++ b/src/cpu/simple/timing.cc
@@ -28,6 +28,7 @@
  * Authors: Steve Reinhardt
  */
 
+#include "arch/locked_mem.hh"
 #include "arch/utility.hh"
 #include "cpu/exetrace.hh"
 #include "cpu/simple/timing.hh"
@@ -73,7 +74,8 @@ TimingSimpleCPU::CpuPort::recvAtomic(Packet *pkt)
 void
 TimingSimpleCPU::CpuPort::recvFunctional(Packet *pkt)
 {
-    panic("TimingSimpleCPU doesn't expect recvFunctional callback!");
+    //No internal storage to update, jusst return
+    return;
 }
 
 void
@@ -94,12 +96,14 @@ TimingSimpleCPU::CpuPort::TickEvent::schedule(Packet *_pkt, Tick t)
 }
 
 TimingSimpleCPU::TimingSimpleCPU(Params *p)
-    : BaseSimpleCPU(p), icachePort(this, p->clock), dcachePort(this, p->clock)
+    : BaseSimpleCPU(p), icachePort(this, p->clock), dcachePort(this, p->clock),
+      cpu_id(p->cpu_id)
 {
     _status = Idle;
     ifetch_pkt = dcache_pkt = NULL;
     drainEvent = NULL;
     fetchEvent = NULL;
+    previousTick = 0;
     changeState(SimObject::Running);
 }
 
@@ -158,6 +162,7 @@ TimingSimpleCPU::resume()
 
     assert(system->getMemoryMode() == System::Timing);
     changeState(SimObject::Running);
+    previousTick = curTick;
 }
 
 void
@@ -165,6 +170,7 @@ TimingSimpleCPU::switchOut()
 {
     assert(status() == Running || status() == Idle);
     _status = SwitchedOut;
+    numCycles += curTick - previousTick;
 
     // If we've been scheduled to resume but are then told to switch out,
     // we'll need to cancel it.
@@ -187,6 +193,27 @@ TimingSimpleCPU::takeOverFrom(BaseCPU *oldCPU)
             break;
         }
     }
+
+    if (_status != Running) {
+        _status = Idle;
+    }
+
+    Port *peer;
+    if (icachePort.getPeer() == NULL) {
+        peer = oldCPU->getPort("icache_port")->getPeer();
+        icachePort.setPeer(peer);
+    } else {
+        peer = icachePort.getPeer();
+    }
+    peer->setPeer(&icachePort);
+
+    if (dcachePort.getPeer() == NULL) {
+        peer = oldCPU->getPort("dcache_port")->getPeer();
+        dcachePort.setPeer(peer);
+    } else {
+        peer = dcachePort.getPeer();
+    }
+    peer->setPeer(&dcachePort);
 }
 
 
@@ -227,35 +254,35 @@ template <class T>
 Fault
 TimingSimpleCPU::read(Addr addr, T &data, unsigned flags)
 {
-    // need to fill in CPU & thread IDs here
-    Request *data_read_req = new Request();
-    data_read_req->setThreadContext(0,0); //Need CPU/Thread IDS HERE
-    data_read_req->setVirt(0, addr, sizeof(T), flags, thread->readPC());
+    Request *req =
+        new Request(/* asid */ 0, addr, sizeof(T), flags, thread->readPC(),
+                    cpu_id, /* thread ID */ 0);
 
     if (traceData) {
-        traceData->setAddr(data_read_req->getVaddr());
+        traceData->setAddr(req->getVaddr());
     }
 
    // translate to physical address
-    Fault fault = thread->translateDataReadReq(data_read_req);
+    Fault fault = thread->translateDataReadReq(req);
 
     // Now do the access.
     if (fault == NoFault) {
-        Packet *data_read_pkt =
-            new Packet(data_read_req, Packet::ReadReq, Packet::Broadcast);
-        data_read_pkt->dataDynamic<T>(new T);
+        Packet *pkt =
+            new Packet(req, Packet::ReadReq, Packet::Broadcast);
+        pkt->dataDynamic<T>(new T);
 
-        if (!dcachePort.sendTiming(data_read_pkt)) {
+        if (!dcachePort.sendTiming(pkt)) {
             _status = DcacheRetry;
-            dcache_pkt = data_read_pkt;
+            dcache_pkt = pkt;
         } else {
             _status = DcacheWaitResponse;
+            // memory system takes ownership of packet
             dcache_pkt = NULL;
         }
     }
 
     // This will need a new way to tell if it has a dcache attached.
-    if (data_read_req->getFlags() & UNCACHEABLE)
+    if (req->isUncacheable())
         recordEvent("Uncached Read");
 
     return fault;
@@ -308,31 +335,39 @@ template <class T>
 Fault
 TimingSimpleCPU::write(T data, Addr addr, unsigned flags, uint64_t *res)
 {
-    // need to fill in CPU & thread IDs here
-    Request *data_write_req = new Request();
-    data_write_req->setThreadContext(0,0); //Need CPU/Thread IDS HERE
-    data_write_req->setVirt(0, addr, sizeof(T), flags, thread->readPC());
+    Request *req =
+        new Request(/* asid */ 0, addr, sizeof(T), flags, thread->readPC(),
+                    cpu_id, /* thread ID */ 0);
 
     // translate to physical address
-    Fault fault = thread->translateDataWriteReq(data_write_req);
+    Fault fault = thread->translateDataWriteReq(req);
+
     // Now do the access.
     if (fault == NoFault) {
-        Packet *data_write_pkt =
-            new Packet(data_write_req, Packet::WriteReq, Packet::Broadcast);
-        data_write_pkt->allocate();
-        data_write_pkt->set(data);
+        assert(dcache_pkt == NULL);
+        dcache_pkt = new Packet(req, Packet::WriteReq, Packet::Broadcast);
+        dcache_pkt->allocate();
+        dcache_pkt->set(data);
 
-        if (!dcachePort.sendTiming(data_write_pkt)) {
-            _status = DcacheRetry;
-            dcache_pkt = data_write_pkt;
-        } else {
-            _status = DcacheWaitResponse;
-            dcache_pkt = NULL;
+        bool do_access = true;  // flag to suppress cache access
+
+        if (req->isLocked()) {
+            do_access = TheISA::handleLockedWrite(thread, req);
+        }
+
+        if (do_access) {
+            if (!dcachePort.sendTiming(dcache_pkt)) {
+                _status = DcacheRetry;
+            } else {
+                _status = DcacheWaitResponse;
+                // memory system takes ownership of packet
+                dcache_pkt = NULL;
+            }
         }
     }
 
     // This will need a new way to tell if it's hooked up to a cache or not.
-    if (data_write_req->getFlags() & UNCACHEABLE)
+    if (req->isUncacheable())
         recordEvent("Uncached Write");
 
     // If the write needs to have a fault on the access, consider calling
@@ -392,9 +427,8 @@ TimingSimpleCPU::fetch()
 {
     checkForInterrupts();
 
-    // need to fill in CPU & thread IDs here
     Request *ifetch_req = new Request();
-    ifetch_req->setThreadContext(0,0); //Need CPU/Thread IDS HERE
+    ifetch_req->setThreadContext(cpu_id, /* thread ID */ 0);
     Fault fault = setupFetchRequest(ifetch_req);
 
     ifetch_pkt = new Packet(ifetch_req, Packet::ReadReq, Packet::Broadcast);
@@ -414,6 +448,9 @@ TimingSimpleCPU::fetch()
         // fetch fault: advance directly to next instruction (fault handler)
         advanceInst(fault);
     }
+
+    numCycles += curTick - previousTick;
+    previousTick = curTick;
 }
 
 
@@ -444,6 +481,9 @@ TimingSimpleCPU::completeIfetch(Packet *pkt)
     delete pkt->req;
     delete pkt;
 
+    numCycles += curTick - previousTick;
+    previousTick = curTick;
+
     if (getState() == SimObject::Draining) {
         completeDrain();
         return;
@@ -453,12 +493,20 @@ TimingSimpleCPU::completeIfetch(Packet *pkt)
     if (curStaticInst->isMemRef() && !curStaticInst->isDataPrefetch()) {
         // load or store: just send to dcache
         Fault fault = curStaticInst->initiateAcc(this, traceData);
-        if (fault == NoFault) {
-            // successfully initiated access: instruction will
-            // complete in dcache response callback
-            assert(_status == DcacheWaitResponse);
+        if (_status != Running) {
+            // instruction will complete in dcache response callback
+            assert(_status == DcacheWaitResponse || _status == DcacheRetry);
+            assert(fault == NoFault);
         } else {
-            // fault: complete now to invoke fault handler
+            if (fault == NoFault) {
+                // early fail on store conditional: complete now
+                assert(dcache_pkt != NULL);
+                fault = curStaticInst->completeAcc(dcache_pkt, this,
+                                                   traceData);
+                delete dcache_pkt->req;
+                delete dcache_pkt;
+                dcache_pkt = NULL;
+            }
             postExecute();
             advanceInst(fault);
         }
@@ -479,8 +527,7 @@ TimingSimpleCPU::IcachePort::ITickEvent::process()
 bool
 TimingSimpleCPU::IcachePort::recvTiming(Packet *pkt)
 {
-    // These next few lines could be replaced with something faster
-    // who knows what though
+    // delay processing of returned data until next CPU clock edge
     Tick time = pkt->req->getTime();
     while (time < curTick)
         time += lat;
@@ -516,21 +563,27 @@ TimingSimpleCPU::completeDataAccess(Packet *pkt)
     assert(_status == DcacheWaitResponse);
     _status = Running;
 
-    if (getState() == SimObject::Draining) {
-        completeDrain();
+    numCycles += curTick - previousTick;
+    previousTick = curTick;
 
-        delete pkt->req;
-        delete pkt;
+    Fault fault = curStaticInst->completeAcc(pkt, this, traceData);
 
-        return;
+    if (pkt->isRead() && pkt->req->isLocked()) {
+        TheISA::handleLockedRead(thread, pkt->req);
     }
 
-    Fault fault = curStaticInst->completeAcc(pkt, this, traceData);
-
     delete pkt->req;
     delete pkt;
 
     postExecute();
+
+    if (getState() == SimObject::Draining) {
+        advancePC(fault);
+        completeDrain();
+
+        return;
+    }
+
     advanceInst(fault);
 }
 
@@ -546,6 +599,7 @@ TimingSimpleCPU::completeDrain()
 bool
 TimingSimpleCPU::DcachePort::recvTiming(Packet *pkt)
 {
+    // delay processing of returned data until next CPU clock edge
     Tick time = pkt->req->getTime();
     while (time < curTick)
         time += lat;
@@ -574,6 +628,7 @@ TimingSimpleCPU::DcachePort::recvRetry()
     Packet *tmp = cpu->dcache_pkt;
     if (sendTiming(tmp)) {
         cpu->_status = DcacheWaitResponse;
+        // memory system takes ownership of packet
         cpu->dcache_pkt = NULL;
     }
 }
@@ -592,11 +647,11 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(TimingSimpleCPU)
     Param<Tick> progress_interval;
     SimObjectParam<MemObject *> mem;
     SimObjectParam<System *> system;
+    Param<int> cpu_id;
 
 #if FULL_SYSTEM
     SimObjectParam<AlphaITB *> itb;
     SimObjectParam<AlphaDTB *> dtb;
-    Param<int> cpu_id;
     Param<Tick> profile;
 #else
     SimObjectParam<Process *> workload;
@@ -625,11 +680,11 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(TimingSimpleCPU)
     INIT_PARAM(progress_interval, "Progress interval"),
     INIT_PARAM(mem, "memory"),
     INIT_PARAM(system, "system object"),
+    INIT_PARAM(cpu_id, "processor ID"),
 
 #if FULL_SYSTEM
     INIT_PARAM(itb, "Instruction TLB"),
     INIT_PARAM(dtb, "Data TLB"),
-    INIT_PARAM(cpu_id, "processor ID"),
     INIT_PARAM(profile, ""),
 #else
     INIT_PARAM(workload, "processes to run"),
@@ -661,11 +716,11 @@ CREATE_SIM_OBJECT(TimingSimpleCPU)
     params->functionTraceStart = function_trace_start;
     params->mem = mem;
     params->system = system;
+    params->cpu_id = cpu_id;
 
 #if FULL_SYSTEM
     params->itb = itb;
     params->dtb = dtb;
-    params->cpu_id = cpu_id;
     params->profile = profile;
 #else
     params->process = workload;
diff --git a/src/cpu/simple/timing.hh b/src/cpu/simple/timing.hh
index d03fa4bc0..988ddeded 100644
--- a/src/cpu/simple/timing.hh
+++ b/src/cpu/simple/timing.hh
@@ -92,7 +92,7 @@ class TimingSimpleCPU : public BaseSimpleCPU
 
         virtual void getDeviceAddressRanges(AddrRangeList &resp,
             AddrRangeList &snoop)
-        { resp.clear(); snoop.clear(); }
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1)); }
 
         struct TickEvent : public Event
         {
@@ -166,6 +166,9 @@ class TimingSimpleCPU : public BaseSimpleCPU
     Packet *ifetch_pkt;
     Packet *dcache_pkt;
 
+    int cpu_id;
+    Tick previousTick;
+
   public:
 
     virtual Port *getPort(const std::string &if_name, int idx = -1);
diff --git a/src/cpu/simple_thread.hh b/src/cpu/simple_thread.hh
index 242cfd0e1..6fa6500bd 100644
--- a/src/cpu/simple_thread.hh
+++ b/src/cpu/simple_thread.hh
@@ -237,7 +237,7 @@ class SimpleThread : public ThreadState
     Fault read(RequestPtr &req, T &data)
     {
 #if FULL_SYSTEM && THE_ISA == ALPHA_ISA
-        if (req->flags & LOCKED) {
+        if (req->isLocked()) {
             req->xc->setMiscReg(TheISA::Lock_Addr_DepTag, req->paddr);
             req->xc->setMiscReg(TheISA::Lock_Flag_DepTag, true);
         }
@@ -256,10 +256,10 @@ class SimpleThread : public ThreadState
         ExecContext *xc;
 
         // If this is a store conditional, act appropriately
-        if (req->flags & LOCKED) {
+        if (req->isLocked()) {
             xc = req->xc;
 
-            if (req->flags & UNCACHEABLE) {
+            if (req->isUncacheable()) {
                 // Don't update result register (see stq_c in isa_desc)
                 req->result = 2;
                 xc->setStCondFailures(0);//Needed? [RGD]