13 files changed, 389 insertions, 70 deletions
diff --git a/src/mem/bus.cc b/src/mem/bus.cc
index cf9e54e62..3c5283a77 100644
--- a/src/mem/bus.cc
+++ b/src/mem/bus.cc
@@ -252,6 +252,7 @@ Bus::recvFunctional(Packet *pkt)
     DPRINTF(Bus, "recvFunctional: packet src %d dest %d addr 0x%x cmd %s\n",
             pkt->getSrc(), pkt->getDest(), pkt->getAddr(), pkt->cmdString());
     assert(pkt->getDest() == Packet::Broadcast);
+    atomicSnoop(pkt);
     findPort(pkt->getAddr(), pkt->getSrc())->sendFunctional(pkt);
 }
 
diff --git a/src/mem/cache/base_cache.cc b/src/mem/cache/base_cache.cc
index a172847df..e6138e320 100644
--- a/src/mem/cache/base_cache.cc
+++ b/src/mem/cache/base_cache.cc
@@ -199,7 +199,9 @@ BaseCache::CacheEvent::process()
         return;
     }
     //Response
-    //Know the packet to send, no need to mark in service (must succed)
+    //Know the packet to send
+    pkt->result = Packet::Success;
+    pkt->makeTimingResponse();
     assert(cachePort->sendTiming(pkt));
 }
 
diff --git a/src/mem/cache/base_cache.hh b/src/mem/cache/base_cache.hh
index 069dbab58..7c16398aa 100644
--- a/src/mem/cache/base_cache.hh
+++ b/src/mem/cache/base_cache.hh
@@ -127,6 +127,8 @@ class BaseCache : public MemObject
     CachePort *cpuSidePort;
     CachePort *memSidePort;
 
+    bool snoopRangesSent;
+
   public:
     virtual Port *getPort(const std::string &if_name, int idx = -1);
 
@@ -149,17 +151,22 @@ class BaseCache : public MemObject
 
     void recvStatusChange(Port::Status status, bool isCpuSide)
     {
-        if (status == Port::RangeChange)
-        {
-            if (!isCpuSide)
-            {
+        if (status == Port::RangeChange){
+            if (!isCpuSide) {
                 cpuSidePort->sendStatusChange(Port::RangeChange);
+                if (topLevelCache && !snoopRangesSent) {
+                    snoopRangesSent = true;
+                    memSidePort->sendStatusChange(Port::RangeChange);
+                }
             }
-            else
-            {
+            else {
                 memSidePort->sendStatusChange(Port::RangeChange);
             }
         }
+        else if (status == Port::SnoopSquash) {
+            assert(snoopPhase2);
+            snoopPhase2 = false;
+        }
     }
 
     virtual Packet *getPacket()
@@ -205,6 +212,10 @@ class BaseCache : public MemObject
     /** True if this cache is connected to the CPU. */
     bool topLevelCache;
 
+
+    /** True if we are now in phase 2 of the snoop process. */
+    bool snoopPhase2;
+
     /** Stores time the cache blocked for statistics. */
     Tick blockedCycle;
 
@@ -332,6 +343,7 @@ class BaseCache : public MemObject
         //Start ports at null if more than one is created we should panic
         cpuSidePort = NULL;
         memSidePort = NULL;
+        snoopRangesSent = false;
     }
 
     virtual void init();
@@ -382,9 +394,12 @@ class BaseCache : public MemObject
             blocked_causes[cause]++;
             blockedCycle = curTick;
         }
-        blocked |= flag;
-        DPRINTF(Cache,"Blocking for cause %s\n", cause);
-        cpuSidePort->setBlocked();
+        if (!(blocked & flag)) {
+            //Wasn't already blocked for this cause
+            blocked |= flag;
+            DPRINTF(Cache,"Blocking for cause %s\n", cause);
+            cpuSidePort->setBlocked();
+        }
     }
 
     /**
@@ -395,8 +410,11 @@ class BaseCache : public MemObject
     void setBlockedForSnoop(BlockedCause cause)
     {
         uint8_t flag = 1 << cause;
-        blockedSnoop |= flag;
-        memSidePort->setBlocked();
+        if (!(blocked & flag)) {
+            //Wasn't already blocked for this cause
+            blockedSnoop |= flag;
+            memSidePort->setBlocked();
+        }
     }
 
     /**
@@ -503,8 +521,6 @@ class BaseCache : public MemObject
      */
     void respond(Packet *pkt, Tick time)
     {
-        pkt->makeTimingResponse();
-        pkt->result = Packet::Success;
         CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
         reqCpu->schedule(time);
     }
@@ -517,10 +533,8 @@ class BaseCache : public MemObject
     void respondToMiss(Packet *pkt, Tick time)
     {
         if (!pkt->req->isUncacheable()) {
-            missLatency[pkt->cmdToIndex()][pkt->req->getThreadNum()] += time - pkt->time;
+            missLatency[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/] += time - pkt->time;
         }
-        pkt->makeTimingResponse();
-        pkt->result = Packet::Success;
         CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
         reqCpu->schedule(time);
     }
@@ -529,10 +543,12 @@ class BaseCache : public MemObject
      * Suppliess the data if cache to cache transfers are enabled.
      * @param pkt The bus transaction to fulfill.
      */
-    void respondToSnoop(Packet *pkt)
+    void respondToSnoop(Packet *pkt, Tick time)
     {
-        assert("Implement\n" && 0);
+//        assert("Implement\n" && 0);
 //	mi->respond(pkt,curTick + hitLatency);
+        CacheEvent *reqMem = new CacheEvent(memSidePort, pkt);
+        reqMem->schedule(time);
     }
 
     /**
@@ -551,6 +567,16 @@ class BaseCache : public MemObject
         else
         {
             //This is where snoops get updated
+            AddrRangeList dummy;
+            if (!topLevelCache)
+            {
+                cpuSidePort->getPeerAddressRanges(dummy, snoop);
+            }
+            else
+            {
+                snoop.push_back(RangeSize(0,-1));
+            }
+
             return;
         }
     }
diff --git a/src/mem/cache/cache.hh b/src/mem/cache/cache.hh
index 989b8743e..4b8870c95 100644
--- a/src/mem/cache/cache.hh
+++ b/src/mem/cache/cache.hh
@@ -251,7 +251,7 @@ class Cache : public BaseCache
      * request.
      * @return The estimated completion time.
      */
-    Tick probe(Packet * &pkt, bool update);
+    Tick probe(Packet * &pkt, bool update, CachePort * otherSidePort);
 
     /**
      * Snoop for the provided request in the cache and return the estimated
@@ -262,7 +262,7 @@ class Cache : public BaseCache
      * request.
      * @return The estimated completion time.
      */
-    Tick snoopProbe(Packet * &pkt, bool update);
+    Tick snoopProbe(Packet * &pkt);
 };
 
 #endif // __CACHE_HH__
diff --git a/src/mem/cache/cache_impl.hh b/src/mem/cache/cache_impl.hh
index 593dbecf3..00fecc2b7 100644
--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@@ -63,14 +63,26 @@ doTimingAccess(Packet *pkt, CachePort *cachePort, bool isCpuSide)
         if (pkt->isWrite() && (pkt->req->getFlags() & LOCKED)) {
             pkt->req->setScResult(1);
         }
-        access(pkt);
+        if (!(pkt->flags & SATISFIED)) {
+            access(pkt);
+        }
     }
     else
     {
         if (pkt->isResponse())
             handleResponse(pkt);
-        else
-            snoop(pkt);
+        else {
+            //Check if we are in phase1
+            if (!snoopPhase2) {
+                snoopPhase2 = true;
+            }
+            else {
+                //Check if we should do the snoop
+                if (pkt->flags && SNOOP_COMMIT)
+                    snoop(pkt);
+                snoopPhase2 = false;
+            }
+        }
     }
     return true;
 }
@@ -87,7 +99,7 @@ doAtomicAccess(Packet *pkt, bool isCpuSide)
             pkt->req->setScResult(1);
         }
 
-        probe(pkt, true);
+        probe(pkt, true, NULL);
         //TEMP ALWAYS SUCCES FOR NOW
         pkt->result = Packet::Success;
     }
@@ -96,7 +108,7 @@ doAtomicAccess(Packet *pkt, bool isCpuSide)
         if (pkt->isResponse())
             handleResponse(pkt);
         else
-            snoopProbe(pkt, true);
+            snoopProbe(pkt);
     }
     //Fix this timing info
     return hitLatency;
@@ -117,16 +129,13 @@ doFunctionalAccess(Packet *pkt, bool isCpuSide)
             assert("Can't handle LL/SC on functional path\n");
         }
 
-        probe(pkt, true);
+        probe(pkt, false, memSidePort);
         //TEMP ALWAYS SUCCESFUL FOR NOW
         pkt->result = Packet::Success;
     }
     else
     {
-        if (pkt->isResponse())
-            handleResponse(pkt);
-        else
-            snoopProbe(pkt, true);
+            probe(pkt, false, cpuSidePort);
     }
 }
 
@@ -239,7 +248,7 @@ Cache<TagStore,Buffering,Coherence>::access(PacketPtr &pkt)
             pkt->getAddr() & ~((Addr)blkSize - 1), pkt->req->getPC());
     if (blk) {
         // Hit
-        hits[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+        hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
         // clear dirty bit if write through
         if (pkt->needsResponse())
             respond(pkt, curTick+lat);
@@ -249,7 +258,7 @@ Cache<TagStore,Buffering,Coherence>::access(PacketPtr &pkt)
 
     // Miss
     if (!pkt->req->isUncacheable()) {
-        misses[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+        misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
         /** @todo Move miss count code into BaseCache */
         if (missCount) {
             --missCount;
@@ -270,7 +279,7 @@ Cache<TagStore,Buffering,Coherence>::getPacket()
     Packet * pkt = missQueue->getPacket();
     if (pkt) {
         if (!pkt->req->isUncacheable()) {
-            if (pkt->cmd == Packet::HardPFReq) misses[Packet::HardPFReq][pkt->req->getThreadNum()]++;
+            if (pkt->cmd == Packet::HardPFReq) misses[Packet::HardPFReq][0/*pkt->req->getThreadNum()*/]++;
             BlkType *blk = tags->findBlock(pkt);
             Packet::Command cmd = coherence->getBusCmd(pkt->cmd,
                                               (blk)? blk->status : 0);
@@ -314,7 +323,7 @@ Cache<TagStore,Buffering,Coherence>::handleResponse(Packet * &pkt)
             PacketList writebacks;
             blk = tags->handleFill(blk, (MSHR*)pkt->senderState,
                                    coherence->getNewState(pkt,old_state),
-                                   writebacks);
+                                   writebacks, pkt);
             while (!writebacks.empty()) {
                     missQueue->doWriteback(writebacks.front());
             }
@@ -372,7 +381,6 @@ template<class TagStore, class Buffering, class Coherence>
 void
 Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
 {
-
     Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
     BlkType *blk = tags->findBlock(pkt);
     MSHR *mshr = missQueue->findMSHR(blk_addr);
@@ -385,7 +393,10 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
                     //If the outstanding request was an invalidate (upgrade,readex,..)
                     //Then we need to ACK the request until we get the data
                     //Also NACK if the outstanding request is not a cachefill (writeback)
+                    pkt->flags |= SATISFIED;
                     pkt->flags |= NACKED_LINE;
+                    assert("Don't detect these on the other side yet\n");
+                    respondToSnoop(pkt, curTick + hitLatency);
                     return;
                 }
                 else {
@@ -398,6 +409,7 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
                     //@todo Make it so that a read to a pending read can't be exclusive now.
 
                     //Set the address so find match works
+                    assert("Don't have invalidates yet\n");
                     invalidatePkt->addrOverride(pkt->getAddr());
 
                     //Append the invalidate on
@@ -433,7 +445,7 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
                         assert(offset + pkt->getSize() <=blkSize);
                         memcpy(pkt->getPtr<uint8_t>(), mshr->pkt->getPtr<uint8_t>() + offset, pkt->getSize());
 
-                        respondToSnoop(pkt);
+                        respondToSnoop(pkt, curTick + hitLatency);
                     }
 
                     if (pkt->isInvalidate()) {
@@ -449,7 +461,7 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
     bool satisfy = coherence->handleBusRequest(pkt,blk,mshr, new_state);
     if (satisfy) {
         tags->handleSnoop(blk, new_state, pkt);
-        respondToSnoop(pkt);
+        respondToSnoop(pkt, curTick + hitLatency);
         return;
     }
     tags->handleSnoop(blk, new_state);
@@ -486,7 +498,7 @@ Cache<TagStore,Buffering,Coherence>::invalidateBlk(Addr addr)
  */
 template<class TagStore, class Buffering, class Coherence>
 Tick
-Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
+Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update, CachePort* otherSidePort)
 {
 //    MemDebug::cacheProbe(pkt);
     if (!pkt->req->isUncacheable()) {
@@ -517,7 +529,8 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
         missQueue->findWrites(blk_addr, writes);
 
         if (!update) {
-            memSidePort->sendFunctional(pkt);
+                otherSidePort->sendFunctional(pkt);
+
             // Check for data in MSHR and writebuffer.
             if (mshr) {
                 warn("Found outstanding miss on an non-update probe");
@@ -612,12 +625,15 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
 
                 lat = memSidePort->sendAtomic(busPkt);
 
+                //Be sure to flip the response to a request for coherence
+                busPkt->makeAtomicResponse();
+
 /*		if (!(busPkt->flags & SATISFIED)) {
                     // blocked at a higher level, just return
                     return 0;
                 }
 
-*/		misses[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+*/		misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
 
                 CacheBlk::State old_state = (blk) ? blk->status : 0;
                 tags->handleFill(blk, busPkt,
@@ -642,10 +658,10 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
         }
 
         if (update) {
-            hits[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+            hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
         } else if (pkt->isWrite()) {
             // Still need to change data in all locations.
-            return memSidePort->sendAtomic(pkt);
+            return otherSidePort->sendAtomic(pkt);
         }
         return curTick + lat;
     }
@@ -655,18 +671,18 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update)
 
 template<class TagStore, class Buffering, class Coherence>
 Tick
-Cache<TagStore,Buffering,Coherence>::snoopProbe(PacketPtr &pkt, bool update)
+Cache<TagStore,Buffering,Coherence>::snoopProbe(PacketPtr &pkt)
 {
-    Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
-    BlkType *blk = tags->findBlock(pkt);
-    MSHR *mshr = missQueue->findMSHR(blk_addr);
-    CacheBlk::State new_state = 0;
-    bool satisfy = coherence->handleBusRequest(pkt,blk,mshr, new_state);
-    if (satisfy) {
-        tags->handleSnoop(blk, new_state, pkt);
-        return hitLatency;
-    }
-    tags->handleSnoop(blk, new_state);
-    return 0;
+        Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
+        BlkType *blk = tags->findBlock(pkt);
+        MSHR *mshr = missQueue->findMSHR(blk_addr);
+        CacheBlk::State new_state = 0;
+        bool satisfy = coherence->handleBusRequest(pkt,blk,mshr, new_state);
+        if (satisfy) {
+            tags->handleSnoop(blk, new_state, pkt);
+            return hitLatency;
+        }
+        tags->handleSnoop(blk, new_state);
+        return 0;
 }
 
diff --git a/src/mem/cache/miss/blocking_buffer.cc b/src/mem/cache/miss/blocking_buffer.cc
index 67fc7ae56..7a6ea9133 100644
--- a/src/mem/cache/miss/blocking_buffer.cc
+++ b/src/mem/cache/miss/blocking_buffer.cc
@@ -189,7 +189,7 @@ BlockingBuffer::squash(int threadNum)
     if (miss.threadNum == threadNum) {
         Packet * target = miss.getTarget();
         miss.popTarget();
-        assert(target->req->getThreadNum() == threadNum);
+        assert(0/*target->req->getThreadNum()*/ == threadNum);
         target = NULL;
         assert(!miss.hasTargets());
         miss.ntargets=0;
@@ -218,7 +218,7 @@ BlockingBuffer::doWriteback(Addr addr,
     }
 
     ///All writebacks charged to same thread @todo figure this out
-    writebacks[pkt->req->getThreadNum()]++;
+    writebacks[0/*pkt->req->getThreadNum()*/]++;
 
     wb.allocateAsBuffer(pkt);
     cache->setMasterRequest(Request_WB, curTick);
@@ -230,7 +230,7 @@ BlockingBuffer::doWriteback(Addr addr,
 void
 BlockingBuffer::doWriteback(Packet * &pkt)
 {
-    writebacks[pkt->req->getThreadNum()]++;
+    writebacks[0/*pkt->req->getThreadNum()*/]++;
 
     wb.allocateAsBuffer(pkt);
 
diff --git a/src/mem/cache/miss/miss_queue.cc b/src/mem/cache/miss/miss_queue.cc
index 76fb25716..273b6587f 100644
--- a/src/mem/cache/miss/miss_queue.cc
+++ b/src/mem/cache/miss/miss_queue.cc
@@ -413,8 +413,8 @@ MissQueue::handleMiss(Packet * &pkt, int blkSize, Tick time)
         mshr = mq.findMatch(blkAddr);
         if (mshr) {
             //@todo remove hw_pf here
-            mshr_hits[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
-            if (mshr->threadNum != pkt->req->getThreadNum()) {
+            mshr_hits[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
+            if (mshr->threadNum != 0/*pkt->req->getThreadNum()*/) {
                 mshr->threadNum = -1;
             }
             mq.allocateTarget(mshr, pkt);
@@ -434,11 +434,11 @@ MissQueue::handleMiss(Packet * &pkt, int blkSize, Tick time)
             mshr_no_allocate_misses++;
         }
         else {
-            mshr_misses[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+            mshr_misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
         }
     } else {
         //Count uncacheable accesses
-        mshr_uncacheable[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+        mshr_uncacheable[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
         size = pkt->getSize();
     }
     if (pkt->isWrite() && (pkt->req->isUncacheable() || !writeAllocate ||
@@ -499,7 +499,7 @@ MissQueue::getPacket()
         pkt = prefetcher->getPacket();
         if (pkt) {
             //Update statistic on number of prefetches issued (hwpf_mshr_misses)
-            mshr_misses[pkt->cmdToIndex()][pkt->req->getThreadNum()]++;
+            mshr_misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
             //It will request the bus for the future, but should clear that immedieatley
             allocateMiss(pkt, pkt->getSize(), curTick);
             pkt = mq.getReq();
@@ -592,7 +592,7 @@ MissQueue::handleResponse(Packet * &pkt, Tick time)
     BlockedCause cause = NUM_BLOCKED_CAUSES;
 
     if (pkt->isCacheFill() && !pkt->isNoAllocate()) {
-        mshr_miss_latency[mshr->originalCmd][pkt->req->getThreadNum()] +=
+        mshr_miss_latency[mshr->originalCmd][0/*pkt->req->getThreadNum()*/] +=
             curTick - pkt->time;
         // targets were handled in the cache tags
         if (mshr == noTargetMSHR) {
@@ -619,7 +619,7 @@ MissQueue::handleResponse(Packet * &pkt, Tick time)
         }
     } else {
         if (pkt->req->isUncacheable()) {
-            mshr_uncacheable_lat[pkt->cmd][pkt->req->getThreadNum()] +=
+            mshr_uncacheable_lat[pkt->cmd][0/*pkt->req->getThreadNum()*/] +=
                 curTick - pkt->time;
         }
         if (mshr->hasTargets() && pkt->req->isUncacheable()) {
@@ -725,7 +725,7 @@ MissQueue::doWriteback(Addr addr,
     }
 
     ///All writebacks charged to same thread @todo figure this out
-    writebacks[pkt->req->getThreadNum()]++;
+    writebacks[0/*pkt->req->getThreadNum()*/]++;
 
     allocateWrite(pkt, 0, curTick);
 }
@@ -734,7 +734,7 @@ MissQueue::doWriteback(Addr addr,
 void
 MissQueue::doWriteback(Packet * &pkt)
 {
-    writebacks[pkt->req->getThreadNum()]++;
+    writebacks[0/*pkt->req->getThreadNum()*/]++;
     allocateWrite(pkt, 0, curTick);
 }
 
diff --git a/src/mem/cache/miss/mshr.cc b/src/mem/cache/miss/mshr.cc
index 519ec5ebd..f36032672 100644
--- a/src/mem/cache/miss/mshr.cc
+++ b/src/mem/cache/miss/mshr.cc
@@ -88,7 +88,7 @@ void
 MSHR::allocateAsBuffer(Packet * &target)
 {
     addr = target->getAddr();
-    threadNum = target->req->getThreadNum();
+    threadNum = 0/*target->req->getThreadNum()*/;
     pkt = new Packet(target->req, target->cmd, -1);
     pkt->allocate();
     pkt->senderState = (Packet::SenderState*)this;
diff --git a/src/mem/cache/miss/mshr_queue.cc b/src/mem/cache/miss/mshr_queue.cc
index 97a56119f..e54f7aa08 100644
--- a/src/mem/cache/miss/mshr_queue.cc
+++ b/src/mem/cache/miss/mshr_queue.cc
@@ -251,7 +251,7 @@ MSHRQueue::squash(int threadNum)
                 Packet * target = mshr->getTarget();
                 mshr->popTarget();
 
-                assert(target->req->getThreadNum() == threadNum);
+                assert(0/*target->req->getThreadNum()*/ == threadNum);
                 target = NULL;
             }
             assert(!mshr->hasTargets());
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index c7d28010c..be9bf5f57 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -312,7 +312,7 @@ class Packet
      *   for returning as a response to that request.  Used for timing
      *   accesses only.  For atomic and functional accesses, the
      *   request packet is always implicitly passed back *without*
-     *   modifying the command or destination fields, so this function
+     *   modifying the destination fields, so this function
      *   should not be called. */
     void makeTimingResponse() {
         assert(needsResponse());
@@ -325,6 +325,18 @@ class Packet
         srcValid = false;
     }
 
+    /** Take a request packet and modify it in place to be suitable
+     *   for returning as a response to that request.
+     */
+    void makeAtomicResponse() {
+        assert(needsResponse());
+        assert(isRequest());
+        int icmd = (int)cmd;
+        icmd &= ~(IsRequest);
+        icmd |= IsResponse;
+        cmd = (Command)icmd;
+    }
+
     /** Take a request packet that has been returned as NACKED and modify it so
      * that it can be sent out again. Only packets that need a response can be
      * NACKED, so verify that that is true. */
diff --git a/tests/configs/o3-timing-mp.py b/tests/configs/o3-timing-mp.py
new file mode 100644
index 000000000..881c23156
--- /dev/null
+++ b/tests/configs/o3-timing-mp.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2006 The Regents of The University of Michigan
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Ron Dreslinski
+
+import m5
+from m5.objects import *
+m5.AddToPath('../configs/common')
+from FullO3Config import *
+
+# --------------------
+# Base L1 Cache
+# ====================
+
+class L1(BaseCache):
+    latency = 1
+    block_size = 64
+    mshrs = 4
+    tgts_per_mshr = 8
+    protocol = CoherenceProtocol(protocol='moesi')
+
+# ----------------------
+# Base L2 Cache
+# ----------------------
+
+class L2(BaseCache):
+    block_size = 64
+    latency = 100
+    mshrs = 92
+    tgts_per_mshr = 16
+    write_buffers = 8
+
+nb_cores = 4
+cpus = [ DetailedO3CPU() for i in xrange(nb_cores) ]
+
+# system simulated
+system = System(cpu = cpus, physmem = PhysicalMemory(), membus =
+Bus())
+
+# l2cache & bus
+system.toL2Bus = Bus()
+system.l2c = L2(size='4MB', assoc=8)
+system.l2c.cpu_side = system.toL2Bus.port
+
+# connect l2c to membus
+system.l2c.mem_side = system.membus.port
+
+# add L1 caches
+for cpu in cpus:
+    cpu.addPrivateSplitL1Caches(L1(size = '32kB', assoc = 1),
+                                L1(size = '32kB', assoc = 4))
+    cpu.mem = cpu.dcache
+    # connect cpu level-1 caches to shared level-2 cache
+    cpu.connectMemPorts(system.toL2Bus)
+
+# connect memory to membus
+system.physmem.port = system.membus.port
+
+
+# -----------------------
+# run simulation
+# -----------------------
+
+root = Root( system = system )
+root.system.mem_mode = 'timing'
+root.trace.flags="Bus Cache"
+#root.trace.flags = "BusAddrRanges"
diff --git a/tests/configs/simple-atomic-mp.py b/tests/configs/simple-atomic-mp.py
new file mode 100644
index 000000000..cc1a36dda
--- /dev/null
+++ b/tests/configs/simple-atomic-mp.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2006 The Regents of The University of Michigan
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Ron Dreslinski
+
+import m5
+from m5.objects import *
+
+# --------------------
+# Base L1 Cache
+# ====================
+
+class L1(BaseCache):
+    latency = 1
+    block_size = 64
+    mshrs = 4
+    tgts_per_mshr = 8
+    protocol = CoherenceProtocol(protocol='moesi')
+
+# ----------------------
+# Base L2 Cache
+# ----------------------
+
+class L2(BaseCache):
+    block_size = 64
+    latency = 100
+    mshrs = 92
+    tgts_per_mshr = 16
+    write_buffers = 8
+
+nb_cores = 4
+cpus = [ AtomicSimpleCPU() for i in xrange(nb_cores) ]
+
+# system simulated
+system = System(cpu = cpus, physmem = PhysicalMemory(), membus =
+Bus())
+
+# l2cache & bus
+system.toL2Bus = Bus()
+system.l2c = L2(size='4MB', assoc=8)
+system.l2c.cpu_side = system.toL2Bus.port
+
+# connect l2c to membus
+system.l2c.mem_side = system.membus.port
+
+# add L1 caches
+for cpu in cpus:
+    cpu.addPrivateSplitL1Caches(L1(size = '32kB', assoc = 1),
+                                L1(size = '32kB', assoc = 4))
+    cpu.mem = cpu.dcache
+    # connect cpu level-1 caches to shared level-2 cache
+    cpu.connectMemPorts(system.toL2Bus)
+
+# connect memory to membus
+system.physmem.port = system.membus.port
+
+
+# -----------------------
+# run simulation
+# -----------------------
+
+root = Root( system = system )
+root.system.mem_mode = 'atomic'
diff --git a/tests/configs/simple-timing-mp.py b/tests/configs/simple-timing-mp.py
new file mode 100644
index 000000000..9fc5f3874
--- /dev/null
+++ b/tests/configs/simple-timing-mp.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2006 The Regents of The University of Michigan
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Ron Dreslinski
+
+import m5
+from m5.objects import *
+
+# --------------------
+# Base L1 Cache
+# ====================
+
+class L1(BaseCache):
+    latency = 1
+    block_size = 64
+    mshrs = 4
+    tgts_per_mshr = 8
+    protocol = CoherenceProtocol(protocol='moesi')
+
+# ----------------------
+# Base L2 Cache
+# ----------------------
+
+class L2(BaseCache):
+    block_size = 64
+    latency = 100
+    mshrs = 92
+    tgts_per_mshr = 16
+    write_buffers = 8
+
+nb_cores = 4
+cpus = [ TimingSimpleCPU() for i in xrange(nb_cores) ]
+
+# system simulated
+system = System(cpu = cpus, physmem = PhysicalMemory(), membus =
+Bus())
+
+# l2cache & bus
+system.toL2Bus = Bus()
+system.l2c = L2(size='4MB', assoc=8)
+system.l2c.cpu_side = system.toL2Bus.port
+
+# connect l2c to membus
+system.l2c.mem_side = system.membus.port
+
+# add L1 caches
+for cpu in cpus:
+    cpu.addPrivateSplitL1Caches(L1(size = '32kB', assoc = 1),
+                                L1(size = '32kB', assoc = 4))
+    cpu.mem = cpu.dcache
+    # connect cpu level-1 caches to shared level-2 cache
+    cpu.connectMemPorts(system.toL2Bus)
+
+# connect memory to membus
+system.physmem.port = system.membus.port
+
+
+# -----------------------
+# run simulation
+# -----------------------
+
+root = Root( system = system )
+root.system.mem_mode = 'timing'