34 files changed, 1013 insertions, 301 deletions
diff --git a/src/base/traceflags.py b/src/base/traceflags.py
index 274407be5..757c9e7b7 100644
--- a/src/base/traceflags.py
+++ b/src/base/traceflags.py
@@ -58,6 +58,7 @@ baseFlags = [
     'BusAddrRanges',
     'BusBridge',
     'Cache',
+    'CachePort',
     'Chains',
     'Checker',
     'Clock',
@@ -93,6 +94,7 @@ baseFlags = [
     'Flow',
     'FreeList',
     'FullCPU',
+    'FunctionalAccess',
     'GDBAcc',
     'GDBExtra',
     'GDBMisc',
@@ -121,6 +123,7 @@ baseFlags = [
     'MSHR',
     'Mbox',
     'MemDepUnit',
+    'MemoryAccess',
     'O3CPU',
     'OzoneCPU',
     'OzoneLSQ',
diff --git a/src/cpu/checker/thread_context.hh b/src/cpu/checker/thread_context.hh
index 8c0186dae..b2806d40b 100644
--- a/src/cpu/checker/thread_context.hh
+++ b/src/cpu/checker/thread_context.hh
@@ -133,7 +133,7 @@ class CheckerThreadContext : public ThreadContext
     void takeOverFrom(ThreadContext *oldContext)
     {
         actualTC->takeOverFrom(oldContext);
-        checkerTC->takeOverFrom(oldContext);
+        checkerTC->copyState(oldContext);
     }
 
     void regStats(const std::string &name) { actualTC->regStats(name); }
diff --git a/src/cpu/memtest/memtest.cc b/src/cpu/memtest/memtest.cc
index 609a07a8e..024cd7e41 100644
--- a/src/cpu/memtest/memtest.cc
+++ b/src/cpu/memtest/memtest.cc
@@ -71,7 +71,11 @@ MemTest::CpuPort::recvAtomic(Packet *pkt)
 void
 MemTest::CpuPort::recvFunctional(Packet *pkt)
 {
-    memtest->completeRequest(pkt);
+    //Do nothing if we see one come through
+    if (curTick != 0)//Supress warning durring initialization
+        warn("Functional Writes not implemented in MemTester\n");
+    //Need to find any response values that intersect and update
+    return;
 }
 
 void
@@ -89,6 +93,20 @@ MemTest::CpuPort::recvRetry()
     memtest->doRetry();
 }
 
+void
+MemTest::sendPkt(Packet *pkt) {
+    if (atomic) {
+        cachePort.sendAtomic(pkt);
+        pkt->makeAtomicResponse();
+        completeRequest(pkt);
+    }
+    else if (!cachePort.sendTiming(pkt)) {
+        accessRetry = true;
+        retryPkt = pkt;
+    }
+
+}
+
 MemTest::MemTest(const string &name,
 //		 MemInterface *_cache_interface,
 //		 PhysicalMemory *main_mem,
@@ -101,7 +119,8 @@ MemTest::MemTest(const string &name,
                  unsigned _percentSourceUnaligned,
                  unsigned _percentDestUnaligned,
                  Addr _traceAddr,
-                 Counter _max_loads)
+                 Counter _max_loads,
+                 bool _atomic)
     : MemObject(name),
       tickEvent(this),
       cachePort("test", this),
@@ -117,7 +136,8 @@ MemTest::MemTest(const string &name,
       nextProgressMessage(_progressInterval),
       percentSourceUnaligned(_percentSourceUnaligned),
       percentDestUnaligned(percentDestUnaligned),
-      maxLoads(_max_loads)
+      maxLoads(_max_loads),
+      atomic(_atomic)
 {
     vector<string> cmd;
     cmd.push_back("/bin/ls");
@@ -325,7 +345,7 @@ MemTest::tick()
     } else {
         paddr = ((base) ? baseAddr1 : baseAddr2) + offset;
     }
-    // bool probe = (random() % 2 == 1) && !req->isUncacheable();
+    //bool probe = (random() % 2 == 1) && !req->isUncacheable();
     bool probe = false;
 
     paddr &= ~((1 << access_size) - 1);
@@ -340,7 +360,11 @@ MemTest::tick()
         //For now we only allow one outstanding request per addreess per tester
         //This means we assume CPU does write forwarding to reads that alias something
         //in the cpu store buffer.
-        if (outstandingAddrs.find(paddr) != outstandingAddrs.end()) return;
+        if (outstandingAddrs.find(paddr) != outstandingAddrs.end()) {
+            delete result;
+            delete req;
+            return;
+        }
         else outstandingAddrs.insert(paddr);
 
         // ***** NOTE FOR RON: I'm not sure how to access checkMem. - Kevin
@@ -364,13 +388,10 @@ MemTest::tick()
 
         if (probe) {
             cachePort.sendFunctional(pkt);
-//	    completeRequest(pkt, result);
+            completeRequest(pkt);
         } else {
 //	    req->completionEvent = new MemCompleteEvent(req, result, this);
-            if (!cachePort.sendTiming(pkt)) {
-                accessRetry = true;
-                retryPkt = pkt;
-            }
+            sendPkt(pkt);
         }
     } else {
         // write
@@ -378,7 +399,12 @@ MemTest::tick()
         //For now we only allow one outstanding request per addreess per tester
         //This means we assume CPU does write forwarding to reads that alias something
         //in the cpu store buffer.
-        if (outstandingAddrs.find(paddr) != outstandingAddrs.end()) return;
+        if (outstandingAddrs.find(paddr) != outstandingAddrs.end()) {
+            delete [] result;
+            delete req;
+            return;
+        }
+
         else outstandingAddrs.insert(paddr);
 
 /*
@@ -405,13 +431,10 @@ MemTest::tick()
 
         if (probe) {
             cachePort.sendFunctional(pkt);
-//	    completeRequest(req, NULL);
+            completeRequest(pkt);
         } else {
 //	    req->completionEvent = new MemCompleteEvent(req, NULL, this);
-            if (!cachePort.sendTiming(pkt)) {
-                accessRetry = true;
-                retryPkt = pkt;
-            }
+            sendPkt(pkt);
         }
     }
 /*    else {
@@ -483,6 +506,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(MemTest)
     Param<unsigned> percent_dest_unaligned;
     Param<Addr> trace_addr;
     Param<Counter> max_loads;
+    Param<bool> atomic;
 
 END_DECLARE_SIM_OBJECT_PARAMS(MemTest)
 
@@ -502,7 +526,8 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(MemTest)
     INIT_PARAM(percent_dest_unaligned,
                "percent of copy dest address that are unaligned"),
     INIT_PARAM(trace_addr, "address to trace"),
-    INIT_PARAM(max_loads, "terminate when we have reached this load count")
+                              INIT_PARAM(max_loads, "terminate when we have reached this load count"),
+    INIT_PARAM(atomic, "Is the tester testing atomic mode (or timing)")
 
 END_INIT_SIM_OBJECT_PARAMS(MemTest)
 
@@ -513,7 +538,7 @@ CREATE_SIM_OBJECT(MemTest)
                        /*check_mem,*/ memory_size, percent_reads, /*percent_copies,*/
                        percent_uncacheable, progress_interval,
                        percent_source_unaligned, percent_dest_unaligned,
-                       trace_addr, max_loads);
+                       trace_addr, max_loads, atomic);
 }
 
 REGISTER_SIM_OBJECT("MemTest", MemTest)
diff --git a/src/cpu/memtest/memtest.hh b/src/cpu/memtest/memtest.hh
index 278012eba..5de41f0d8 100644
--- a/src/cpu/memtest/memtest.hh
+++ b/src/cpu/memtest/memtest.hh
@@ -61,7 +61,8 @@ class MemTest : public MemObject
             unsigned _percentSourceUnaligned,
             unsigned _percentDestUnaligned,
             Addr _traceAddr,
-            Counter _max_loads);
+            Counter _max_loads,
+            bool _atomic);
 
     virtual void init();
 
@@ -113,7 +114,7 @@ class MemTest : public MemObject
 
         virtual void getDeviceAddressRanges(AddrRangeList &resp,
             AddrRangeList &snoop)
-        { resp.clear(); snoop.clear(); }
+        { resp.clear(); snoop.clear(); snoop.push_back(RangeSize(0,-1)); }
     };
 
     CpuPort cachePort;
@@ -175,6 +176,9 @@ class MemTest : public MemObject
 
     uint64_t numReads;
     uint64_t maxLoads;
+
+    bool atomic;
+
     Stats::Scalar<> numReadsStat;
     Stats::Scalar<> numWritesStat;
     Stats::Scalar<> numCopiesStat;
@@ -182,6 +186,8 @@ class MemTest : public MemObject
     // called by MemCompleteEvent::process()
     void completeRequest(Packet *pkt);
 
+    void sendPkt(Packet *pkt);
+
     void doRetry();
 
     friend class MemCompleteEvent;
diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh
index c80e4d8c1..ecf6ed632 100644
--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@@ -342,12 +342,6 @@ DefaultCommit<Impl>::drain()
 {
     drainPending = true;
 
-    // If it's already drained, return true.
-    if (rob->isEmpty() && !iewStage->hasStoresToWB()) {
-        cpu->signalDrained();
-        return true;
-    }
-
     return false;
 }
 
@@ -1218,16 +1212,16 @@ DefaultCommit<Impl>::skidInsert()
 
     for (int inst_num = 0; inst_num < fromRename->size; ++inst_num) {
         DynInstPtr inst = fromRename->insts[inst_num];
-        int tid = inst->threadNumber;
 
         if (!inst->isSquashed()) {
             DPRINTF(Commit, "Inserting PC %#x [sn:%i] [tid:%i] into ",
-                    "skidBuffer.\n", inst->readPC(), inst->seqNum, tid);
+                    "skidBuffer.\n", inst->readPC(), inst->seqNum,
+                    inst->threadNumber);
             skidBuffer.push(inst);
         } else {
             DPRINTF(Commit, "Instruction PC %#x [sn:%i] [tid:%i] was "
                     "squashed, skipping.\n",
-                    inst->readPC(), inst->seqNum, tid);
+                    inst->readPC(), inst->seqNum, inst->threadNumber);
         }
     }
 }
diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc
index 7386dfadd..4c9a8e91f 100644
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@@ -88,7 +88,7 @@ FullO3CPU<Impl>::TickEvent::description()
 
 template <class Impl>
 FullO3CPU<Impl>::ActivateThreadEvent::ActivateThreadEvent()
-    : Event(&mainEventQueue, CPU_Tick_Pri)
+    : Event(&mainEventQueue, CPU_Switch_Pri)
 {
 }
 
@@ -135,7 +135,8 @@ void
 FullO3CPU<Impl>::DeallocateContextEvent::process()
 {
     cpu->deactivateThread(tid);
-    cpu->removeThread(tid);
+    if (remove)
+        cpu->removeThread(tid);
 }
 
 template <class Impl>
@@ -191,7 +192,11 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
       deferRegistration(params->deferRegistration),
       numThreads(number_of_threads)
 {
-    _status = Idle;
+    if (!deferRegistration) {
+        _status = Running;
+    } else {
+        _status = Idle;
+    }
 
     checker = NULL;
 
@@ -304,6 +309,9 @@ FullO3CPU<Impl>::FullO3CPU(Params *params)
 
                             tid,
                             bindRegs);
+
+        activateThreadEvent[tid].init(tid, this);
+        deallocateContextEvent[tid].init(tid, this);
     }
 
     rename.setRenameMap(renameMap);
@@ -447,13 +455,16 @@ FullO3CPU<Impl>::tick()
     if (!tickEvent.scheduled()) {
         if (_status == SwitchedOut ||
             getState() == SimObject::Drained) {
+            DPRINTF(O3CPU, "Switched out!\n");
             // increment stat
             lastRunningCycle = curTick;
-        } else if (!activityRec.active()) {
+        } else if (!activityRec.active() || _status == Idle) {
+            DPRINTF(O3CPU, "Idle!\n");
             lastRunningCycle = curTick;
             timesIdled++;
         } else {
             tickEvent.schedule(curTick + cycles(1));
+            DPRINTF(O3CPU, "Scheduling next tick!\n");
         }
     }
 
@@ -512,6 +523,8 @@ FullO3CPU<Impl>::activateThread(unsigned tid)
     list<unsigned>::iterator isActive = find(
         activeThreads.begin(), activeThreads.end(), tid);
 
+    DPRINTF(O3CPU, "[tid:%i]: Calling activate thread.\n", tid);
+
     if (isActive == activeThreads.end()) {
         DPRINTF(O3CPU, "[tid:%i]: Adding to active threads list\n",
                 tid);
@@ -528,6 +541,8 @@ FullO3CPU<Impl>::deactivateThread(unsigned tid)
     list<unsigned>::iterator thread_it =
         find(activeThreads.begin(), activeThreads.end(), tid);
 
+    DPRINTF(O3CPU, "[tid:%i]: Calling deactivate thread.\n", tid);
+
     if (thread_it != activeThreads.end()) {
         DPRINTF(O3CPU,"[tid:%i]: Removing from active threads list\n",
                 tid);
@@ -548,7 +563,7 @@ FullO3CPU<Impl>::activateContext(int tid, int delay)
         activateThread(tid);
     }
 
-    if(lastActivatedCycle < curTick) {
+    if (lastActivatedCycle < curTick) {
         scheduleTickEvent(delay);
 
         // Be sure to signal that there's some activity so the CPU doesn't
@@ -563,17 +578,20 @@ FullO3CPU<Impl>::activateContext(int tid, int delay)
 }
 
 template <class Impl>
-void
-FullO3CPU<Impl>::deallocateContext(int tid, int delay)
+bool
+FullO3CPU<Impl>::deallocateContext(int tid, bool remove, int delay)
 {
     // Schedule removal of thread data from CPU
     if (delay){
         DPRINTF(O3CPU, "[tid:%i]: Scheduling thread context to deallocate "
                 "on cycle %d\n", tid, curTick + cycles(delay));
-        scheduleDeallocateContextEvent(tid, delay);
+        scheduleDeallocateContextEvent(tid, remove, delay);
+        return false;
     } else {
         deactivateThread(tid);
-        removeThread(tid);
+        if (remove)
+            removeThread(tid);
+        return true;
     }
 }
 
@@ -582,8 +600,9 @@ void
 FullO3CPU<Impl>::suspendContext(int tid)
 {
     DPRINTF(O3CPU,"[tid: %i]: Suspending Thread Context.\n", tid);
-    deactivateThread(tid);
-    if (activeThreads.size() == 0)
+    bool deallocated = deallocateContext(tid, false, 1);
+    // If this was the last thread then unschedule the tick event.
+    if ((activeThreads.size() == 1 && !deallocated) || activeThreads.size() == 0)
         unscheduleTickEvent();
     _status = Idle;
 }
@@ -594,7 +613,7 @@ FullO3CPU<Impl>::haltContext(int tid)
 {
     //For now, this is the same as deallocate
     DPRINTF(O3CPU,"[tid:%i]: Halt Context called. Deallocating", tid);
-    deallocateContext(tid, 1);
+    deallocateContext(tid, true, 1);
 }
 
 template <class Impl>
@@ -682,10 +701,17 @@ FullO3CPU<Impl>::removeThread(unsigned tid)
     assert(iew.ldstQueue.getCount(tid) == 0);
 
     // Reset ROB/IQ/LSQ Entries
+
+    // Commented out for now.  This should be possible to do by
+    // telling all the pipeline stages to drain first, and then
+    // checking until the drain completes.  Once the pipeline is
+    // drained, call resetEntries(). - 10-09-06 ktlim
+/*
     if (activeThreads.size() >= 1) {
         commit.rob->resetEntries();
         iew.resetEntries();
     }
+*/
 }
 
 
@@ -824,7 +850,9 @@ template <class Impl>
 void
 FullO3CPU<Impl>::resume()
 {
+#if FULL_SYSTEM
     assert(system->getMemoryMode() == System::Timing);
+#endif
     fetch.resume();
     decode.resume();
     rename.resume();
@@ -935,6 +963,25 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU)
     }
     if (!tickEvent.scheduled())
         tickEvent.schedule(curTick);
+
+    Port *peer;
+    Port *icachePort = fetch.getIcachePort();
+    if (icachePort->getPeer() == NULL) {
+        peer = oldCPU->getPort("icache_port")->getPeer();
+        icachePort->setPeer(peer);
+    } else {
+        peer = icachePort->getPeer();
+    }
+    peer->setPeer(icachePort);
+
+    Port *dcachePort = iew.getDcachePort();
+    if (dcachePort->getPeer() == NULL) {
+        peer = oldCPU->getPort("dcache_port")->getPeer();
+        dcachePort->setPeer(peer);
+    } else {
+        peer = dcachePort->getPeer();
+    }
+    peer->setPeer(dcachePort);
 }
 
 template <class Impl>
diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh
index dcdcd1fe6..fe510519c 100644
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@@ -202,9 +202,12 @@ class FullO3CPU : public BaseO3CPU
     class DeallocateContextEvent : public Event
     {
       private:
-        /** Number of Thread to Activate */
+        /** Number of Thread to deactivate */
         int tid;
 
+        /** Should the thread be removed from the CPU? */
+        bool remove;
+
         /** Pointer to the CPU. */
         FullO3CPU<Impl> *cpu;
 
@@ -218,12 +221,15 @@ class FullO3CPU : public BaseO3CPU
         /** Processes the event, calling activateThread() on the CPU. */
         void process();
 
+        /** Sets whether the thread should also be removed from the CPU. */
+        void setRemove(bool _remove) { remove = _remove; }
+
         /** Returns the description of the event. */
         const char *description();
     };
 
     /** Schedule cpu to deallocate thread context.*/
-    void scheduleDeallocateContextEvent(int tid, int delay)
+    void scheduleDeallocateContextEvent(int tid, bool remove, int delay)
     {
         // Schedule thread to activate, regardless of its current state.
         if (deallocateContextEvent[tid].squashed())
@@ -296,9 +302,9 @@ class FullO3CPU : public BaseO3CPU
     void suspendContext(int tid);
 
     /** Remove Thread from Active Threads List &&
-     *  Remove Thread Context from CPU.
+     *  Possibly Remove Thread Context from CPU.
      */
-    void deallocateContext(int tid, int delay = 1);
+    bool deallocateContext(int tid, bool remove, int delay = 1);
 
     /** Remove Thread from Active Threads List &&
      *  Remove Thread Context from CPU.
@@ -626,11 +632,6 @@ class FullO3CPU : public BaseO3CPU
     /** Pointers to all of the threads in the CPU. */
     std::vector<Thread *> thread;
 
-    /** Pointer to the icache interface. */
-    MemInterface *icacheInterface;
-    /** Pointer to the dcache interface. */
-    MemInterface *dcacheInterface;
-
     /** Whether or not the CPU should defer its registration. */
     bool deferRegistration;
 
diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh
index b3c3caaad..32210f1cd 100644
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@@ -623,6 +623,11 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
         // Now do the timing access to see whether or not the instruction
         // exists within the cache.
         if (!icachePort->sendTiming(data_pkt)) {
+            if (data_pkt->result == Packet::BadAddress) {
+                fault = TheISA::genMachineCheckFault();
+                delete mem_req;
+                memReq[tid] = NULL;
+            }
             assert(retryPkt == NULL);
             assert(retryTid == -1);
             DPRINTF(Fetch, "[tid:%i] Out of MSHRs!\n", tid);
diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh
index b2baae296..ba5260fe2 100644
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@@ -600,6 +600,11 @@ template<class Impl>
 void
 DefaultIEW<Impl>::instToCommit(DynInstPtr &inst)
 {
+    // This function should not be called after writebackInsts in a
+    // single cycle.  That will cause problems with an instruction
+    // being added to the queue to commit without being processed by
+    // writebackInsts prior to being sent to commit.
+
     // First check the time slot that this instruction will write
     // to.  If there are free write ports at the time, then go ahead
     // and write the instruction to that time.  If there are not,
@@ -1286,6 +1291,7 @@ DefaultIEW<Impl>::executeInsts()
                 } else if (fault != NoFault) {
                     // If the instruction faulted, then we need to send it along to commit
                     // without the instruction completing.
+                    DPRINTF(IEW, "Store has fault! [sn:%lli]\n", inst->seqNum);
 
                     // Send this instruction to commit, also make sure iew stage
                     // realizes there is activity.
diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh
index 58945f04e..11a02e7c7 100644
--- a/src/cpu/o3/lsq_unit.hh
+++ b/src/cpu/o3/lsq_unit.hh
@@ -626,18 +626,30 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
 
     ++usedPorts;
 
-    PacketPtr data_pkt = new Packet(req, Packet::ReadReq, Packet::Broadcast);
-    data_pkt->dataStatic(load_inst->memData);
-
-    LSQSenderState *state = new LSQSenderState;
-    state->isLoad = true;
-    state->idx = load_idx;
-    state->inst = load_inst;
-    data_pkt->senderState = state;
-
     // if we the cache is not blocked, do cache access
     if (!lsq->cacheBlocked()) {
+        PacketPtr data_pkt =
+            new Packet(req, Packet::ReadReq, Packet::Broadcast);
+        data_pkt->dataStatic(load_inst->memData);
+
+        LSQSenderState *state = new LSQSenderState;
+        state->isLoad = true;
+        state->idx = load_idx;
+        state->inst = load_inst;
+        data_pkt->senderState = state;
+
         if (!dcachePort->sendTiming(data_pkt)) {
+            Packet::Result result = data_pkt->result;
+
+            // Delete state and data packet because a load retry
+            // initiates a pipeline restart; it does not retry.
+            delete state;
+            delete data_pkt;
+
+            if (result == Packet::BadAddress) {
+                return TheISA::genMachineCheckFault();
+            }
+
             // If the access didn't succeed, tell the LSQ by setting
             // the retry thread id.
             lsq->setRetryTid(lsqID);
@@ -664,16 +676,6 @@ LSQUnit<Impl>::read(Request *req, T &data, int load_idx)
         return NoFault;
     }
 
-    if (data_pkt->result != Packet::Success) {
-        DPRINTF(LSQUnit, "LSQUnit: D-cache miss!\n");
-        DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
-                load_inst->seqNum);
-    } else {
-        DPRINTF(LSQUnit, "LSQUnit: D-cache hit!\n");
-        DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
-                load_inst->seqNum);
-    }
-
     return NoFault;
 }
 
diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh
index 63ffcece1..3f9db912f 100644
--- a/src/cpu/o3/lsq_unit_impl.hh
+++ b/src/cpu/o3/lsq_unit_impl.hh
@@ -608,9 +608,9 @@ LSQUnit<Impl>::writebackStores()
 
         DPRINTF(LSQUnit, "D-Cache: Writing back store idx:%i PC:%#x "
                 "to Addr:%#x, data:%#x [sn:%lli]\n",
-                storeWBIdx, storeQueue[storeWBIdx].inst->readPC(),
+                storeWBIdx, inst->readPC(),
                 req->getPaddr(), *(inst->memData),
-                storeQueue[storeWBIdx].inst->seqNum);
+                inst->seqNum);
 
         // @todo: Remove this SC hack once the memory system handles it.
         if (req->isLocked()) {
@@ -619,10 +619,19 @@ LSQUnit<Impl>::writebackStores()
             } else {
                 if (cpu->lockFlag) {
                     req->setScResult(1);
+                    DPRINTF(LSQUnit, "Store conditional [sn:%lli] succeeded.",
+                            inst->seqNum);
                 } else {
                     req->setScResult(0);
                     // Hack: Instantly complete this store.
-                    completeDataAccess(data_pkt);
+//                    completeDataAccess(data_pkt);
+                    DPRINTF(LSQUnit, "Store conditional [sn:%lli] failed.  "
+                            "Instantly completing it.\n",
+                            inst->seqNum);
+                    WritebackEvent *wb = new WritebackEvent(inst, data_pkt, this);
+                    wb->schedule(curTick + 1);
+                    delete state;
+                    completeStore(storeWBIdx);
                     incrStIdx(storeWBIdx);
                     continue;
                 }
@@ -633,7 +642,13 @@ LSQUnit<Impl>::writebackStores()
         }
 
         if (!dcachePort->sendTiming(data_pkt)) {
+            if (data_pkt->result == Packet::BadAddress) {
+                panic("LSQ sent out a bad address for a completed store!");
+            }
             // Need to handle becoming blocked on a store.
+            DPRINTF(IEW, "D-Cache became blcoked when writing [sn:%lli], will"
+                    "retry later\n",
+                    inst->seqNum);
             isStoreBlocked = true;
             ++lsqCacheBlocked;
             assert(retryPkt == NULL);
@@ -880,6 +895,9 @@ LSQUnit<Impl>::recvRetry()
         assert(retryPkt != NULL);
 
         if (dcachePort->sendTiming(retryPkt)) {
+            if (retryPkt->result == Packet::BadAddress) {
+                panic("LSQ sent out a bad address for a completed store!");
+            }
             storePostSend(retryPkt);
             retryPkt = NULL;
             isStoreBlocked = false;
diff --git a/src/cpu/o3/thread_context_impl.hh b/src/cpu/o3/thread_context_impl.hh
index 25e1db21c..2bc194d53 100755
--- a/src/cpu/o3/thread_context_impl.hh
+++ b/src/cpu/o3/thread_context_impl.hh
@@ -165,14 +165,14 @@ template <class Impl>
 void
 O3ThreadContext<Impl>::deallocate(int delay)
 {
-    DPRINTF(O3CPU, "Calling deallocate on Thread Context %d\n",
-            getThreadNum());
+    DPRINTF(O3CPU, "Calling deallocate on Thread Context %d delay %d\n",
+            getThreadNum(), delay);
 
     if (thread->status() == ThreadContext::Unallocated)
         return;
 
     thread->setStatus(ThreadContext::Unallocated);
-    cpu->deallocateContext(thread->readTid(), delay);
+    cpu->deallocateContext(thread->readTid(), true, delay);
 }
 
 template <class Impl>
diff --git a/src/cpu/simple/timing.cc b/src/cpu/simple/timing.cc
index 88aa882e3..ad5c0e5d6 100644
--- a/src/cpu/simple/timing.cc
+++ b/src/cpu/simple/timing.cc
@@ -103,6 +103,7 @@ TimingSimpleCPU::TimingSimpleCPU(Params *p)
     ifetch_pkt = dcache_pkt = NULL;
     drainEvent = NULL;
     fetchEvent = NULL;
+    previousTick = 0;
     changeState(SimObject::Running);
 }
 
@@ -162,6 +163,7 @@ TimingSimpleCPU::resume()
     }
 
     changeState(SimObject::Running);
+    previousTick = curTick;
 }
 
 void
@@ -169,6 +171,7 @@ TimingSimpleCPU::switchOut()
 {
     assert(status() == Running || status() == Idle);
     _status = SwitchedOut;
+    numCycles += curTick - previousTick;
 
     // If we've been scheduled to resume but are then told to switch out,
     // we'll need to cancel it.
@@ -195,6 +198,23 @@ TimingSimpleCPU::takeOverFrom(BaseCPU *oldCPU)
     if (_status != Running) {
         _status = Idle;
     }
+
+    Port *peer;
+    if (icachePort.getPeer() == NULL) {
+        peer = oldCPU->getPort("icache_port")->getPeer();
+        icachePort.setPeer(peer);
+    } else {
+        peer = icachePort.getPeer();
+    }
+    peer->setPeer(&icachePort);
+
+    if (dcachePort.getPeer() == NULL) {
+        peer = oldCPU->getPort("dcache_port")->getPeer();
+        dcachePort.setPeer(peer);
+    } else {
+        peer = dcachePort.getPeer();
+    }
+    peer->setPeer(&dcachePort);
 }
 
 
@@ -429,6 +449,9 @@ TimingSimpleCPU::fetch()
         // fetch fault: advance directly to next instruction (fault handler)
         advanceInst(fault);
     }
+
+    numCycles += curTick - previousTick;
+    previousTick = curTick;
 }
 
 
@@ -459,6 +482,9 @@ TimingSimpleCPU::completeIfetch(Packet *pkt)
     delete pkt->req;
     delete pkt;
 
+    numCycles += curTick - previousTick;
+    previousTick = curTick;
+
     if (getState() == SimObject::Draining) {
         completeDrain();
         return;
@@ -538,6 +564,9 @@ TimingSimpleCPU::completeDataAccess(Packet *pkt)
     assert(_status == DcacheWaitResponse);
     _status = Running;
 
+    numCycles += curTick - previousTick;
+    previousTick = curTick;
+
     Fault fault = curStaticInst->completeAcc(pkt, this, traceData);
 
     if (pkt->isRead() && pkt->req->isLocked()) {
@@ -547,6 +576,8 @@ TimingSimpleCPU::completeDataAccess(Packet *pkt)
     delete pkt->req;
     delete pkt;
 
+    postExecute();
+
     if (getState() == SimObject::Draining) {
         advancePC(fault);
         completeDrain();
@@ -554,7 +585,6 @@ TimingSimpleCPU::completeDataAccess(Packet *pkt)
         return;
     }
 
-    postExecute();
     advanceInst(fault);
 }
 
diff --git a/src/cpu/simple/timing.hh b/src/cpu/simple/timing.hh
index 18e13aeb2..988ddeded 100644
--- a/src/cpu/simple/timing.hh
+++ b/src/cpu/simple/timing.hh
@@ -167,6 +167,7 @@ class TimingSimpleCPU : public BaseSimpleCPU
     Packet *dcache_pkt;
 
     int cpu_id;
+    Tick previousTick;
 
   public:
 
diff --git a/src/mem/bus.cc b/src/mem/bus.cc
index 1646cbd57..b11b6de58 100644
--- a/src/mem/bus.cc
+++ b/src/mem/bus.cc
@@ -61,12 +61,79 @@ Bus::getPort(const std::string &if_name, int idx)
 void
 Bus::init()
 {
-    std::vector<Port*>::iterator intIter;
+    std::vector<BusPort*>::iterator intIter;
 
     for (intIter = interfaces.begin(); intIter != interfaces.end(); intIter++)
         (*intIter)->sendStatusChange(Port::RangeChange);
 }
 
+Bus::BusFreeEvent::BusFreeEvent(Bus *_bus) : Event(&mainEventQueue), bus(_bus)
+{}
+
+void Bus::BusFreeEvent::process()
+{
+    bus->recvRetry(-1);
+}
+
+const char * Bus::BusFreeEvent::description()
+{
+    return "bus became available";
+}
+
+void Bus::occupyBus(PacketPtr pkt)
+{
+    //Bring tickNextIdle up to the present tick
+    //There is some potential ambiguity where a cycle starts, which might make
+    //a difference when devices are acting right around a cycle boundary. Using
+    //a < allows things which happen exactly on a cycle boundary to take up only
+    //the following cycle. Anthing that happens later will have to "wait" for
+    //the end of that cycle, and then start using the bus after that.
+    while (tickNextIdle < curTick)
+        tickNextIdle += clock;
+
+    // The packet will be sent. Figure out how long it occupies the bus, and
+    // how much of that time is for the first "word", aka bus width.
+    int numCycles = 0;
+    // Requests need one cycle to send an address
+    if (pkt->isRequest())
+        numCycles++;
+    else if (pkt->isResponse() || pkt->hasData()) {
+        // If a packet has data, it needs ceil(size/width) cycles to send it
+        // We're using the "adding instead of dividing" trick again here
+        if (pkt->hasData()) {
+            int dataSize = pkt->getSize();
+            for (int transmitted = 0; transmitted < dataSize;
+                    transmitted += width) {
+                numCycles++;
+            }
+        } else {
+            // If the packet didn't have data, it must have been a response.
+            // Those use the bus for one cycle to send their data.
+            numCycles++;
+        }
+    }
+
+    // The first word will be delivered after the current tick, the delivery
+    // of the address if any, and one bus cycle to deliver the data
+    pkt->firstWordTime =
+        tickNextIdle +
+        pkt->isRequest() ? clock : 0 +
+        clock;
+
+    //Advance it numCycles bus cycles.
+    //XXX Should this use the repeated addition trick as well?
+    tickNextIdle += (numCycles * clock);
+    if (!busIdle.scheduled()) {
+        busIdle.schedule(tickNextIdle);
+    } else {
+        busIdle.reschedule(tickNextIdle);
+    }
+    DPRINTF(Bus, "The bus is now occupied from tick %d to %d\n",
+            curTick, tickNextIdle);
+
+    // The bus will become idle once the current packet is delivered.
+    pkt->finishTime = tickNextIdle;
+}
 
 /** Function called by the port when the bus is receiving a Timing
  * transaction.*/
@@ -77,23 +144,40 @@ Bus::recvTiming(Packet *pkt)
     DPRINTF(Bus, "recvTiming: packet src %d dest %d addr 0x%x cmd %s\n",
             pkt->getSrc(), pkt->getDest(), pkt->getAddr(), pkt->cmdString());
 
+    BusPort *pktPort;
+    if (pkt->getSrc() == defaultId)
+        pktPort = defaultPort;
+    else pktPort = interfaces[pkt->getSrc()];
+
+    // If the bus is busy, or other devices are in line ahead of the current
+    // one, put this device on the retry list.
+    if (tickNextIdle > curTick ||
+            (retryList.size() && (!inRetry || pktPort != retryList.front()))) {
+        addToRetryList(pktPort);
+        return false;
+    }
+
     short dest = pkt->getDest();
     if (dest == Packet::Broadcast) {
-        if (timingSnoop(pkt))
-        {
+        if (timingSnoop(pkt)) {
             pkt->flags |= SNOOP_COMMIT;
             bool success = timingSnoop(pkt);
             assert(success);
             if (pkt->flags & SATISFIED) {
                 //Cache-Cache transfer occuring
+                if (inRetry) {
+                    retryList.front()->onRetryList(false);
+                    retryList.pop_front();
+                    inRetry = false;
+                }
+                occupyBus(pkt);
                 return true;
             }
             port = findPort(pkt->getAddr(), pkt->getSrc());
-        }
-        else
-        {
+        } else {
             //Snoop didn't succeed
-            retryList.push_back(interfaces[pkt->getSrc()]);
+            DPRINTF(Bus, "Adding a retry to RETRY list %i\n", pktPort);
+            addToRetryList(pktPort);
             return false;
         }
     } else {
@@ -101,35 +185,60 @@ Bus::recvTiming(Packet *pkt)
         assert(dest != pkt->getSrc()); // catch infinite loops
         port = interfaces[dest];
     }
+
+    occupyBus(pkt);
+
     if (port->sendTiming(pkt))  {
-        // packet was successfully sent, just return true.
+        // Packet was successfully sent. Return true.
+        // Also take care of retries
+        if (inRetry) {
+            DPRINTF(Bus, "Remove retry from list %i\n", retryList.front());
+            retryList.front()->onRetryList(false);
+            retryList.pop_front();
+            inRetry = false;
+        }
         return true;
     }
 
-    // packet not successfully sent
-    retryList.push_back(interfaces[pkt->getSrc()]);
+    // Packet not successfully sent. Leave or put it on the retry list.
+    DPRINTF(Bus, "Adding a retry to RETRY list %i\n", pktPort);
+    addToRetryList(pktPort);
     return false;
 }
 
 void
 Bus::recvRetry(int id)
 {
-    // Go through all the elements on the list calling sendRetry on each
-    // This is not very efficient at all but it works. Ultimately we should end
-    // up with something that is more intelligent.
-    int initialSize = retryList.size();
-    int i;
-    Port *p;
-
-    for (i = 0; i < initialSize; i++) {
-        assert(retryList.size() > 0);
-        p = retryList.front();
-        retryList.pop_front();
-        p->sendRetry();
+    DPRINTF(Bus, "Received a retry\n");
+    // If there's anything waiting, and the bus isn't busy...
+    if (retryList.size() && curTick >= tickNextIdle) {
+        //retryingPort = retryList.front();
+        inRetry = true;
+        DPRINTF(Bus, "Sending a retry\n");
+        retryList.front()->sendRetry();
+        // If inRetry is still true, sendTiming wasn't called
+        if (inRetry)
+        {
+            retryList.front()->onRetryList(false);
+            retryList.pop_front();
+            inRetry = false;
+
+            //Bring tickNextIdle up to the present
+            while (tickNextIdle < curTick)
+                tickNextIdle += clock;
+
+            //Burn a cycle for the missed grant.
+            tickNextIdle += clock;
+
+            if (!busIdle.scheduled()) {
+                busIdle.schedule(tickNextIdle);
+            } else {
+                busIdle.reschedule(tickNextIdle);
+            }
+        }
     }
 }
 
-
 Port *
 Bus::findPort(Addr addr, int id)
 {
@@ -180,24 +289,30 @@ Bus::findSnoopPorts(Addr addr, int id)
             //Careful  to not overlap ranges
             //or snoop will be called more than once on the port
             ports.push_back(portSnoopList[i].portId);
-            DPRINTF(Bus, "  found snoop addr %#llx on device%d\n", addr,
-                    portSnoopList[i].portId);
+//            DPRINTF(Bus, "  found snoop addr %#llx on device%d\n", addr,
+//                    portSnoopList[i].portId);
         }
         i++;
     }
     return ports;
 }
 
-void
+Tick
 Bus::atomicSnoop(Packet *pkt)
 {
     std::vector<int> ports = findSnoopPorts(pkt->getAddr(), pkt->getSrc());
+    Tick response_time = 0;
 
     while (!ports.empty())
     {
-        interfaces[ports.back()]->sendAtomic(pkt);
+        Tick response = interfaces[ports.back()]->sendAtomic(pkt);
+        if (response) {
+            assert(!response_time);  //Multiple responders
+            response_time = response;
+        }
         ports.pop_back();
     }
+    return response_time;
 }
 
 void
@@ -205,7 +320,7 @@ Bus::functionalSnoop(Packet *pkt)
 {
     std::vector<int> ports = findSnoopPorts(pkt->getAddr(), pkt->getSrc());
 
-    while (!ports.empty())
+    while (!ports.empty() && pkt->result != Packet::Success)
     {
         interfaces[ports.back()]->sendFunctional(pkt);
         ports.pop_back();
@@ -236,8 +351,11 @@ Bus::recvAtomic(Packet *pkt)
     DPRINTF(Bus, "recvAtomic: packet src %d dest %d addr 0x%x cmd %s\n",
             pkt->getSrc(), pkt->getDest(), pkt->getAddr(), pkt->cmdString());
     assert(pkt->getDest() == Packet::Broadcast);
-    atomicSnoop(pkt);
-    return findPort(pkt->getAddr(), pkt->getSrc())->sendAtomic(pkt);
+    Tick snoopTime = atomicSnoop(pkt);
+    if (snoopTime)
+        return snoopTime;  //Snoop satisfies it
+    else
+        return findPort(pkt->getAddr(), pkt->getSrc())->sendAtomic(pkt);
 }
 
 /** Function called by the port when the bus is receiving a Functional
@@ -249,7 +367,10 @@ Bus::recvFunctional(Packet *pkt)
             pkt->getSrc(), pkt->getDest(), pkt->getAddr(), pkt->cmdString());
     assert(pkt->getDest() == Packet::Broadcast);
     functionalSnoop(pkt);
-    findPort(pkt->getAddr(), pkt->getSrc())->sendFunctional(pkt);
+
+    // If the snooping found what we were looking for, we're done.
+    if (pkt->result != Packet::Success)
+        findPort(pkt->getAddr(), pkt->getSrc())->sendFunctional(pkt);
 }
 
 /** Function called by the port when the bus is receiving a status change.*/
@@ -277,7 +398,7 @@ Bus::recvStatusChange(Port::Status status, int id)
         }
     } else {
 
-        assert((id < interfaces.size() && id >= 0) || id == -1);
+        assert((id < interfaces.size() && id >= 0) || id == defaultId);
         Port *port = interfaces[id];
         std::vector<DevMap>::iterator portIter;
         std::vector<DevMap>::iterator snoopIter;
@@ -377,16 +498,20 @@ Bus::addressRanges(AddrRangeList &resp, AddrRangeList &snoop, int id)
 BEGIN_DECLARE_SIM_OBJECT_PARAMS(Bus)
 
     Param<int> bus_id;
+    Param<int> clock;
+    Param<int> width;
 
 END_DECLARE_SIM_OBJECT_PARAMS(Bus)
 
 BEGIN_INIT_SIM_OBJECT_PARAMS(Bus)
-    INIT_PARAM(bus_id, "a globally unique bus id")
+    INIT_PARAM(bus_id, "a globally unique bus id"),
+    INIT_PARAM(clock, "bus clock speed"),
+    INIT_PARAM(width, "width of the bus (bits)")
 END_INIT_SIM_OBJECT_PARAMS(Bus)
 
 CREATE_SIM_OBJECT(Bus)
 {
-    return new Bus(getInstanceName(), bus_id);
+    return new Bus(getInstanceName(), bus_id, clock, width);
 }
 
 REGISTER_SIM_OBJECT("Bus", Bus)
diff --git a/src/mem/bus.hh b/src/mem/bus.hh
index ff4ec9c8c..509b8cf9b 100644
--- a/src/mem/bus.hh
+++ b/src/mem/bus.hh
@@ -46,13 +46,20 @@
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
+#include "sim/eventq.hh"
 
 class Bus : public MemObject
 {
     /** a globally unique id for this bus. */
     int busId;
+    /** the clock speed for the bus */
+    int clock;
+    /** the width of the bus in bytes */
+    int width;
+    /** the next tick at which the bus will be idle */
+    Tick tickNextIdle;
 
-    static const int defaultId = -1;
+    static const int defaultId = -3; //Make it unique from Broadcast
 
     struct DevMap {
         int portId;
@@ -100,7 +107,7 @@ class Bus : public MemObject
     std::vector<int> findSnoopPorts(Addr addr, int id);
 
     /** Snoop all relevant ports atomicly. */
-    void atomicSnoop(Packet *pkt);
+    Tick atomicSnoop(Packet *pkt);
 
     /** Snoop all relevant ports functionally. */
     void functionalSnoop(Packet *pkt);
@@ -118,11 +125,15 @@ class Bus : public MemObject
      */
     void addressRanges(AddrRangeList &resp, AddrRangeList &snoop, int id);
 
+    /** Occupy the bus with transmitting the packet pkt */
+    void occupyBus(PacketPtr pkt);
 
     /** Declaration of the buses port type, one will be instantiated for each
         of the interfaces connecting to the bus. */
     class BusPort : public Port
     {
+        bool _onRetryList;
+
         /** A pointer to the bus to which this port belongs. */
         Bus *bus;
 
@@ -133,9 +144,15 @@ class Bus : public MemObject
 
         /** Constructor for the BusPort.*/
         BusPort(const std::string &_name, Bus *_bus, int _id)
-            : Port(_name), bus(_bus), id(_id)
+            : Port(_name), _onRetryList(false), bus(_bus), id(_id)
         { }
 
+        bool onRetryList()
+        { return _onRetryList; }
+
+        void onRetryList(bool newVal)
+        { _onRetryList = newVal; }
+
       protected:
 
         /** When reciving a timing request from the peer port (at id),
@@ -176,16 +193,52 @@ class Bus : public MemObject
 
     };
 
+    class BusFreeEvent : public Event
+    {
+        Bus * bus;
+
+      public:
+        BusFreeEvent(Bus * _bus);
+        void process();
+        const char *description();
+    };
+
+    BusFreeEvent busIdle;
+
+    bool inRetry;
+
     /** An array of pointers to the peer port interfaces
         connected to this bus.*/
-    std::vector<Port*> interfaces;
+    std::vector<BusPort*> interfaces;
 
     /** An array of pointers to ports that retry should be called on because the
      * original send failed for whatever reason.*/
-    std::list<Port*> retryList;
+    std::list<BusPort*> retryList;
+
+    void addToRetryList(BusPort * port)
+    {
+        if (!inRetry) {
+            // The device wasn't retrying a packet, or wasn't at an appropriate
+            // time.
+            assert(!port->onRetryList());
+            port->onRetryList(true);
+            retryList.push_back(port);
+        } else {
+            if (port->onRetryList()) {
+                // The device was retrying a packet. It didn't work, so we'll leave
+                // it at the head of the retry list.
+                assert(port == retryList.front());
+                inRetry = false;
+            }
+            else {
+                port->onRetryList(true);
+                retryList.push_back(port);
+            }
+        }
+    }
 
     /** Port that handles requests that don't match any of the interfaces.*/
-    Port *defaultPort;
+    BusPort *defaultPort;
 
   public:
 
@@ -194,8 +247,16 @@ class Bus : public MemObject
 
     virtual void init();
 
-    Bus(const std::string &n, int bus_id)
-        : MemObject(n), busId(bus_id), defaultPort(NULL)  {}
+    Bus(const std::string &n, int bus_id, int _clock, int _width)
+        : MemObject(n), busId(bus_id), clock(_clock), width(_width),
+        tickNextIdle(0), busIdle(this), inRetry(false), defaultPort(NULL)
+    {
+        //Both the width and clock period must be positive
+        if (width <= 0)
+            fatal("Bus width must be positive\n");
+        if (clock <= 0)
+            fatal("Bus clock period must be positive\n");
+    }
 
 };
 
diff --git a/src/mem/cache/base_cache.cc b/src/mem/cache/base_cache.cc
index 1a0f63d17..3f7a52fab 100644
--- a/src/mem/cache/base_cache.cc
+++ b/src/mem/cache/base_cache.cc
@@ -44,6 +44,8 @@ BaseCache::CachePort::CachePort(const std::string &_name, BaseCache *_cache,
     : Port(_name), cache(_cache), isCpuSide(_isCpuSide)
 {
     blocked = false;
+    cshrRetry = NULL;
+    waitingOnRetry = false;
     //Start ports at null if more than one is created we should panic
     //cpuSidePort = NULL;
     //memSidePort = NULL;
@@ -71,6 +73,22 @@ BaseCache::CachePort::deviceBlockSize()
 bool
 BaseCache::CachePort::recvTiming(Packet *pkt)
 {
+    if (isCpuSide
+        && !pkt->req->isUncacheable()
+        && pkt->isInvalidate()
+        && !pkt->isRead() && !pkt->isWrite()) {
+        //Upgrade or Invalidate
+        //Look into what happens if two slave caches on bus
+        DPRINTF(Cache, "%s %x ? blk_addr: %x\n", pkt->cmdString(),
+                pkt->getAddr() & (((ULL(1))<<48)-1),
+                pkt->getAddr() & ~((Addr)cache->blkSize - 1));
+
+        assert(!(pkt->flags & SATISFIED));
+        pkt->flags |= SATISFIED;
+        //Invalidates/Upgrades need no response if they get the bus
+        return true;
+    }
+
     if (pkt->isRequest() && blocked)
     {
         DPRINTF(Cache,"Scheduling a retry while blocked\n");
@@ -89,6 +107,42 @@ BaseCache::CachePort::recvAtomic(Packet *pkt)
 void
 BaseCache::CachePort::recvFunctional(Packet *pkt)
 {
+    //Check storage here first
+    list<Packet *>::iterator i = drainList.begin();
+    list<Packet *>::iterator end = drainList.end();
+    for (; i != end; ++i) {
+        Packet * target = *i;
+        // If the target contains data, and it overlaps the
+        // probed request, need to update data
+        if (target->intersect(pkt)) {
+            uint8_t* pkt_data;
+            uint8_t* write_data;
+            int data_size;
+            if (target->getAddr() < pkt->getAddr()) {
+                int offset = pkt->getAddr() - target->getAddr();
+                            pkt_data = pkt->getPtr<uint8_t>();
+                            write_data = target->getPtr<uint8_t>() + offset;
+                            data_size = target->getSize() - offset;
+                            assert(data_size > 0);
+                            if (data_size > pkt->getSize())
+                                data_size = pkt->getSize();
+            } else {
+                int offset = target->getAddr() - pkt->getAddr();
+                pkt_data = pkt->getPtr<uint8_t>() + offset;
+                write_data = target->getPtr<uint8_t>();
+                data_size = pkt->getSize() - offset;
+                assert(data_size > pkt->getSize());
+                if (data_size > target->getSize())
+                    data_size = target->getSize();
+            }
+
+            if (pkt->isWrite()) {
+                memcpy(pkt_data, write_data, data_size);
+            } else {
+                memcpy(write_data, pkt_data, data_size);
+            }
+        }
+    }
     cache->doFunctionalAccess(pkt, isCpuSide);
 }
 
@@ -96,47 +150,69 @@ void
 BaseCache::CachePort::recvRetry()
 {
     Packet *pkt;
+    assert(waitingOnRetry);
     if (!drainList.empty()) {
+        DPRINTF(CachePort, "%s attempting to send a retry for response\n", name());
         //We have some responses to drain first
-        bool result = true;
-        while (result && !drainList.empty()) {
-            result = sendTiming(drainList.front());
-            if (result)
-                drainList.pop_front();
+        if (sendTiming(drainList.front())) {
+            DPRINTF(CachePort, "%s sucessful in sending a retry for response\n", name());
+            drainList.pop_front();
+            if (!drainList.empty() ||
+                !isCpuSide && cache->doMasterRequest() ||
+                isCpuSide && cache->doSlaveRequest()) {
+
+                DPRINTF(CachePort, "%s has more responses/requests\n", name());
+                BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this);
+                reqCpu->schedule(curTick + 1);
+            }
+            waitingOnRetry = false;
         }
     }
-
-    if (!isCpuSide)
+    else if (!isCpuSide)
     {
+        DPRINTF(CachePort, "%s attempting to send a retry for MSHR\n", name());
+        if (!cache->doMasterRequest()) {
+            //This can happen if I am the owner of a block and see an upgrade
+            //while the block was in my WB Buffers.  I just remove the
+            //wb and de-assert the masterRequest
+            waitingOnRetry = false;
+            return;
+        }
         pkt = cache->getPacket();
         MSHR* mshr = (MSHR*)pkt->senderState;
         bool success = sendTiming(pkt);
         DPRINTF(Cache, "Address %x was %s in sending the timing request\n",
                 pkt->getAddr(), success ? "succesful" : "unsuccesful");
         cache->sendResult(pkt, mshr, success);
+        waitingOnRetry = !success;
         if (success && cache->doMasterRequest())
         {
+            DPRINTF(CachePort, "%s has more requests\n", name());
             //Still more to issue, rerequest in 1 cycle
-            pkt = NULL;
             BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this);
             reqCpu->schedule(curTick + 1);
         }
     }
     else
     {
+        assert(cshrRetry);
         //pkt = cache->getCoherencePacket();
         //We save the packet, no reordering on CSHRS
         pkt = cshrRetry;
         bool success = sendTiming(pkt);
-        if (success && cache->doSlaveRequest())
+        waitingOnRetry = !success;
+        if (success)
         {
-            //Still more to issue, rerequest in 1 cycle
-            pkt = NULL;
-            BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this);
-            reqCpu->schedule(curTick + 1);
+            if (cache->doSlaveRequest()) {
+                //Still more to issue, rerequest in 1 cycle
+                BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(this);
+                reqCpu->schedule(curTick + 1);
+            }
+            cshrRetry = NULL;
         }
-
     }
+    if (waitingOnRetry) DPRINTF(CachePort, "%s STILL Waiting on retry\n", name());
+    else DPRINTF(CachePort, "%s no longer waiting on retry\n", name());
     return;
 }
 void
@@ -181,17 +257,47 @@ BaseCache::CacheEvent::process()
 {
     if (!pkt)
     {
-        if (!cachePort->isCpuSide)
-        {
-            //MSHR
+        if (cachePort->waitingOnRetry) return;
+       //We have some responses to drain first
+        if (!cachePort->drainList.empty()) {
+            DPRINTF(CachePort, "%s trying to drain a response\n", cachePort->name());
+            if (cachePort->sendTiming(cachePort->drainList.front())) {
+                DPRINTF(CachePort, "%s drains a response succesfully\n", cachePort->name());
+                cachePort->drainList.pop_front();
+                if (!cachePort->drainList.empty() ||
+                    !cachePort->isCpuSide && cachePort->cache->doMasterRequest() ||
+                    cachePort->isCpuSide && cachePort->cache->doSlaveRequest()) {
+
+                    DPRINTF(CachePort, "%s still has outstanding bus reqs\n", cachePort->name());
+                    this->schedule(curTick + 1);
+                }
+            }
+            else {
+                cachePort->waitingOnRetry = true;
+                DPRINTF(CachePort, "%s now waiting on a retry\n", cachePort->name());
+            }
+        }
+        else if (!cachePort->isCpuSide)
+        {            //MSHR
+            DPRINTF(CachePort, "%s trying to send a MSHR request\n", cachePort->name());
+            if (!cachePort->cache->doMasterRequest()) {
+                //This can happen if I am the owner of a block and see an upgrade
+                //while the block was in my WB Buffers.  I just remove the
+                //wb and de-assert the masterRequest
+                return;
+            }
+
             pkt = cachePort->cache->getPacket();
             MSHR* mshr = (MSHR*) pkt->senderState;
             bool success = cachePort->sendTiming(pkt);
             DPRINTF(Cache, "Address %x was %s in sending the timing request\n",
                     pkt->getAddr(), success ? "succesful" : "unsuccesful");
             cachePort->cache->sendResult(pkt, mshr, success);
+            cachePort->waitingOnRetry = !success;
+            if (cachePort->waitingOnRetry) DPRINTF(CachePort, "%s now waiting on a retry\n", cachePort->name());
             if (success && cachePort->cache->doMasterRequest())
             {
+                DPRINTF(CachePort, "%s still more MSHR requests to send\n", cachePort->name());
                 //Still more to issue, rerequest in 1 cycle
                 pkt = NULL;
                 this->schedule(curTick+1);
@@ -200,32 +306,49 @@ BaseCache::CacheEvent::process()
         else
         {
             //CSHR
-            pkt = cachePort->cache->getCoherencePacket();
+            if (!cachePort->cshrRetry) {
+                assert(cachePort->cache->doSlaveRequest());
+                pkt = cachePort->cache->getCoherencePacket();
+            }
+            else {
+                pkt = cachePort->cshrRetry;
+            }
             bool success = cachePort->sendTiming(pkt);
             if (!success) {
                 //Need to send on a retry
                 cachePort->cshrRetry = pkt;
+                cachePort->waitingOnRetry = true;
             }
-            else if (cachePort->cache->doSlaveRequest())
+            else
             {
-                //Still more to issue, rerequest in 1 cycle
-                pkt = NULL;
-                this->schedule(curTick+1);
+                cachePort->cshrRetry = NULL;
+                if (cachePort->cache->doSlaveRequest()) {
+                    //Still more to issue, rerequest in 1 cycle
+                    pkt = NULL;
+                    this->schedule(curTick+1);
+                }
             }
         }
         return;
     }
     //Response
     //Know the packet to send
-    pkt->result = Packet::Success;
+    if (pkt->flags & NACKED_LINE)
+        pkt->result = Packet::Nacked;
+    else
+        pkt->result = Packet::Success;
     pkt->makeTimingResponse();
-    if (!cachePort->drainList.empty()) {
-        //Already blocked waiting for bus, just append
+    DPRINTF(CachePort, "%s attempting to send a response\n", cachePort->name());
+    if (!cachePort->drainList.empty() || cachePort->waitingOnRetry) {
+        //Already have a list, just append
         cachePort->drainList.push_back(pkt);
+        DPRINTF(CachePort, "%s appending response onto drain list\n", cachePort->name());
     }
     else if (!cachePort->sendTiming(pkt)) {
         //It failed, save it to list of drain events
+        DPRINTF(CachePort, "%s now waiting for a retry\n", cachePort->name());
         cachePort->drainList.push_back(pkt);
+        cachePort->waitingOnRetry = true;
     }
 }
 
diff --git a/src/mem/cache/base_cache.hh b/src/mem/cache/base_cache.hh
index c45f3b71b..455e13d9c 100644
--- a/src/mem/cache/base_cache.hh
+++ b/src/mem/cache/base_cache.hh
@@ -112,6 +112,8 @@ class BaseCache : public MemObject
 
         bool isCpuSide;
 
+        bool waitingOnRetry;
+
         std::list<Packet *> drainList;
 
         Packet *cshrRetry;
@@ -210,10 +212,6 @@ class BaseCache : public MemObject
 
   protected:
 
-    /** True if this cache is connected to the CPU. */
-    bool topLevelCache;
-
-
     /** Stores time the cache blocked for statistics. */
     Tick blockedCycle;
 
@@ -335,7 +333,7 @@ class BaseCache : public MemObject
      */
     BaseCache(const std::string &name, Params &params)
         : MemObject(name), blocked(0), blockedSnoop(0), masterRequests(0),
-          slaveRequests(0), topLevelCache(false),  blkSize(params.blkSize),
+          slaveRequests(0), blkSize(params.blkSize),
           missCount(params.maxMisses)
     {
         //Start ports at null if more than one is created we should panic
@@ -356,15 +354,6 @@ class BaseCache : public MemObject
     }
 
     /**
-     * Returns true if this cache is connect to the CPU.
-     * @return True if this is a L1 cache.
-     */
-    bool isTopLevel()
-    {
-        return topLevelCache;
-    }
-
-    /**
      * Returns true if the cache is blocked for accesses.
      */
     bool isBlocked()
@@ -392,11 +381,13 @@ class BaseCache : public MemObject
             blocked_causes[cause]++;
             blockedCycle = curTick;
         }
+        int old_state = blocked;
         if (!(blocked & flag)) {
             //Wasn't already blocked for this cause
             blocked |= flag;
             DPRINTF(Cache,"Blocking for cause %s\n", cause);
-            cpuSidePort->setBlocked();
+            if (!old_state)
+                cpuSidePort->setBlocked();
         }
     }
 
@@ -408,10 +399,12 @@ class BaseCache : public MemObject
     void setBlockedForSnoop(BlockedCause cause)
     {
         uint8_t flag = 1 << cause;
-        if (!(blocked & flag)) {
+        uint8_t old_state = blockedSnoop;
+        if (!(blockedSnoop & flag)) {
             //Wasn't already blocked for this cause
             blockedSnoop |= flag;
-            memSidePort->setBlocked();
+            if (!old_state)
+                memSidePort->setBlocked();
         }
     }
 
@@ -461,7 +454,7 @@ class BaseCache : public MemObject
      */
     void setMasterRequest(RequestCause cause, Tick time)
     {
-        if (!doMasterRequest())
+        if (!doMasterRequest() && !memSidePort->waitingOnRetry)
         {
             BaseCache::CacheEvent * reqCpu = new BaseCache::CacheEvent(memSidePort);
             reqCpu->schedule(time);
@@ -523,6 +516,10 @@ class BaseCache : public MemObject
             CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
             reqCpu->schedule(time);
         }
+        else {
+            if (pkt->cmd == Packet::Writeback) delete pkt->req;
+            delete pkt;
+        }
     }
 
     /**
@@ -539,6 +536,10 @@ class BaseCache : public MemObject
             CacheEvent *reqCpu = new CacheEvent(cpuSidePort, pkt);
             reqCpu->schedule(time);
         }
+        else {
+            if (pkt->cmd == Packet::Writeback) delete pkt->req;
+            delete pkt;
+        }
     }
 
     /**
@@ -547,8 +548,6 @@ class BaseCache : public MemObject
      */
     void respondToSnoop(Packet *pkt, Tick time)
     {
-//        assert("Implement\n" && 0);
-//	mi->respond(pkt,curTick + hitLatency);
         assert (pkt->needsResponse());
         CacheEvent *reqMem = new CacheEvent(memSidePort, pkt);
         reqMem->schedule(time);
@@ -571,15 +570,7 @@ class BaseCache : public MemObject
         {
             //This is where snoops get updated
             AddrRangeList dummy;
-//            if (!topLevelCache)
-//            {
-                cpuSidePort->getPeerAddressRanges(dummy, snoop);
-//            }
-//            else
-//            {
-//                snoop.push_back(RangeSize(0,-1));
-//            }
-
+            cpuSidePort->getPeerAddressRanges(dummy, snoop);
             return;
         }
     }
diff --git a/src/mem/cache/cache.hh b/src/mem/cache/cache.hh
index 923bf8255..41b270030 100644
--- a/src/mem/cache/cache.hh
+++ b/src/mem/cache/cache.hh
@@ -103,6 +103,7 @@ class Cache : public BaseCache
       * Used to append to target list, to cause an invalidation.
       */
     Packet * invalidatePkt;
+    Request *invalidateReq;
 
     /**
      * Temporarily move a block into a MSHR.
diff --git a/src/mem/cache/cache_impl.hh b/src/mem/cache/cache_impl.hh
index bde7ac04b..9db79b843 100644
--- a/src/mem/cache/cache_impl.hh
+++ b/src/mem/cache/cache_impl.hh
@@ -63,9 +63,8 @@ doTimingAccess(Packet *pkt, CachePort *cachePort, bool isCpuSide)
         if (pkt->isWrite() && (pkt->req->isLocked())) {
             pkt->req->setScResult(1);
         }
-        if (!(pkt->flags & SATISFIED)) {
-            access(pkt);
-        }
+        access(pkt);
+
     }
     else
     {
@@ -101,7 +100,7 @@ doAtomicAccess(Packet *pkt, bool isCpuSide)
         if (pkt->isResponse())
             handleResponse(pkt);
         else
-            snoopProbe(pkt);
+            return snoopProbe(pkt);
     }
     //Fix this timing info
     return hitLatency;
@@ -149,14 +148,10 @@ Cache(const std::string &_name,
       prefetchAccess(params.prefetchAccess),
       tags(params.tags), missQueue(params.missQueue),
       coherence(params.coherence), prefetcher(params.prefetcher),
-      doCopy(params.doCopy), blockOnCopy(params.blockOnCopy)
+      doCopy(params.doCopy), blockOnCopy(params.blockOnCopy),
+      hitLatency(params.hitLatency)
 {
-//FIX BUS POINTERS
-//    if (params.in == NULL) {
-        topLevelCache = true;
-//    }
-//PLEASE FIX THIS, BUS SIZES NOT BEING USED
-        tags->setCache(this, blkSize, 1/*params.out->width, params.out->clockRate*/);
+    tags->setCache(this);
     tags->setPrefetcher(prefetcher);
     missQueue->setCache(this);
     missQueue->setPrefetcher(prefetcher);
@@ -164,10 +159,8 @@ Cache(const std::string &_name,
     prefetcher->setCache(this);
     prefetcher->setTags(tags);
     prefetcher->setBuffer(missQueue);
-#if 0
-    invalidatePkt = new Packet;
-    invalidatePkt->cmd = Packet::InvalidateReq;
-#endif
+    invalidateReq = new Request((Addr) NULL, blkSize, 0);
+    invalidatePkt = new Packet(invalidateReq, Packet::InvalidateReq, 0);
 }
 
 template<class TagStore, class Buffering, class Coherence>
@@ -196,20 +189,6 @@ Cache<TagStore,Buffering,Coherence>::access(PacketPtr &pkt)
         prefetcher->handleMiss(pkt, curTick);
     }
     if (!pkt->req->isUncacheable()) {
-        if (pkt->isInvalidate() && !pkt->isRead()
-            && !pkt->isWrite()) {
-            //Upgrade or Invalidate
-            //Look into what happens if two slave caches on bus
-            DPRINTF(Cache, "%s %x ? blk_addr: %x\n", pkt->cmdString(),
-                    pkt->getAddr() & (((ULL(1))<<48)-1),
-                    pkt->getAddr() & ~((Addr)blkSize - 1));
-
-            //@todo Should this return latency have the hit latency in it?
-//	    respond(pkt,curTick+lat);
-            pkt->flags |= SATISFIED;
-//            return MA_HIT; //@todo, return values
-            return true;
-        }
         blk = tags->handleAccess(pkt, lat, writebacks);
     } else {
         size = pkt->getSize();
@@ -245,7 +224,10 @@ Cache<TagStore,Buffering,Coherence>::access(PacketPtr &pkt)
         // clear dirty bit if write through
         if (pkt->needsResponse())
             respond(pkt, curTick+lat);
-//	return MA_HIT;
+        if (pkt->cmd == Packet::Writeback) {
+            //Signal that you can kill the pkt/req
+            pkt->flags |= SATISFIED;
+        }
         return true;
     }
 
@@ -269,6 +251,7 @@ template<class TagStore, class Buffering, class Coherence>
 Packet *
 Cache<TagStore,Buffering,Coherence>::getPacket()
 {
+    assert(missQueue->havePending());
     Packet * pkt = missQueue->getPacket();
     if (pkt) {
         if (!pkt->req->isUncacheable()) {
@@ -289,13 +272,28 @@ template<class TagStore, class Buffering, class Coherence>
 void
 Cache<TagStore,Buffering,Coherence>::sendResult(PacketPtr &pkt, MSHR* mshr, bool success)
 {
-    if (success) {
-              missQueue->markInService(pkt, mshr);
-          //Temp Hack for UPGRADES
-          if (pkt->cmd == Packet::UpgradeReq) {
-              handleResponse(pkt);
-          }
+    if (success && !(pkt->flags & NACKED_LINE)) {
+        missQueue->markInService(pkt, mshr);
+        //Temp Hack for UPGRADES
+        if (pkt->cmd == Packet::UpgradeReq) {
+            pkt->flags &= ~CACHE_LINE_FILL;
+            BlkType *blk = tags->findBlock(pkt);
+            CacheBlk::State old_state = (blk) ? blk->status : 0;
+            CacheBlk::State new_state = coherence->getNewState(pkt,old_state);
+            if (old_state != new_state)
+                DPRINTF(Cache, "Block for blk addr %x moving from state %i to %i\n",
+                        pkt->getAddr() & (((ULL(1))<<48)-1), old_state, new_state);
+            //Set the state on the upgrade
+            memcpy(pkt->getPtr<uint8_t>(), blk->data, blkSize);
+            PacketList writebacks;
+            tags->handleFill(blk, mshr, new_state, writebacks, pkt);
+            assert(writebacks.empty());
+            missQueue->handleResponse(pkt, curTick + hitLatency);
+        }
     } else if (pkt && !pkt->req->isUncacheable()) {
+        pkt->flags &= ~NACKED_LINE;
+        pkt->flags &= ~SATISFIED;
+        pkt->flags &= ~SNOOP_COMMIT;
         missQueue->restoreOrigCmd(pkt);
     }
 }
@@ -306,6 +304,14 @@ Cache<TagStore,Buffering,Coherence>::handleResponse(Packet * &pkt)
 {
     BlkType *blk = NULL;
     if (pkt->senderState) {
+        if (pkt->result == Packet::Nacked) {
+            //pkt->reinitFromRequest();
+            warn("NACKs from devices not connected to the same bus not implemented\n");
+            return;
+        }
+        if (pkt->result == Packet::BadAddress) {
+            //Make the response a Bad address and send it
+        }
 //	MemDebug::cacheResponse(pkt);
         DPRINTF(Cache, "Handling reponse to %x, blk addr: %x\n",pkt->getAddr(),
                 pkt->getAddr() & (((ULL(1))<<48)-1));
@@ -315,8 +321,9 @@ Cache<TagStore,Buffering,Coherence>::handleResponse(Packet * &pkt)
             CacheBlk::State old_state = (blk) ? blk->status : 0;
             PacketList writebacks;
             CacheBlk::State new_state = coherence->getNewState(pkt,old_state);
-            DPRINTF(Cache, "Block for blk addr %x moving from state %i to %i\n",
-                    pkt->getAddr() & (((ULL(1))<<48)-1), old_state, new_state);
+            if (old_state != new_state)
+                DPRINTF(Cache, "Block for blk addr %x moving from state %i to %i\n",
+                        pkt->getAddr() & (((ULL(1))<<48)-1), old_state, new_state);
             blk = tags->handleFill(blk, (MSHR*)pkt->senderState,
                                    new_state, writebacks, pkt);
             while (!writebacks.empty()) {
@@ -377,10 +384,15 @@ template<class TagStore, class Buffering, class Coherence>
 void
 Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
 {
+    if (pkt->req->isUncacheable()) {
+        //Can't get a hit on an uncacheable address
+        //Revisit this for multi level coherence
+        return;
+    }
     Addr blk_addr = pkt->getAddr() & ~(Addr(blkSize-1));
     BlkType *blk = tags->findBlock(pkt);
     MSHR *mshr = missQueue->findMSHR(blk_addr);
-    if (isTopLevel() && coherence->hasProtocol()) { //@todo Move this into handle bus req
+    if (coherence->hasProtocol()) { //@todo Move this into handle bus req
         //If we find an mshr, and it is in service, we need to NACK or invalidate
         if (mshr) {
             if (mshr->inService) {
@@ -392,8 +404,9 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
                     assert(!(pkt->flags & SATISFIED));
                     pkt->flags |= SATISFIED;
                     pkt->flags |= NACKED_LINE;
-                    assert("Don't detect these on the other side yet\n");
-                    respondToSnoop(pkt, curTick + hitLatency);
+                    ///@todo NACK's from other levels
+                    //warn("NACKs from devices not connected to the same bus not implemented\n");
+                    //respondToSnoop(pkt, curTick + hitLatency);
                     return;
                 }
                 else {
@@ -406,7 +419,7 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
                     //@todo Make it so that a read to a pending read can't be exclusive now.
 
                     //Set the address so find match works
-                    assert("Don't have invalidates yet\n");
+                    //panic("Don't have invalidates yet\n");
                     invalidatePkt->addrOverride(pkt->getAddr());
 
                     //Append the invalidate on
@@ -437,7 +450,7 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
                         pkt->flags |= SHARED_LINE;
 
                         assert(pkt->isRead());
-                        Addr offset = pkt->getAddr() & ~(blkSize - 1);
+                        Addr offset = pkt->getAddr() & (blkSize - 1);
                         assert(offset < blkSize);
                         assert(pkt->getSize() <= blkSize);
                         assert(offset + pkt->getSize() <=blkSize);
@@ -458,16 +471,16 @@ Cache<TagStore,Buffering,Coherence>::snoop(Packet * &pkt)
     CacheBlk::State new_state;
     bool satisfy = coherence->handleBusRequest(pkt,blk,mshr, new_state);
     if (satisfy) {
-        DPRINTF(Cache, "Cache snooped a %s request and now supplying data,"
+        DPRINTF(Cache, "Cache snooped a %s request for addr %x and now supplying data,"
                 "new state is %i\n",
-                pkt->cmdString(), new_state);
+                pkt->cmdString(), blk_addr, new_state);
 
         tags->handleSnoop(blk, new_state, pkt);
         respondToSnoop(pkt, curTick + hitLatency);
         return;
     }
-    if (blk) DPRINTF(Cache, "Cache snooped a %s request, new state is %i\n",
-                     pkt->cmdString(), new_state);
+    if (blk) DPRINTF(Cache, "Cache snooped a %s request for addr %x, new state is %i\n",
+                     pkt->cmdString(), blk_addr, new_state);
     tags->handleSnoop(blk, new_state);
 }
 
@@ -521,6 +534,10 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update, CachePort
     int lat;
     BlkType *blk = tags->handleAccess(pkt, lat, writebacks, update);
 
+    DPRINTF(Cache, "%s %x %s blk_addr: %x\n", pkt->cmdString(),
+            pkt->getAddr() & (((ULL(1))<<48)-1), (blk) ? "hit" : "miss",
+            pkt->getAddr() & ~((Addr)blkSize - 1));
+
     if (!blk) {
         // Need to check for outstanding misses and writes
         Addr blk_addr = pkt->getAddr() & ~(blkSize - 1);
@@ -627,6 +644,11 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update, CachePort
 
                 busPkt->time = curTick;
 
+                DPRINTF(Cache, "Sending a atomic %s for %x blk_addr: %x\n",
+                        busPkt->cmdString(),
+                        busPkt->getAddr() & (((ULL(1))<<48)-1),
+                        busPkt->getAddr() & ~((Addr)blkSize - 1));
+
                 lat = memSidePort->sendAtomic(busPkt);
 
                 //Be sure to flip the response to a request for coherence
@@ -642,13 +664,26 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update, CachePort
 */		misses[pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/]++;
 
                 CacheBlk::State old_state = (blk) ? blk->status : 0;
+                CacheBlk::State new_state = coherence->getNewState(busPkt, old_state);
+                    DPRINTF(Cache, "Receive response:%s for blk addr %x in state %i\n",
+                            busPkt->cmdString(),
+                            busPkt->getAddr() & (((ULL(1))<<48)-1), old_state);
+                if (old_state != new_state)
+                    DPRINTF(Cache, "Block for blk addr %x moving from state %i to %i\n",
+                            busPkt->getAddr() & (((ULL(1))<<48)-1), old_state, new_state);
+
                 tags->handleFill(blk, busPkt,
-                                 coherence->getNewState(busPkt, old_state),
+                                 new_state,
                                  writebacks, pkt);
+                //Free the packet
+                delete busPkt;
+
                 // Handle writebacks if needed
                 while (!writebacks.empty()){
-                    memSidePort->sendAtomic(writebacks.front());
+                    Packet *wbPkt = writebacks.front();
+                    memSidePort->sendAtomic(wbPkt);
                     writebacks.pop_front();
+                    delete wbPkt;
                 }
                 return lat + hitLatency;
             } else {
@@ -669,7 +704,7 @@ Cache<TagStore,Buffering,Coherence>::probe(Packet * &pkt, bool update, CachePort
             // Still need to change data in all locations.
             otherSidePort->sendFunctional(pkt);
         }
-        return curTick + lat;
+        return hitLatency;
     }
     fatal("Probe not handled.\n");
     return 0;
@@ -685,15 +720,15 @@ Cache<TagStore,Buffering,Coherence>::snoopProbe(PacketPtr &pkt)
         CacheBlk::State new_state = 0;
         bool satisfy = coherence->handleBusRequest(pkt,blk,mshr, new_state);
         if (satisfy) {
-            DPRINTF(Cache, "Cache snooped a %s request and now supplying data,"
+            DPRINTF(Cache, "Cache snooped a %s request for addr %x and now supplying data,"
                     "new state is %i\n",
-                    pkt->cmdString(), new_state);
+                    pkt->cmdString(), blk_addr, new_state);
 
             tags->handleSnoop(blk, new_state, pkt);
             return hitLatency;
         }
-        if (blk) DPRINTF(Cache, "Cache snooped a %s request, new state is %i\n",
-                     pkt->cmdString(), new_state);
+        if (blk) DPRINTF(Cache, "Cache snooped a %s request for addr %x, new state is %i\n",
+                     pkt->cmdString(), blk_addr, new_state);
         tags->handleSnoop(blk, new_state);
         return 0;
 }
diff --git a/src/mem/cache/coherence/coherence_protocol.cc b/src/mem/cache/coherence/coherence_protocol.cc
index bcf3ce9c5..e28dda3dc 100644
--- a/src/mem/cache/coherence/coherence_protocol.cc
+++ b/src/mem/cache/coherence/coherence_protocol.cc
@@ -271,7 +271,7 @@ CoherenceProtocol::CoherenceProtocol(const string &name,
     }
 
     Packet::Command writeToSharedCmd = doUpgrades ? Packet::UpgradeReq : Packet::ReadExReq;
-    Packet::Command writeToSharedResp = doUpgrades ? Packet::UpgradeResp : Packet::ReadExResp;
+    Packet::Command writeToSharedResp = doUpgrades ? Packet::UpgradeReq : Packet::ReadExResp;
 
 //@todo add in hardware prefetch to this list
     if (protocol == "msi") {
diff --git a/src/mem/cache/coherence/uni_coherence.cc b/src/mem/cache/coherence/uni_coherence.cc
index 5ab706269..0efe393f9 100644
--- a/src/mem/cache/coherence/uni_coherence.cc
+++ b/src/mem/cache/coherence/uni_coherence.cc
@@ -68,14 +68,12 @@ UniCoherence::handleBusRequest(Packet * &pkt, CacheBlk *blk, MSHR *mshr,
     if (pkt->isInvalidate()) {
         DPRINTF(Cache, "snoop inval on blk %x (blk ptr %x)\n",
                 pkt->getAddr(), blk);
-        if (!cache->isTopLevel()) {
-            // Forward to other caches
-            Packet * tmp = new Packet(pkt->req, Packet::InvalidateReq, -1);
-            cshrs.allocate(tmp);
-            cache->setSlaveRequest(Request_Coherence, curTick);
-            if (cshrs.isFull()) {
-                cache->setBlockedForSnoop(Blocked_Coherence);
-            }
+        // Forward to other caches
+        Packet * tmp = new Packet(pkt->req, Packet::InvalidateReq, -1);
+        cshrs.allocate(tmp);
+        cache->setSlaveRequest(Request_Coherence, curTick);
+        if (cshrs.isFull()) {
+            cache->setBlockedForSnoop(Blocked_Coherence);
         }
     } else {
         if (blk) {
diff --git a/src/mem/cache/miss/miss_queue.cc b/src/mem/cache/miss/miss_queue.cc
index bdb7a39c8..c23b542f5 100644
--- a/src/mem/cache/miss/miss_queue.cc
+++ b/src/mem/cache/miss/miss_queue.cc
@@ -352,7 +352,7 @@ MissQueue::setPrefetcher(BasePrefetcher *_prefetcher)
 MSHR*
 MissQueue::allocateMiss(Packet * &pkt, int size, Tick time)
 {
-    MSHR* mshr = mq.allocate(pkt, blkSize);
+    MSHR* mshr = mq.allocate(pkt, size);
     mshr->order = order++;
     if (!pkt->req->isUncacheable() ){//&& !pkt->isNoAllocate()) {
         // Mark this as a cache line fill
@@ -515,6 +515,14 @@ MissQueue::setBusCmd(Packet * &pkt, Packet::Command cmd)
     assert(pkt->senderState != 0);
     MSHR * mshr = (MSHR*)pkt->senderState;
     mshr->originalCmd = pkt->cmd;
+    if (cmd == Packet::UpgradeReq || cmd == Packet::InvalidateReq) {
+        pkt->flags |= NO_ALLOCATE;
+        pkt->flags &= ~CACHE_LINE_FILL;
+    }
+    else if (!pkt->req->isUncacheable() && !pkt->isNoAllocate() &&
+             (cmd & (1 << 6)/*NeedsResponse*/)) {
+        pkt->flags |= CACHE_LINE_FILL;
+    }
     if (pkt->isCacheFill() || pkt->isNoAllocate())
         pkt->cmd = cmd;
 }
diff --git a/src/mem/cache/miss/mshr.cc b/src/mem/cache/miss/mshr.cc
index f36032672..455798f15 100644
--- a/src/mem/cache/miss/mshr.cc
+++ b/src/mem/cache/miss/mshr.cc
@@ -100,6 +100,7 @@ MSHR::deallocate()
 {
     assert(targets.empty());
     assert(ntargets == 0);
+    delete pkt;
     pkt = NULL;
     inService = false;
     //allocIter = NULL;
diff --git a/src/mem/cache/miss/mshr_queue.cc b/src/mem/cache/miss/mshr_queue.cc
index bd9667529..1876a8987 100644
--- a/src/mem/cache/miss/mshr_queue.cc
+++ b/src/mem/cache/miss/mshr_queue.cc
@@ -213,8 +213,13 @@ void
 MSHRQueue::markInService(MSHR* mshr)
 {
     //assert(mshr == pendingList.front());
-    if (!mshr->pkt->needsResponse()) {
+    if (!(mshr->pkt->needsResponse() || mshr->pkt->cmd == Packet::UpgradeReq)) {
         assert(mshr->getNumTargets() == 0);
+        if ((mshr->pkt->flags & SATISFIED) && (mshr->pkt->cmd == Packet::Writeback)) {
+            //Writeback hit, so delete it
+            //otherwise the consumer will delete it
+            delete mshr->pkt->req;
+        }
         deallocate(mshr);
         return;
     }
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index 91298df8c..64c65dcca 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -34,14 +34,26 @@
  * Definition of the Packet Class, a packet is a transaction occuring
  * between a single level of the memory heirarchy (ie L1->L2).
  */
+
+#include <iostream>
 #include "base/misc.hh"
 #include "mem/packet.hh"
+#include "base/trace.hh"
 
 static const std::string ReadReqString("ReadReq");
 static const std::string WriteReqString("WriteReq");
-static const std::string WriteReqNoAckString("WriteReqNoAck");
+static const std::string WriteReqNoAckString("WriteReqNoAck|Writeback");
 static const std::string ReadRespString("ReadResp");
 static const std::string WriteRespString("WriteResp");
+static const std::string SoftPFReqString("SoftPFReq");
+static const std::string SoftPFRespString("SoftPFResp");
+static const std::string HardPFReqString("HardPFReq");
+static const std::string HardPFRespString("HardPFResp");
+static const std::string InvalidateReqString("InvalidateReq");
+static const std::string WriteInvalidateReqString("WriteInvalidateReq");
+static const std::string UpgradeReqString("UpgradeReq");
+static const std::string ReadExReqString("ReadExReq");
+static const std::string ReadExRespString("ReadExResp");
 static const std::string OtherCmdString("<other>");
 
 const std::string &
@@ -53,6 +65,15 @@ Packet::cmdString() const
       case WriteReqNoAck:   return WriteReqNoAckString;
       case ReadResp:        return ReadRespString;
       case WriteResp:       return WriteRespString;
+      case SoftPFReq:       return SoftPFReqString;
+      case SoftPFResp:      return SoftPFRespString;
+      case HardPFReq:       return HardPFReqString;
+      case HardPFResp:      return HardPFRespString;
+      case InvalidateReq:   return InvalidateReqString;
+      case WriteInvalidateReq:return WriteInvalidateReqString;
+      case UpgradeReq:      return UpgradeReqString;
+      case ReadExReq:       return ReadExReqString;
+      case ReadExResp:      return ReadExRespString;
       default:              return OtherCmdString;
     }
 }
@@ -66,6 +87,15 @@ Packet::cmdIdxToString(Packet::Command idx)
       case WriteReqNoAck:   return WriteReqNoAckString;
       case ReadResp:        return ReadRespString;
       case WriteResp:       return WriteRespString;
+      case SoftPFReq:       return SoftPFReqString;
+      case SoftPFResp:      return SoftPFRespString;
+      case HardPFReq:       return HardPFReqString;
+      case HardPFResp:      return HardPFRespString;
+      case InvalidateReq:   return InvalidateReqString;
+      case WriteInvalidateReq:return WriteInvalidateReqString;
+      case UpgradeReq:      return UpgradeReqString;
+      case ReadExReq:       return ReadExReqString;
+      case ReadExResp:      return ReadExRespString;
       default:              return OtherCmdString;
     }
 }
@@ -102,19 +132,103 @@ bool
 Packet::intersect(Packet *p)
 {
     Addr s1 = getAddr();
-    Addr e1 = getAddr() + getSize();
+    Addr e1 = getAddr() + getSize() - 1;
     Addr s2 = p->getAddr();
-    Addr e2 = p->getAddr() + p->getSize();
+    Addr e2 = p->getAddr() + p->getSize() - 1;
 
-    if (s1 >= s2 && s1 < e2)
-        return true;
-    if (e1 >= s2 && e1 < e2)
-        return true;
-    return false;
+    return !(s1 > e2 || e1 < s2);
 }
 
 bool
 fixPacket(Packet *func, Packet *timing)
 {
-    panic("Need to implement!");
+    Addr funcStart      = func->getAddr();
+    Addr funcEnd        = func->getAddr() + func->getSize() - 1;
+    Addr timingStart    = timing->getAddr();
+    Addr timingEnd      = timing->getAddr() + timing->getSize() - 1;
+
+    assert(!(funcStart > timingEnd || timingStart < funcEnd));
+
+    if (DTRACE(FunctionalAccess)) {
+       DebugOut() << func;
+       DebugOut() << timing;
+    }
+
+    // this packet can't solve our problem, continue on
+    if (!timing->hasData())
+        return true;
+
+    if (func->isRead()) {
+        if (funcStart >= timingStart && funcEnd <= timingEnd) {
+            func->allocate();
+            memcpy(func->getPtr<uint8_t>(), timing->getPtr<uint8_t>() +
+                    funcStart - timingStart, func->getSize());
+            func->result = Packet::Success;
+            return false;
+        } else {
+            // In this case the timing packet only partially satisfies the
+            // requset, so we would need more information to make this work.
+            // Like bytes valid in the packet or something, so the request could
+            // continue and get this bit of possibly newer data along with the
+            // older data not written to yet.
+            panic("Timing packet only partially satisfies the functional"
+                    "request. Now what?");
+        }
+    } else if (func->isWrite()) {
+        if (funcStart >= timingStart) {
+            memcpy(timing->getPtr<uint8_t>() + (funcStart - timingStart),
+                   func->getPtr<uint8_t>(),
+                   funcStart - std::min(funcEnd, timingEnd));
+        } else { // timingStart > funcStart
+            memcpy(timing->getPtr<uint8_t>(),
+                   func->getPtr<uint8_t>() + (timingStart - funcStart),
+                   timingStart - std::min(funcEnd, timingEnd));
+        }
+        // we always want to keep going with a write
+        return true;
+    } else
+        panic("Don't know how to handle command type %#x\n",
+                func->cmdToIndex());
+
+}
+
+
+std::ostream &
+operator<<(std::ostream &o, const Packet &p)
+{
+
+    o << "[0x";
+    o.setf(std::ios_base::hex, std::ios_base::showbase);
+    o <<  p.getAddr();
+    o.unsetf(std::ios_base::hex| std::ios_base::showbase);
+    o <<  ":";
+    o.setf(std::ios_base::hex, std::ios_base::showbase);
+    o <<  p.getAddr() + p.getSize() - 1 << "] ";
+    o.unsetf(std::ios_base::hex| std::ios_base::showbase);
+
+    if (p.result == Packet::Success)
+        o << "Successful ";
+    if (p.result == Packet::BadAddress)
+        o << "BadAddress ";
+    if (p.result == Packet::Nacked)
+        o << "Nacked ";
+    if (p.result == Packet::Unknown)
+        o << "Inflight ";
+
+    if (p.isRead())
+        o << "Read ";
+    if (p.isWrite())
+        o << "Read ";
+    if (p.isInvalidate())
+        o << "Read ";
+    if (p.isRequest())
+        o << "Request ";
+    if (p.isResponse())
+        o << "Response ";
+    if (p.hasData())
+        o << "w/Data ";
+
+    o << std::endl;
+    return o;
 }
+
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index be9bf5f57..48b32ec47 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -58,10 +58,8 @@ typedef std::list<PacketPtr> PacketList;
 #define NO_ALLOCATE 1 << 5
 #define SNOOP_COMMIT 1 << 6
 
-//For statistics we need max number of commands, hard code it at
-//20 for now.  @todo fix later
-#define NUM_MEM_CMDS 1 << 9
-
+//for now.  @todo fix later
+#define NUM_MEM_CMDS 1 << 11
 /**
  * A Packet is used to encapsulate a transfer between two objects in
  * the memory system (e.g., the L1 and L2 cache).  (In contrast, a
@@ -94,7 +92,6 @@ class Packet
      *   be called on it rather than simply delete.*/
     bool arrayData;
 
-
     /** The address of the request.  This address could be virtual or
      *   physical, depending on the system configuration. */
     Addr addr;
@@ -126,6 +123,12 @@ class Packet
     /** Used to calculate latencies for each packet.*/
     Tick time;
 
+    /** The time at which the packet will be fully transmitted */
+    Tick finishTime;
+
+    /** The time at which the first chunk of the packet will be transmitted */
+    Tick firstWordTime;
+
     /** The special destination address indicating that the packet
      *   should be routed based on its address. */
     static const short Broadcast = -1;
@@ -164,17 +167,21 @@ class Packet
 
   private:
     /** List of command attributes. */
+    // If you add a new CommandAttribute, make sure to increase NUM_MEM_CMDS
+    // as well.
     enum CommandAttribute
     {
-        IsRead		= 1 << 0,
-        IsWrite		= 1 << 1,
-        IsPrefetch	= 1 << 2,
-        IsInvalidate	= 1 << 3,
-        IsRequest	= 1 << 4,
-        IsResponse 	= 1 << 5,
-        NeedsResponse	= 1 << 6,
+        IsRead                = 1 << 0,
+        IsWrite                = 1 << 1,
+        IsPrefetch        = 1 << 2,
+        IsInvalidate        = 1 << 3,
+        IsRequest        = 1 << 4,
+        IsResponse         = 1 << 5,
+        NeedsResponse        = 1 << 6,
         IsSWPrefetch    = 1 << 7,
-        IsHWPrefetch    = 1 << 8
+        IsHWPrefetch    = 1 << 8,
+        IsUpgrade       = 1 << 9,
+        HasData                = 1 << 10
     };
 
   public:
@@ -182,22 +189,24 @@ class Packet
     enum Command
     {
         InvalidCmd      = 0,
-        ReadReq		= IsRead  | IsRequest | NeedsResponse,
-        WriteReq	= IsWrite | IsRequest | NeedsResponse,
-        WriteReqNoAck	= IsWrite | IsRequest,
-        ReadResp	= IsRead  | IsResponse | NeedsResponse,
-        WriteResp	= IsWrite | IsResponse | NeedsResponse,
-        Writeback       = IsWrite | IsRequest,
+        ReadReq                = IsRead  | IsRequest | NeedsResponse,
+        WriteReq        = IsWrite | IsRequest | NeedsResponse | HasData,
+        WriteReqNoAck        = IsWrite | IsRequest | HasData,
+        ReadResp        = IsRead  | IsResponse | NeedsResponse | HasData,
+        WriteResp        = IsWrite | IsResponse | NeedsResponse,
+        Writeback       = IsWrite | IsRequest | HasData,
         SoftPFReq       = IsRead  | IsRequest | IsSWPrefetch | NeedsResponse,
         HardPFReq       = IsRead  | IsRequest | IsHWPrefetch | NeedsResponse,
-        SoftPFResp      = IsRead  | IsResponse | IsSWPrefetch | NeedsResponse,
-        HardPFResp      = IsRead  | IsResponse | IsHWPrefetch | NeedsResponse,
+        SoftPFResp      = IsRead  | IsResponse | IsSWPrefetch
+                                | NeedsResponse | HasData,
+        HardPFResp      = IsRead  | IsResponse | IsHWPrefetch
+                                    | NeedsResponse | HasData,
         InvalidateReq   = IsInvalidate | IsRequest,
-        WriteInvalidateReq = IsWrite | IsInvalidate | IsRequest,
-        UpgradeReq      = IsInvalidate | IsRequest | NeedsResponse,
-        UpgradeResp     = IsInvalidate | IsResponse | NeedsResponse,
+        WriteInvalidateReq = IsWrite | IsInvalidate | IsRequest | HasData,
+        UpgradeReq      = IsInvalidate | IsRequest | IsUpgrade,
         ReadExReq       = IsRead | IsInvalidate | IsRequest | NeedsResponse,
-        ReadExResp      = IsRead | IsInvalidate | IsResponse | NeedsResponse
+        ReadExResp      = IsRead | IsInvalidate | IsResponse
+                                | NeedsResponse | HasData
     };
 
     /** Return the string name of the cmd field (for debugging and
@@ -213,16 +222,17 @@ class Packet
     /** The command field of the packet. */
     Command cmd;
 
-    bool isRead() 	 { return (cmd & IsRead)  != 0; }
-    bool isWrite()       { return (cmd & IsWrite) != 0; }
-    bool isRequest()	 { return (cmd & IsRequest)  != 0; }
-    bool isResponse()	 { return (cmd & IsResponse) != 0; }
-    bool needsResponse() { return (cmd & NeedsResponse) != 0; }
-    bool isInvalidate()  { return (cmd & IsInvalidate) != 0; }
+    bool isRead() const         { return (cmd & IsRead)  != 0; }
+    bool isWrite()  const       { return (cmd & IsWrite) != 0; }
+    bool isRequest() const      { return (cmd & IsRequest)  != 0; }
+    bool isResponse() const     { return (cmd & IsResponse) != 0; }
+    bool needsResponse() const  { return (cmd & NeedsResponse) != 0; }
+    bool isInvalidate() const   { return (cmd & IsInvalidate) != 0; }
+    bool hasData() const        { return (cmd & HasData) != 0; }
 
-    bool isCacheFill() { return (flags & CACHE_LINE_FILL) != 0; }
-    bool isNoAllocate() { return (flags & NO_ALLOCATE) != 0; }
-    bool isCompressed() { return (flags & COMPRESSED) != 0; }
+    bool isCacheFill() const    { return (flags & CACHE_LINE_FILL) != 0; }
+    bool isNoAllocate() const   { return (flags & NO_ALLOCATE) != 0; }
+    bool isCompressed() const   { return (flags & COMPRESSED) != 0; }
 
     bool nic_pkt() { assert("Unimplemented\n" && 0); return false; }
 
@@ -320,6 +330,10 @@ class Packet
         int icmd = (int)cmd;
         icmd &= ~(IsRequest);
         icmd |= IsResponse;
+        if (isRead())
+            icmd |= HasData;
+        if (isWrite())
+            icmd &= ~HasData;
         cmd = (Command)icmd;
         dest = src;
         srcValid = false;
@@ -334,6 +348,10 @@ class Packet
         int icmd = (int)cmd;
         icmd &= ~(IsRequest);
         icmd |= IsResponse;
+        if (isRead())
+            icmd |= HasData;
+        if (isWrite())
+            icmd &= ~HasData;
         cmd = (Command)icmd;
     }
 
@@ -383,5 +401,14 @@ class Packet
     bool intersect(Packet *p);
 };
 
+
+/** This function given a functional packet and a timing packet either satisfies
+ * the timing packet, or updates the timing packet to reflect the updated state
+ * in the timing packet. It returns if the functional packet should continue to
+ * traverse the memory hierarchy or not.
+ */
 bool fixPacket(Packet *func, Packet *timing);
+
+std::ostream & operator<<(std::ostream &o, const Packet &p);
+
 #endif //__MEM_PACKET_HH
diff --git a/src/mem/physical.cc b/src/mem/physical.cc
index 96d78bd99..f5a0ade15 100644
--- a/src/mem/physical.cc
+++ b/src/mem/physical.cc
@@ -195,18 +195,22 @@ PhysicalMemory::checkLockedAddrList(Request *req)
 void
 PhysicalMemory::doFunctionalAccess(Packet *pkt)
 {
-    assert(pkt->getAddr() + pkt->getSize() < params()->addrRange.size());
+    assert(pkt->getAddr() + pkt->getSize() <= params()->addrRange.size());
 
     if (pkt->isRead()) {
         if (pkt->req->isLocked()) {
             trackLoadLocked(pkt->req);
         }
+        DPRINTF(MemoryAccess, "Performing Read of size %i on address 0x%x\n",
+                pkt->getSize(), pkt->getAddr());
         memcpy(pkt->getPtr<uint8_t>(),
                pmemAddr + pkt->getAddr() - params()->addrRange.start,
                pkt->getSize());
     }
     else if (pkt->isWrite()) {
         if (writeOK(pkt->req)) {
+            DPRINTF(MemoryAccess, "Performing Write of size %i on address 0x%x\n",
+                    pkt->getSize(), pkt->getAddr());
             memcpy(pmemAddr + pkt->getAddr() - params()->addrRange.start,
                    pkt->getPtr<uint8_t>(), pkt->getSize());
         }
diff --git a/src/mem/tport.cc b/src/mem/tport.cc
index cef7a2a5b..21907c0ca 100644
--- a/src/mem/tport.cc
+++ b/src/mem/tport.cc
@@ -33,8 +33,22 @@
 void
 SimpleTimingPort::recvFunctional(Packet *pkt)
 {
-    // just do an atomic access and throw away the returned latency
-    recvAtomic(pkt);
+    //First check queued events
+    std::list<Packet *>::iterator i = transmitList.begin();
+    std::list<Packet *>::iterator end = transmitList.end();
+    bool cont = true;
+
+    while (i != end && cont) {
+        Packet * target = *i;
+        // If the target contains data, and it overlaps the
+        // probed request, need to update data
+        if (target->intersect(pkt))
+            fixPacket(pkt, target);
+
+    }
+    //Then just do an atomic access and throw away the returned latency
+    if (cont)
+        recvAtomic(pkt);
 }
 
 bool
@@ -58,13 +72,17 @@ SimpleTimingPort::recvTiming(Packet *pkt)
 void
 SimpleTimingPort::recvRetry()
 {
-    bool result = true;
-    while (result && transmitList.size()) {
-        result = sendTiming(transmitList.front());
-        if (result)
-            transmitList.pop_front();
+    assert(outTiming > 0);
+    assert(!transmitList.empty());
+    if (sendTiming(transmitList.front())) {
+        transmitList.pop_front();
+        outTiming--;
+        DPRINTF(Bus, "No Longer waiting on retry\n");
+        if (!transmitList.empty())
+            sendTimingLater(transmitList.front(), 1);
     }
-    if (transmitList.size() == 0 && drainEvent) {
+
+    if (transmitList.empty() && drainEvent) {
         drainEvent->process();
         drainEvent = NULL;
     }
@@ -73,18 +91,28 @@ SimpleTimingPort::recvRetry()
 void
 SimpleTimingPort::SendEvent::process()
 {
-    port->outTiming--;
-    assert(port->outTiming >= 0);
-    if (port->sendTiming(packet)) {
-        // send successfule
-        if (port->transmitList.size() == 0 && port->drainEvent) {
+    assert(port->outTiming > 0);
+    if (!port->transmitList.empty() && port->transmitList.front() != packet) {
+        //We are not the head of the list
+        port->transmitList.push_back(packet);
+    } else if (port->sendTiming(packet)) {
+        // send successful
+        if (port->transmitList.size()) {
+            port->transmitList.pop_front();
+            port->outTiming--;
+           if (!port->transmitList.empty())
+                port->sendTimingLater(port->transmitList.front(), 1);
+        }
+        if (port->transmitList.empty() && port->drainEvent) {
             port->drainEvent->process();
             port->drainEvent = NULL;
         }
     } else {
         // send unsuccessful (due to flow control).  Will get retry
-        // callback later; save for then.
-        port->transmitList.push_back(packet);
+        // callback later; save for then if not already
+        DPRINTF(Bus, "Waiting on retry\n");
+        if (!(port->transmitList.front() == packet))
+            port->transmitList.push_back(packet);
     }
 }
 
diff --git a/src/python/m5/objects/Bus.py b/src/python/m5/objects/Bus.py
index f6828a0d5..6710111e5 100644
--- a/src/python/m5/objects/Bus.py
+++ b/src/python/m5/objects/Bus.py
@@ -6,3 +6,5 @@ class Bus(MemObject):
     port = VectorPort("vector port for connecting devices")
     default = Port("Default port for requests that aren't handeled by a device.")
     bus_id = Param.Int(0, "blah")
+    clock = Param.Clock("1GHz", "bus clock speed")
+    width = Param.Int(64, "bus width (bytes)")
diff --git a/src/python/m5/objects/FUPool.py b/src/python/m5/objects/FUPool.py
index 4b4be79a6..916183bd7 100644
--- a/src/python/m5/objects/FUPool.py
+++ b/src/python/m5/objects/FUPool.py
@@ -1,6 +1,12 @@
 from m5.SimObject import SimObject
 from m5.params import *
+from FuncUnit import *
+from FuncUnitConfig import *
 
 class FUPool(SimObject):
     type = 'FUPool'
     FUList = VectorParam.FUDesc("list of FU's for this pool")
+
+class DefaultFUPool(FUPool):
+    FUList = [ IntALU(), IntMultDiv(), FP_ALU(), FP_MultDiv(), ReadPort(),
+               WritePort(), RdWrPort(), IprPort() ]
diff --git a/src/python/m5/objects/FuncUnitConfig.py b/src/python/m5/objects/FuncUnitConfig.py
new file mode 100644
index 000000000..43d7a4bb7
--- /dev/null
+++ b/src/python/m5/objects/FuncUnitConfig.py
@@ -0,0 +1,41 @@
+from m5.SimObject import SimObject
+from m5.params import *
+from FuncUnit import *
+
+class IntALU(FUDesc):
+    opList = [ OpDesc(opClass='IntAlu') ]
+    count = 6
+
+class IntMultDiv(FUDesc):
+    opList = [ OpDesc(opClass='IntMult', opLat=3),
+               OpDesc(opClass='IntDiv', opLat=20, issueLat=19) ]
+    count=2
+
+class FP_ALU(FUDesc):
+    opList = [ OpDesc(opClass='FloatAdd', opLat=2),
+               OpDesc(opClass='FloatCmp', opLat=2),
+               OpDesc(opClass='FloatCvt', opLat=2) ]
+    count = 4
+
+class FP_MultDiv(FUDesc):
+    opList = [ OpDesc(opClass='FloatMult', opLat=4),
+               OpDesc(opClass='FloatDiv', opLat=12, issueLat=12),
+               OpDesc(opClass='FloatSqrt', opLat=24, issueLat=24) ]
+    count = 2
+
+class ReadPort(FUDesc):
+    opList = [ OpDesc(opClass='MemRead') ]
+    count = 0
+
+class WritePort(FUDesc):
+    opList = [ OpDesc(opClass='MemWrite') ]
+    count = 0
+
+class RdWrPort(FUDesc):
+    opList = [ OpDesc(opClass='MemRead'), OpDesc(opClass='MemWrite') ]
+    count = 4
+
+class IprPort(FUDesc):
+    opList = [ OpDesc(opClass='IprAccess', opLat = 3, issueLat = 3) ]
+    count = 1
+
diff --git a/src/python/m5/objects/MemTest.py b/src/python/m5/objects/MemTest.py
index 18aff03f4..83399be80 100644
--- a/src/python/m5/objects/MemTest.py
+++ b/src/python/m5/objects/MemTest.py
@@ -6,6 +6,7 @@ from m5 import build_env
 class MemTest(SimObject):
     type = 'MemTest'
     max_loads = Param.Counter("number of loads to execute")
+    atomic = Param.Bool(False, "Execute tester in atomic mode? (or timing)\n")
     memory_size = Param.Int(65536, "memory size")
     percent_dest_unaligned = Param.Percent(50,
         "percent of copy dest address that are unaligned")
diff --git a/src/python/m5/objects/O3CPU.py b/src/python/m5/objects/O3CPU.py
index 59b40c6e8..20eef383f 100644
--- a/src/python/m5/objects/O3CPU.py
+++ b/src/python/m5/objects/O3CPU.py
@@ -3,6 +3,7 @@ from m5.proxy import *
 from m5 import build_env
 from BaseCPU import BaseCPU
 from Checker import O3Checker
+from FUPool import *
 
 class DerivO3CPU(BaseCPU):
     type = 'DerivO3CPU'
@@ -14,11 +15,13 @@ class DerivO3CPU(BaseCPU):
     if build_env['USE_CHECKER']:
         if not build_env['FULL_SYSTEM']:
             checker = Param.BaseCPU(O3Checker(workload=Parent.workload,
-                                              exitOnError=True,
+                                              exitOnError=False,
+                                              updateOnError=True,
                                               warnOnlyOnLoadError=False),
                                     "checker")
         else:
-            checker = Param.BaseCPU(O3Checker(exitOnError=True, warnOnlyOnLoadError=False), "checker")
+            checker = Param.BaseCPU(O3Checker(exitOnError=False, updateOnError=True,
+                                              warnOnlyOnLoadError=False), "checker")
             checker.itb = Parent.itb
             checker.dtb = Parent.dtb
 
@@ -57,7 +60,7 @@ class DerivO3CPU(BaseCPU):
     issueWidth = Param.Unsigned(8, "Issue width")
     wbWidth = Param.Unsigned(8, "Writeback width")
     wbDepth = Param.Unsigned(1, "Writeback depth")
-    fuPool = Param.FUPool("Functional Unit pool")
+    fuPool = Param.FUPool(DefaultFUPool(), "Functional Unit pool")
 
     iewToCommitDelay = Param.Unsigned(1, "Issue/Execute/Writeback to commit "
                "delay")
@@ -77,7 +80,7 @@ class DerivO3CPU(BaseCPU):
     localHistoryBits = Param.Unsigned(11, "Bits for the local history")
     globalPredictorSize = Param.Unsigned(8192, "Size of global predictor")
     globalCtrBits = Param.Unsigned(2, "Bits per counter")
-    globalHistoryBits = Param.Unsigned(4096, "Bits of history")
+    globalHistoryBits = Param.Unsigned(13, "Bits of history")
     choicePredictorSize = Param.Unsigned(8192, "Size of choice predictor")
     choiceCtrBits = Param.Unsigned(2, "Bits of choice counters")