8 files changed, 609 insertions, 94 deletions
diff --git a/configs/ruby/MOESI_CMP_token.py b/configs/ruby/MOESI_CMP_token.py
index ef110d682..ba61c727a 100644
--- a/configs/ruby/MOESI_CMP_token.py
+++ b/configs/ruby/MOESI_CMP_token.py
@@ -81,6 +81,7 @@ def create_system(options, system, piobus, dma_devices):
     # Must create the individual controllers before the network to ensure the
     # controller constructors are called before the network constructor
     #
+    l2_bits = int(math.log(options.num_l2caches, 2))
     
     for i in xrange(options.num_cpus):
         #
@@ -104,9 +105,7 @@ def create_system(options, system, piobus, dma_devices):
                                       sequencer = cpu_seq,
                                       L1IcacheMemory = l1i_cache,
                                       L1DcacheMemory = l1d_cache,
-                                      l2_select_num_bits = \
-                                        math.log(options.num_l2caches,
-                                                 2),
+                                      l2_select_num_bits = l2_bits,
                                       N_tokens = n_tokens,
                                       retry_threshold = \
                                         options.l1_retries,
@@ -129,7 +128,8 @@ def create_system(options, system, piobus, dma_devices):
         # First create the Ruby objects associated with this cpu
         #
         l2_cache = L2Cache(size = options.l2_size,
-                           assoc = options.l2_assoc)
+                           assoc = options.l2_assoc,
+                           start_index_bit = l2_bits)
 
         l2_cntrl = L2Cache_Controller(version = i,
                                       L2cacheMemory = l2_cache,
diff --git a/configs/ruby/MOESI_hammer.py b/configs/ruby/MOESI_hammer.py
index 02d958b09..00908ae8b 100644
--- a/configs/ruby/MOESI_hammer.py
+++ b/configs/ruby/MOESI_hammer.py
@@ -27,6 +27,7 @@
 #
 # Authors: Brad Beckmann
 
+import math
 import m5
 from m5.objects import *
 from m5.defines import buildEnv
@@ -43,10 +44,18 @@ class L1Cache(RubyCache):
 class L2Cache(RubyCache):
     latency = 10
 
+#
+# Probe filter is a cache, latency is not used
+#
+class ProbeFilter(RubyCache):
+    latency = 1
+
 def define_options(parser):
     parser.add_option("--allow-atomic-migration", action="store_true",
           help="allow migratory sharing for atomic only accessed blocks")
-
+    parser.add_option("--pf-on", action="store_true",
+          help="Hammer: enable Probe Filter")
+    
 def create_system(options, system, piobus, dma_devices):
     
     if buildEnv['PROTOCOL'] != 'MOESI_hammer':
@@ -107,6 +116,29 @@ def create_system(options, system, piobus, dma_devices):
                       long(system.physmem.range.first) + 1
     mem_module_size = phys_mem_size / options.num_dirs
 
+    #
+    # determine size and index bits for probe filter
+    # By default, the probe filter size is configured to be twice the
+    # size of the L2 cache.
+    #
+    pf_size = MemorySize(options.l2_size)
+    pf_size.value = pf_size.value * 2
+    dir_bits = int(math.log(options.num_dirs, 2))
+    pf_bits = int(math.log(pf_size.value, 2))
+    if options.numa_high_bit:
+        if options.numa_high_bit > 0:
+            # if numa high bit explicitly set, make sure it does not overlap
+            # with the probe filter index
+            assert(options.numa_high_bit - dir_bits > pf_bits)
+
+        # set the probe filter start bit to just above the block offset
+        pf_start_bit = 6
+    else:
+        if dir_bits > 0:
+            pf_start_bit = dir_bits + 5
+        else:
+            pf_start_bit = 6
+
     for i in xrange(options.num_dirs):
         #
         # Create the Ruby objects associated with the directory controller
@@ -117,6 +149,8 @@ def create_system(options, system, piobus, dma_devices):
         dir_size = MemorySize('0B')
         dir_size.value = mem_module_size
 
+        pf = ProbeFilter(size = pf_size, assoc = 4)
+
         dir_cntrl = Directory_Controller(version = i,
                                          directory = \
                                          RubyDirectoryMemory( \
@@ -125,7 +159,10 @@ def create_system(options, system, piobus, dma_devices):
                                                     use_map = options.use_map,
                                                     map_levels = \
                                                     options.map_levels),
-                                         memBuffer = mem_cntrl)
+                                         probeFilter = pf,
+                                         memBuffer = mem_cntrl,
+                                         probe_filter_enabled = \
+                                           options.pf_on)
 
         exec("system.dir_cntrl%d = dir_cntrl" % i)
         dir_cntrl_nodes.append(dir_cntrl)
diff --git a/src/mem/protocol/MOESI_hammer-cache.sm b/src/mem/protocol/MOESI_hammer-cache.sm
index 7b49c075c..06ce69624 100644
--- a/src/mem/protocol/MOESI_hammer-cache.sm
+++ b/src/mem/protocol/MOESI_hammer-cache.sm
@@ -96,6 +96,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
     Other_GETX,      desc="A GetX from another processor";
     Other_GETS,      desc="A GetS from another processor";
     Other_GETS_No_Mig, desc="A GetS from another processor";
+    Invalidate,      desc="Invalidate block";
 
     // Responses
     Ack,             desc="Received an ack message";
@@ -292,6 +293,8 @@ machine(L1Cache, "AMD Hammer-like protocol")
           } else {
             trigger(Event:Other_GETS, in_msg.Address);
           }
+        } else if (in_msg.Type == CoherenceRequestType:INV) {
+          trigger(Event:Invalidate, in_msg.Address);
         } else if (in_msg.Type == CoherenceRequestType:WB_ACK) {
           trigger(Event:Writeback_Ack, in_msg.Address);
         } else if (in_msg.Type == CoherenceRequestType:WB_NACK) {
@@ -445,7 +448,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
         out_msg.Destination.add(in_msg.Requestor);
         out_msg.DataBlk := getCacheEntry(address).DataBlk;
         out_msg.Dirty := getCacheEntry(address).Dirty;
-        out_msg.Acks := 2;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
         out_msg.MessageSize := MessageSizeType:Response_Data;
       }
     }
@@ -470,7 +477,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
         out_msg.Destination.add(in_msg.Requestor);
         out_msg.DataBlk := getCacheEntry(address).DataBlk;
         out_msg.Dirty := getCacheEntry(address).Dirty;
-        out_msg.Acks := 2;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
         out_msg.MessageSize := MessageSizeType:Response_Data;
       }
     }
@@ -484,8 +495,13 @@ machine(L1Cache, "AMD Hammer-like protocol")
         out_msg.Sender := machineID;
         out_msg.Destination.add(in_msg.Requestor);
         out_msg.DataBlk := getCacheEntry(address).DataBlk;
+        DEBUG_EXPR(out_msg.DataBlk);
         out_msg.Dirty := getCacheEntry(address).Dirty;
-        out_msg.Acks := 2;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
         out_msg.MessageSize := MessageSizeType:Response_Data;
       }
     }
@@ -499,6 +515,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
         out_msg.Sender := machineID;
         out_msg.Destination.add(in_msg.Requestor);
         out_msg.Acks := 1;
+        assert(in_msg.DirectedProbe == false);
         out_msg.MessageSize := MessageSizeType:Response_Control;
       }
     }
@@ -512,6 +529,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
         out_msg.Sender := machineID;
         out_msg.Destination.add(in_msg.Requestor);
         out_msg.Acks := 1;
+        assert(in_msg.DirectedProbe == false);
         out_msg.MessageSize := MessageSizeType:Response_Control;
       }
     }
@@ -527,6 +545,26 @@ machine(L1Cache, "AMD Hammer-like protocol")
     }
   }
 
+  action(gm_sendUnblockM, "gm", desc="Send unblock to memory and indicate M/O/E state") {
+    enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      out_msg.Address := address;
+      out_msg.Type := CoherenceResponseType:UNBLOCKM;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+    }
+  }
+
+  action(gs_sendUnblockS, "gs", desc="Send unblock to memory and indicate S state") {
+    enqueue(unblockNetwork_out, ResponseMsg, latency=cache_response_latency) {
+      out_msg.Address := address;
+      out_msg.Type := CoherenceResponseType:UNBLOCKS;
+      out_msg.Sender := machineID;
+      out_msg.Destination.add(map_Address_to_Directory(address));
+      out_msg.MessageSize := MessageSizeType:Unblock_Control;
+    }
+  }
+
   action(h_load_hit, "h", desc="Notify sequencer the load completed.") {
     DEBUG_EXPR(getCacheEntry(address).DataBlk);
 
@@ -653,9 +691,14 @@ machine(L1Cache, "AMD Hammer-like protocol")
         out_msg.Type := CoherenceResponseType:DATA;
         out_msg.Sender := machineID;
         out_msg.Destination.add(in_msg.Requestor);
+        DEBUG_EXPR(out_msg.Destination);
         out_msg.DataBlk := TBEs[address].DataBlk;
         out_msg.Dirty := TBEs[address].Dirty;
-        out_msg.Acks := 2;
+        if (in_msg.DirectedProbe) {
+          out_msg.Acks := machineCount(MachineType:L1Cache);
+        } else {
+          out_msg.Acks := 2;
+        }
         out_msg.MessageSize := MessageSizeType:Response_Data;
       }
     }
@@ -719,9 +762,11 @@ machine(L1Cache, "AMD Hammer-like protocol")
 
   action(v_writeDataToCacheVerify, "v", desc="Write data to cache, assert it was same as before") {
     peek(responseToCache_in, ResponseMsg) {
+      DEBUG_EXPR(getCacheEntry(address).DataBlk);
+      DEBUG_EXPR(in_msg.DataBlk);
       assert(getCacheEntry(address).DataBlk == in_msg.DataBlk);
       getCacheEntry(address).DataBlk := in_msg.DataBlk;
-      getCacheEntry(address).Dirty := in_msg.Dirty;
+      getCacheEntry(address).Dirty := in_msg.Dirty || getCacheEntry(address).Dirty;
     }
   }
   
@@ -813,7 +858,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
     zz_recycleMandatoryQueue;
   }
 
-  transition({IT, ST, OT, MT, MMT}, {Other_GETX, Other_GETS, Other_GETS_No_Mig}) {
+  transition({IT, ST, OT, MT, MMT}, {Other_GETX, Other_GETS, Other_GETS_No_Mig, Invalidate}) {
     // stall
   }
 
@@ -963,7 +1008,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
     rr_deallocateL2CacheBlock;
   }
 
-  transition(I, {Other_GETX, Other_GETS, Other_GETS_No_Mig}) {
+  transition(I, {Other_GETX, Other_GETS, Other_GETS_No_Mig, Invalidate}) {
     f_sendAck;
     l_popForwardQueue;
   }
@@ -985,7 +1030,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
     rr_deallocateL2CacheBlock;
   }
 
-  transition(S, Other_GETX, I) {
+  transition(S, {Other_GETX, Invalidate}, I) {
     f_sendAck;
     l_popForwardQueue;
   }
@@ -1015,7 +1060,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
     rr_deallocateL2CacheBlock;
   }
 
-  transition(O, Other_GETX, I) {
+  transition(O, {Other_GETX, Invalidate}, I) {
     e_sendData;
     l_popForwardQueue;
   }
@@ -1042,7 +1087,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
     rr_deallocateL2CacheBlock;
   }
 
-  transition(MM, Other_GETX, I) {
+  transition(MM, {Other_GETX, Invalidate}, I) {
     c_sendExclusiveData;
     l_popForwardQueue;
   }
@@ -1074,7 +1119,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
     rr_deallocateL2CacheBlock;
   }
 
-  transition(M, Other_GETX, I) {
+  transition(M, {Other_GETX, Invalidate}, I) {
     c_sendExclusiveData;
     l_popForwardQueue;
   }
@@ -1086,7 +1131,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
 
   // Transitions from IM
 
-  transition(IM, {Other_GETX, Other_GETS, Other_GETS_No_Mig}) {
+  transition(IM, {Other_GETX, Other_GETS, Other_GETS_No_Mig, Invalidate}) {
     f_sendAck;
     l_popForwardQueue;
   }
@@ -1118,7 +1163,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
     l_popForwardQueue;
   }
 
-  transition(SM, Other_GETX, IM) {
+  transition(SM, {Other_GETX, Invalidate}, IM) {
     f_sendAck;
     l_popForwardQueue;
   }
@@ -1145,14 +1190,14 @@ machine(L1Cache, "AMD Hammer-like protocol")
 
   transition(ISM, All_acks_no_sharers, MM) {
     sxt_trig_ext_store_hit;
-    g_sendUnblock;
+    gm_sendUnblockM;
     s_deallocateTBE;
     j_popTriggerQueue;
   }
 
   // Transitions from OM
 
-  transition(OM, Other_GETX, IM) {
+  transition(OM, {Other_GETX, Invalidate}, IM) {
     e_sendData;
     pp_incrementNumberOfMessagesByOne;
     l_popForwardQueue;
@@ -1171,14 +1216,14 @@ machine(L1Cache, "AMD Hammer-like protocol")
 
   transition(OM, {All_acks, All_acks_no_sharers}, MM) {
     sxt_trig_ext_store_hit;
-    g_sendUnblock;
+    gm_sendUnblockM;
     s_deallocateTBE;
     j_popTriggerQueue;
   }
 
   // Transitions from IS
 
-  transition(IS, {Other_GETX, Other_GETS, Other_GETS_No_Mig}) {
+  transition(IS, {Other_GETX, Other_GETS, Other_GETS_No_Mig, Invalidate}) {
     f_sendAck;
     l_popForwardQueue;
   }
@@ -1237,14 +1282,14 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   transition(SS, All_acks, S) {
-    g_sendUnblock;
+    gs_sendUnblockS;
     s_deallocateTBE;
     j_popTriggerQueue;
   }
 
   transition(SS, All_acks_no_sharers, S) {
     // Note: The directory might still be the owner, so that is why we go to S
-    g_sendUnblock;
+    gs_sendUnblockS;
     s_deallocateTBE;
     j_popTriggerQueue;
   }
@@ -1263,7 +1308,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   transition(MM_W, All_acks_no_sharers, MM) {
-    g_sendUnblock;
+    gm_sendUnblockM;
     s_deallocateTBE;
     j_popTriggerQueue;
   }
@@ -1282,14 +1327,14 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   transition(M_W, All_acks_no_sharers, M) {
-    g_sendUnblock;
+    gm_sendUnblockM;
     s_deallocateTBE;
     j_popTriggerQueue;
   }
 
   // Transitions from OI/MI
 
-  transition({OI, MI}, Other_GETX, II) {
+  transition({OI, MI}, {Other_GETX, Invalidate}, II) {
     q_sendDataFromTBEToCache;
     l_popForwardQueue;
   }
@@ -1312,7 +1357,7 @@ machine(L1Cache, "AMD Hammer-like protocol")
   }
 
   // Transitions from II
-  transition(II, {Other_GETS, Other_GETS_No_Mig, Other_GETX}, II) {
+  transition(II, {Other_GETS, Other_GETS_No_Mig, Other_GETX, Invalidate}, II) {
     f_sendAck;
     l_popForwardQueue;
   }
diff --git a/src/mem/protocol/MOESI_hammer-dir.sm b/src/mem/protocol/MOESI_hammer-dir.sm
index d967c813c..ae282ba3a 100644
--- a/src/mem/protocol/MOESI_hammer-dir.sm
+++ b/src/mem/protocol/MOESI_hammer-dir.sm
@@ -35,8 +35,10 @@
 
 machine(Directory, "AMD Hammer-like protocol") 
 : DirectoryMemory * directory,
+  CacheMemory * probeFilter,
   MemoryControl * memBuffer,
-  int memory_controller_latency = 2
+  int memory_controller_latency = 2,
+  bool probe_filter_enabled = false
 {
 
   MessageBuffer forwardFromDir, network="To", virtual_network="3", ordered="false";
@@ -56,9 +58,16 @@ machine(Directory, "AMD Hammer-like protocol")
   // STATES
   enumeration(State, desc="Directory states", default="Directory_State_E") {
     // Base states
-    NO,             desc="Not Owner";
-    O,              desc="Owner";
-    E,              desc="Exclusive Owner (we can provide the data in exclusive)";
+    NX,             desc="Not Owner, probe filter entry exists, block in O at Owner";
+    NO,             desc="Not Owner, probe filter entry exists, block in E/M at Owner";
+    S,              desc="Data clean, probe filter entry exists pointing to the current owner";
+    O,              desc="Data clean, probe filter entry exists";
+    E,              desc="Exclusive Owner, no probe filter entry";
+
+    O_R,            desc="Was data Owner, replacing probe filter entry";
+    S_R,            desc="Was Not Owner or Sharer, replacing probe filter entry";
+    NO_R,           desc="Was Not Owner or Sharer, replacing probe filter entry";
+
     NO_B,  "NO^B",  desc="Not Owner, Blocked";
     O_B,   "O^B",   desc="Owner, Blocked";
     NO_B_W,         desc="Not Owner, Blocked, waiting for Dram";
@@ -83,11 +92,16 @@ machine(Directory, "AMD Hammer-like protocol")
     GETS,                      desc="A GETS arrives";
     PUT,                       desc="A PUT arrives"; 
     Unblock,                   desc="An unblock message arrives";
+    UnblockS,                   desc="An unblock message arrives";
+    UnblockM,                   desc="An unblock message arrives";
     Writeback_Clean,           desc="The final part of a PutX (no data)";
     Writeback_Dirty,           desc="The final part of a PutX (data)";
     Writeback_Exclusive_Clean, desc="The final part of a PutX (no data, exclusive)";
     Writeback_Exclusive_Dirty, desc="The final part of a PutX (data, exclusive)";
 
+    // Probe filter
+    Pf_Replacement,            desc="probe filter replacement";
+
     // DMA requests
     DMA_READ, desc="A DMA Read memory request";
     DMA_WRITE, desc="A DMA Write memory request";
@@ -100,10 +114,12 @@ machine(Directory, "AMD Hammer-like protocol")
     Ack,             desc="Received an ack message";
     Shared_Ack,      desc="Received an ack message, responder has a shared copy";
     Shared_Data,     desc="Received a data message, responder has a shared copy";
+    Data,            desc="Received a data message, responder had a owner or exclusive copy, they gave it to us";
     Exclusive_Data,  desc="Received a data message, responder had an exclusive copy, they gave it to us";
 
     // Triggers
-    All_acks_and_data,            desc="Received all required data and message acks";
+    All_acks_and_shared_data,     desc="Received shared data and message acks";
+    All_acks_and_owner_data,     desc="Received shared data and message acks";
     All_acks_and_data_no_sharers, desc="Received all acks and no other processor has a shared copy";
   }
 
@@ -115,18 +131,27 @@ machine(Directory, "AMD Hammer-like protocol")
     DataBlock DataBlk,             desc="data for the block";
   }
 
+  // ProbeFilterEntry
+  structure(PfEntry, desc="...", interface="AbstractCacheEntry") {
+    State PfState,                 desc="Directory state";
+    MachineID Owner,               desc="Owner node";
+    DataBlock DataBlk,             desc="data for the block";
+  }
+
   // TBE entries for DMA requests
   structure(TBE, desc="TBE entries for outstanding DMA requests") {
     Address PhysicalAddress, desc="physical address";
     State TBEState,        desc="Transient State";
     CoherenceResponseType ResponseType, desc="The type for the subsequent response message";
+    int Acks, default="0", desc="The number of acks that the waiting response represents";
     DataBlock DmaDataBlk,  desc="DMA Data to be written.  Partial blocks need to merged with system memory";
     DataBlock DataBlk,     desc="The current view of system memory";
     int Len,               desc="...";
     MachineID DmaRequestor, desc="DMA requestor";
     int NumPendingMsgs,    desc="Number of pending acks/messages";
-    bool CacheDirty,       desc="Indicates whether a cache has responded with dirty data";
-    bool Sharers,          desc="Indicates whether a cache has indicated it is currently a sharer";
+    bool CacheDirty, default="false", desc="Indicates whether a cache has responded with dirty data";
+    bool Sharers, default="false", desc="Indicates whether a cache has indicated it is currently a sharer";
+    bool Owned, default="false", desc="Indicates whether a cache has indicated it is currently a sharer";
   }
 
   external_type(TBETable) {
@@ -144,10 +169,21 @@ machine(Directory, "AMD Hammer-like protocol")
     return static_cast(Entry, directory[addr]);
   }
 
+  PfEntry getPfEntry(Address addr), return_by_ref="yes" {
+    return static_cast(PfEntry, probeFilter[addr]);
+  }
+
   State getState(Address addr) {
     if (TBEs.isPresent(addr)) {
       return TBEs[addr].TBEState;
     } else {
+      if (probe_filter_enabled) {
+        if (probeFilter.isTagPresent(addr)) {
+          assert(getPfEntry(addr).PfState == getDirectoryEntry(addr).DirectoryState);
+        } else {
+          assert(getDirectoryEntry(addr).DirectoryState == State:E);
+        }
+      }
       return getDirectoryEntry(addr).DirectoryState;
     }
   }
@@ -156,9 +192,31 @@ machine(Directory, "AMD Hammer-like protocol")
     if (TBEs.isPresent(addr)) {
       TBEs[addr].TBEState := state;
     }
+    if (probe_filter_enabled) {
+      if (probeFilter.isTagPresent(addr)) {
+        getPfEntry(addr).PfState := state;
+      }
+      if (state == State:NX || state == State:NO || state == State:S || state == State:O) {
+        assert(probeFilter.isTagPresent(addr));
+      }
+    }
+    if (state == State:E || state == State:NX || state == State:NO || state == State:S || 
+        state == State:O) {
+      assert(TBEs.isPresent(addr) == false);
+    }
     getDirectoryEntry(addr).DirectoryState := state;
   }
   
+  Event cache_request_to_event(CoherenceRequestType type) {
+    if (type == CoherenceRequestType:GETS) {
+      return Event:GETS;
+    } else if (type == CoherenceRequestType:GETX) {
+      return Event:GETX;
+    } else {
+      error("Invalid CoherenceRequestType");
+    }
+  }
+
   MessageBuffer triggerQueue, ordered="true";
 
   // ** OUT_PORTS **
@@ -180,7 +238,9 @@ machine(Directory, "AMD Hammer-like protocol")
     if (triggerQueue_in.isReady()) {
       peek(triggerQueue_in, TriggerMsg) {
         if (in_msg.Type == TriggerType:ALL_ACKS) {
-          trigger(Event:All_acks_and_data, in_msg.Address);
+          trigger(Event:All_acks_and_owner_data, in_msg.Address);
+        } else if (in_msg.Type == TriggerType:ALL_ACKS_OWNER_EXISTS) {
+          trigger(Event:All_acks_and_shared_data, in_msg.Address);
         } else if (in_msg.Type == TriggerType:ALL_ACKS_NO_SHARERS) {
           trigger(Event:All_acks_and_data_no_sharers, in_msg.Address);
         } else {
@@ -195,6 +255,10 @@ machine(Directory, "AMD Hammer-like protocol")
       peek(unblockNetwork_in, ResponseMsg) {
         if (in_msg.Type == CoherenceResponseType:UNBLOCK) {
           trigger(Event:Unblock, in_msg.Address);
+        } else if (in_msg.Type == CoherenceResponseType:UNBLOCKS) {
+          trigger(Event:UnblockS, in_msg.Address);
+        } else if (in_msg.Type == CoherenceResponseType:UNBLOCKM) {
+          trigger(Event:UnblockM, in_msg.Address);
         } else if (in_msg.Type == CoherenceResponseType:WB_CLEAN) {
           trigger(Event:Writeback_Clean, in_msg.Address);
         } else if (in_msg.Type == CoherenceResponseType:WB_DIRTY) {
@@ -220,8 +284,9 @@ machine(Directory, "AMD Hammer-like protocol")
           trigger(Event:Shared_Ack, in_msg.Address);
         } else if (in_msg.Type == CoherenceResponseType:DATA_SHARED) {
           trigger(Event:Shared_Data, in_msg.Address);
-        } else if (in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE ||
-                   in_msg.Type == CoherenceResponseType:DATA) {
+        } else if (in_msg.Type == CoherenceResponseType:DATA) {
+          trigger(Event:Data, in_msg.Address);
+        } else if (in_msg.Type == CoherenceResponseType:DATA_EXCLUSIVE) {
           trigger(Event:Exclusive_Data, in_msg.Address);
         } else {
           error("Unexpected message");
@@ -247,14 +312,22 @@ machine(Directory, "AMD Hammer-like protocol")
   in_port(requestQueue_in, RequestMsg, requestToDir) {
     if (requestQueue_in.isReady()) {
       peek(requestQueue_in, RequestMsg) {
-        if (in_msg.Type == CoherenceRequestType:GETS) {
-          trigger(Event:GETS, in_msg.Address);
-        } else if (in_msg.Type == CoherenceRequestType:GETX) {
-          trigger(Event:GETX, in_msg.Address);
-        } else if (in_msg.Type == CoherenceRequestType:PUT) {
+        if (in_msg.Type == CoherenceRequestType:PUT) {
           trigger(Event:PUT, in_msg.Address);
         } else {
-          error("Invalid message");
+          if (probe_filter_enabled) {
+            if (probeFilter.isTagPresent(in_msg.Address)) {
+              trigger(cache_request_to_event(in_msg.Type), in_msg.Address);
+            } else {
+              if (probeFilter.cacheAvail(in_msg.Address)) {
+                trigger(cache_request_to_event(in_msg.Type), in_msg.Address);
+              } else {
+                trigger(Event:Pf_Replacement, probeFilter.cacheProbe(in_msg.Address));
+              }
+            }
+          } else {
+            trigger(cache_request_to_event(in_msg.Type), in_msg.Address);
+          }
         }
       }
     }
@@ -278,6 +351,31 @@ machine(Directory, "AMD Hammer-like protocol")
 
   // Actions
   
+  action(r_setMRU, "\rr", desc="manually set the MRU bit for pf entry" ) {
+    if (probe_filter_enabled) {
+      assert(probeFilter.isTagPresent(address)); 
+      probeFilter.setMRU(address);
+    }
+  }
+
+  action(auno_assertUnblockerNotOwner, "auno", desc="assert unblocker not owner") {
+    if (probe_filter_enabled) {
+      assert(probeFilter.isTagPresent(address));    
+      peek(unblockNetwork_in, ResponseMsg) {
+        assert(getPfEntry(address).Owner != in_msg.Sender);
+      }
+    }
+  }
+
+  action(uo_updateOwnerIfPf, "uo", desc="update owner") {
+    if (probe_filter_enabled) {
+      assert(probeFilter.isTagPresent(address));    
+      peek(unblockNetwork_in, ResponseMsg) {
+        getPfEntry(address).Owner := in_msg.Sender;
+      }
+    }
+  }
+
   action(a_sendWriteBackAck, "a", desc="Send writeback ack to requestor") {
     peek(requestQueue_in, RequestMsg) {
       enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) {
@@ -302,6 +400,27 @@ machine(Directory, "AMD Hammer-like protocol")
     }
   }
 
+  action(pfa_probeFilterAllocate, "pfa", desc="Allocate ProbeFilterEntry") {
+    if (probe_filter_enabled) {
+      peek(requestQueue_in, RequestMsg) {
+        probeFilter.allocate(address, new PfEntry);
+        getPfEntry(in_msg.Address).Owner := in_msg.Requestor;
+      }
+    }
+  }
+
+  action(pfd_probeFilterDeallocate, "pfd", desc="Deallocate ProbeFilterEntry") {
+    if (probe_filter_enabled) {
+      probeFilter.deallocate(address);
+    }
+  }
+
+  action(ppfd_possibleProbeFilterDeallocate, "ppfd", desc="Deallocate ProbeFilterEntry") {
+    if (probe_filter_enabled && probeFilter.isTagPresent(address)) {
+      probeFilter.deallocate(address);
+    }
+  }
+
   action(v_allocateTBE, "v", desc="Allocate TBE") {
     peek(requestQueue_in, RequestMsg) {
       TBEs.allocate(address);
@@ -330,10 +449,30 @@ machine(Directory, "AMD Hammer-like protocol")
     }
   }
 
+  action(pa_setPendingMsgsToAll, "pa", desc="set pending msgs to all") {
+    TBEs[address].NumPendingMsgs := machineCount(MachineType:L1Cache);
+  }
+
+  action(po_setPendingMsgsToOne, "po", desc="set pending msgs to one") {
+    TBEs[address].NumPendingMsgs := 1;
+  }
+
   action(w_deallocateTBE, "w", desc="Deallocate TBE") {
     TBEs.deallocate(address);
   }
 
+  action(sa_setAcksToOne, "sa", desc="Forwarded request, set the ack amount to one") {
+    TBEs[address].Acks := 1;
+  }   
+
+  action(saa_setAcksToAllIfPF, "saa", desc="Non-forwarded request, set the ack amount to all") {
+    if (probe_filter_enabled) {
+      TBEs[address].Acks := machineCount(MachineType:L1Cache);
+    } else {
+      TBEs[address].Acks := 1;
+    }
+  }   
+
   action(m_decrementNumberOfMessages, "m", desc="Decrement the number of messages for which we're waiting") {
     peek(responseToDir_in, ResponseMsg) {
       assert(in_msg.Acks > 0);
@@ -357,7 +496,11 @@ machine(Directory, "AMD Hammer-like protocol")
       enqueue(triggerQueue_out, TriggerMsg) {
         out_msg.Address := address;
         if (TBEs[address].Sharers) {
-          out_msg.Type := TriggerType:ALL_ACKS;
+          if (TBEs[address].Owned) {
+            out_msg.Type := TriggerType:ALL_ACKS_OWNER_EXISTS;
+          } else {
+            out_msg.Type := TriggerType:ALL_ACKS;
+          }
         } else {
           out_msg.Type := TriggerType:ALL_ACKS_NO_SHARERS;
         }
@@ -365,6 +508,22 @@ machine(Directory, "AMD Hammer-like protocol")
     }
   }
 
+  action(spa_setPendingAcksToZeroIfPF, "spa", desc="if probe filter, no need to wait for acks") {
+    if (probe_filter_enabled) {
+      TBEs[address].NumPendingMsgs := 0;
+    }
+  }
+
+  action(sc_signalCompletionIfPF, "sc", desc="indicate that we should skip waiting for cpu acks") {
+    if (TBEs[address].NumPendingMsgs == 0) {
+      assert(probe_filter_enabled);
+      enqueue(triggerQueue_out, TriggerMsg) {
+        out_msg.Address := address;
+        out_msg.Type := TriggerType:ALL_ACKS_NO_SHARERS;
+      }
+    }
+  }
+
   action(d_sendData, "d", desc="Send data to requestor") {
     peek(memQueue_in, MemoryMsg) {
       enqueue(responseNetwork_out, ResponseMsg, latency="1") {
@@ -373,8 +532,11 @@ machine(Directory, "AMD Hammer-like protocol")
         out_msg.Sender := machineID;
         out_msg.Destination.add(in_msg.OriginalRequestorMachId);
         out_msg.DataBlk := in_msg.DataBlk;
+        DEBUG_EXPR(out_msg.DataBlk);
         out_msg.Dirty := false; // By definition, the block is now clean
-        out_msg.Acks := 1;
+        out_msg.Acks := TBEs[address].Acks;
+        DEBUG_EXPR(out_msg.Acks);
+        assert(out_msg.Acks > 0);
         out_msg.MessageSize := MessageSizeType:Response_Data;
       }
     }
@@ -440,6 +602,11 @@ machine(Directory, "AMD Hammer-like protocol")
     TBEs[address].Sharers := true;
   }
 
+  action(so_setOwnerBit, "so", desc="We saw other sharers") {
+    TBEs[address].Sharers := true;
+    TBEs[address].Owned := true;
+  }
+
   action(qf_queueMemoryFetchRequest, "qf", desc="Queue off-chip fetch request") {
     peek(requestQueue_in, RequestMsg) {
       enqueue(memQueue_out, MemoryMsg, latency="1") {
@@ -468,8 +635,8 @@ machine(Directory, "AMD Hammer-like protocol")
     }
   }
 
-  action(f_forwardRequest, "f", desc="Forward requests") {
-    if (machineCount(MachineType:L1Cache) > 1) {
+  action(fn_forwardRequestIfNecessary, "fn", desc="Forward requests if necessary") {
+    if ((machineCount(MachineType:L1Cache) > 1) && (TBEs[address].Acks <= 1)) {
       peek(requestQueue_in, RequestMsg) {
         enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) {
           out_msg.Address := address;
@@ -483,34 +650,105 @@ machine(Directory, "AMD Hammer-like protocol")
     }
   }
 
-  action(f_forwardWriteFromDma, "fw", desc="Forward requests") {
-    peek(dmaRequestQueue_in, DMARequestMsg) {
+  action(ia_invalidateAllRequest, "ia", desc="invalidate all copies") {
+    if (machineCount(MachineType:L1Cache) > 1) {
       enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) {
         out_msg.Address := address;
-        out_msg.Type := CoherenceRequestType:GETX;
-        //
-        // Send to all L1 caches, since the requestor is the memory controller
-        // itself
-        //
+        out_msg.Type := CoherenceRequestType:INV;
         out_msg.Requestor := machineID;
-        out_msg.Destination.broadcast(MachineType:L1Cache); 
+        out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches
         out_msg.MessageSize := MessageSizeType:Broadcast_Control;
       }
     }
   }
 
-  action(f_forwardReadFromDma, "fr", desc="Forward requests") {
-    peek(dmaRequestQueue_in, DMARequestMsg) {
+  action(io_invalidateOwnerRequest, "io", desc="invalidate all copies") {
+    if (machineCount(MachineType:L1Cache) > 1) {
       enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) {
         out_msg.Address := address;
-        out_msg.Type := CoherenceRequestType:GETS;
-        //
-        // Send to all L1 caches, since the requestor is the memory controller
-        // itself
-        //
+        out_msg.Type := CoherenceRequestType:INV;
         out_msg.Requestor := machineID;
-        out_msg.Destination.broadcast(MachineType:L1Cache); 
-        out_msg.MessageSize := MessageSizeType:Broadcast_Control;
+        out_msg.Destination.add(getPfEntry(address).Owner);
+        out_msg.MessageSize := MessageSizeType:Request_Control;
+        out_msg.DirectedProbe := true;
+      }
+    }
+  }
+
+  action(fb_forwardRequestBcast, "fb", desc="Forward requests to all nodes") {
+    if (machineCount(MachineType:L1Cache) > 1) {
+      peek(requestQueue_in, RequestMsg) {
+        enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) {
+          out_msg.Address := address;
+          out_msg.Type := in_msg.Type;
+          out_msg.Requestor := in_msg.Requestor;
+          out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches
+          out_msg.Destination.remove(in_msg.Requestor); // Don't include the original requestor
+          out_msg.MessageSize := MessageSizeType:Broadcast_Control;
+        }
+      }
+    }
+  }
+
+  action(fc_forwardRequestConditionalOwner, "fc", desc="Forward request to one or more nodes") {
+    assert(machineCount(MachineType:L1Cache) > 1);
+    if (probe_filter_enabled) {
+      peek(requestQueue_in, RequestMsg) {
+        enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) {
+          out_msg.Address := address;
+          out_msg.Type := in_msg.Type;
+          out_msg.Requestor := in_msg.Requestor;
+          out_msg.Destination.add(getPfEntry(address).Owner);
+          out_msg.MessageSize := MessageSizeType:Request_Control;
+          out_msg.DirectedProbe := true;
+        }
+      }      
+    } else {
+      peek(requestQueue_in, RequestMsg) {
+        enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) {
+          out_msg.Address := address;
+          out_msg.Type := in_msg.Type;
+          out_msg.Requestor := in_msg.Requestor;
+          out_msg.Destination.broadcast(MachineType:L1Cache); // Send to all L1 caches
+          out_msg.Destination.remove(in_msg.Requestor); // Don't include the original requestor
+          out_msg.MessageSize := MessageSizeType:Broadcast_Control;
+        }
+      }
+    }
+  }
+
+  action(f_forwardWriteFromDma, "fw", desc="Forward requests") {
+    if (TBEs[address].NumPendingMsgs > 0) {
+      peek(dmaRequestQueue_in, DMARequestMsg) {
+        enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) {
+          out_msg.Address := address;
+          out_msg.Type := CoherenceRequestType:GETX;
+          //
+          // Send to all L1 caches, since the requestor is the memory controller
+          // itself
+          //
+          out_msg.Requestor := machineID;
+          out_msg.Destination.broadcast(MachineType:L1Cache); 
+          out_msg.MessageSize := MessageSizeType:Broadcast_Control;
+        }
+      }
+    }
+  }
+
+  action(f_forwardReadFromDma, "fr", desc="Forward requests") {
+    if (TBEs[address].NumPendingMsgs > 0) {
+      peek(dmaRequestQueue_in, DMARequestMsg) {
+        enqueue(forwardNetwork_out, RequestMsg, latency=memory_controller_latency) {
+          out_msg.Address := address;
+          out_msg.Type := CoherenceRequestType:GETS;
+          //
+          // Send to all L1 caches, since the requestor is the memory controller
+          // itself
+          //
+          out_msg.Requestor := machineID;
+          out_msg.Destination.broadcast(MachineType:L1Cache); 
+          out_msg.MessageSize := MessageSizeType:Broadcast_Control;
+        }
       }
     }
   }
@@ -554,6 +792,14 @@ machine(Directory, "AMD Hammer-like protocol")
     }
   }
 
+  action(wr_writeResponseDataToMemory, "wr", desc="Write response data to memory") {
+    peek(responseToDir_in, ResponseMsg) {
+      getDirectoryEntry(address).DataBlk := in_msg.DataBlk;
+      DEBUG_EXPR(in_msg.Address);
+      DEBUG_EXPR(in_msg.DataBlk);
+    }
+  }
+
   action(l_writeDataToMemory, "l", desc="Write PUTX/PUTO data to memory") {
     peek(unblockNetwork_in, ResponseMsg) {
       assert(in_msg.Dirty);
@@ -565,14 +811,31 @@ machine(Directory, "AMD Hammer-like protocol")
   }
 
   action(dwt_writeDmaDataFromTBE, "dwt", desc="DMA Write data to memory from TBE") {
+    DEBUG_EXPR(getDirectoryEntry(address).DataBlk);
     getDirectoryEntry(address).DataBlk := TBEs[address].DataBlk;
+    DEBUG_EXPR(getDirectoryEntry(address).DataBlk);
     getDirectoryEntry(address).DataBlk.copyPartial(TBEs[address].DmaDataBlk, addressOffset(TBEs[address].PhysicalAddress), TBEs[address].Len);
+    DEBUG_EXPR(getDirectoryEntry(address).DataBlk);
+  }
+
+  action(wdt_writeDataFromTBE, "wdt", desc="DMA Write data to memory from TBE") {
+    DEBUG_EXPR(getDirectoryEntry(address).DataBlk);
+    getDirectoryEntry(address).DataBlk := TBEs[address].DataBlk;
+    DEBUG_EXPR(getDirectoryEntry(address).DataBlk);
   }
 
   action(a_assertCacheData, "ac", desc="Assert that a cache provided the data") {
     assert(TBEs[address].CacheDirty);
   }
 
+  action(ano_assertNotOwner, "ano", desc="Assert that request is not current owner") {
+    if (probe_filter_enabled) {
+      peek(requestQueue_in, RequestMsg) {
+        assert(getPfEntry(address).Owner != in_msg.Requestor);
+      }
+    }
+  }
+
   action(l_queueMemoryWBRequest, "lq", desc="Write PUTX data to memory") {
     peek(unblockNetwork_in, ResponseMsg) {
       enqueue(memQueue_out, MemoryMsg, latency="1") {
@@ -616,75 +879,152 @@ machine(Directory, "AMD Hammer-like protocol")
 
   // Transitions out of E state
   transition(E, GETX, NO_B_W) {
+    pfa_probeFilterAllocate;
     v_allocateTBE;
     rx_recordExclusiveInTBE;
+    saa_setAcksToAllIfPF;
     qf_queueMemoryFetchRequest;
-    f_forwardRequest;
+    fn_forwardRequestIfNecessary;
     i_popIncomingRequestQueue;
   }
 
   transition(E, GETS, NO_B_W) {
+    pfa_probeFilterAllocate;
     v_allocateTBE;
     rx_recordExclusiveInTBE;
+    saa_setAcksToAllIfPF;
     qf_queueMemoryFetchRequest;
-    f_forwardRequest;
+    fn_forwardRequestIfNecessary;
     i_popIncomingRequestQueue;
   }
 
   transition(E, DMA_READ, NO_DR_B_W) {
     vd_allocateDmaRequestInTBE;
     qd_queueMemoryRequestFromDmaRead;
+    spa_setPendingAcksToZeroIfPF;
     f_forwardReadFromDma;
     p_popDmaRequestQueue;
   }
 
+  transition(E, DMA_WRITE, NO_DW_B_W) {
+    vd_allocateDmaRequestInTBE;
+    spa_setPendingAcksToZeroIfPF;
+    sc_signalCompletionIfPF;
+    f_forwardWriteFromDma;
+    p_popDmaRequestQueue;
+  }
+
   // Transitions out of O state
   transition(O, GETX, NO_B_W) {
+    r_setMRU;
     v_allocateTBE;
     r_recordDataInTBE;
+    sa_setAcksToOne;
     qf_queueMemoryFetchRequest;
-    f_forwardRequest;
+    fb_forwardRequestBcast;
     i_popIncomingRequestQueue;
   }
 
+  // This transition is dumb, if a shared copy exists on-chip, then that should
+  // provide data, not slow off-chip dram.  The problem is that the current
+  // caches don't provide data in S state
   transition(O, GETS, O_B_W) {
+    r_setMRU;
     v_allocateTBE;
     r_recordDataInTBE;
+    saa_setAcksToAllIfPF;
     qf_queueMemoryFetchRequest;
-    f_forwardRequest;
+    fn_forwardRequestIfNecessary;
     i_popIncomingRequestQueue;
   }
 
   transition(O, DMA_READ, O_DR_B_W) {
     vd_allocateDmaRequestInTBE;
+    spa_setPendingAcksToZeroIfPF;
     qd_queueMemoryRequestFromDmaRead;
     f_forwardReadFromDma;
     p_popDmaRequestQueue;
   }
 
-  transition({E, O, NO}, DMA_WRITE, NO_DW_B_W) {
+  transition(O, Pf_Replacement, O_R) {
+    v_allocateTBE;
+    pa_setPendingMsgsToAll;
+    ia_invalidateAllRequest;
+    pfd_probeFilterDeallocate;
+  }
+
+  transition(S, Pf_Replacement, S_R) {
+    v_allocateTBE;
+    pa_setPendingMsgsToAll;
+    ia_invalidateAllRequest;
+    pfd_probeFilterDeallocate;
+  }
+
+  transition(NO, Pf_Replacement, NO_R) {
+    v_allocateTBE;
+    po_setPendingMsgsToOne;
+    io_invalidateOwnerRequest;
+    pfd_probeFilterDeallocate;
+  }
+
+  transition(NX, Pf_Replacement, NO_R) {
+    v_allocateTBE;
+    pa_setPendingMsgsToAll;
+    ia_invalidateAllRequest;
+    pfd_probeFilterDeallocate;
+  }
+
+  transition({O, S, NO, NX}, DMA_WRITE, NO_DW_B_W) {
     vd_allocateDmaRequestInTBE;
     f_forwardWriteFromDma;
     p_popDmaRequestQueue;
   }
 
   // Transitions out of NO state
+  transition(NX, GETX, NO_B) {
+    r_setMRU;
+    fb_forwardRequestBcast;
+    i_popIncomingRequestQueue;
+  }
+
+  // Transitions out of NO state
   transition(NO, GETX, NO_B) {
-    f_forwardRequest;
+    r_setMRU;
+    ano_assertNotOwner;
+    fc_forwardRequestConditionalOwner;
     i_popIncomingRequestQueue;
   }
 
-  transition(NO, GETS, NO_B) {
-    f_forwardRequest;
+  transition(S, GETX, NO_B) {
+    r_setMRU;
+    fb_forwardRequestBcast;
     i_popIncomingRequestQueue;
   }
 
-  transition(NO, PUT, WB) {
+  transition(S, GETS, NO_B) {
+    r_setMRU;
+    ano_assertNotOwner;
+    fb_forwardRequestBcast;
+    i_popIncomingRequestQueue;
+  }
+
+  transition({NX, NO}, GETS, NO_B) {
+    r_setMRU;
+    ano_assertNotOwner;
+    fc_forwardRequestConditionalOwner;
+    i_popIncomingRequestQueue;
+  }
+
+  transition({NO, NX, S}, PUT, WB) {
+    //
+    // note that the PUT requestor may not be the current owner if an invalidate
+    // raced with PUT
+    //    
     a_sendWriteBackAck;
     i_popIncomingRequestQueue;
   }
 
-  transition(NO, DMA_READ, NO_DR_B_D) {
+  transition({NO, NX, S}, DMA_READ, NO_DR_B_D) {
     vd_allocateDmaRequestInTBE;
     f_forwardReadFromDma;
     p_popDmaRequestQueue;
@@ -699,23 +1039,28 @@ machine(Directory, "AMD Hammer-like protocol")
   // Blocked transient states
   transition({NO_B, O_B, NO_DR_B_W, NO_DW_B_W, NO_B_W, NO_DR_B_D, 
               NO_DR_B, O_DR_B, O_B_W, O_DR_B_W, NO_DW_W, 
-              NO_W, O_W, WB, WB_E_W, WB_O_W}, 
-             {GETS, GETX, PUT}) {
+              NO_W, O_W, WB, WB_E_W, WB_O_W, O_R, S_R, NO_R}, 
+             {GETS, GETX, PUT, Pf_Replacement}) {
     zz_recycleRequest;
   }
 
   transition({NO_B, O_B, NO_DR_B_W, NO_DW_B_W, NO_B_W, NO_DR_B_D, 
               NO_DR_B, O_DR_B, O_B_W, O_DR_B_W, NO_DW_W, 
-              NO_W, O_W, WB, WB_E_W, WB_O_W}, 
+              NO_W, O_W, WB, WB_E_W, WB_O_W, O_R, S_R, NO_R}, 
              {DMA_READ, DMA_WRITE}) {
     y_recycleDmaRequestQueue;
   }
 
-  transition(NO_B, Unblock, NO) {
+  transition(NO_B, UnblockS, NX) {
     j_popIncomingUnblockQueue;
   }
 
-  transition(O_B, Unblock, O) {
+  transition(NO_B, UnblockM, NO) {
+    uo_updateOwnerIfPf;
+    j_popIncomingUnblockQueue;
+  }
+
+  transition(O_B, UnblockS, O) {
     j_popIncomingUnblockQueue;
   }
 
@@ -744,7 +1089,32 @@ machine(Directory, "AMD Hammer-like protocol")
     n_popResponseQueue;
   }
 
-  transition(NO_DR_B_W, Ack) {
+  transition({O_R, S_R, NO_R}, Ack) {
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(S_R, Data) {
+    wr_writeResponseDataToMemory;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(NO_R, {Data, Exclusive_Data}) {
+    wr_writeResponseDataToMemory;
+    m_decrementNumberOfMessages;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition({O_R, S_R, NO_R}, All_acks_and_data_no_sharers, E) {
+    w_deallocateTBE;
+    g_popTriggerQueue;
+  }
+
+  transition({NO_DR_B_W, O_DR_B_W}, Ack) {
     m_decrementNumberOfMessages;
     n_popResponseQueue;
   }
@@ -755,6 +1125,19 @@ machine(Directory, "AMD Hammer-like protocol")
     n_popResponseQueue;
   }
 
+  transition(O_DR_B, Shared_Ack) {
+    m_decrementNumberOfMessages;
+    so_setOwnerBit;
+    o_checkForCompletion;
+    n_popResponseQueue;
+  }
+
+  transition(O_DR_B_W, Shared_Ack) {
+    m_decrementNumberOfMessages;
+    so_setOwnerBit;
+    n_popResponseQueue;
+  }
+
   transition({NO_DR_B, NO_DR_B_D}, Shared_Ack) {
     m_decrementNumberOfMessages;
     r_setSharerBit;
@@ -765,7 +1148,7 @@ machine(Directory, "AMD Hammer-like protocol")
   transition(NO_DR_B_W, Shared_Data) {
     r_recordCacheData;
     m_decrementNumberOfMessages;
-    r_setSharerBit;
+    so_setOwnerBit;
     o_checkForCompletion;
     n_popResponseQueue;
   }
@@ -773,48 +1156,82 @@ machine(Directory, "AMD Hammer-like protocol")
   transition({NO_DR_B, NO_DR_B_D}, Shared_Data) {
     r_recordCacheData;
     m_decrementNumberOfMessages;
-    r_setSharerBit;
+    so_setOwnerBit;
     o_checkForCompletion;
     n_popResponseQueue;
   }
 
-  transition(NO_DR_B_W, Exclusive_Data) {
+  transition(NO_DR_B_W, {Exclusive_Data, Data}) {
     r_recordCacheData;
     m_decrementNumberOfMessages;
     n_popResponseQueue;
   }
 
-  transition({NO_DR_B, NO_DR_B_D, NO_DW_B_W}, Exclusive_Data) {
+  transition({NO_DR_B, NO_DR_B_D, NO_DW_B_W}, {Exclusive_Data, Data}) {
     r_recordCacheData;
     m_decrementNumberOfMessages;
     o_checkForCompletion;
     n_popResponseQueue;
   }
 
-  transition(NO_DR_B, All_acks_and_data, O) {
+  transition(NO_DR_B, All_acks_and_owner_data, O) {
+    //
+    // Note that the DMA consistency model allows us to send the DMA device
+    // a response as soon as we receive valid data and prior to receiving
+    // all acks.  However, to simplify the protocol we wait for all acks.
+    //
+    dt_sendDmaDataFromTbe;
+    wdt_writeDataFromTBE;
+    w_deallocateTBE;
+    g_popTriggerQueue;
+  }
+
+  transition(NO_DR_B, All_acks_and_shared_data, S) {
     //
     // Note that the DMA consistency model allows us to send the DMA device
     // a response as soon as we receive valid data and prior to receiving
     // all acks.  However, to simplify the protocol we wait for all acks.
     //
     dt_sendDmaDataFromTbe;
+    wdt_writeDataFromTBE;
     w_deallocateTBE;
     g_popTriggerQueue;
   }
 
-  transition(NO_DR_B_D, All_acks_and_data, O) {
+  transition(NO_DR_B_D, All_acks_and_owner_data, O) {
     //
     // Note that the DMA consistency model allows us to send the DMA device
     // a response as soon as we receive valid data and prior to receiving
     // all acks.  However, to simplify the protocol we wait for all acks.
     //
     dt_sendDmaDataFromTbe;
+    wdt_writeDataFromTBE;
+    w_deallocateTBE;
+    g_popTriggerQueue;
+  }
+
+  transition(NO_DR_B_D, All_acks_and_shared_data, S) {
+    //
+    // Note that the DMA consistency model allows us to send the DMA device
+    // a response as soon as we receive valid data and prior to receiving
+    // all acks.  However, to simplify the protocol we wait for all acks.
+    //
+    dt_sendDmaDataFromTbe;
+    wdt_writeDataFromTBE;
+    w_deallocateTBE;
+    g_popTriggerQueue;
+  }
+
+  transition(O_DR_B, All_acks_and_owner_data, O) {
+    wdt_writeDataFromTBE;
     w_deallocateTBE;
     g_popTriggerQueue;
   }
 
-  transition(O_DR_B, All_acks_and_data_no_sharers, O) {
+  transition(O_DR_B, All_acks_and_data_no_sharers, E) {
+    wdt_writeDataFromTBE;
     w_deallocateTBE;
+    pfd_probeFilterDeallocate;
     g_popTriggerQueue;
   }
 
@@ -825,7 +1242,9 @@ machine(Directory, "AMD Hammer-like protocol")
     // all acks.  However, to simplify the protocol we wait for all acks.
     //
     dt_sendDmaDataFromTbe;
+    wdt_writeDataFromTBE;
     w_deallocateTBE;
+    ppfd_possibleProbeFilterDeallocate;
     g_popTriggerQueue;
   }
 
@@ -837,7 +1256,9 @@ machine(Directory, "AMD Hammer-like protocol")
     // all acks.  However, to simplify the protocol we wait for all acks.
     //
     dt_sendDmaDataFromTbe;
+    wdt_writeDataFromTBE;
     w_deallocateTBE;
+    ppfd_possibleProbeFilterDeallocate;
     g_popTriggerQueue;
   }
 
@@ -850,6 +1271,7 @@ machine(Directory, "AMD Hammer-like protocol")
   transition(NO_DW_W, Memory_Ack, E) {
     da_sendDmaAck;
     w_deallocateTBE;
+    ppfd_possibleProbeFilterDeallocate;
     l_popMemQueue;
   }
 
@@ -859,11 +1281,11 @@ machine(Directory, "AMD Hammer-like protocol")
     l_popMemQueue;
   }
 
-  transition(NO_B_W, Unblock, NO_W) {
+  transition(NO_B_W, {UnblockM, UnblockS}, NO_W) {
     j_popIncomingUnblockQueue;
   }
 
-  transition(O_B_W, Unblock, O_W) {
+  transition(O_B_W, UnblockS, O_W) {
     j_popIncomingUnblockQueue;
   }
 
@@ -891,6 +1313,7 @@ machine(Directory, "AMD Hammer-like protocol")
   }
 
   transition(WB_E_W, Memory_Ack, E) {
+    pfd_probeFilterDeallocate;
     l_popMemQueue;
   }
 
@@ -905,10 +1328,12 @@ machine(Directory, "AMD Hammer-like protocol")
 
   transition(WB, Writeback_Exclusive_Clean, E) {
     ll_checkIncomingWriteback;
+    pfd_probeFilterDeallocate;
     j_popIncomingUnblockQueue;
   }
 
   transition(WB, Unblock, NO) {
+    auno_assertUnblockerNotOwner;
     j_popIncomingUnblockQueue;
   }
 }
diff --git a/src/mem/protocol/MOESI_hammer-msg.sm b/src/mem/protocol/MOESI_hammer-msg.sm
index 4856178a1..f414d599d 100644
--- a/src/mem/protocol/MOESI_hammer-msg.sm
+++ b/src/mem/protocol/MOESI_hammer-msg.sm
@@ -36,6 +36,7 @@ enumeration(CoherenceRequestType, desc="...") {
   PUT,       desc="Put Ownership";
   WB_ACK,    desc="Writeback ack";
   WB_NACK,   desc="Writeback neg. ack";
+  INV,       desc="Invalidate";
 }
 
 // CoherenceResponseType
@@ -49,7 +50,9 @@ enumeration(CoherenceResponseType, desc="...") {
   WB_DIRTY,           desc="Dirty writeback";
   WB_EXCLUSIVE_CLEAN, desc="Clean writeback of exclusive data";
   WB_EXCLUSIVE_DIRTY, desc="Dirty writeback of exclusive data";
-  UNBLOCK,            desc="Unblock";
+  UNBLOCK,            desc="Unblock for writeback";
+  UNBLOCKS,            desc="Unblock now in S";
+  UNBLOCKM,            desc="Unblock now in M/O/E";
   NULL,               desc="Null value";
 }
 
@@ -57,6 +60,7 @@ enumeration(CoherenceResponseType, desc="...") {
 enumeration(TriggerType, desc="...") {
   L2_to_L1,            desc="L2 to L1 transfer";
   ALL_ACKS,            desc="See corresponding event";
+  ALL_ACKS_OWNER_EXISTS,desc="See corresponding event";
   ALL_ACKS_NO_SHARERS, desc="See corresponding event";
 }
 
@@ -73,6 +77,7 @@ structure(RequestMsg, desc="...", interface="NetworkMessage") {
   MachineID Requestor,            desc="Node who initiated the request";
   NetDest Destination,             desc="Multicast destination mask";
   MessageSizeType MessageSize, desc="size category of the message";
+  bool DirectedProbe, default="false", desc="probe filter directed probe";
 }
 
 // ResponseMsg (and also unblock requests)
diff --git a/src/mem/ruby/system/Cache.py b/src/mem/ruby/system/Cache.py
index 06952afd1..ab3ec4b29 100644
--- a/src/mem/ruby/system/Cache.py
+++ b/src/mem/ruby/system/Cache.py
@@ -38,3 +38,4 @@ class RubyCache(SimObject):
     latency = Param.Int("");
     assoc = Param.Int("");
     replacement_policy = Param.String("PSEUDO_LRU", "");
+    start_index_bit = Param.Int(6, "index start, default 6 for 64-byte line");
diff --git a/src/mem/ruby/system/CacheMemory.cc b/src/mem/ruby/system/CacheMemory.cc
index 604113238..59f97e5fe 100644
--- a/src/mem/ruby/system/CacheMemory.cc
+++ b/src/mem/ruby/system/CacheMemory.cc
@@ -53,6 +53,7 @@ CacheMemory::CacheMemory(const Params *p)
     m_cache_assoc = p->assoc;
     m_policy = p->replacement_policy;
     m_profiler_ptr = new CacheProfiler(name());
+    m_start_index_bit = p->start_index_bit;
 }
 
 void
@@ -127,8 +128,8 @@ Index
 CacheMemory::addressToCacheSet(const Address& address) const
 {
     assert(address == line_address(address));
-    return address.bitSelect(RubySystem::getBlockSizeBits(),
-        RubySystem::getBlockSizeBits() + m_cache_num_set_bits - 1);
+    return address.bitSelect(m_start_index_bit,
+                             m_start_index_bit + m_cache_num_set_bits - 1);
 }
 
 // Given a cache index: returns the index of the tag in a set.
diff --git a/src/mem/ruby/system/CacheMemory.hh b/src/mem/ruby/system/CacheMemory.hh
index c1d49f784..3ef951821 100644
--- a/src/mem/ruby/system/CacheMemory.hh
+++ b/src/mem/ruby/system/CacheMemory.hh
@@ -169,6 +169,7 @@ class CacheMemory : public SimObject
     int m_cache_num_sets;
     int m_cache_num_set_bits;
     int m_cache_assoc;
+    int m_start_index_bit;
 };
 
 #endif // __MEM_RUBY_SYSTEM_CACHEMEMORY_HH__