5 files changed, 373 insertions, 139 deletions
diff --git a/src/cpu/inorder/InOrderCPU.py b/src/cpu/inorder/InOrderCPU.py
index 5d24ae4fd..4766a1ac1 100644
--- a/src/cpu/inorder/InOrderCPU.py
+++ b/src/cpu/inorder/InOrderCPU.py
@@ -48,6 +48,9 @@ class InOrderCPU(BaseCPU):
     dcache_port = Port("Data Port")
     _cached_ports = ['icache_port', 'dcache_port']
 
+    fetchBuffSize = Param.Unsigned(4, "Fetch Buffer Size (Number of Cache Blocks Stored)")
+    memBlockSize = Param.Unsigned(64, "Memory Block Size")
+
     predType = Param.String("tournament", "Branch predictor type ('local', 'tournament')")
     localPredictorSize = Param.Unsigned(2048, "Size of local predictor")
     localCtrBits = Param.Unsigned(2, "Bits per counter")
@@ -69,8 +72,6 @@ class InOrderCPU(BaseCPU):
     functionTraceStart = Param.Tick(0, "Cycle to start function trace")
     stageTracing = Param.Bool(False, "Enable tracing of each stage in CPU")
 
-    memBlockSize = Param.Unsigned(64, "Memory Block Size")
-
     multLatency = Param.Unsigned(1, "Latency for Multiply Operations")
     multRepeatRate = Param.Unsigned(1, "Repeat Rate for Multiply Operations")
     div8Latency = Param.Unsigned(1, "Latency for 8-bit Divide Operations")
diff --git a/src/cpu/inorder/resources/cache_unit.cc b/src/cpu/inorder/resources/cache_unit.cc
index 03f1c1527..6c9da67f5 100644
--- a/src/cpu/inorder/resources/cache_unit.cc
+++ b/src/cpu/inorder/resources/cache_unit.cc
@@ -97,7 +97,7 @@ CacheUnit::CachePort::recvRetry()
 CacheUnit::CacheUnit(string res_name, int res_id, int res_width,
         int res_latency, InOrderCPU *_cpu, ThePipeline::Params *params)
     : Resource(res_name, res_id, res_width, res_latency, _cpu),
-      cachePortBlocked(false), predecoder(NULL)
+      cachePortBlocked(false)
 {
     cachePort = new CachePort(this);
 
@@ -137,6 +137,9 @@ CacheUnit::init()
     // Switch to Timing TLB translations.
     resourceEvent = new CacheUnitEvent[width];
 
+    cacheBlkSize = this->cachePort->peerBlockSize();
+    cacheBlkMask = cacheBlkSize  - 1;
+
     initSlots();
 }
 
@@ -375,28 +378,20 @@ CacheUnit::requestAgain(DynInstPtr inst, bool &service_request)
     }
 }
 
-Fault
-CacheUnit::doTLBAccess(DynInstPtr inst, CacheReqPtr cache_req, int acc_size,
-                       int flags, TheISA::TLB::Mode tlb_mode)
+void
+CacheUnit::setupMemRequest(DynInstPtr inst, CacheReqPtr cache_req,
+                           int acc_size, int flags)
 {
     ThreadID tid = inst->readTid();
     Addr aligned_addr = inst->getMemAddr();
-    unsigned stage_num = cache_req->getStageNum();
-    unsigned slot_idx = cache_req->getSlot();
 
-    if (tlb_mode == TheISA::TLB::Execute) {
-        inst->fetchMemReq =
-            new Request(inst->readTid(), aligned_addr, acc_size, flags,
-                        inst->instAddr(), cpu->readCpuId(), inst->readTid());
-        cache_req->memReq = inst->fetchMemReq;
-    } else {
-        if (!cache_req->is2ndSplit()) {            
+    if (!cache_req->is2ndSplit()) {
             inst->dataMemReq =
                 new Request(cpu->asid[tid], aligned_addr, acc_size, flags,
                             inst->instAddr(), cpu->readCpuId(),
-                            inst->readTid());
+                            tid);
             cache_req->memReq = inst->dataMemReq;
-        } else {
+    } else {
             assert(inst->splitInst);
             
             inst->splitMemReq = new Request(cpu->asid[tid], 
@@ -407,9 +402,19 @@ CacheUnit::doTLBAccess(DynInstPtr inst, CacheReqPtr cache_req, int acc_size,
                                             cpu->readCpuId(), 
                                             tid);
             cache_req->memReq = inst->splitMemReq;            
-        }
     }
-    
+}
+
+Fault
+CacheUnit::doTLBAccess(DynInstPtr inst, CacheReqPtr cache_req, int acc_size,
+                       int flags, TheISA::TLB::Mode tlb_mode)
+{
+    ThreadID tid = inst->readTid();
+    //Addr aligned_addr = inst->getMemAddr();
+    unsigned stage_num = cache_req->getStageNum();
+    unsigned slot_idx = cache_req->getSlot();
+
+    setupMemRequest(inst, cache_req, acc_size, flags);
 
     cache_req->fault =
         _tlb->translateAtomic(cache_req->memReq,
@@ -842,8 +847,8 @@ CacheUnit::doCacheAccess(DynInstPtr inst, uint64_t *write_res,
     }
 
     DPRINTF(InOrderCachePort,
-            "[tid:%i] [sn:%i] attempting to access cache\n",
-            tid, inst->seqNum);
+            "[tid:%i] [sn:%i] attempting to access cache for addr %08p\n",
+            tid, inst->seqNum, cache_req->dataPkt->getAddr());
 
     if (do_access) {
         if (!cachePort->sendTiming(cache_req->dataPkt)) {
@@ -1086,6 +1091,24 @@ CacheUnit::squashDueToMemStall(DynInstPtr inst, int stage_num,
     squash(inst, stage_num, squash_seq_num + 1, tid);    
 }
 
+void
+CacheUnit::squashCacheRequest(CacheReqPtr req_ptr)
+{
+    DynInstPtr inst =  req_ptr->getInst();
+
+    req_ptr->setSquashed();
+    inst->setSquashed();
+    if (inst->validMemAddr()) {
+        DPRINTF(AddrDep, "Squash of [tid:%i] [sn:%i], attempting to "
+                "remove addr. %08p dependencies.\n",
+                inst->readTid(),
+                inst->seqNum,
+                inst->getMemAddr());
+
+        removeAddrDependency(inst);
+    }
+}
+
 
 void
 CacheUnit::squash(DynInstPtr inst, int stage_num,
@@ -1115,14 +1138,12 @@ CacheUnit::squash(DynInstPtr inst, int stage_num,
                 map_it++;                
                 continue;                
             }
-            
-            req_ptr->setSquashed();
-
-            req_ptr->getInst()->setSquashed();
 
             CacheReqPtr cache_req = dynamic_cast<CacheReqPtr>(req_ptr);
             assert(cache_req);
 
+            squashCacheRequest(cache_req);
+
             int req_slot_num = req_ptr->getSlot();
 
             if (cache_req->tlbStall) {
@@ -1152,15 +1173,6 @@ CacheUnit::squash(DynInstPtr inst, int stage_num,
                         req_ptr->getInst()->splitInst);
             }
 
-            if (req_ptr->getInst()->validMemAddr()) {                    
-                DPRINTF(AddrDep, "Squash of [tid:%i] [sn:%i], attempting to "
-                        "remove addr. %08p dependencies.\n",
-                        req_ptr->getInst()->readTid(),
-                        req_ptr->getInst()->seqNum, 
-                        req_ptr->getInst()->getMemAddr());
-                
-                removeAddrDependency(req_ptr->getInst());
-            }            
         }
 
         map_it++;
diff --git a/src/cpu/inorder/resources/cache_unit.hh b/src/cpu/inorder/resources/cache_unit.hh
index ece06be83..0d911999d 100644
--- a/src/cpu/inorder/resources/cache_unit.hh
+++ b/src/cpu/inorder/resources/cache_unit.hh
@@ -139,10 +139,16 @@ class CacheUnit : public Resource
     void squashDueToMemStall(DynInstPtr inst, int stage_num,
                              InstSeqNum squash_seq_num, ThreadID tid);
 
+    virtual void squashCacheRequest(CacheReqPtr req_ptr);
+
     /** After memory request is completedd in the cache, then do final
         processing to complete the request in the CPU.
     */
-   virtual void processCacheCompletion(PacketPtr pkt);
+    virtual void processCacheCompletion(PacketPtr pkt);
+
+    /** Create request that will interface w/TLB and Memory objects */
+    virtual void setupMemRequest(DynInstPtr inst, CacheReqPtr cache_req,
+                                 int acc_size, int flags);
 
     void recvRetry();
 
@@ -167,7 +173,7 @@ class CacheUnit : public Resource
     uint64_t getMemData(Packet *packet);
 
     void setAddrDependency(DynInstPtr inst);
-    void removeAddrDependency(DynInstPtr inst);
+    virtual void removeAddrDependency(DynInstPtr inst);
     
   protected:
     /** Cache interface. */
@@ -190,8 +196,6 @@ class CacheUnit : public Resource
         return (addr & ~(cacheBlkMask));
     }
 
-    TheISA::Predecoder predecoder;
-
     bool tlbBlocked[ThePipeline::MaxThreads];
 
     TheISA::TLB* tlb();
@@ -225,7 +229,7 @@ class CacheRequest : public ResourceRequest
           pktCmd(pkt_cmd), memReq(NULL), reqData(NULL), dataPkt(NULL),
           retryPkt(NULL), memAccComplete(false), memAccPending(false),
           tlbStall(false), splitAccess(false), splitAccessNum(-1),
-          split2ndAccess(false), instIdx(idx)
+          split2ndAccess(false), instIdx(idx), fetchBufferFill(false)
     { }
 
 
@@ -270,7 +274,9 @@ class CacheRequest : public ResourceRequest
     int splitAccessNum;
     bool split2ndAccess;
     int instIdx;    
-    
+
+    /** Should we expect block from cache access or fetch buffer? */
+    bool fetchBufferFill;
 };
 
 class CacheReqPacket : public Packet
diff --git a/src/cpu/inorder/resources/fetch_unit.cc b/src/cpu/inorder/resources/fetch_unit.cc
index 877d23375..7bbeffadd 100644
--- a/src/cpu/inorder/resources/fetch_unit.cc
+++ b/src/cpu/inorder/resources/fetch_unit.cc
@@ -37,6 +37,7 @@
 #include "arch/utility.hh"
 #include "arch/predecoder.hh"
 #include "config/the_isa.hh"
+#include "cpu/inorder/resources/cache_unit.hh"
 #include "cpu/inorder/resources/fetch_unit.hh"
 #include "cpu/inorder/pipeline_traits.hh"
 #include "cpu/inorder/cpu.hh"
@@ -50,10 +51,42 @@ using namespace ThePipeline;
 FetchUnit::FetchUnit(string res_name, int res_id, int res_width,
                      int res_latency, InOrderCPU *_cpu,
                      ThePipeline::Params *params)
-    : CacheUnit(res_name, res_id, res_width, res_latency, _cpu,
-                params)
+    : CacheUnit(res_name, res_id, res_width, res_latency, _cpu, params),
+      instSize(sizeof(TheISA::MachInst)), fetchBuffSize(params->fetchBuffSize),
+      predecoder(NULL)
 { }
 
+void
+FetchUnit::createMachInst(std::list<FetchBlock*>::iterator fetch_it,
+                          DynInstPtr inst)
+{
+    ExtMachInst ext_inst;
+    Addr block_addr = cacheBlockAlign(inst->getMemAddr());
+    Addr fetch_addr = inst->getMemAddr();
+    unsigned fetch_offset = (fetch_addr - block_addr) / instSize;
+    ThreadID tid = inst->readTid();
+    TheISA::PCState instPC = inst->pcState();
+
+
+    DPRINTF(InOrderCachePort, "Creating instruction [sn:%i] w/fetch data @"
+            "addr:%08p block:%08p\n", inst->seqNum, fetch_addr, block_addr);
+
+    assert((*fetch_it)->valid);
+
+    TheISA::MachInst *fetchInsts =
+        reinterpret_cast<TheISA::MachInst *>((*fetch_it)->block);
+
+    MachInst mach_inst =
+        TheISA::gtoh(fetchInsts[fetch_offset]);
+
+    predecoder.setTC(cpu->thread[tid]->getTC());
+    predecoder.moreBytes(instPC, inst->instAddr(), mach_inst);
+    ext_inst = predecoder.getExtMachInst(instPC);
+
+    inst->pcState(instPC);
+    inst->setMachInst(ext_inst);
+}
+
 int
 FetchUnit::getSlot(DynInstPtr inst)
 {
@@ -119,15 +152,64 @@ FetchUnit::setupMemRequest(DynInstPtr inst, CacheReqPtr cache_req,
                            int acc_size, int flags)
 {
     ThreadID tid = inst->readTid();
-    Addr aligned_addr = inst->getMemAddr();
+    Addr aligned_addr = cacheBlockAlign(inst->getMemAddr());
 
     inst->fetchMemReq =
-            new Request(inst->readTid(), aligned_addr, acc_size, flags,
-                        inst->instAddr(), cpu->readCpuId(), inst->readTid());
+            new Request(tid, aligned_addr, acc_size, flags,
+                        inst->instAddr(), cpu->readCpuId(), tid);
 
     cache_req->memReq = inst->fetchMemReq;
 }
 
+std::list<FetchUnit::FetchBlock*>::iterator
+FetchUnit::findBlock(std::list<FetchBlock*> &fetch_blocks, int asid,
+                     Addr block_addr)
+{
+    std::list<FetchBlock*>::iterator fetch_it = fetch_blocks.begin();
+    std::list<FetchBlock*>::iterator end_it = fetch_blocks.end();
+
+    while (fetch_it != end_it) {
+        if ((*fetch_it)->asid == asid &&
+            (*fetch_it)->addr == block_addr) {
+            return fetch_it;
+        }
+
+        fetch_it++;
+    }
+
+    return fetch_it;
+}
+
+std::list<FetchUnit::FetchBlock*>::iterator
+FetchUnit::findReplacementBlock()
+{
+    std::list<FetchBlock*>::iterator fetch_it = fetchBuffer.begin();
+    std::list<FetchBlock*>::iterator end_it = fetchBuffer.end();
+
+    while (fetch_it != end_it) {
+        if ((*fetch_it)->cnt == 0) {
+            return fetch_it;
+        } else {
+            DPRINTF(InOrderCachePort, "Block %08p has %i insts pending.\n",
+                    (*fetch_it)->addr, (*fetch_it)->cnt);
+        }
+        fetch_it++;
+    }
+
+    return fetch_it;
+}
+
+void
+FetchUnit::markBlockUsed(std::list<FetchBlock*>::iterator block_it)
+{
+    // Move block from whatever location it is in fetch buffer
+    // to the back (represents most-recently-used location)
+    if (block_it != fetchBuffer.end()) {
+        FetchBlock *mru_blk = *block_it;
+        fetchBuffer.erase(block_it);
+        fetchBuffer.push_back(mru_blk);
+    }
+}
 
 void
 FetchUnit::execute(int slot_num)
@@ -142,54 +224,157 @@ FetchUnit::execute(int slot_num)
     }
 
     DynInstPtr inst = cache_req->inst;
-#if TRACING_ON
     ThreadID tid = inst->readTid();
-    int seq_num = inst->seqNum;
-    std::string acc_type = "write";
-#endif
-
+    Addr block_addr = cacheBlockAlign(inst->getMemAddr());
+    int asid = cpu->asid[tid];
     cache_req->fault = NoFault;
 
     switch (cache_req->cmd)
     {
       case InitiateFetch:
         {
+            // Check to see if we've already got this request buffered
+            // or pending to be buffered
+            bool do_fetch = true;
+            std::list<FetchBlock*>::iterator pending_it;
+            pending_it = findBlock(pendingFetch, asid, block_addr);
+            if (pending_it != pendingFetch.end()) {
+                (*pending_it)->cnt++;
+                do_fetch = false;
+
+                DPRINTF(InOrderCachePort, "%08p is a pending fetch block "
+                        "(pending:%i).\n", block_addr,
+                        (*pending_it)->cnt);
+            } else if (pendingFetch.size() < fetchBuffSize) {
+                std::list<FetchBlock*>::iterator buff_it;
+                buff_it = findBlock(fetchBuffer, asid, block_addr);
+                if (buff_it  != fetchBuffer.end()) {
+                    (*buff_it)->cnt++;
+                    do_fetch = false;
+
+                    DPRINTF(InOrderCachePort, "%08p is in fetch buffer"
+                            "(pending:%i).\n", block_addr, (*buff_it)->cnt);
+                }
+            }
+
+            if (!do_fetch) {
+                DPRINTF(InOrderCachePort, "Inst. [sn:%i] marked to be filled "
+                        "through fetch buffer.\n", inst->seqNum);
+                cache_req->fetchBufferFill = true;
+                cache_req->setCompleted(true);
+                return;
+            }
+
+            // Check to see if there is room in the fetchbuffer for this instruction.
+            // If not, block this request.
+            if (pendingFetch.size() >= fetchBuffSize) {
+                DPRINTF(InOrderCachePort, "No room available in fetch buffer.\n");
+                cache_req->setCompleted(false);
+                return;
+            }
+
             doTLBAccess(inst, cache_req, cacheBlkSize, 0, TheISA::TLB::Execute);
 
             if (cache_req->fault == NoFault) {
                 DPRINTF(InOrderCachePort,
-                    "[tid:%u]: Initiating fetch access to %s for addr. %08p\n",
-                    tid, name(), cache_req->inst->getMemAddr());
+                        "[tid:%u]: Initiating fetch access to %s for "
+                        "addr:%#x (block:%#x)\n", tid, name(),
+                        cache_req->inst->getMemAddr(), block_addr);
 
-                cache_req->reqData = new uint8_t[cacheBlksize];
+                cache_req->reqData = new uint8_t[cacheBlkSize];
 
                 inst->setCurResSlot(slot_num);
 
                 doCacheAccess(inst);
+
+                if (cache_req->isMemAccPending()) {
+                    pendingFetch.push_back(new FetchBlock(asid, block_addr));
+                }
             }
 
             break;
         }
 
       case CompleteFetch:
+        if (cache_req->fetchBufferFill) {
+            // Block request if it's depending on a previous fetch, but it hasnt made it yet
+            std::list<FetchBlock*>::iterator fetch_it = findBlock(fetchBuffer, asid, block_addr);
+            if (fetch_it == fetchBuffer.end()) {
+                DPRINTF(InOrderCachePort, "%#x not available yet\n",
+                        block_addr);
+                cache_req->setCompleted(false);
+                return;
+            }
+
+            // Make New Instruction
+            createMachInst(fetch_it, inst);
+            if (inst->traceData) {
+                inst->traceData->setStaticInst(inst->staticInst);
+                inst->traceData->setPC(inst->pcState());
+            }
+
+            // FetchBuffer Book-Keeping
+            (*fetch_it)->cnt--;
+            assert((*fetch_it)->cnt >= 0);
+            markBlockUsed(fetch_it);
+
+            cache_req->done();
+            return;
+        }
+
         if (cache_req->isMemAccComplete()) {
+            if (fetchBuffer.size() >= fetchBuffSize) {
+                // If there is no replacement block, then we'll just have
+                // to wait till that gets cleared before satisfying the fetch
+                // for this instruction
+                std::list<FetchBlock*>::iterator repl_it  =
+                    findReplacementBlock();
+                if (repl_it == fetchBuffer.end()) {
+                    DPRINTF(InOrderCachePort, "Unable to find replacement block"
+                            " and complete fetch.\n");
+                    cache_req->setCompleted(false);
+                    return;
+                }
+
+                fetchBuffer.erase(repl_it);
+            }
+
             DPRINTF(InOrderCachePort,
                     "[tid:%i]: Completing Fetch Access for [sn:%i]\n",
                     tid, inst->seqNum);
 
+            // Make New Instruction
+            std::list<FetchBlock*>::iterator fetch_it  =
+                findBlock(pendingFetch, asid, block_addr);
+
+            assert(fetch_it != pendingFetch.end());
+            assert((*fetch_it)->valid);
+
+            createMachInst(fetch_it, inst);
+            if (inst->traceData) {
+                inst->traceData->setStaticInst(inst->staticInst);
+                inst->traceData->setPC(inst->pcState());
+            }
+
+
+            // Update instructions waiting on new fetch block
+            FetchBlock *new_block = (*fetch_it);
+            new_block->cnt--;
+            assert(new_block->cnt >= 0);
+
+            // Finally, update FetchBuffer w/Pending Block into the
+            // MRU location
+            pendingFetch.erase(fetch_it);
+            fetchBuffer.push_back(new_block);
 
             DPRINTF(InOrderCachePort, "[tid:%i]: Instruction [sn:%i] is: %s\n",
-                    tid, seq_num,
+                    tid, inst->seqNum,
                     inst->staticInst->disassemble(inst->instAddr()));
 
-            removeAddrDependency(inst);
+            inst->unsetMemAddr();
 
             delete cache_req->dataPkt;
 
-            // Do not stall and switch threads for fetch... for now..
-            // TODO: We need to detect cache misses for latencies > 1
-            // cache_req->setMemStall(false);
-
             cache_req->done();
         } else {
             DPRINTF(InOrderCachePort,
@@ -199,7 +384,9 @@ FetchUnit::execute(int slot_num)
                     "STALL: [tid:%i]: Fetch miss from %08p\n",
                     tid, cache_req->inst->instAddr());
             cache_req->setCompleted(false);
-            //cache_req->setMemStall(true);
+            // NOTE: For SwitchOnCacheMiss ThreadModel, we *don't* switch on
+            //       fetch miss, but we could ...
+            // cache_req->setMemStall(true);
         }
         break;
 
@@ -213,7 +400,6 @@ FetchUnit::processCacheCompletion(PacketPtr pkt)
 {
     // Cast to correct packet type
     CacheReqPacket* cache_pkt = dynamic_cast<CacheReqPacket*>(pkt);
-
     assert(cache_pkt);
 
     if (cache_pkt->cacheReq->isSquashed()) {
@@ -230,104 +416,108 @@ FetchUnit::processCacheCompletion(PacketPtr pkt)
         delete cache_pkt;
 
         cpu->wakeCPU();
-
         return;
     }
 
+    Addr block_addr = cacheBlockAlign(cache_pkt->cacheReq->
+                                      getInst()->getMemAddr());
+
     DPRINTF(InOrderCachePort,
-            "[tid:%u]: [sn:%i]: Waking from cache access to addr. %08p\n",
+            "[tid:%u]: [sn:%i]: Waking from fetch access to addr:%#x(phys:%#x), size:%i\n",
             cache_pkt->cacheReq->getInst()->readTid(),
             cache_pkt->cacheReq->getInst()->seqNum,
-            cache_pkt->cacheReq->getInst()->getMemAddr());
+            block_addr, cache_pkt->getAddr(), cache_pkt->getSize());
 
     // Cast to correct request type
     CacheRequest *cache_req = dynamic_cast<CacheReqPtr>(
         findRequest(cache_pkt->cacheReq->getInst(), cache_pkt->instIdx));
 
     if (!cache_req) {
-        panic("[tid:%u]: [sn:%i]: Can't find slot for cache access to "
+        panic("[tid:%u]: [sn:%i]: Can't find slot for fetch access to "
               "addr. %08p\n", cache_pkt->cacheReq->getInst()->readTid(),
               cache_pkt->cacheReq->getInst()->seqNum,
-              cache_pkt->cacheReq->getInst()->getMemAddr());
+              block_addr);
     }
 
-    assert(cache_req);
-
-
     // Get resource request info
     unsigned stage_num = cache_req->getStageNum();
     DynInstPtr inst = cache_req->inst;
     ThreadID tid = cache_req->inst->readTid();
+    short asid = cpu->asid[tid];
 
-    if (!cache_req->isSquashed()) {
-        assert(inst->resSched.top()->cmd == CompleteFetch);
+    assert(!cache_req->isSquashed());
+    assert(inst->resSched.top()->cmd == CompleteFetch);
 
-        DPRINTF(InOrderCachePort,
-                "[tid:%u]: [sn:%i]: Processing fetch access\n",
-                tid, inst->seqNum);
-
-        // NOTE: This is only allowing a thread to fetch one line
-        //       at a time. Re-examine when/if prefetching
-        //       gets implemented.
-        // memcpy(fetchData[tid], cache_pkt->getPtr<uint8_t>(),
-        //        cache_pkt->getSize());
-
-        // Get the instruction from the array of the cache line.
-        // @todo: update this
-        ExtMachInst ext_inst;
-        StaticInstPtr staticInst = NULL;
-        TheISA::PCState instPC = inst->pcState();
-        MachInst mach_inst =
-            TheISA::gtoh(*reinterpret_cast<TheISA::MachInst *>
-                         (cache_pkt->getPtr<uint8_t>()));
-
-        predecoder.setTC(cpu->thread[tid]->getTC());
-        predecoder.moreBytes(instPC, inst->instAddr(), mach_inst);
-        ext_inst = predecoder.getExtMachInst(instPC);
-        inst->pcState(instPC);
-
-        inst->setMachInst(ext_inst);
-
-        // Set Up More TraceData info
-        if (inst->traceData) {
-            inst->traceData->setStaticInst(inst->staticInst);
-            inst->traceData->setPC(instPC);
-        }
+    DPRINTF(InOrderCachePort,
+            "[tid:%u]: [sn:%i]: Processing fetch access for block %#x\n",
+            tid, inst->seqNum, block_addr);
 
-        cache_req->setMemAccPending(false);
-        cache_req->setMemAccCompleted();
+    std::list<FetchBlock*>::iterator pend_it = findBlock(pendingFetch, asid,
+                                                         block_addr);
+    assert(pend_it != pendingFetch.end());
 
-        if (cache_req->isMemStall() &&
-            cpu->threadModel == InOrderCPU::SwitchOnCacheMiss) {
-            DPRINTF(InOrderCachePort, "[tid:%u] Waking up from Cache Miss.\n",
-                    tid);
+    // Copy Data to pendingFetch queue...
+    (*pend_it)->block = new uint8_t[cacheBlkSize];
+    memcpy((*pend_it)->block, cache_pkt->getPtr<uint8_t>(), cacheBlkSize);
+    (*pend_it)->valid = true;
 
-            cpu->activateContext(tid);
+    cache_req->setMemAccPending(false);
+    cache_req->setMemAccCompleted();
 
-            DPRINTF(ThreadModel, "Activating [tid:%i] after return from cache"
-                    "miss.\n", tid);
-        }
+    if (cache_req->isMemStall() &&
+        cpu->threadModel == InOrderCPU::SwitchOnCacheMiss) {
+        DPRINTF(InOrderCachePort, "[tid:%u] Waking up from Cache Miss.\n",
+                tid);
 
-        // Wake up the CPU (if it went to sleep and was waiting on this
-        // completion event).
-        cpu->wakeCPU();
+        cpu->activateContext(tid);
+
+        DPRINTF(ThreadModel, "Activating [tid:%i] after return from cache"
+                "miss.\n", tid);
+    }
 
-        DPRINTF(Activity, "[tid:%u] Activating %s due to cache completion\n",
+    // Wake up the CPU (if it went to sleep and was waiting on this
+    // completion event).
+    cpu->wakeCPU();
+
+    DPRINTF(Activity, "[tid:%u] Activating %s due to cache completion\n",
             tid, cpu->pipelineStage[stage_num]->name());
 
-        cpu->switchToActive(stage_num);
-    } else {
-        DPRINTF(InOrderCachePort,
-                "[tid:%u] Miss on block @ %08p completed, but squashed\n",
-                tid, cache_req->inst->instAddr());
-        cache_req->setMemAccCompleted();
-    }
+    cpu->switchToActive(stage_num);
 }
 
 void
-FetchUnit::squash(DynInstPtr inst, int stage_num,
-                  InstSeqNum squash_seq_num, ThreadID tid)
+FetchUnit::squashCacheRequest(CacheReqPtr req_ptr)
 {
-    CacheUnit::squash(inst, stage_num, squash_seq_num, tid);
+    DynInstPtr inst = req_ptr->getInst();
+    ThreadID tid = inst->readTid();
+    Addr block_addr = cacheBlockAlign(inst->getMemAddr());
+    int asid = cpu->asid[tid];
+
+    // Check Fetch Buffer (or pending fetch) for this block and
+    // update pending counts
+    std::list<FetchBlock*>::iterator buff_it = findBlock(fetchBuffer,
+                                                         asid,
+                                                         block_addr);
+    if (buff_it != fetchBuffer.end()) {
+        (*buff_it)->cnt--;
+        DPRINTF(InOrderCachePort, "[sn:%i] Removing Pending Fetch "
+                "for Buffer block %08p (cnt=%i)\n", inst->seqNum,
+                block_addr, (*buff_it)->cnt);
+    } else {
+        std::list<FetchBlock*>::iterator block_it = findBlock(pendingFetch,
+                                                              asid,
+                                                              block_addr);
+        if (block_it != pendingFetch.end()) {
+            (*block_it)->cnt--;
+            if ((*block_it)->cnt == 0) {
+                DPRINTF(InOrderCachePort, "[sn:%i] Removing Pending Fetch "
+                        "for block %08p (cnt=%i)\n", inst->seqNum,
+                        block_addr, (*block_it)->cnt);
+                pendingFetch.erase(block_it);
+            }
+        }
+    }
+
+    CacheUnit::squashCacheRequest(req_ptr);
 }
 
diff --git a/src/cpu/inorder/resources/fetch_unit.hh b/src/cpu/inorder/resources/fetch_unit.hh
index 257a8c50a..035f3f4a1 100644
--- a/src/cpu/inorder/resources/fetch_unit.hh
+++ b/src/cpu/inorder/resources/fetch_unit.hh
@@ -52,19 +52,31 @@
 class FetchUnit : public CacheUnit
 {
   public:
-    typedef ThePipeline::DynInstPtr DynInstPtr;
-
-  public:
     FetchUnit(std::string res_name, int res_id, int res_width,
               int res_latency, InOrderCPU *_cpu, ThePipeline::Params *params);
 
-    /** Actions that this resources can take on an instruction */
+    typedef ThePipeline::DynInstPtr DynInstPtr;
+    typedef TheISA::ExtMachInst ExtMachInst;
+
+    struct FetchBlock {
+        int asid;
+        Addr addr;
+        uint8_t *block;
+        short cnt;
+        bool valid;
+
+        FetchBlock(int _asid, Addr _addr)
+            : asid(_asid), addr(_addr), block(NULL), cnt(1), valid(false)
+        { }
+    };
+
+    /** Actions that this resource can take on an instruction */
     enum Command {
         InitiateFetch,
         CompleteFetch
     };
 
-  public:
+
     ResourceRequest* getRequest(DynInstPtr _inst, int stage_num,
                                 int res_idx, int slot_num,
                                 unsigned cmd);
@@ -74,8 +86,11 @@ class FetchUnit : public CacheUnit
     /** Executes one of the commands from the "Command" enum */
     void execute(int slot_num);
 
-    void squash(DynInstPtr inst, int stage_num,
-                InstSeqNum squash_seq_num, ThreadID tid);
+  private:
+    void squashCacheRequest(CacheReqPtr req_ptr);
+
+    void createMachInst(std::list<FetchBlock*>::iterator fetch_it,
+                        DynInstPtr inst);
 
     /** After memory request is completed, then turn the fetched data
         into an instruction.
@@ -94,14 +109,24 @@ class FetchUnit : public CacheUnit
 
     void removeAddrDependency(DynInstPtr inst);
 
-  public:
-    /** The mem line being fetched. */
-    uint8_t *fetchData[ThePipeline::MaxThreads];
+    std::list<FetchBlock*>::iterator findReplacementBlock();
+    std::list<FetchBlock*>::iterator findBlock(std::list<FetchBlock*>
+                                               &fetch_blocks, int asid,
+                                               Addr block_addr);
+
+    void markBlockUsed(std::list<FetchBlock*>::iterator block_it);
+
+    int instSize;
+
+    int fetchBuffSize;
+
+    TheISA::Predecoder predecoder;
 
+    /** Valid Cache Blocks*/
+    std::list<FetchBlock*> fetchBuffer;
 
-    /** The Addr of the cacheline that has been loaded. */
-    //Addr cacheBlockAddr[ThePipeline::MaxThreads];
-    //unsigned fetchOffset[ThePipeline::MaxThreads];
+    /** Cache lines that are pending */
+    std::list<FetchBlock*> pendingFetch;
 };
 
 #endif //__CPU_FETCH_UNIT_HH__