From ee9a331fe59356a11e6f610967cd5aa08cef3db9 Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Tue, 18 Jan 2011 16:30:02 -0600 Subject: O3: Support timing translations for O3 CPU fetch. --- src/cpu/o3/fetch.hh | 38 ++++- src/cpu/o3/fetch_impl.hh | 371 ++++++++++++++++++++++------------------------- 2 files changed, 212 insertions(+), 197 deletions(-) (limited to 'src/cpu/o3') diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index b86ccf81e..92691720b 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -1,4 +1,16 @@ /* + * Copyright (c) 2010 ARM Limited + * All rights reserved + * + * The license below extends only to copyright in the software and shall + * not be construed as granting a license to any other intellectual + * property including but not limited to intellectual property relating + * to a hardware implementation of the functionality of the software + * licensed hereunder. You may use the software subject to the license + * terms below provided that you ensure that this notice is replicated + * unmodified and in its entirety in all distributions of the software, + * modified or unmodified, in source code or in binary form. + * * Copyright (c) 2004-2006 The Regents of The University of Michigan * All rights reserved. * @@ -38,6 +50,7 @@ #include "cpu/timebuf.hh" #include "config/the_isa.hh" #include "cpu/pc_event.hh" +#include "cpu/translation.hh" #include "mem/packet.hh" #include "mem/port.hh" #include "sim/eventq.hh" @@ -113,6 +126,25 @@ class DefaultFetch virtual void recvRetry(); }; + class FetchTranslation : public BaseTLB::Translation + { + protected: + DefaultFetch *fetch; + + public: + FetchTranslation(DefaultFetch *_fetch) + : fetch(_fetch) + {} + + void + finish(Fault fault, RequestPtr req, ThreadContext *tc, + BaseTLB::Mode mode) + { + assert(mode == BaseTLB::Execute); + fetch->finishTranslation(fault, req); + delete this; + } + }; public: /** Overall fetch status. Used to determine if the CPU can @@ -133,6 +165,7 @@ class DefaultFetch TrapPending, QuiescePending, SwitchOut, + ItlbWait, IcacheWaitResponse, IcacheWaitRetry, IcacheAccessComplete @@ -242,7 +275,8 @@ class DefaultFetch * @param pc The actual PC of the current instruction. * @return Any fault that occured. */ - bool fetchCacheLine(Addr vaddr, Fault &ret_fault, ThreadID tid, Addr pc); + bool fetchCacheLine(Addr vaddr, ThreadID tid, Addr pc); + void finishTranslation(Fault fault, RequestPtr mem_req); /** Check if an interrupt is pending and that we need to handle @@ -468,6 +502,8 @@ class DefaultFetch Stats::Scalar fetchCycles; /** Stat for total number of cycles spent squashing. */ Stats::Scalar fetchSquashCycles; + /** Stat for total number of cycles spent waiting for translation */ + Stats::Scalar fetchTlbCycles; /** Stat for total number of cycles spent blocked due to other stages in * the pipeline. */ diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh index 1875d9c50..880158dfc 100644 --- a/src/cpu/o3/fetch_impl.hh +++ b/src/cpu/o3/fetch_impl.hh @@ -237,6 +237,11 @@ DefaultFetch::regStats() .desc("Number of cycles fetch has spent squashing") .prereq(fetchSquashCycles); + fetchTlbCycles + .name(name() + ".TlbCycles") + .desc("Number of cycles fetch has spent waiting for tlb") + .prereq(fetchTlbCycles); + fetchIdleCycles .name(name() + ".IdleCycles") .desc("Number of cycles fetch was idle") @@ -548,11 +553,11 @@ DefaultFetch::lookupAndUpdateNextPC( template bool -DefaultFetch::fetchCacheLine(Addr vaddr, Fault &ret_fault, ThreadID tid, - Addr pc) +DefaultFetch::fetchCacheLine(Addr vaddr, ThreadID tid, Addr pc) { Fault fault = NoFault; + // @todo: not sure if these should block translation. //AlphaDep if (cacheBlocked) { DPRINTF(Fetch, "[tid:%i] Can't fetch cache line, cache blocked\n", @@ -575,11 +580,6 @@ DefaultFetch::fetchCacheLine(Addr vaddr, Fault &ret_fault, ThreadID tid, // Align the fetch address so it's at the start of a cache block. Addr block_PC = icacheBlockAlignPC(vaddr); - // If we've already got the block, no need to try to fetch it again. - if (cacheDataValid[tid] && block_PC == cacheDataPC[tid]) { - return true; - } - // Setup the memReq to do a read of the first instruction's address. // Set the appropriate read size and flags as well. // Build request here. @@ -589,27 +589,23 @@ DefaultFetch::fetchCacheLine(Addr vaddr, Fault &ret_fault, ThreadID tid, memReq[tid] = mem_req; - // Translate the instruction request. - fault = cpu->itb->translateAtomic(mem_req, cpu->thread[tid]->getTC(), - BaseTLB::Execute); + // Initiate translation of the icache block + fetchStatus[tid] = ItlbWait; + FetchTranslation *trans = new FetchTranslation(this); + cpu->itb->translateTiming(mem_req, cpu->thread[tid]->getTC(), + trans, BaseTLB::Execute); + return true; +} - // In the case of faults, the fetch stage may need to stall and wait - // for the ITB miss to be handled. +template +void +DefaultFetch::finishTranslation(Fault fault, RequestPtr mem_req) +{ + ThreadID tid = mem_req->threadId(); + Addr block_PC = mem_req->getVaddr(); - // If translation was successful, attempt to read the first - // instruction. + // If translation was successful, attempt to read the icache block. if (fault == NoFault) { -#if 0 - if (cpu->system->memctrl->badaddr(memReq[tid]->paddr) || - memReq[tid]->isUncacheable()) { - DPRINTF(Fetch, "Fetch: Bad address %#x (hopefully on a " - "misspeculating path)!", - memReq[tid]->paddr); - ret_fault = TheISA::genMachineCheckFault(); - return false; - } -#endif - // Build packet here. PacketPtr data_pkt = new Packet(mem_req, MemCmd::ReadReq, Packet::Broadcast); @@ -617,39 +613,54 @@ DefaultFetch::fetchCacheLine(Addr vaddr, Fault &ret_fault, ThreadID tid, cacheDataPC[tid] = block_PC; cacheDataValid[tid] = false; - DPRINTF(Fetch, "Fetch: Doing instruction read.\n"); fetchedCacheLines++; - // Now do the timing access to see whether or not the instruction - // exists within the cache. + // Access the cache. if (!icachePort->sendTiming(data_pkt)) { assert(retryPkt == NULL); assert(retryTid == InvalidThreadID); DPRINTF(Fetch, "[tid:%i] Out of MSHRs!\n", tid); + fetchStatus[tid] = IcacheWaitRetry; retryPkt = data_pkt; retryTid = tid; cacheBlocked = true; - return false; + } else { + DPRINTF(Fetch, "[tid:%i]: Doing Icache access.\n", tid); + DPRINTF(Activity, "[tid:%i]: Activity: Waiting on I-cache " + "response.\n", tid); + + lastIcacheStall[tid] = curTick(); + fetchStatus[tid] = IcacheWaitResponse; } + } else { + // Translation faulted, icache request won't be sent. + delete mem_req; + memReq[tid] = NULL; - DPRINTF(Fetch, "[tid:%i]: Doing cache access.\n", tid); + // Send the fault to commit. This thread will not do anything + // until commit handles the fault. The only other way it can + // wake up is if a squash comes along and changes the PC. + TheISA::PCState fetchPC = pc[tid]; - lastIcacheStall[tid] = curTick(); + // We will use a nop in ordier to carry the fault. + DynInstPtr instruction = buildInst(tid, + StaticInstPtr(TheISA::NoopMachInst, fetchPC.instAddr()), + NULL, fetchPC, fetchPC, false); - DPRINTF(Activity, "[tid:%i]: Activity: Waiting on I-cache " - "response.\n", tid); + instruction->setPredTarg(fetchPC); + instruction->fault = fault; + wroteToTimeBuffer = true; - fetchStatus[tid] = IcacheWaitResponse; - } else { - delete mem_req; - memReq[tid] = NULL; - } + fetchStatus[tid] = TrapPending; - ret_fault = fault; - return true; + DPRINTF(Fetch, "[tid:%i]: Blocked, need to handle the trap.\n", tid); + DPRINTF(Fetch, "[tid:%i]: fault (%s) detected @ PC %s.\n", + tid, fault->name(), pc[tid]); + } + _status = updateFetchStatus(); } template @@ -1044,9 +1055,6 @@ DefaultFetch::fetch(bool &status_change) // The current PC. TheISA::PCState thisPC = pc[tid]; - // Fault code for memory access. - Fault fault = NoFault; - Addr pcOffset = fetchOffset[tid]; Addr fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask; @@ -1054,22 +1062,30 @@ DefaultFetch::fetch(bool &status_change) // to running, otherwise do the cache access. Possibly move this up // to tick() function. if (fetchStatus[tid] == IcacheAccessComplete) { - DPRINTF(Fetch, "[tid:%i]: Icache miss is complete.\n",tid); + DPRINTF(Fetch, "[tid:%i]: Icache miss is complete.\n", tid); fetchStatus[tid] = Running; status_change = true; } else if (fetchStatus[tid] == Running) { - DPRINTF(Fetch, "[tid:%i]: Attempting to translate and read " - "instruction, starting at PC %#x.\n", tid, fetchAddr); + // Align the fetch PC so its at the start of a cache block. + Addr block_PC = icacheBlockAlignPC(fetchAddr); + + // Unless buffer already got the block, fetch it from icache. + if (!cacheDataValid[tid] || block_PC != cacheDataPC[tid]) { + DPRINTF(Fetch, "[tid:%i]: Attempting to translate and read " + "instruction, starting at PC %s.\n", tid, thisPC); + + fetchCacheLine(fetchAddr, tid, thisPC.instAddr()); - bool fetch_success = fetchCacheLine(fetchAddr, fault, tid, - thisPC.instAddr()); - if (!fetch_success) { - if (cacheBlocked) { + if (fetchStatus[tid] == IcacheWaitResponse) ++icacheStallCycles; - } else { + else if (fetchStatus[tid] == ItlbWait) + ++fetchTlbCycles; + else ++fetchMiscStallCycles; - } + return; + } else if (checkInterrupt(thisPC.instAddr()) || isSwitchedOut()) { + ++fetchMiscStallCycles; return; } } else { @@ -1084,145 +1100,140 @@ DefaultFetch::fetch(bool &status_change) DPRINTF(Fetch, "[tid:%i]: Fetch is squashing!\n", tid); } else if (fetchStatus[tid] == IcacheWaitResponse) { ++icacheStallCycles; - DPRINTF(Fetch, "[tid:%i]: Fetch is waiting cache response!\n", tid); + DPRINTF(Fetch, "[tid:%i]: Fetch is waiting cache response!\n", + tid); + } else if (fetchStatus[tid] == ItlbWait) { + DPRINTF(Fetch, "[tid:%i]: Fetch is waiting ITLB walk to " + "finish! \n", tid); + ++fetchTlbCycles; } - // Status is Idle, Squashing, Blocked, or IcacheWaitResponse, so - // fetch should do nothing. + // Status is Idle, Squashing, Blocked, ItlbWait or IcacheWaitResponse + // so fetch should do nothing. return; } ++fetchCycles; - // If we had a stall due to an icache miss, then return. - if (fetchStatus[tid] == IcacheWaitResponse) { - ++icacheStallCycles; - status_change = true; - return; - } - TheISA::PCState nextPC = thisPC; StaticInstPtr staticInst = NULL; StaticInstPtr curMacroop = macroop[tid]; - if (fault == NoFault) { + // If the read of the first instruction was successful, then grab the + // instructions from the rest of the cache line and put them into the + // queue heading to decode. - // If the read of the first instruction was successful, then grab the - // instructions from the rest of the cache line and put them into the - // queue heading to decode. - - DPRINTF(Fetch, - "[tid:%i]: Adding instructions to queue to decode.\n", tid); - - // Need to keep track of whether or not a predicted branch - // ended this fetch block. - bool predictedBranch = false; - - TheISA::MachInst *cacheInsts = - reinterpret_cast(cacheData[tid]); - - const unsigned numInsts = cacheBlkSize / instSize; - unsigned blkOffset = (fetchAddr - cacheDataPC[tid]) / instSize; - - // Loop through instruction memory from the cache. - while (blkOffset < numInsts && - numInst < fetchWidth && - !predictedBranch) { - - // If we need to process more memory, do it now. - if (!curMacroop && !predecoder.extMachInstReady()) { - if (ISA_HAS_DELAY_SLOT && pcOffset == 0) { - // Walk past any annulled delay slot instructions. - Addr pcAddr = thisPC.instAddr() & BaseCPU::PCMask; - while (fetchAddr != pcAddr && blkOffset < numInsts) { - blkOffset++; - fetchAddr += instSize; - } - if (blkOffset >= numInsts) - break; - } - MachInst inst = TheISA::gtoh(cacheInsts[blkOffset]); + DPRINTF(Fetch, "[tid:%i]: Adding instructions to queue to " + "decode.\n", tid); + + // Need to keep track of whether or not a predicted branch + // ended this fetch block. + bool predictedBranch = false; - predecoder.setTC(cpu->thread[tid]->getTC()); - predecoder.moreBytes(thisPC, fetchAddr, inst); + TheISA::MachInst *cacheInsts = + reinterpret_cast(cacheData[tid]); - if (predecoder.needMoreBytes()) { + const unsigned numInsts = cacheBlkSize / instSize; + unsigned blkOffset = (fetchAddr - cacheDataPC[tid]) / instSize; + + // Loop through instruction memory from the cache. + while (blkOffset < numInsts && + numInst < fetchWidth && + !predictedBranch) { + + // If we need to process more memory, do it now. + if (!curMacroop && !predecoder.extMachInstReady()) { + if (ISA_HAS_DELAY_SLOT && pcOffset == 0) { + // Walk past any annulled delay slot instructions. + Addr pcAddr = thisPC.instAddr() & BaseCPU::PCMask; + while (fetchAddr != pcAddr && blkOffset < numInsts) { blkOffset++; fetchAddr += instSize; - pcOffset += instSize; } + if (blkOffset >= numInsts) + break; } + MachInst inst = TheISA::gtoh(cacheInsts[blkOffset]); - // Extract as many instructions and/or microops as we can from - // the memory we've processed so far. - do { - if (!curMacroop) { - if (predecoder.extMachInstReady()) { - ExtMachInst extMachInst; - - extMachInst = predecoder.getExtMachInst(thisPC); - pcOffset = 0; - staticInst = StaticInstPtr(extMachInst, - thisPC.instAddr()); - - // Increment stat of fetched instructions. - ++fetchedInsts; - - if (staticInst->isMacroop()) - curMacroop = staticInst; - } else { - // We need more bytes for this instruction. - break; - } - } - if (curMacroop) { - staticInst = curMacroop->fetchMicroop(thisPC.microPC()); - if (staticInst->isLastMicroop()) - curMacroop = NULL; + predecoder.setTC(cpu->thread[tid]->getTC()); + predecoder.moreBytes(thisPC, fetchAddr, inst); + + if (predecoder.needMoreBytes()) { + blkOffset++; + fetchAddr += instSize; + pcOffset += instSize; + } + } + + // Extract as many instructions and/or microops as we can from + // the memory we've processed so far. + do { + if (!curMacroop) { + if (predecoder.extMachInstReady()) { + ExtMachInst extMachInst; + + extMachInst = predecoder.getExtMachInst(thisPC); + pcOffset = 0; + staticInst = StaticInstPtr(extMachInst, + thisPC.instAddr()); + + // Increment stat of fetched instructions. + ++fetchedInsts; + + if (staticInst->isMacroop()) + curMacroop = staticInst; + } else { + // We need more bytes for this instruction. + break; } + } + if (curMacroop) { + staticInst = curMacroop->fetchMicroop(thisPC.microPC()); + if (staticInst->isLastMicroop()) + curMacroop = NULL; + } - DynInstPtr instruction = - buildInst(tid, staticInst, curMacroop, - thisPC, nextPC, true); + DynInstPtr instruction = + buildInst(tid, staticInst, curMacroop, + thisPC, nextPC, true); - numInst++; + numInst++; - nextPC = thisPC; + nextPC = thisPC; - // If we're branching after this instruction, quite fetching - // from the same block then. - predictedBranch |= thisPC.branching(); - predictedBranch |= - lookupAndUpdateNextPC(instruction, nextPC); - if (predictedBranch) { - DPRINTF(Fetch, "Branch detected with PC = %s\n", thisPC); - } + // If we're branching after this instruction, quite fetching + // from the same block then. + predictedBranch |= thisPC.branching(); + predictedBranch |= + lookupAndUpdateNextPC(instruction, nextPC); + if (predictedBranch) { + DPRINTF(Fetch, "Branch detected with PC = %s\n", thisPC); + } - // Move to the next instruction, unless we have a branch. - thisPC = nextPC; + // Move to the next instruction, unless we have a branch. + thisPC = nextPC; - if (instruction->isQuiesce()) { - DPRINTF(Fetch, - "Quiesce instruction encountered, halting fetch!"); - fetchStatus[tid] = QuiescePending; - status_change = true; - break; - } - } while ((curMacroop || predecoder.extMachInstReady()) && - numInst < fetchWidth); - } + if (instruction->isQuiesce()) { + DPRINTF(Fetch, + "Quiesce instruction encountered, halting fetch!"); + fetchStatus[tid] = QuiescePending; + status_change = true; + break; + } + } while ((curMacroop || predecoder.extMachInstReady()) && + numInst < fetchWidth); + } - if (predictedBranch) { - DPRINTF(Fetch, "[tid:%i]: Done fetching, predicted branch " - "instruction encountered.\n", tid); - } else if (numInst >= fetchWidth) { - DPRINTF(Fetch, "[tid:%i]: Done fetching, reached fetch bandwidth " - "for this cycle.\n", tid); - } else if (blkOffset >= cacheBlkSize) { - DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of cache " - "block.\n", tid); - } + if (predictedBranch) { + DPRINTF(Fetch, "[tid:%i]: Done fetching, predicted branch " + "instruction encountered.\n", tid); + } else if (numInst >= fetchWidth) { + DPRINTF(Fetch, "[tid:%i]: Done fetching, reached fetch bandwidth " + "for this cycle.\n", tid); + } else if (blkOffset >= cacheBlkSize) { + DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of cache " + "block.\n", tid); } macroop[tid] = curMacroop; @@ -1232,39 +1243,7 @@ DefaultFetch::fetch(bool &status_change) wroteToTimeBuffer = true; } - // Now that fetching is completed, update the PC to signify what the next - // cycle will be. - if (fault == NoFault) { - pc[tid] = nextPC; - DPRINTF(Fetch, "[tid:%i]: Setting PC to %s.\n", tid, nextPC); - } else { - // We shouldn't be in an icache miss and also have a fault (an ITB - // miss) - if (fetchStatus[tid] == IcacheWaitResponse) { - panic("Fetch should have exited prior to this!"); - } - - // Send the fault to commit. This thread will not do anything - // until commit handles the fault. The only other way it can - // wake up is if a squash comes along and changes the PC. Send the - // fault on a dummy nop. - staticInst = StaticInstPtr(TheISA::NoopMachInst, thisPC.instAddr()); - - DynInstPtr instruction = - buildInst(tid, staticInst, NULL, thisPC, nextPC, false); - - TheISA::advancePC(nextPC, staticInst); - instruction->setPredTarg(nextPC); - instruction->fault = fault; - - DPRINTF(Fetch, "[tid:%i]: Blocked, need to handle the trap.\n",tid); - - fetchStatus[tid] = TrapPending; - status_change = true; - - DPRINTF(Fetch, "[tid:%i]: fault (%s) detected @ PC %s, sending nop " - "[sn:%lli]\n", tid, fault->name(), thisPC, inst_seq); - } + pc[tid] = thisPC; } template -- cgit v1.2.3