/* * Copyright (c) 2014 The University of Wisconsin * * Copyright (c) 2006 INRIA (Institut National de Recherche en * Informatique et en Automatique / French National Research Institute * for Computer Science and Applied Mathematics) * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer; * redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution; * neither the name of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Authors: Vignyan Reddy, Dibakar Gope and Arthur Perais, * from André Seznec's code. */ /* @file * Implementation of a L-TAGE branch predictor */ #include "cpu/pred/ltage.hh" #include "base/intmath.hh" #include "base/logging.hh" #include "base/random.hh" #include "base/trace.hh" #include "debug/Fetch.hh" #include "debug/LTage.hh" LTAGE::LTAGE(const LTAGEParams *params) : BPredUnit(params), logSizeBiMP(params->logSizeBiMP), logSizeTagTables(params->logSizeTagTables), logSizeLoopPred(params->logSizeLoopPred), nHistoryTables(params->nHistoryTables), tagTableCounterBits(params->tagTableCounterBits), histBufferSize(params->histBufferSize), minHist(params->minHist), maxHist(params->maxHist), minTagWidth(params->minTagWidth), threadHistory(params->numThreads) { assert(params->histBufferSize > params->maxHist * 2); useAltPredForNewlyAllocated = 0; logTick = 19; tCounter = ULL(1) << (logTick - 1); for (auto& history : threadHistory) { history.pathHist = 0; history.globalHistory = new uint8_t[histBufferSize]; history.gHist = history.globalHistory; memset(history.gHist, 0, histBufferSize); history.ptGhist = 0; } histLengths = new int [nHistoryTables+1]; histLengths[1] = minHist; histLengths[nHistoryTables] = maxHist; for (int i = 2; i <= nHistoryTables; i++) { histLengths[i] = (int) (((double) minHist * pow ((double) (maxHist) / (double) minHist, (double) (i - 1) / (double) ((nHistoryTables- 1)))) + 0.5); } tagWidths[1] = minTagWidth; tagWidths[2] = minTagWidth; tagWidths[3] = minTagWidth + 1; tagWidths[4] = minTagWidth + 1; tagWidths[5] = minTagWidth + 2; tagWidths[6] = minTagWidth + 3; tagWidths[7] = minTagWidth + 4; tagWidths[8] = minTagWidth + 5; tagWidths[9] = minTagWidth + 5; tagWidths[10] = minTagWidth + 6; tagWidths[11] = minTagWidth + 7; tagWidths[12] = minTagWidth + 8; for (int i = 1; i <= 2; i++) tagTableSizes[i] = logSizeTagTables - 1; for (int i = 3; i <= 6; i++) tagTableSizes[i] = logSizeTagTables; for (int i = 7; i <= 10; i++) tagTableSizes[i] = logSizeTagTables - 1; for (int i = 11; i <= 12; i++) tagTableSizes[i] = logSizeTagTables - 2; for (auto& history : threadHistory) { history.computeIndices = new FoldedHistory[nHistoryTables+1]; history.computeTags[0] = new FoldedHistory[nHistoryTables+1]; history.computeTags[1] = new FoldedHistory[nHistoryTables+1]; for (int i = 1; i <= nHistoryTables; i++) { history.computeIndices[i].init(histLengths[i], (tagTableSizes[i])); history.computeTags[0][i].init( history.computeIndices[i].origLength, tagWidths[i]); history.computeTags[1][i].init( history.computeIndices[i].origLength, tagWidths[i] - 1); DPRINTF(LTage, "HistLength:%d, TTSize:%d, TTTWidth:%d\n", histLengths[i], tagTableSizes[i], tagWidths[i]); } } btable = new BimodalEntry[ULL(1) << logSizeBiMP]; ltable = new LoopEntry[ULL(1) << logSizeLoopPred]; gtable = new TageEntry*[nHistoryTables + 1]; for (int i = 1; i <= nHistoryTables; i++) { gtable[i] = new TageEntry[1<<(tagTableSizes[i])]; } tableIndices = new int [nHistoryTables+1]; tableTags = new int [nHistoryTables+1]; loopUseCounter = 0; } int LTAGE::bindex(Addr pc_in) const { return ((pc_in) & ((ULL(1) << (logSizeBiMP)) - 1)); } int LTAGE::lindex(Addr pc_in) const { return (((pc_in) & ((ULL(1) << (logSizeLoopPred - 2)) - 1)) << 2); } int LTAGE::F(int A, int size, int bank) const { int A1, A2; A = A & ((ULL(1) << size) - 1); A1 = (A & ((ULL(1) << tagTableSizes[bank]) - 1)); A2 = (A >> tagTableSizes[bank]); A2 = ((A2 << bank) & ((ULL(1) << tagTableSizes[bank]) - 1)) + (A2 >> (tagTableSizes[bank] - bank)); A = A1 ^ A2; A = ((A << bank) & ((ULL(1) << tagTableSizes[bank]) - 1)) + (A >> (tagTableSizes[bank] - bank)); return (A); } // gindex computes a full hash of pc, ghist and pathHist int LTAGE::gindex(ThreadID tid, Addr pc, int bank) const { int index; int hlen = (histLengths[bank] > 16) ? 16 : histLengths[bank]; index = (pc) ^ ((pc) >> ((int) abs(tagTableSizes[bank] - bank) + 1)) ^ threadHistory[tid].computeIndices[bank].comp ^ F(threadHistory[tid].pathHist, hlen, bank); return (index & ((ULL(1) << (tagTableSizes[bank])) - 1)); } // Tag computation uint16_t LTAGE::gtag(ThreadID tid, Addr pc, int bank) const { int tag = (pc) ^ threadHistory[tid].computeTags[0][bank].comp ^ (threadHistory[tid].computeTags[1][bank].comp << 1); return (tag & ((ULL(1) << tagWidths[bank]) - 1)); } // Up-down saturating counter void LTAGE::ctrUpdate(int8_t & ctr, bool taken, int nbits) { assert(nbits <= sizeof(int8_t) << 3); if (taken) { if (ctr < ((1 << (nbits - 1)) - 1)) ctr++; } else { if (ctr > -(1 << (nbits - 1))) ctr--; } } // Bimodal prediction bool LTAGE::getBimodePred(Addr pc, BranchInfo* bi) const { return (btable[bi->bimodalIndex].pred > 0); } // Update the bimodal predictor: a hysteresis bit is shared among 4 prediction // bits void LTAGE::baseUpdate(Addr pc, bool taken, BranchInfo* bi) { int inter = (btable[bi->bimodalIndex].pred << 1) + btable[bi->bimodalIndex ].hyst; if (taken) { if (inter < 3) inter++; } else if (inter > 0) { inter--; } btable[bi->bimodalIndex].pred = inter >> 1; btable[bi->bimodalIndex].hyst = (inter & 1); DPRINTF(LTage, "Updating branch %lx, pred:%d, hyst:%d\n", pc, btable[bi->bimodalIndex].pred,btable[bi->bimodalIndex].hyst); } //loop prediction: only used if high confidence bool LTAGE::getLoop(Addr pc, BranchInfo* bi) const { bi->loopHit = -1; bi->loopPredValid = false; bi->loopIndex = lindex(pc); bi->loopTag = ((pc) >> (logSizeLoopPred - 2)); for (int i = 0; i < 4; i++) { if (ltable[bi->loopIndex + i].tag == bi->loopTag) { bi->loopHit = i; bi->loopPredValid = (ltable[bi->loopIndex + i].confidence >= 3); bi->currentIter = ltable[bi->loopIndex + i].currentIterSpec; if (ltable[bi->loopIndex + i].currentIterSpec + 1 == ltable[bi->loopIndex + i].numIter) { return !(ltable[bi->loopIndex + i].dir); }else { return (ltable[bi->loopIndex + i].dir); } } } return false; } void LTAGE::specLoopUpdate(Addr pc, bool taken, BranchInfo* bi) { if (bi->loopHit>=0) { int index = lindex(pc); if (taken != ltable[index].dir) { ltable[index].currentIterSpec = 0; } else { ltable[index].currentIterSpec++; } } } void LTAGE::loopUpdate(Addr pc, bool taken, BranchInfo* bi) { int idx = bi->loopIndex + bi->loopHit; if (bi->loopHit >= 0) { //already a hit if (bi->loopPredValid) { if (taken != bi->loopPred) { // free the entry ltable[idx].numIter = 0; ltable[idx].age = 0; ltable[idx].confidence = 0; ltable[idx].currentIter = 0; return; } else if (bi->loopPred != bi->tagePred) { DPRINTF(LTage, "Loop Prediction success:%lx\n",pc); if (ltable[idx].age < 7) ltable[idx].age++; } } ltable[idx].currentIter++; if (ltable[idx].currentIter > ltable[idx].numIter) { ltable[idx].confidence = 0; if (ltable[idx].numIter != 0) { // free the entry ltable[idx].numIter = 0; ltable[idx].age = 0; ltable[idx].confidence = 0; } } if (taken != ltable[idx].dir) { if (ltable[idx].currentIter == ltable[idx].numIter) { DPRINTF(LTage, "Loop End predicted successfully:%lx\n", pc); if (ltable[idx].confidence < 7) { ltable[idx].confidence++; } //just do not predict when the loop count is 1 or 2 if (ltable[idx].numIter < 3) { // free the entry ltable[idx].dir = taken; ltable[idx].numIter = 0; ltable[idx].age = 0; ltable[idx].confidence = 0; } } else { DPRINTF(LTage, "Loop End predicted incorrectly:%lx\n", pc); if (ltable[idx].numIter == 0) { // first complete nest; ltable[idx].confidence = 0; ltable[idx].numIter = ltable[idx].currentIter; } else { //not the same number of iterations as last time: free the //entry ltable[idx].numIter = 0; ltable[idx].age = 0; ltable[idx].confidence = 0; } } ltable[idx].currentIter = 0; } } else if (taken) { //try to allocate an entry on taken branch int nrand = random_mt.random(); for (int i = 0; i < 4; i++) { int loop_hit = (nrand + i) & 3; idx = bi->loopIndex + loop_hit; if (ltable[idx].age == 0) { DPRINTF(LTage, "Allocating loop pred entry for branch %lx\n", pc); ltable[idx].dir = !taken; ltable[idx].tag = bi->loopTag; ltable[idx].numIter = 0; ltable[idx].age = 7; ltable[idx].confidence = 0; ltable[idx].currentIter = 1; break; } else ltable[idx].age--; } } } // shifting the global history: we manage the history in a big table in order // to reduce simulation time void LTAGE::updateGHist(uint8_t * &h, bool dir, uint8_t * tab, int &pt) { if (pt == 0) { DPRINTF(LTage, "Rolling over the histories\n"); // Copy beginning of globalHistoryBuffer to end, such that // the last maxHist outcomes are still reachable // through pt[0 .. maxHist - 1]. for (int i = 0; i < maxHist; i++) tab[histBufferSize - maxHist + i] = tab[i]; pt = histBufferSize - maxHist; h = &tab[pt]; } pt--; h--; h[0] = (dir) ? 1 : 0; } // Get GHR for hashing indirect predictor // Build history backwards from pointer in // bp_history. unsigned LTAGE::getGHR(ThreadID tid, void *bp_history) const { BranchInfo* bi = static_cast(bp_history); unsigned val = 0; for (unsigned i = 0; i < 32; i++) { // Make sure we don't go out of bounds int gh_offset = bi->ptGhist + i; assert(&(threadHistory[tid].globalHistory[gh_offset]) < threadHistory[tid].globalHistory + histBufferSize); val |= ((threadHistory[tid].globalHistory[gh_offset] & 0x1) << i); } return val; } //prediction bool LTAGE::predict(ThreadID tid, Addr branch_pc, bool cond_branch, void* &b) { BranchInfo *bi = new BranchInfo(nHistoryTables+1); b = (void*)(bi); Addr pc = branch_pc; bool pred_taken = true; bi->loopHit = -1; if (cond_branch) { // TAGE prediction // computes the table addresses and the partial tags for (int i = 1; i <= nHistoryTables; i++) { tableIndices[i] = gindex(tid, pc, i); bi->tableIndices[i] = tableIndices[i]; tableTags[i] = gtag(tid, pc, i); bi->tableTags[i] = tableTags[i]; } bi->bimodalIndex = bindex(pc); bi->hitBank = 0; bi->altBank = 0; //Look for the bank with longest matching history for (int i = nHistoryTables; i > 0; i--) { if (gtable[i][tableIndices[i]].tag == tableTags[i]) { bi->hitBank = i; bi->hitBankIndex = tableIndices[bi->hitBank]; break; } } //Look for the alternate bank for (int i = bi->hitBank - 1; i > 0; i--) { if (gtable[i][tableIndices[i]].tag == tableTags[i]) { bi->altBank = i; bi->altBankIndex = tableIndices[bi->altBank]; break; } } //computes the prediction and the alternate prediction if (bi->hitBank > 0) { if (bi->altBank > 0) { bi->altTaken = gtable[bi->altBank][tableIndices[bi->altBank]].ctr >= 0; }else { bi->altTaken = getBimodePred(pc, bi); } bi->longestMatchPred = gtable[bi->hitBank][tableIndices[bi->hitBank]].ctr >= 0; bi->pseudoNewAlloc = abs(2 * gtable[bi->hitBank][bi->hitBankIndex].ctr + 1) <= 1; //if the entry is recognized as a newly allocated entry and //useAltPredForNewlyAllocated is positive use the alternate //prediction if ((useAltPredForNewlyAllocated < 0) || abs(2 * gtable[bi->hitBank][tableIndices[bi->hitBank]].ctr + 1) > 1) bi->tagePred = bi->longestMatchPred; else bi->tagePred = bi->altTaken; } else { bi->altTaken = getBimodePred(pc, bi); bi->tagePred = bi->altTaken; bi->longestMatchPred = bi->altTaken; } //end TAGE prediction bi->loopPred = getLoop(pc, bi); // loop prediction pred_taken = (((loopUseCounter >= 0) && bi->loopPredValid)) ? (bi->loopPred): (bi->tagePred); DPRINTF(LTage, "Predict for %lx: taken?:%d, loopTaken?:%d, " "loopValid?:%d, loopUseCounter:%d, tagePred:%d, altPred:%d\n", branch_pc, pred_taken, bi->loopPred, bi->loopPredValid, loopUseCounter, bi->tagePred, bi->altTaken); } bi->branchPC = branch_pc; bi->condBranch = cond_branch; specLoopUpdate(branch_pc, pred_taken, bi); return pred_taken; } // PREDICTOR UPDATE void LTAGE::update(ThreadID tid, Addr branch_pc, bool taken, void* bp_history, bool squashed) { assert(bp_history); BranchInfo *bi = static_cast(bp_history); if (squashed) { // This restores the global history, then update it // and recomputes the folded histories. squash(tid, taken, bp_history); return; } int nrand = random_mt.random(0,3); Addr pc = branch_pc; if (bi->condBranch) { DPRINTF(LTage, "Updating tables for branch:%lx; taken?:%d\n", branch_pc, taken); // first update the loop predictor loopUpdate(pc, taken, bi); if (bi->loopPredValid) { if (bi->tagePred != bi->loopPred) { ctrUpdate(loopUseCounter, (bi->loopPred== taken), 7); } } // TAGE UPDATE // try to allocate a new entries only if prediction was wrong bool longest_match_pred = false; bool alloc = (bi->tagePred != taken) && (bi->hitBank < nHistoryTables); if (bi->hitBank > 0) { // Manage the selection between longest matching and alternate // matching for "pseudo"-newly allocated longest matching entry longest_match_pred = bi->longestMatchPred; bool PseudoNewAlloc = bi->pseudoNewAlloc; // an entry is considered as newly allocated if its prediction // counter is weak if (PseudoNewAlloc) { if (longest_match_pred == taken) { alloc = false; } // if it was delivering the correct prediction, no need to // allocate new entry even if the overall prediction was false if (longest_match_pred != bi->altTaken) { ctrUpdate(useAltPredForNewlyAllocated, bi->altTaken == taken, 4); } } } if (alloc) { // is there some "unuseful" entry to allocate int8_t min = 1; for (int i = nHistoryTables; i > bi->hitBank; i--) { if (gtable[i][bi->tableIndices[i]].u < min) { min = gtable[i][bi->tableIndices[i]].u; } } // we allocate an entry with a longer history // to avoid ping-pong, we do not choose systematically the next // entry, but among the 3 next entries int Y = nrand & ((ULL(1) << (nHistoryTables - bi->hitBank - 1)) - 1); int X = bi->hitBank + 1; if (Y & 1) { X++; if (Y & 2) X++; } // No entry available, forces one to be available if (min > 0) { gtable[X][bi->tableIndices[X]].u = 0; } //Allocate only one entry for (int i = X; i <= nHistoryTables; i++) { if ((gtable[i][bi->tableIndices[i]].u == 0)) { gtable[i][bi->tableIndices[i]].tag = bi->tableTags[i]; gtable[i][bi->tableIndices[i]].ctr = (taken) ? 0 : -1; gtable[i][bi->tableIndices[i]].u = 0; //? } } } //periodic reset of u: reset is not complete but bit by bit tCounter++; if ((tCounter & ((ULL(1) << logTick) - 1)) == 0) { // reset least significant bit // most significant bit becomes least significant bit for (int i = 1; i <= nHistoryTables; i++) { for (int j = 0; j < (ULL(1) << tagTableSizes[i]); j++) { gtable[i][j].u = gtable[i][j].u >> 1; } } } if (bi->hitBank > 0) { DPRINTF(LTage, "Updating tag table entry (%d,%d) for branch %lx\n", bi->hitBank, bi->hitBankIndex, branch_pc); ctrUpdate(gtable[bi->hitBank][bi->hitBankIndex].ctr, taken, tagTableCounterBits); // if the provider entry is not certified to be useful also update // the alternate prediction if (gtable[bi->hitBank][bi->hitBankIndex].u == 0) { if (bi->altBank > 0) { ctrUpdate(gtable[bi->altBank][bi->altBankIndex].ctr, taken, tagTableCounterBits); DPRINTF(LTage, "Updating tag table entry (%d,%d) for" " branch %lx\n", bi->hitBank, bi->hitBankIndex, branch_pc); } if (bi->altBank == 0) { baseUpdate(pc, taken, bi); } } // update the u counter if (longest_match_pred != bi->altTaken) { if (longest_match_pred == taken) { if (gtable[bi->hitBank][bi->hitBankIndex].u < 1) { gtable[bi->hitBank][bi->hitBankIndex].u++; } } } } else { baseUpdate(pc, taken, bi); } //END PREDICTOR UPDATE } if (!squashed) { delete bi; } } void LTAGE::updateHistories(ThreadID tid, Addr branch_pc, bool taken, void* b) { BranchInfo* bi = (BranchInfo*)(b); ThreadHistory& tHist = threadHistory[tid]; // UPDATE HISTORIES bool pathbit = ((branch_pc) & 1); //on a squash, return pointers to this and recompute indices. //update user history updateGHist(tHist.gHist, taken, tHist.globalHistory, tHist.ptGhist); tHist.pathHist = (tHist.pathHist << 1) + pathbit; tHist.pathHist = (tHist.pathHist & ((ULL(1) << 16) - 1)); bi->ptGhist = tHist.ptGhist; bi->pathHist = tHist.pathHist; //prepare next index and tag computations for user branchs for (int i = 1; i <= nHistoryTables; i++) { bi->ci[i] = tHist.computeIndices[i].comp; bi->ct0[i] = tHist.computeTags[0][i].comp; bi->ct1[i] = tHist.computeTags[1][i].comp; tHist.computeIndices[i].update(tHist.gHist); tHist.computeTags[0][i].update(tHist.gHist); tHist.computeTags[1][i].update(tHist.gHist); } DPRINTF(LTage, "Updating global histories with branch:%lx; taken?:%d, " "path Hist: %x; pointer:%d\n", branch_pc, taken, tHist.pathHist, tHist.ptGhist); } void LTAGE::squash(ThreadID tid, bool taken, void *bp_history) { BranchInfo* bi = (BranchInfo*)(bp_history); ThreadHistory& tHist = threadHistory[tid]; DPRINTF(LTage, "Restoring branch info: %lx; taken? %d; PathHistory:%x, " "pointer:%d\n", bi->branchPC,taken, bi->pathHist, bi->ptGhist); tHist.pathHist = bi->pathHist; tHist.ptGhist = bi->ptGhist; tHist.gHist = &(tHist.globalHistory[tHist.ptGhist]); tHist.gHist[0] = (taken ? 1 : 0); for (int i = 1; i <= nHistoryTables; i++) { tHist.computeIndices[i].comp = bi->ci[i]; tHist.computeTags[0][i].comp = bi->ct0[i]; tHist.computeTags[1][i].comp = bi->ct1[i]; tHist.computeIndices[i].update(tHist.gHist); tHist.computeTags[0][i].update(tHist.gHist); tHist.computeTags[1][i].update(tHist.gHist); } if (bi->condBranch) { if (bi->loopHit >= 0) { int idx = bi->loopIndex + bi->loopHit; ltable[idx].currentIterSpec = bi->currentIter; } } } void LTAGE::squash(ThreadID tid, void *bp_history) { BranchInfo* bi = (BranchInfo*)(bp_history); DPRINTF(LTage, "Deleting branch info: %lx\n", bi->branchPC); if (bi->condBranch) { if (bi->loopHit >= 0) { int idx = bi->loopIndex + bi->loopHit; ltable[idx].currentIterSpec = bi->currentIter; } } delete bi; } bool LTAGE::lookup(ThreadID tid, Addr branch_pc, void* &bp_history) { bool retval = predict(tid, branch_pc, true, bp_history); DPRINTF(LTage, "Lookup branch: %lx; predict:%d\n", branch_pc, retval); updateHistories(tid, branch_pc, retval, bp_history); assert(threadHistory[tid].gHist == &threadHistory[tid].globalHistory[threadHistory[tid].ptGhist]); return retval; } void LTAGE::btbUpdate(ThreadID tid, Addr branch_pc, void* &bp_history) { BranchInfo* bi = (BranchInfo*) bp_history; ThreadHistory& tHist = threadHistory[tid]; DPRINTF(LTage, "BTB miss resets prediction: %lx\n", branch_pc); assert(tHist.gHist == &tHist.globalHistory[tHist.ptGhist]); tHist.gHist[0] = 0; for (int i = 1; i <= nHistoryTables; i++) { tHist.computeIndices[i].comp = bi->ci[i]; tHist.computeTags[0][i].comp = bi->ct0[i]; tHist.computeTags[1][i].comp = bi->ct1[i]; tHist.computeIndices[i].update(tHist.gHist); tHist.computeTags[0][i].update(tHist.gHist); tHist.computeTags[1][i].update(tHist.gHist); } } void LTAGE::uncondBranch(ThreadID tid, Addr br_pc, void* &bp_history) { DPRINTF(LTage, "UnConditionalBranch: %lx\n", br_pc); predict(tid, br_pc, false, bp_history); updateHistories(tid, br_pc, true, bp_history); assert(threadHistory[tid].gHist == &threadHistory[tid].globalHistory[threadHistory[tid].ptGhist]); } LTAGE* LTAGEParams::create() { return new LTAGE(this); }