summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGabe Black <gblack@eecs.umich.edu>2013-01-04 19:00:44 -0600
committerGabe Black <gblack@eecs.umich.edu>2013-01-04 19:00:44 -0600
commitd1965af22045d2a62b1cd1bc473b836413d79b46 (patch)
tree3b5a67ba03489fc8789923b724bc04f04070bea3
parent63b10907ef822aa6873d7f4f4d1ee849a188b2c4 (diff)
downloadgem5-d1965af22045d2a62b1cd1bc473b836413d79b46.tar.xz
X86: Move address based decode caching in front of the predecoder.
The predecoder in x86 does a lot of work, most of which can be skipped if the decoder cache is put in front of it. Committed by: Nilay Vaish <nilay@cs.wisc.edu>
-rw-r--r--src/arch/x86/decoder.cc157
-rw-r--r--src/arch/x86/decoder.hh137
-rw-r--r--src/arch/x86/isa.cc21
-rw-r--r--src/arch/x86/isa.hh3
4 files changed, 246 insertions, 72 deletions
diff --git a/src/arch/x86/decoder.cc b/src/arch/x86/decoder.cc
index 5fb2dcaf4..9dcb02902 100644
--- a/src/arch/x86/decoder.cc
+++ b/src/arch/x86/decoder.cc
@@ -38,10 +38,15 @@
namespace X86ISA
{
-void Decoder::doReset()
+
+Decoder::State
+Decoder::doResetState()
{
origPC = basePC + offset;
DPRINTF(Decoder, "Setting origPC to %#x\n", origPC);
+ instBytes = &decodePages->lookup(origPC);
+ chunkIdx = 0;
+
emi.rex = 0;
emi.legacy = 0;
emi.opcode.num = 0;
@@ -55,12 +60,17 @@ void Decoder::doReset()
emi.modRM = 0;
emi.sib = 0;
- m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);
- emi.mode.mode = m5Reg.mode;
- emi.mode.submode = m5Reg.submode;
+
+ if (instBytes->si) {
+ return FromCacheState;
+ } else {
+ instBytes->chunks.clear();
+ return PrefixState;
+ }
}
-void Decoder::process()
+void
+Decoder::process()
{
//This function drives the decoder state machine.
@@ -70,15 +80,18 @@ void Decoder::process()
assert(!outOfBytes);
assert(!instDone);
+ if (state == ResetState)
+ state = doResetState();
+ if (state == FromCacheState) {
+ state = doFromCacheState();
+ } else {
+ instBytes->chunks.push_back(fetchChunk);
+ }
+
//While there's still something to do...
- while(!instDone && !outOfBytes)
- {
+ while (!instDone && !outOfBytes) {
uint8_t nextByte = getNextByte();
- switch(state)
- {
- case ResetState:
- doReset();
- state = PrefixState;
+ switch (state) {
case PrefixState:
state = doPrefixState(nextByte);
break;
@@ -105,9 +118,42 @@ void Decoder::process()
}
}
+Decoder::State
+Decoder::doFromCacheState()
+{
+ DPRINTF(Decoder, "Looking at cache state.\n");
+ if ((fetchChunk & instBytes->masks[chunkIdx]) !=
+ instBytes->chunks[chunkIdx]) {
+ DPRINTF(Decoder, "Decode cache miss.\n");
+ // The chached chunks didn't match what was fetched. Fall back to the
+ // predecoder.
+ instBytes->chunks[chunkIdx] = fetchChunk;
+ instBytes->chunks.resize(chunkIdx + 1);
+ instBytes->si = NULL;
+ chunkIdx = 0;
+ fetchChunk = instBytes->chunks[0];
+ offset = origPC % sizeof(MachInst);
+ basePC = origPC - offset;
+ return PrefixState;
+ } else if (chunkIdx == instBytes->chunks.size() - 1) {
+ // We matched the cache, so use its value.
+ instDone = true;
+ offset = instBytes->lastOffset;
+ if (offset == sizeof(MachInst))
+ outOfBytes = true;
+ return ResetState;
+ } else {
+ // We matched so far, but need to check more chunks.
+ chunkIdx++;
+ outOfBytes = true;
+ return FromCacheState;
+ }
+}
+
//Either get a prefix and record it in the ExtMachInst, or send the
//state machine on to get the opcode(s).
-Decoder::State Decoder::doPrefixState(uint8_t nextByte)
+Decoder::State
+Decoder::doPrefixState(uint8_t nextByte)
{
uint8_t prefix = Prefixes[nextByte];
State nextState = PrefixState;
@@ -164,7 +210,8 @@ Decoder::State Decoder::doPrefixState(uint8_t nextByte)
//Load all the opcodes (currently up to 2) and then figure out
//what immediate and/or ModRM is needed.
-Decoder::State Decoder::doOpcodeState(uint8_t nextByte)
+Decoder::State
+Decoder::doOpcodeState(uint8_t nextByte)
{
State nextState = ErrorState;
emi.opcode.num++;
@@ -194,9 +241,9 @@ Decoder::State Decoder::doOpcodeState(uint8_t nextByte)
if (emi.rex.w)
logOpSize = 3; // 64 bit operand size
else if (emi.legacy.op)
- logOpSize = m5Reg.altOp;
+ logOpSize = altOp;
else
- logOpSize = m5Reg.defOp;
+ logOpSize = defOp;
//Set the actual op size
emi.opSize = 1 << logOpSize;
@@ -205,16 +252,16 @@ Decoder::State Decoder::doOpcodeState(uint8_t nextByte)
//a fixed value at the decoder level.
int logAddrSize;
if(emi.legacy.addr)
- logAddrSize = m5Reg.altAddr;
+ logAddrSize = altAddr;
else
- logAddrSize = m5Reg.defAddr;
+ logAddrSize = defAddr;
//Set the actual address size
emi.addrSize = 1 << logAddrSize;
//Figure out the effective stack width. This can be overriden to
//a fixed value at the decoder level.
- emi.stackSize = 1 << m5Reg.stack;
+ emi.stackSize = 1 << stack;
//Figure out how big of an immediate we'll retreive based
//on the opcode.
@@ -242,13 +289,14 @@ Decoder::State Decoder::doOpcodeState(uint8_t nextByte)
//Get the ModRM byte and determine what displacement, if any, there is.
//Also determine whether or not to get the SIB byte, displacement, or
//immediate next.
-Decoder::State Decoder::doModRMState(uint8_t nextByte)
+Decoder::State
+Decoder::doModRMState(uint8_t nextByte)
{
State nextState = ErrorState;
ModRM modRM;
modRM = nextByte;
DPRINTF(Decoder, "Found modrm byte %#x.\n", nextByte);
- if (m5Reg.defOp == 1) {
+ if (defOp == 1) {
//figure out 16 bit displacement size
if ((modRM.mod == 0 && modRM.rm == 6) || modRM.mod == 2)
displacementSize = 2;
@@ -297,7 +345,8 @@ Decoder::State Decoder::doModRMState(uint8_t nextByte)
//Get the SIB byte. We don't do anything with it at this point, other
//than storing it in the ExtMachInst. Determine if we need to get a
//displacement or immediate next.
-Decoder::State Decoder::doSIBState(uint8_t nextByte)
+Decoder::State
+Decoder::doSIBState(uint8_t nextByte)
{
State nextState = ErrorState;
emi.sib = nextByte;
@@ -318,7 +367,8 @@ Decoder::State Decoder::doSIBState(uint8_t nextByte)
//Gather up the displacement, or at least as much of it
//as we can get.
-Decoder::State Decoder::doDisplacementState()
+Decoder::State
+Decoder::doDisplacementState()
{
State nextState = ErrorState;
@@ -365,7 +415,8 @@ Decoder::State Decoder::doDisplacementState()
//Gather up the immediate, or at least as much of it
//as we can get
-Decoder::State Decoder::doImmediateState()
+Decoder::State
+Decoder::doImmediateState()
{
State nextState = ErrorState;
@@ -408,24 +459,62 @@ Decoder::State Decoder::doImmediateState()
return nextState;
}
-DecodeCache::InstMap Decoder::instMap;
-DecodeCache::AddrMap<StaticInstPtr> Decoder::decodePages;
+Decoder::InstBytes Decoder::dummy;
+Decoder::InstCacheMap Decoder::instCacheMap;
StaticInstPtr
Decoder::decode(ExtMachInst mach_inst, Addr addr)
{
- StaticInstPtr &si = decodePages.lookup(addr);
- if (si && (si->machInst == mach_inst))
- return si;
+ DecodeCache::InstMap::iterator iter = instMap->find(mach_inst);
+ if (iter != instMap->end())
+ return iter->second;
+
+ StaticInstPtr si = decodeInst(mach_inst);
+ (*instMap)[mach_inst] = si;
+ return si;
+}
+
+StaticInstPtr
+Decoder::decode(PCState &nextPC)
+{
+ if (!instDone)
+ return NULL;
+ instDone = false;
+ updateNPC(nextPC);
- DecodeCache::InstMap::iterator iter = instMap.find(mach_inst);
- if (iter != instMap.end()) {
- si = iter->second;
+ StaticInstPtr &si = instBytes->si;
+ if (si)
return si;
+
+ // We didn't match in the AddrMap, but we still populated an entry. Fix
+ // up its byte masks.
+ const int chunkSize = sizeof(MachInst);
+
+ instBytes->lastOffset = offset;
+
+ Addr firstBasePC = basePC - (instBytes->chunks.size() - 1) * chunkSize;
+ Addr firstOffset = origPC - firstBasePC;
+ Addr totalSize = instBytes->lastOffset - firstOffset +
+ (instBytes->chunks.size() - 1) * chunkSize;
+ int start = firstOffset;
+ instBytes->masks.clear();
+
+ while (totalSize) {
+ int end = start + totalSize;
+ end = (chunkSize < end) ? chunkSize : end;
+ int size = end - start;
+ int idx = instBytes->masks.size();
+
+ MachInst maskVal = mask(size * 8) << (start * 8);
+ assert(maskVal);
+
+ instBytes->masks.push_back(maskVal);
+ instBytes->chunks[idx] &= instBytes->masks[idx];
+ totalSize -= size;
+ start = 0;
}
- si = decodeInst(mach_inst);
- instMap[mach_inst] = si;
+ si = decode(emi, origPC);
return si;
}
diff --git a/src/arch/x86/decoder.hh b/src/arch/x86/decoder.hh
index 24194d839..796f9eef9 100644
--- a/src/arch/x86/decoder.hh
+++ b/src/arch/x86/decoder.hh
@@ -32,6 +32,7 @@
#define __ARCH_X86_DECODER_HH__
#include <cassert>
+#include <vector>
#include "arch/x86/regs/misc.hh"
#include "arch/x86/types.hh"
@@ -58,9 +59,24 @@ class Decoder
static const uint8_t SizeTypeToSize[3][10];
protected:
+ struct InstBytes
+ {
+ StaticInstPtr si;
+ std::vector<MachInst> chunks;
+ std::vector<MachInst> masks;
+ int lastOffset;
+
+ InstBytes() : lastOffset(0)
+ {}
+ };
+
+ static InstBytes dummy;
+
ThreadContext * tc;
//The bytes to be predecoded
MachInst fetchChunk;
+ InstBytes *instBytes;
+ int chunkIdx;
//The pc of the start of fetchChunk
Addr basePC;
//The pc the current instruction started at
@@ -69,9 +85,16 @@ class Decoder
int offset;
//The extended machine instruction being generated
ExtMachInst emi;
- HandyM5Reg m5Reg;
-
- inline uint8_t getNextByte()
+ //Predecoding state
+ X86Mode mode;
+ X86SubMode submode;
+ uint8_t altOp;
+ uint8_t defOp;
+ uint8_t altAddr;
+ uint8_t defAddr;
+ uint8_t stack;
+
+ uint8_t getNextByte()
{
return ((uint8_t *)&fetchChunk)[offset];
}
@@ -99,23 +122,34 @@ class Decoder
consumeBytes(toGet);
}
- inline void consumeByte()
+ void updateOffsetState()
{
- offset++;
assert(offset <= sizeof(MachInst));
- if(offset == sizeof(MachInst))
- outOfBytes = true;
+ if (offset == sizeof(MachInst)) {
+ DPRINTF(Decoder, "At the end of a chunk, idx = %d, chunks = %d.\n",
+ chunkIdx, instBytes->chunks.size());
+ chunkIdx++;
+ if (chunkIdx == instBytes->chunks.size()) {
+ outOfBytes = true;
+ } else {
+ offset = 0;
+ fetchChunk = instBytes->chunks[chunkIdx];
+ basePC += sizeof(MachInst);
+ }
+ }
}
- inline void consumeBytes(int numBytes)
+ void consumeByte()
{
- offset += numBytes;
- assert(offset <= sizeof(MachInst));
- if(offset == sizeof(MachInst))
- outOfBytes = true;
+ offset++;
+ updateOffsetState();
}
- void doReset();
+ void consumeBytes(int numBytes)
+ {
+ offset += numBytes;
+ updateOffsetState();
+ }
//State machine state
protected:
@@ -133,6 +167,7 @@ class Decoder
enum State {
ResetState,
+ FromCacheState,
PrefixState,
OpcodeState,
ModRMState,
@@ -146,6 +181,8 @@ class Decoder
State state;
//Functions to handle each of the states
+ State doResetState();
+ State doFromCacheState();
State doPrefixState(uint8_t);
State doOpcodeState(uint8_t);
State doModRMState(uint8_t);
@@ -153,6 +190,20 @@ class Decoder
State doDisplacementState();
State doImmediateState();
+ protected:
+ /// Caching for decoded instruction objects.
+
+ typedef MiscReg CacheKey;
+
+ typedef DecodeCache::AddrMap<Decoder::InstBytes> DecodePages;
+ DecodePages *decodePages;
+ typedef m5::hash_map<CacheKey, DecodePages *> AddrCacheMap;
+ AddrCacheMap addrCacheMap;
+
+ DecodeCache::InstMap *instMap;
+ typedef m5::hash_map<CacheKey, DecodeCache::InstMap *> InstCacheMap;
+ static InstCacheMap instCacheMap;
+
public:
Decoder(ThreadContext * _tc) :
tc(_tc), basePC(0), origPC(0), offset(0),
@@ -160,9 +211,47 @@ class Decoder
state(ResetState)
{
memset(&emi, 0, sizeof(emi));
- emi.mode.mode = LongMode;
- emi.mode.submode = SixtyFourBitMode;
- m5Reg = 0;
+ mode = LongMode;
+ submode = SixtyFourBitMode;
+ emi.mode.mode = mode;
+ emi.mode.submode = submode;
+ altOp = 0;
+ defOp = 0;
+ altAddr = 0;
+ defAddr = 0;
+ stack = 0;
+ instBytes = &dummy;
+ decodePages = NULL;
+ instMap = NULL;
+ }
+
+ void setM5Reg(HandyM5Reg m5Reg)
+ {
+ mode = (X86Mode)(uint64_t)m5Reg.mode;
+ submode = (X86SubMode)(uint64_t)m5Reg.submode;
+ emi.mode.mode = mode;
+ emi.mode.submode = submode;
+ altOp = m5Reg.altOp;
+ defOp = m5Reg.defOp;
+ altAddr = m5Reg.altAddr;
+ defAddr = m5Reg.defAddr;
+ stack = m5Reg.stack;
+
+ AddrCacheMap::iterator amIter = addrCacheMap.find(m5Reg);
+ if (amIter != addrCacheMap.end()) {
+ decodePages = amIter->second;
+ } else {
+ decodePages = new DecodePages;
+ addrCacheMap[m5Reg] = decodePages;
+ }
+
+ InstCacheMap::iterator imIter = instCacheMap.find(m5Reg);
+ if (imIter != instCacheMap.end()) {
+ instMap = imIter->second;
+ } else {
+ instMap = new DecodeCache::InstMap;
+ instCacheMap[m5Reg] = instMap;
+ }
}
void reset()
@@ -218,11 +307,6 @@ class Decoder
}
}
- protected:
- /// Caching for decoded instruction objects.
- static DecodeCache::InstMap instMap;
- static DecodeCache::AddrMap<StaticInstPtr> decodePages;
-
public:
StaticInstPtr decodeInst(ExtMachInst mach_inst);
@@ -230,16 +314,7 @@ class Decoder
/// @param mach_inst The binary instruction to decode.
/// @retval A pointer to the corresponding StaticInst object.
StaticInstPtr decode(ExtMachInst mach_inst, Addr addr);
-
- StaticInstPtr
- decode(X86ISA::PCState &nextPC)
- {
- if (!instDone)
- return NULL;
- instDone = false;
- updateNPC(nextPC);
- return decode(emi, origPC);
- }
+ StaticInstPtr decode(X86ISA::PCState &nextPC);
};
} // namespace X86ISA
diff --git a/src/arch/x86/isa.cc b/src/arch/x86/isa.cc
index 5305b1058..1a9b39840 100644
--- a/src/arch/x86/isa.cc
+++ b/src/arch/x86/isa.cc
@@ -28,6 +28,7 @@
* Authors: Gabe Black
*/
+#include "arch/x86/decoder.hh"
#include "arch/x86/isa.hh"
#include "arch/x86/tlb.hh"
#include "cpu/base.hh"
@@ -39,7 +40,8 @@ namespace X86ISA
void
ISA::updateHandyM5Reg(Efer efer, CR0 cr0,
- SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags)
+ SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags,
+ ThreadContext *tc)
{
HandyM5Reg m5reg = 0;
if (efer.lma) {
@@ -94,6 +96,8 @@ ISA::updateHandyM5Reg(Efer efer, CR0 cr0,
}
regVal[MISCREG_M5_REG] = m5reg;
+ if (tc)
+ tc->getDecoderPtr()->setM5Reg(m5reg);
}
void
@@ -184,7 +188,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc)
newCR0,
regVal[MISCREG_CS_ATTR],
regVal[MISCREG_SS_ATTR],
- regVal[MISCREG_RFLAGS]);
+ regVal[MISCREG_RFLAGS],
+ tc);
}
break;
case MISCREG_CR2:
@@ -225,7 +230,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc)
regVal[MISCREG_CR0],
newCSAttr,
regVal[MISCREG_SS_ATTR],
- regVal[MISCREG_RFLAGS]);
+ regVal[MISCREG_RFLAGS],
+ tc);
}
break;
case MISCREG_SS_ATTR:
@@ -233,7 +239,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc)
regVal[MISCREG_CR0],
regVal[MISCREG_CS_ATTR],
val,
- regVal[MISCREG_RFLAGS]);
+ regVal[MISCREG_RFLAGS],
+ tc);
break;
// These segments always actually use their bases, or in other words
// their effective bases must stay equal to their actual bases.
@@ -340,7 +347,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc)
regVal[MISCREG_CR0],
regVal[MISCREG_CS_ATTR],
regVal[MISCREG_SS_ATTR],
- regVal[MISCREG_RFLAGS]);
+ regVal[MISCREG_RFLAGS],
+ tc);
return;
default:
break;
@@ -363,7 +371,8 @@ ISA::unserialize(EventManager *em, Checkpoint * cp,
regVal[MISCREG_CR0],
regVal[MISCREG_CS_ATTR],
regVal[MISCREG_SS_ATTR],
- regVal[MISCREG_RFLAGS]);
+ regVal[MISCREG_RFLAGS],
+ NULL);
}
}
diff --git a/src/arch/x86/isa.hh b/src/arch/x86/isa.hh
index 463a249a4..7b0c7b61a 100644
--- a/src/arch/x86/isa.hh
+++ b/src/arch/x86/isa.hh
@@ -50,7 +50,8 @@ namespace X86ISA
protected:
MiscReg regVal[NUM_MISCREGS];
void updateHandyM5Reg(Efer efer, CR0 cr0,
- SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags);
+ SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags,
+ ThreadContext *tc);
public:
void clear();