diff options
Diffstat (limited to 'src')
61 files changed, 1285 insertions, 509 deletions
diff --git a/src/arch/alpha/ev5.cc b/src/arch/alpha/ev5.cc index 796ed07de..7595423c3 100644 --- a/src/arch/alpha/ev5.cc +++ b/src/arch/alpha/ev5.cc @@ -554,6 +554,7 @@ AlphaISA::MiscRegFile::setIpr(int idx, uint64_t val, ThreadContext *tc) return NoFault; } + void AlphaISA::copyIprs(ThreadContext *src, ThreadContext *dest) { @@ -562,6 +563,7 @@ AlphaISA::copyIprs(ThreadContext *src, ThreadContext *dest) } } + /** * Check for special simulator handling of specific PAL calls. * If return value is false, actual PAL call will be suppressed. diff --git a/src/arch/alpha/freebsd/system.cc b/src/arch/alpha/freebsd/system.cc index 8d50e1612..99be25057 100644 --- a/src/arch/alpha/freebsd/system.cc +++ b/src/arch/alpha/freebsd/system.cc @@ -105,6 +105,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(FreebsdAlphaSystem) Param<string> boot_osflags; Param<string> readfile; + Param<string> symbolfile; Param<unsigned int> init_param; Param<uint64_t> system_type; @@ -124,6 +125,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(FreebsdAlphaSystem) INIT_PARAM_DFLT(boot_osflags, "flags to pass to the kernel during boot", "a"), INIT_PARAM_DFLT(readfile, "file to read startup script from", ""), + INIT_PARAM_DFLT(symbolfile, "file to read symbols from", ""), INIT_PARAM_DFLT(init_param, "numerical value to pass into simulator", 0), INIT_PARAM_DFLT(system_type, "Type of system we are emulating", 34), INIT_PARAM_DFLT(system_rev, "Revision of system we are emulating", 1<<10) @@ -143,6 +145,7 @@ CREATE_SIM_OBJECT(FreebsdAlphaSystem) p->boot_osflags = boot_osflags; p->init_param = init_param; p->readfile = readfile; + p->symbolfile = symbolfile; p->system_type = system_type; p->system_rev = system_rev; return new FreebsdAlphaSystem(p); diff --git a/src/arch/alpha/isa/decoder.isa b/src/arch/alpha/isa/decoder.isa index 30959c72e..4fc9da3f3 100644 --- a/src/arch/alpha/isa/decoder.isa +++ b/src/arch/alpha/isa/decoder.isa @@ -779,10 +779,10 @@ decode OPCODE default Unknown::unknown() { }}, IsNonSpeculative, IsQuiesce); 0x03: quiesceCycles({{ AlphaPseudo::quiesceCycles(xc->tcBase(), R16); - }}, IsNonSpeculative, IsQuiesce); + }}, IsNonSpeculative, IsQuiesce, IsUnverifiable); 0x04: quiesceTime({{ R0 = AlphaPseudo::quiesceTime(xc->tcBase()); - }}, IsNonSpeculative); + }}, IsNonSpeculative, IsUnverifiable); 0x10: ivlb({{ AlphaPseudo::ivlb(xc->tcBase()); }}, No_OpClass, IsNonSpeculative); @@ -795,6 +795,9 @@ decode OPCODE default Unknown::unknown() { 0x21: m5exit({{ AlphaPseudo::m5exit(xc->tcBase(), R16); }}, No_OpClass, IsNonSpeculative); + 0x31: loadsymbol({{ + AlphaPseudo::loadsymbol(xc->tcBase()); + }}, No_OpClass, IsNonSpeculative); 0x30: initparam({{ Ra = xc->tcBase()->getCpuPtr()->system->init_param; }}); 0x40: resetstats({{ AlphaPseudo::resetstats(xc->tcBase(), R16, R17); diff --git a/src/arch/alpha/isa/mem.isa b/src/arch/alpha/isa/mem.isa index a5dda7fc6..fe69c36a5 100644 --- a/src/arch/alpha/isa/mem.isa +++ b/src/arch/alpha/isa/mem.isa @@ -528,7 +528,7 @@ def template MiscInitiateAcc {{ Fault %(class_name)s::initiateAcc(%(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const { - panic("Misc instruction does not support split access method!"); + warn("Misc instruction does not support split access method!"); return NoFault; } }}; @@ -539,7 +539,7 @@ def template MiscCompleteAcc {{ %(CPU_exec_context)s *xc, Trace::InstRecord *traceData) const { - panic("Misc instruction does not support split access method!"); + warn("Misc instruction does not support split access method!"); return NoFault; } diff --git a/src/arch/alpha/isa_traits.hh b/src/arch/alpha/isa_traits.hh index 4f439b8df..c59d93238 100644 --- a/src/arch/alpha/isa_traits.hh +++ b/src/arch/alpha/isa_traits.hh @@ -42,7 +42,55 @@ class StaticInstPtr; namespace AlphaISA { - using namespace LittleEndianGuest; + + typedef uint32_t MachInst; + typedef uint64_t ExtMachInst; + typedef uint8_t RegIndex; + + const int NumIntArchRegs = 32; + const int NumPALShadowRegs = 8; + const int NumFloatArchRegs = 32; + // @todo: Figure out what this number really should be. + const int NumMiscArchRegs = 32; + + // Static instruction parameters + const int MaxInstSrcRegs = 3; + const int MaxInstDestRegs = 2; + + // semantically meaningful register indices + const int ZeroReg = 31; // architecturally meaningful + // the rest of these depend on the ABI + const int StackPointerReg = 30; + const int GlobalPointerReg = 29; + const int ProcedureValueReg = 27; + const int ReturnAddressReg = 26; + const int ReturnValueReg = 0; + const int FramePointerReg = 15; + const int ArgumentReg0 = 16; + const int ArgumentReg1 = 17; + const int ArgumentReg2 = 18; + const int ArgumentReg3 = 19; + const int ArgumentReg4 = 20; + const int ArgumentReg5 = 21; + const int SyscallNumReg = ReturnValueReg; + const int SyscallPseudoReturnReg = ArgumentReg4; + const int SyscallSuccessReg = 19; + + + + const int LogVMPageSize = 13; // 8K bytes + const int VMPageSize = (1 << LogVMPageSize); + + const int BranchPredAddrShiftAmt = 2; // instructions are 4-byte aligned + + const int WordBytes = 4; + const int HalfwordBytes = 2; + const int ByteBytes = 1; + + + const int NumIntRegs = NumIntArchRegs + NumPALShadowRegs; + const int NumFloatRegs = NumFloatArchRegs; + const int NumMiscRegs = NumMiscArchRegs; // These enumerate all the registers for dependence tracking. enum DependenceTags { @@ -57,231 +105,241 @@ namespace AlphaISA IPR_Base_DepTag = 76 }; - StaticInstPtr decodeInst(ExtMachInst); + typedef uint64_t IntReg; + typedef IntReg IntRegFile[NumIntRegs]; - // Alpha Does NOT have a delay slot - #define ISA_HAS_DELAY_SLOT 0 + // floating point register file entry type + typedef union { + uint64_t q; + double d; + } FloatReg; - const Addr PageShift = 13; - const Addr PageBytes = ULL(1) << PageShift; - const Addr PageMask = ~(PageBytes - 1); - const Addr PageOffset = PageBytes - 1; + typedef union { + uint64_t q[NumFloatRegs]; // integer qword view + double d[NumFloatRegs]; // double-precision floating point view -#if FULL_SYSTEM + void clear() + { bzero(d, sizeof(d)); } + } FloatRegFile; - //////////////////////////////////////////////////////////////////////// - // - // Translation stuff - // +extern const Addr PageShift; +extern const Addr PageBytes; +extern const Addr PageMask; +extern const Addr PageOffset; - const Addr PteShift = 3; - const Addr NPtePageShift = PageShift - PteShift; - const Addr NPtePage = ULL(1) << NPtePageShift; - const Addr PteMask = NPtePage - 1; +// redirected register map, really only used for the full system case. +extern const int reg_redir[NumIntRegs]; - // User Virtual - const Addr USegBase = ULL(0x0); - const Addr USegEnd = ULL(0x000003ffffffffff); +#if FULL_SYSTEM - // Kernel Direct Mapped - const Addr K0SegBase = ULL(0xfffffc0000000000); - const Addr K0SegEnd = ULL(0xfffffdffffffffff); + typedef uint64_t InternalProcReg; - // Kernel Virtual - const Addr K1SegBase = ULL(0xfffffe0000000000); - const Addr K1SegEnd = ULL(0xffffffffffffffff); +#include "arch/alpha/isa_fullsys_traits.hh" - // For loading... XXX This maybe could be USegEnd?? --ali - const Addr LoadAddrMask = ULL(0xffffffffff); +#else + const int NumInternalProcRegs = 0; +#endif - //////////////////////////////////////////////////////////////////////// - // - // Interrupt levels - // - enum InterruptLevels - { - INTLEVEL_SOFTWARE_MIN = 4, - INTLEVEL_SOFTWARE_MAX = 19, + // control register file contents + typedef uint64_t MiscReg; + class MiscRegFile { + protected: + uint64_t fpcr; // floating point condition codes + uint64_t uniq; // process-unique register + bool lock_flag; // lock flag for LL/SC + Addr lock_addr; // lock address for LL/SC - INTLEVEL_EXTERNAL_MIN = 20, - INTLEVEL_EXTERNAL_MAX = 34, + public: + MiscReg readReg(int misc_reg); - INTLEVEL_IRQ0 = 20, - INTLEVEL_IRQ1 = 21, - INTINDEX_ETHERNET = 0, - INTINDEX_SCSI = 1, - INTLEVEL_IRQ2 = 22, - INTLEVEL_IRQ3 = 23, + //These functions should be removed once the simplescalar cpu model + //has been replaced. + int getInstAsid(); + int getDataAsid(); - INTLEVEL_SERIAL = 33, + MiscReg readRegWithEffect(int misc_reg, Fault &fault, ExecContext *xc); - NumInterruptLevels = INTLEVEL_EXTERNAL_MAX - }; + Fault setReg(int misc_reg, const MiscReg &val); + Fault setRegWithEffect(int misc_reg, const MiscReg &val, + ExecContext *xc); - // EV5 modes - enum mode_type - { - mode_kernel = 0, // kernel - mode_executive = 1, // executive (unused by unix) - mode_supervisor = 2, // supervisor (unused by unix) - mode_user = 3, // user mode - mode_number // number of modes - }; + void serialize(std::ostream &os); -#endif + void unserialize(Checkpoint *cp, const std::string §ion); + + void clear() + { + fpcr = uniq = 0; + lock_flag = 0; + lock_addr = 0; + } #if FULL_SYSTEM - //////////////////////////////////////////////////////////////////////// - // - // Internal Processor Reigsters - // - enum md_ipr_names - { - IPR_ISR = 0x100, // interrupt summary register - IPR_ITB_TAG = 0x101, // ITLB tag register - IPR_ITB_PTE = 0x102, // ITLB page table entry register - IPR_ITB_ASN = 0x103, // ITLB address space register - IPR_ITB_PTE_TEMP = 0x104, // ITLB page table entry temp register - IPR_ITB_IA = 0x105, // ITLB invalidate all register - IPR_ITB_IAP = 0x106, // ITLB invalidate all process register - IPR_ITB_IS = 0x107, // ITLB invalidate select register - IPR_SIRR = 0x108, // software interrupt request register - IPR_ASTRR = 0x109, // asynchronous system trap request register - IPR_ASTER = 0x10a, // asynchronous system trap enable register - IPR_EXC_ADDR = 0x10b, // exception address register - IPR_EXC_SUM = 0x10c, // exception summary register - IPR_EXC_MASK = 0x10d, // exception mask register - IPR_PAL_BASE = 0x10e, // PAL base address register - IPR_ICM = 0x10f, // instruction current mode - IPR_IPLR = 0x110, // interrupt priority level register - IPR_INTID = 0x111, // interrupt ID register - IPR_IFAULT_VA_FORM = 0x112, // formatted faulting virtual addr register - IPR_IVPTBR = 0x113, // virtual page table base register - IPR_HWINT_CLR = 0x115, // H/W interrupt clear register - IPR_SL_XMIT = 0x116, // serial line transmit register - IPR_SL_RCV = 0x117, // serial line receive register - IPR_ICSR = 0x118, // instruction control and status register - IPR_IC_FLUSH = 0x119, // instruction cache flush control - IPR_IC_PERR_STAT = 0x11a, // inst cache parity error status register - IPR_PMCTR = 0x11c, // performance counter register - - // PAL temporary registers... - // register meanings gleaned from osfpal.s source code - IPR_PALtemp0 = 0x140, // local scratch - IPR_PALtemp1 = 0x141, // local scratch - IPR_PALtemp2 = 0x142, // entUna - IPR_PALtemp3 = 0x143, // CPU specific impure area pointer - IPR_PALtemp4 = 0x144, // memory management temp - IPR_PALtemp5 = 0x145, // memory management temp - IPR_PALtemp6 = 0x146, // memory management temp - IPR_PALtemp7 = 0x147, // entIF - IPR_PALtemp8 = 0x148, // intmask - IPR_PALtemp9 = 0x149, // entSys - IPR_PALtemp10 = 0x14a, // ?? - IPR_PALtemp11 = 0x14b, // entInt - IPR_PALtemp12 = 0x14c, // entArith - IPR_PALtemp13 = 0x14d, // reserved for platform specific PAL - IPR_PALtemp14 = 0x14e, // reserved for platform specific PAL - IPR_PALtemp15 = 0x14f, // reserved for platform specific PAL - IPR_PALtemp16 = 0x150, // scratch / whami<7:0> / mces<4:0> - IPR_PALtemp17 = 0x151, // sysval - IPR_PALtemp18 = 0x152, // usp - IPR_PALtemp19 = 0x153, // ksp - IPR_PALtemp20 = 0x154, // PTBR - IPR_PALtemp21 = 0x155, // entMM - IPR_PALtemp22 = 0x156, // kgp - IPR_PALtemp23 = 0x157, // PCBB - - IPR_DTB_ASN = 0x200, // DTLB address space number register - IPR_DTB_CM = 0x201, // DTLB current mode register - IPR_DTB_TAG = 0x202, // DTLB tag register - IPR_DTB_PTE = 0x203, // DTLB page table entry register - IPR_DTB_PTE_TEMP = 0x204, // DTLB page table entry temporary register - - IPR_MM_STAT = 0x205, // data MMU fault status register - IPR_VA = 0x206, // fault virtual address register - IPR_VA_FORM = 0x207, // formatted virtual address register - IPR_MVPTBR = 0x208, // MTU virtual page table base register - IPR_DTB_IAP = 0x209, // DTLB invalidate all process register - IPR_DTB_IA = 0x20a, // DTLB invalidate all register - IPR_DTB_IS = 0x20b, // DTLB invalidate single register - IPR_ALT_MODE = 0x20c, // alternate mode register - IPR_CC = 0x20d, // cycle counter register - IPR_CC_CTL = 0x20e, // cycle counter control register - IPR_MCSR = 0x20f, // MTU control register - - IPR_DC_FLUSH = 0x210, - IPR_DC_PERR_STAT = 0x212, // Dcache parity error status register - IPR_DC_TEST_CTL = 0x213, // Dcache test tag control register - IPR_DC_TEST_TAG = 0x214, // Dcache test tag register - IPR_DC_TEST_TAG_TEMP = 0x215, // Dcache test tag temporary register - IPR_DC_MODE = 0x216, // Dcache mode register - IPR_MAF_MODE = 0x217, // miss address file mode register - - NumInternalProcRegs // number of IPR registers - }; -#else - const int NumInternalProcRegs = 0; -#endif + protected: + InternalProcReg ipr[NumInternalProcRegs]; // Internal processor regs - // Constants Related to the number of registers + private: + MiscReg readIpr(int idx, Fault &fault, ExecContext *xc); - const int NumIntArchRegs = 32; - const int NumPALShadowRegs = 8; - const int NumFloatArchRegs = 32; - // @todo: Figure out what this number really should be. - const int NumMiscArchRegs = 32; + Fault setIpr(int idx, uint64_t val, ExecContext *xc); - const int NumIntRegs = NumIntArchRegs + NumPALShadowRegs; - const int NumFloatRegs = NumFloatArchRegs; - const int NumMiscRegs = NumMiscArchRegs; + void copyIprs(ExecContext *xc); +#endif + friend class RegFile; + }; const int TotalNumRegs = NumIntRegs + NumFloatRegs + NumMiscRegs + NumInternalProcRegs; const int TotalDataRegs = NumIntRegs + NumFloatRegs; - // Static instruction parameters - const int MaxInstSrcRegs = 3; - const int MaxInstDestRegs = 2; + typedef union { + IntReg intreg; + FloatReg fpreg; + MiscReg ctrlreg; + } AnyReg; - // semantically meaningful register indices - const int ZeroReg = 31; // architecturally meaningful - // the rest of these depend on the ABI - const int StackPointerReg = 30; - const int GlobalPointerReg = 29; - const int ProcedureValueReg = 27; - const int ReturnAddressReg = 26; - const int ReturnValueReg = 0; - const int FramePointerReg = 15; - const int ArgumentReg0 = 16; - const int ArgumentReg1 = 17; - const int ArgumentReg2 = 18; - const int ArgumentReg3 = 19; - const int ArgumentReg4 = 20; - const int ArgumentReg5 = 21; - const int SyscallNumReg = ReturnValueReg; - const int SyscallPseudoReturnReg = ArgumentReg4; - const int SyscallSuccessReg = 19; + struct RegFile { + IntRegFile intRegFile; // (signed) integer register file + FloatRegFile floatRegFile; // floating point register file + MiscRegFile miscRegs; // control register file + Addr pc; // program counter + Addr npc; // next-cycle program counter + Addr nnpc; - const int LogVMPageSize = 13; // 8K bytes - const int VMPageSize = (1 << LogVMPageSize); +#if FULL_SYSTEM + int intrflag; // interrupt flag + inline int instAsid() + { return EV5::ITB_ASN_ASN(miscRegs.ipr[IPR_ITB_ASN]); } + inline int dataAsid() + { return EV5::DTB_ASN_ASN(miscRegs.ipr[IPR_DTB_ASN]); } +#endif // FULL_SYSTEM + + void serialize(std::ostream &os); + void unserialize(Checkpoint *cp, const std::string §ion); + + void clear() + { + bzero(intRegFile, sizeof(intRegFile)); + floatRegFile.clear(); + miscRegs.clear(); + } + }; - const int BranchPredAddrShiftAmt = 2; // instructions are 4-byte aligned + static inline ExtMachInst makeExtMI(MachInst inst, const uint64_t &pc); - const int MachineBytes = 8; - const int WordBytes = 4; - const int HalfwordBytes = 2; - const int ByteBytes = 1; + StaticInstPtr decodeInst(ExtMachInst); + + // Alpha Does NOT have a delay slot + #define ISA_HAS_DELAY_SLOT 0 // return a no-op instruction... used for instruction fetch faults - // Alpha UNOP (ldq_u r31,0(r0)) - const ExtMachInst NoopMachInst = 0x2ffe0000; + extern const ExtMachInst NoopMachInst; + + enum annotes { + ANNOTE_NONE = 0, + // An impossible number for instruction annotations + ITOUCH_ANNOTE = 0xffffffff, + }; + + static inline bool isCallerSaveIntegerRegister(unsigned int reg) { + panic("register classification not implemented"); + return (reg >= 1 && reg <= 8 || reg >= 22 && reg <= 25 || reg == 27); + } + + static inline bool isCalleeSaveIntegerRegister(unsigned int reg) { + panic("register classification not implemented"); + return (reg >= 9 && reg <= 15); + } + + static inline bool isCallerSaveFloatRegister(unsigned int reg) { + panic("register classification not implemented"); + return false; + } + + static inline bool isCalleeSaveFloatRegister(unsigned int reg) { + panic("register classification not implemented"); + return false; + } + + static inline Addr alignAddress(const Addr &addr, + unsigned int nbytes) { + return (addr & ~(nbytes - 1)); + } + + // Instruction address compression hooks + static inline Addr realPCToFetchPC(const Addr &addr) { + return addr; + } + + static inline Addr fetchPCToRealPC(const Addr &addr) { + return addr; + } + + // the size of "fetched" instructions (not necessarily the size + // of real instructions for PISA) + static inline size_t fetchInstSize() { + return sizeof(MachInst); + } + + static inline MachInst makeRegisterCopy(int dest, int src) { + panic("makeRegisterCopy not implemented"); + return 0; + } + + // Machine operations + + void saveMachineReg(AnyReg &savereg, const RegFile ®_file, + int regnum); + + void restoreMachineReg(RegFile ®s, const AnyReg ®, + int regnum); + +#if 0 + static void serializeSpecialRegs(const Serializable::Proxy &proxy, + const RegFile ®s); + + static void unserializeSpecialRegs(const IniFile *db, + const std::string &category, + ConfigNode *node, + RegFile ®s); +#endif + + /** + * Function to insure ISA semantics about 0 registers. + * @param xc The execution context. + */ + template <class XC> + void zeroRegisters(XC *xc); - // redirected register map, really only used for the full system case. - extern const int reg_redir[NumIntRegs]; + const Addr MaxAddr = (Addr)-1; +#if !FULL_SYSTEM + static inline void setSyscallReturn(SyscallReturn return_value, RegFile *regs) + { + // check for error condition. Alpha syscall convention is to + // indicate success/failure in reg a3 (r19) and put the + // return value itself in the standard return value reg (v0). + if (return_value.successful()) { + // no error + regs->intRegFile[SyscallSuccessReg] = 0; + regs->intRegFile[ReturnValueReg] = return_value.value(); + } else { + // got an error, return details + regs->intRegFile[SyscallSuccessReg] = (IntReg) -1; + regs->intRegFile[ReturnValueReg] = -return_value.value(); + } + } +#endif + + void copyRegs(ExecContext *src, ExecContext *dest); + + void copyMiscRegs(ExecContext *src, ExecContext *dest); + +#if FULL_SYSTEM + void copyIprs(ExecContext *src, ExecContext *dest); +#endif }; #endif // __ARCH_ALPHA_ISA_TRAITS_HH__ diff --git a/src/arch/alpha/linux/system.cc b/src/arch/alpha/linux/system.cc index ef4e18cb5..7cf234eeb 100644 --- a/src/arch/alpha/linux/system.cc +++ b/src/arch/alpha/linux/system.cc @@ -199,6 +199,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(LinuxAlphaSystem) Param<string> boot_osflags; Param<string> readfile; + Param<string> symbolfile; Param<unsigned int> init_param; Param<uint64_t> system_type; @@ -218,6 +219,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(LinuxAlphaSystem) INIT_PARAM_DFLT(boot_osflags, "flags to pass to the kernel during boot", "a"), INIT_PARAM_DFLT(readfile, "file to read startup script from", ""), + INIT_PARAM_DFLT(symbolfile, "file to read symbols from", ""), INIT_PARAM_DFLT(init_param, "numerical value to pass into simulator", 0), INIT_PARAM_DFLT(system_type, "Type of system we are emulating", 34), INIT_PARAM_DFLT(system_rev, "Revision of system we are emulating", 1<<10) @@ -237,6 +239,7 @@ CREATE_SIM_OBJECT(LinuxAlphaSystem) p->boot_osflags = boot_osflags; p->init_param = init_param; p->readfile = readfile; + p->symbolfile = symbolfile; p->system_type = system_type; p->system_rev = system_rev; return new LinuxAlphaSystem(p); diff --git a/src/arch/alpha/system.cc b/src/arch/alpha/system.cc index a7e615531..5597eaedc 100644 --- a/src/arch/alpha/system.cc +++ b/src/arch/alpha/system.cc @@ -229,6 +229,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(AlphaSystem) Param<std::string> boot_osflags; Param<std::string> readfile; + Param<std::string> symbolfile; Param<unsigned int> init_param; Param<uint64_t> system_type; @@ -248,6 +249,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(AlphaSystem) INIT_PARAM_DFLT(boot_osflags, "flags to pass to the kernel during boot", "a"), INIT_PARAM_DFLT(readfile, "file to read startup script from", ""), + INIT_PARAM_DFLT(symbolfile, "file to read symbols from", ""), INIT_PARAM_DFLT(init_param, "numerical value to pass into simulator", 0), INIT_PARAM_DFLT(system_type, "Type of system we are emulating", 34), INIT_PARAM_DFLT(system_rev, "Revision of system we are emulating", 1<<10) @@ -267,6 +269,7 @@ CREATE_SIM_OBJECT(AlphaSystem) p->boot_osflags = boot_osflags; p->init_param = init_param; p->readfile = readfile; + p->symbolfile = symbolfile; p->system_type = system_type; p->system_rev = system_rev; return new AlphaSystem(p); diff --git a/src/arch/alpha/tru64/system.cc b/src/arch/alpha/tru64/system.cc index 3ef1e4d3c..00918bda4 100644 --- a/src/arch/alpha/tru64/system.cc +++ b/src/arch/alpha/tru64/system.cc @@ -103,6 +103,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(Tru64AlphaSystem) Param<string> boot_osflags; Param<string> readfile; + Param<string> symbolfile; Param<unsigned int> init_param; Param<uint64_t> system_type; @@ -122,6 +123,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(Tru64AlphaSystem) INIT_PARAM_DFLT(boot_osflags, "flags to pass to the kernel during boot", "a"), INIT_PARAM_DFLT(readfile, "file to read startup script from", ""), + INIT_PARAM_DFLT(symbolfile, "file to read symbols from", ""), INIT_PARAM_DFLT(init_param, "numerical value to pass into simulator", 0), INIT_PARAM_DFLT(system_type, "Type of system we are emulating", 12), INIT_PARAM_DFLT(system_rev, "Revision of system we are emulating", 2<<1) @@ -141,6 +143,7 @@ CREATE_SIM_OBJECT(Tru64AlphaSystem) p->boot_osflags = boot_osflags; p->init_param = init_param; p->readfile = readfile; + p->symbolfile = symbolfile; p->system_type = system_type; p->system_rev = system_rev; diff --git a/src/cpu/base.cc b/src/cpu/base.cc index ce440aeff..f00dad7d6 100644 --- a/src/cpu/base.cc +++ b/src/cpu/base.cc @@ -48,6 +48,9 @@ #include "base/trace.hh" +// Hack +#include "sim/stat_control.hh" + using namespace std; vector<BaseCPU *> BaseCPU::cpuList; @@ -57,6 +60,30 @@ vector<BaseCPU *> BaseCPU::cpuList; // been initialized int maxThreadsPerCPU = 1; +void +CPUProgressEvent::process() +{ + Counter temp = cpu->totalInstructions(); +#ifndef NDEBUG + double ipc = double(temp - lastNumInst) / (interval / cpu->cycles(1)); + + DPRINTFN("%s progress event, instructions committed: %lli, IPC: %0.8d\n", + cpu->name(), temp - lastNumInst, ipc); + ipc = 0.0; +#else + cprintf("%lli: %s progress event, instructions committed: %lli\n", + curTick, cpu->name(), temp - lastNumInst); +#endif + lastNumInst = temp; + schedule(curTick + interval); +} + +const char * +CPUProgressEvent::description() +{ + return "CPU Progress event"; +} + #if FULL_SYSTEM BaseCPU::BaseCPU(Params *p) : MemObject(p->name), clock(p->clock), checkInterrupts(true), @@ -67,6 +94,7 @@ BaseCPU::BaseCPU(Params *p) number_of_threads(p->numberOfThreads), system(p->system) #endif { +// currentTick = curTick; DPRINTF(FullCPU, "BaseCPU: Creating object, mem address %#x.\n", this); // add self to global list of CPUs @@ -128,6 +156,12 @@ BaseCPU::BaseCPU(Params *p) p->max_loads_all_threads, *counter); } + if (p->stats_reset_inst != 0) { + Stats::SetupEvent(Stats::Reset, p->stats_reset_inst, 0, comInstEventQueue[0]); + cprintf("Stats reset event scheduled for %lli insts\n", + p->stats_reset_inst); + } + #if FULL_SYSTEM memset(interrupts, 0, sizeof(interrupts)); intstatus = 0; @@ -153,7 +187,6 @@ BaseCPU::BaseCPU(Params *p) if (params->profile) profileEvent = new ProfileEvent(this, params->profile); #endif - } BaseCPU::Params::Params() @@ -188,6 +221,11 @@ BaseCPU::startup() if (!params->deferRegistration && profileEvent) profileEvent->schedule(curTick); #endif + + if (params->progress_interval) { + new CPUProgressEvent(&mainEventQueue, params->progress_interval, + this); + } } @@ -238,7 +276,11 @@ BaseCPU::registerThreadContexts() void BaseCPU::switchOut() { - panic("This CPU doesn't support sampling!"); +// panic("This CPU doesn't support sampling!"); +#if FULL_SYSTEM + if (profileEvent && profileEvent->scheduled()) + profileEvent->deschedule(); +#endif } void @@ -261,18 +303,22 @@ BaseCPU::takeOverFrom(BaseCPU *oldCPU) assert(newTC->getProcessPtr() == oldTC->getProcessPtr()); newTC->getProcessPtr()->replaceThreadContext(newTC, newTC->readCpuId()); #endif + +// TheISA::compareXCs(oldXC, newXC); } #if FULL_SYSTEM for (int i = 0; i < TheISA::NumInterruptLevels; ++i) interrupts[i] = oldCPU->interrupts[i]; intstatus = oldCPU->intstatus; + checkInterrupts = oldCPU->checkInterrupts; for (int i = 0; i < threadContexts.size(); ++i) threadContexts[i]->profileClear(); - if (profileEvent) - profileEvent->schedule(curTick); + // The Sampler must take care of this! +// if (profileEvent) +// profileEvent->schedule(curTick); #endif } diff --git a/src/cpu/base.hh b/src/cpu/base.hh index 2be6e4e81..2a3fd9b56 100644 --- a/src/cpu/base.hh +++ b/src/cpu/base.hh @@ -46,6 +46,23 @@ class ThreadContext; class System; class Port; +class CPUProgressEvent : public Event +{ + protected: + Tick interval; + Counter lastNumInst; + BaseCPU *cpu; + + public: + CPUProgressEvent(EventQueue *q, Tick ival, BaseCPU *_cpu) + : Event(q, Event::Stat_Event_Pri), interval(ival), lastNumInst(0), cpu(_cpu) + { schedule(curTick + interval); } + + void process(); + + virtual const char *description(); +}; + class BaseCPU : public MemObject { protected: @@ -53,6 +70,7 @@ class BaseCPU : public MemObject Tick clock; public: +// Tick currentTick; inline Tick frequency() const { return Clock::Frequency / clock; } inline Tick cycles(int numCycles) const { return clock * numCycles; } inline Tick curCycle() const { return curTick / clock; } @@ -120,6 +138,7 @@ class BaseCPU : public MemObject Counter max_insts_all_threads; Counter max_loads_any_thread; Counter max_loads_all_threads; + Counter stats_reset_inst; Tick clock; bool functionTrace; Tick functionTraceStart; @@ -128,6 +147,7 @@ class BaseCPU : public MemObject int cpu_id; Tick profile; #endif + Tick progress_interval; BaseCPU *checker; Params(); diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh index 3158aa9cf..926bfcbb2 100644 --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -197,7 +197,7 @@ class BaseDynInst : public FastAlloc, public RefCounted union Result { uint64_t integer; - float fp; +// float fp; double dbl; }; @@ -394,7 +394,7 @@ class BaseDynInst : public FastAlloc, public RefCounted uint64_t readIntResult() { return instResult.integer; } /** Returns the result of a floating point instruction. */ - float readFloatResult() { return instResult.fp; } + float readFloatResult() { return (float)instResult.dbl; } /** Returns the result of a floating point (double) instruction. */ double readDoubleResult() { return instResult.dbl; } @@ -419,7 +419,8 @@ class BaseDynInst : public FastAlloc, public RefCounted /** Records an fp register being set to a value. */ void setFloatReg(const StaticInst *si, int idx, FloatReg val) { - instResult.fp = val; +// instResult.fp = val; + instResult.dbl = (double)val; } /** Records an fp register being set to an integer value. */ diff --git a/src/cpu/checker/cpu.hh b/src/cpu/checker/cpu.hh index 6d6ae1e0a..737b4b5d4 100644 --- a/src/cpu/checker/cpu.hh +++ b/src/cpu/checker/cpu.hh @@ -102,6 +102,7 @@ class CheckerCPU : public BaseCPU Process *process; #endif bool exitOnError; + bool updateOnError; bool warnOnlyOnLoadError; }; @@ -148,7 +149,7 @@ class CheckerCPU : public BaseCPU union Result { uint64_t integer; - float fp; +// float fp; double dbl; }; @@ -269,7 +270,7 @@ class CheckerCPU : public BaseCPU { int reg_idx = si->destRegIdx(idx) - TheISA::FP_Base_DepTag; thread->setFloatReg(reg_idx, val); - result.fp = val; + result.dbl = (double)val; } void setFloatRegBits(const StaticInst *si, int idx, FloatRegBits val, @@ -318,7 +319,7 @@ class CheckerCPU : public BaseCPU return thread->setMiscRegWithEffect(misc_reg, val); } - void recordPCChange(uint64_t val) { changedPC = true; } + void recordPCChange(uint64_t val) { changedPC = true; newPC = val; } void recordNextPCChange(uint64_t val) { changedNextPC = true; } bool translateInstReq(Request *req); @@ -360,6 +361,7 @@ class CheckerCPU : public BaseCPU uint64_t newPC; bool changedNextPC; bool exitOnError; + bool updateOnError; bool warnOnlyOnLoadError; InstSeqNum youngestSN; @@ -376,7 +378,7 @@ class Checker : public CheckerCPU { public: Checker(Params *p) - : CheckerCPU(p) + : CheckerCPU(p), updateThisCycle(false), unverifiedInst(NULL) { } void switchOut(); @@ -393,12 +395,19 @@ class Checker : public CheckerCPU private: void handleError(DynInstPtr &inst) { - if (exitOnError) + if (exitOnError) { dumpAndExit(inst); + } else if (updateOnError) { + updateThisCycle = true; + } } void dumpAndExit(DynInstPtr &inst); + bool updateThisCycle; + + DynInstPtr unverifiedInst; + std::list<DynInstPtr> instList; typedef typename std::list<DynInstPtr>::iterator InstListIt; void dumpInsts(); diff --git a/src/cpu/checker/cpu_impl.hh b/src/cpu/checker/cpu_impl.hh index 81f97726c..3bb81c4b9 100644 --- a/src/cpu/checker/cpu_impl.hh +++ b/src/cpu/checker/cpu_impl.hh @@ -94,6 +94,8 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst) } } + unverifiedInst = inst; + // Try to check all instructions that are completed, ending if we // run out of instructions to check or if an instruction is not // yet completed. @@ -171,7 +173,7 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst) thread->setPC(thread->readNextPC()); thread->setNextPC(thread->readNextPC() + sizeof(MachInst)); - return; + break; } else { // The instruction is carrying an ITB fault. Handle // the fault and see if our results match the CPU on @@ -220,7 +222,8 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst) thread->funcExeInst++; - fault = curStaticInst->execute(this, NULL); + if (!inst->isUnverifiable()) + fault = curStaticInst->execute(this, NULL); // Checks to make sure instrution results are correct. validateExecution(inst); @@ -289,6 +292,7 @@ Checker<DynInstPtr>::verify(DynInstPtr &completed_inst) break; } } + unverifiedInst = NULL; } template <class DynInstPtr> @@ -395,6 +399,23 @@ template <class DynInstPtr> void Checker<DynInstPtr>::validateState() { + if (updateThisCycle) { + warn("%lli: Instruction PC %#x results didn't match up, copying all " + "registers from main CPU", curTick, unverifiedInst->readPC()); + // Heavy-weight copying of all registers + cpuXC->copyArchRegs(unverifiedInst->xcBase()); + // Also advance the PC. Hopefully no PC-based events happened. +#if THE_ISA != MIPS_ISA + // go to the next instruction + cpuXC->setPC(cpuXC->readNextPC()); + cpuXC->setNextPC(cpuXC->readNextPC() + sizeof(MachInst)); +#else + // go to the next instruction + cpuXC->setPC(cpuXC->readNextPC()); + cpuXC->setNextPC(cpuXC->readNextNPC()); + cpuXC->setNextNPC(cpuXC->readNextNPC() + sizeof(MachInst)); +#endif + updateThisCycle = false; } template <class DynInstPtr> diff --git a/src/cpu/o3/alpha/cpu_builder.cc b/src/cpu/o3/alpha/cpu_builder.cc index 5e767655d..fbf1f342c 100644 --- a/src/cpu/o3/alpha/cpu_builder.cc +++ b/src/cpu/o3/alpha/cpu_builder.cc @@ -56,6 +56,7 @@ SimObjectParam<System *> system; Param<int> cpu_id; SimObjectParam<AlphaITB *> itb; SimObjectParam<AlphaDTB *> dtb; +Param<Tick> profile; #else SimObjectVectorParam<Process *> workload; #endif // FULL_SYSTEM @@ -68,6 +69,8 @@ Param<Counter> max_insts_any_thread; Param<Counter> max_insts_all_threads; Param<Counter> max_loads_any_thread; Param<Counter> max_loads_all_threads; +Param<Counter> stats_reset_inst; +Param<Tick> progress_interval; Param<unsigned> cachePorts; @@ -162,6 +165,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivO3CPU) INIT_PARAM(cpu_id, "processor ID"), INIT_PARAM(itb, "Instruction translation buffer"), INIT_PARAM(dtb, "Data translation buffer"), + INIT_PARAM(profile, ""), #else INIT_PARAM(workload, "Processes to run"), #endif // FULL_SYSTEM @@ -184,6 +188,10 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivO3CPU) "Terminate when all threads have reached this load" "count", 0), + INIT_PARAM_DFLT(stats_reset_inst, + "blah", + 0), + INIT_PARAM_DFLT(progress_interval, "Progress interval", 0), INIT_PARAM_DFLT(cachePorts, "Cache Ports", 200), @@ -305,6 +313,7 @@ CREATE_SIM_OBJECT(DerivO3CPU) params->cpu_id = cpu_id; params->itb = itb; params->dtb = dtb; + params->profile = profile; #else params->workload = workload; #endif // FULL_SYSTEM @@ -317,6 +326,8 @@ CREATE_SIM_OBJECT(DerivO3CPU) params->max_insts_all_threads = max_insts_all_threads; params->max_loads_any_thread = max_loads_any_thread; params->max_loads_all_threads = max_loads_all_threads; + params->stats_reset_inst = stats_reset_inst; + params->progress_interval = progress_interval; // // Caches diff --git a/src/cpu/o3/checker_builder.cc b/src/cpu/o3/checker_builder.cc index 782d963b0..ad83ec57a 100644 --- a/src/cpu/o3/checker_builder.cc +++ b/src/cpu/o3/checker_builder.cc @@ -64,6 +64,8 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(O3Checker) Param<Counter> max_insts_all_threads; Param<Counter> max_loads_any_thread; Param<Counter> max_loads_all_threads; + Param<Counter> stats_reset_inst; + Param<Tick> progress_interval; #if FULL_SYSTEM SimObjectParam<AlphaITB *> itb; @@ -78,6 +80,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(O3Checker) Param<bool> defer_registration; Param<bool> exitOnError; + Param<bool> updateOnError; Param<bool> warnOnlyOnLoadError; Param<bool> function_trace; Param<Tick> function_trace_start; @@ -94,6 +97,9 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(O3Checker) "terminate when any thread reaches this load count"), INIT_PARAM(max_loads_all_threads, "terminate when all threads have reached this load count"), + INIT_PARAM(stats_reset_inst, + "blah"), + INIT_PARAM_DFLT(progress_interval, "CPU Progress Interval", 0), #if FULL_SYSTEM INIT_PARAM(itb, "Instruction TLB"), @@ -109,6 +115,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(O3Checker) INIT_PARAM(defer_registration, "defer system registration (for sampling)"), INIT_PARAM(exitOnError, "exit on error"), + INIT_PARAM(updateOnError, "Update the checker with the main CPU's state on error"), INIT_PARAM_DFLT(warnOnlyOnLoadError, "warn, but don't exit, if a load " "result errors", false), INIT_PARAM(function_trace, "Enable function trace"), @@ -126,7 +133,9 @@ CREATE_SIM_OBJECT(O3Checker) params->max_insts_all_threads = 0; params->max_loads_any_thread = 0; params->max_loads_all_threads = 0; + params->stats_reset_inst = 0; params->exitOnError = exitOnError; + params->updateOnError = updateOnError; params->warnOnlyOnLoadError = warnOnlyOnLoadError; params->deferRegistration = defer_registration; params->functionTrace = function_trace; @@ -139,6 +148,10 @@ CREATE_SIM_OBJECT(O3Checker) temp = max_insts_all_threads; temp = max_loads_any_thread; temp = max_loads_all_threads; + temp = stats_reset_inst; + Tick temp2 = progress_interval; + params->progress_interval = 0; + temp2++; #if FULL_SYSTEM params->itb = itb; diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh index 34f487e2c..6ae01ae67 100644 --- a/src/cpu/o3/commit_impl.hh +++ b/src/cpu/o3/commit_impl.hh @@ -1083,12 +1083,26 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num) // Generate trap squash event. generateTrapEvent(tid); - +// warn("%lli fault (%d) handled @ PC %08p", curTick, inst_fault->name(), head_inst->readPC()); return false; } updateComInstStats(head_inst); +#if FULL_SYSTEM + if (thread[tid]->profile) { +// bool usermode = +// (cpu->readMiscReg(AlphaISA::IPR_DTB_CM, tid) & 0x18) != 0; +// thread[tid]->profilePC = usermode ? 1 : head_inst->readPC(); + thread[tid]->profilePC = head_inst->readPC(); + ProfileNode *node = thread[tid]->profile->consume(thread[tid]->getXCProxy(), + head_inst->staticInst); + + if (node) + thread[tid]->profileNode = node; + } +#endif + if (head_inst->traceData) { head_inst->traceData->setFetchSeq(head_inst->seqNum); head_inst->traceData->setCPSeq(thread[tid]->numInst); @@ -1102,6 +1116,9 @@ DefaultCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num) head_inst->renamedDestRegIdx(i)); } + if (head_inst->isCopy()) + panic("Should not commit any copy instructions!"); + // Finally clear the head ROB entry. rob->retireHead(tid); diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 19ab7f4c5..4279df6f7 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -33,6 +33,7 @@ #include "config/use_checker.hh" #if FULL_SYSTEM +#include "cpu/quiesce_event.hh" #include "sim/system.hh" #else #include "sim/process.hh" @@ -793,6 +794,8 @@ template <class Impl> unsigned int FullO3CPU<Impl>::drain(Event *drain_event) { + DPRINTF(O3CPU, "Switching out\n"); + BaseCPU::switchOut(_sampler); drainCount = 0; fetch.drain(); decode.drain(); @@ -863,6 +866,7 @@ FullO3CPU<Impl>::switchOut() { fetch.switchOut(); rename.switchOut(); + iew.switchOut(); commit.switchOut(); instList.clear(); while (!removeList.empty()) { @@ -931,6 +935,45 @@ FullO3CPU<Impl>::takeOverFrom(BaseCPU *oldCPU) } template <class Impl> +void +FullO3CPU<Impl>::serialize(std::ostream &os) +{ + BaseCPU::serialize(os); + nameOut(os, csprintf("%s.tickEvent", name())); + tickEvent.serialize(os); + + // Use SimpleThread's ability to checkpoint to make it easier to + // write out the registers. Also make this static so it doesn't + // get instantiated multiple times (causes a panic in statistics). + static CPUExecContext temp; + + for (int i = 0; i < thread.size(); i++) { + nameOut(os, csprintf("%s.xc.%i", name(), i)); + temp.copyXC(thread[i]->getXCProxy()); + temp.serialize(os); + } +} + +template <class Impl> +void +FullO3CPU<Impl>::unserialize(Checkpoint *cp, const std::string §ion) +{ + BaseCPU::unserialize(cp, section); + tickEvent.unserialize(cp, csprintf("%s.tickEvent", section)); + + // Use SimpleThread's ability to checkpoint to make it easier to + // read in the registers. Also make this static so it doesn't + // get instantiated multiple times (causes a panic in statistics). + static CPUExecContext temp; + + for (int i = 0; i < thread.size(); i++) { + temp.copyXC(thread[i]->getXCProxy()); + temp.unserialize(cp, csprintf("%s.xc.%i", section, i)); + thread[i]->getXCProxy()->copyArchRegs(temp.getProxy()); + } +} + +template <class Impl> uint64_t FullO3CPU<Impl>::readIntReg(int reg_idx) { diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh index 1e080181c..2d447bfe5 100644 --- a/src/cpu/o3/fetch_impl.hh +++ b/src/cpu/o3/fetch_impl.hh @@ -442,6 +442,7 @@ DefaultFetch<Impl>::takeOverFrom() wroteToTimeBuffer = false; _status = Inactive; switchedOut = false; + interruptPending = false; branchPred.takeOverFrom(); } @@ -563,7 +564,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid unsigned flags = 0; #endif // FULL_SYSTEM - if (cacheBlocked || (interruptPending && flags == 0)) { + if (cacheBlocked || isSwitchedOut() || (interruptPending && flags == 0)) { // Hold off fetch from getting new instructions when: // Cache is blocked, or // while an interrupt is pending and we're not in PAL mode, or @@ -1152,8 +1153,8 @@ DefaultFetch<Impl>::fetch(bool &status_change) fetch_PC = next_PC; if (instruction->isQuiesce()) { - warn("cycle %lli: Quiesce instruction encountered, halting fetch!", - curTick); +// warn("%lli: Quiesce instruction encountered, halting fetch!", +// curTick); fetchStatus[tid] = QuiescePending; ++numInst; status_change = true; @@ -1268,7 +1269,7 @@ DefaultFetch<Impl>::fetch(bool &status_change) fetchStatus[tid] = TrapPending; status_change = true; - warn("cycle %lli: fault (%s) detected @ PC %08p", curTick, fault->name(), PC[tid]); +// warn("%lli fault (%d) detected @ PC %08p", curTick, fault, PC[tid]); #else // !FULL_SYSTEM warn("cycle %lli: fault (%s) detected @ PC %08p", curTick, fault->name(), PC[tid]); #endif // FULL_SYSTEM diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index 76fa008ee..a400c9fa8 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -216,6 +216,7 @@ class DefaultIEW if (++wbOutstanding == wbMax) ableToIssue = false; DPRINTF(IEW, "wbOutstanding: %i\n", wbOutstanding); + assert(wbOutstanding <= wbMax); #ifdef DEBUG wbList.insert(sn); #endif @@ -226,6 +227,7 @@ class DefaultIEW if (wbOutstanding-- == wbMax) ableToIssue = true; DPRINTF(IEW, "wbOutstanding: %i\n", wbOutstanding); + assert(wbOutstanding >= 0); #ifdef DEBUG assert(wbList.find(sn) != wbList.end()); wbList.erase(sn); @@ -450,7 +452,9 @@ class DefaultIEW unsigned wbCycle; /** Number of instructions in flight that will writeback. */ - unsigned wbOutstanding; + + /** Number of instructions in flight that will writeback. */ + int wbOutstanding; /** Writeback width. */ unsigned wbWidth; @@ -507,6 +511,8 @@ class DefaultIEW Stats::Scalar<> iewExecutedInsts; /** Stat for total number of executed load instructions. */ Stats::Vector<> iewExecLoadInsts; + /** Stat for total number of executed store instructions. */ +// Stats::Scalar<> iewExecStoreInsts; /** Stat for total number of squashed instructions skipped at execute. */ Stats::Scalar<> iewExecSquashedInsts; /** Number of executed software prefetches. */ diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh index e9b24a6d4..c82f6dd21 100644 --- a/src/cpu/o3/iew_impl.hh +++ b/src/cpu/o3/iew_impl.hh @@ -162,17 +162,17 @@ DefaultIEW<Impl>::regStats() branchMispredicts = predictedTakenIncorrect + predictedNotTakenIncorrect; iewExecutedInsts - .name(name() + ".EXEC:insts") + .name(name() + ".iewExecutedInsts") .desc("Number of executed instructions"); iewExecLoadInsts .init(cpu->number_of_threads) - .name(name() + ".EXEC:loads") + .name(name() + ".iewExecLoadInsts") .desc("Number of load instructions executed") .flags(total); iewExecSquashedInsts - .name(name() + ".EXEC:squashedInsts") + .name(name() + ".iewExecSquashedInsts") .desc("Number of squashed instructions skipped in execute"); iewExecutedSwp @@ -372,6 +372,8 @@ DefaultIEW<Impl>::switchOut() { // Clear any state. switchedOut = true; + assert(insts[0].empty()); + assert(skidBuffer[0].empty()); instQueue.switchOut(); ldstQueue.switchOut(); @@ -410,7 +412,6 @@ DefaultIEW<Impl>::takeOverFrom() updateLSQNextCycle = false; - // @todo: Fix hardcoded number for (int i = 0; i < issueToExecQueue.getSize(); ++i) { issueToExecQueue.advance(); } @@ -611,9 +612,11 @@ DefaultIEW<Impl>::instToCommit(DynInstPtr &inst) wbNumInst = 0; } - assert((wbCycle * wbWidth + wbNumInst) < wbMax); + assert((wbCycle * wbWidth + wbNumInst) <= wbMax); } + DPRINTF(IEW, "Current wb cycle: %i, width: %i, numInst: %i\nwbActual:%i\n", + wbCycle, wbWidth, wbNumInst, wbCycle * wbWidth + wbNumInst); // Add finished instruction to queue to commit. (*iewQueue)[wbCycle].insts[wbNumInst] = inst; (*iewQueue)[wbCycle].size++; @@ -903,6 +906,22 @@ DefaultIEW<Impl>::emptyRenameInsts(unsigned tid) template <class Impl> void +DefaultIEW<Impl>::emptyRenameInsts(unsigned tid) +{ + while (!insts[tid].empty()) { + if (insts[tid].front()->isLoad() || + insts[tid].front()->isStore() ) { + toRename->iewInfo[tid].dispatchedToLSQ++; + } + + toRename->iewInfo[tid].dispatched++; + + insts[tid].pop(); + } +} + +template <class Impl> +void DefaultIEW<Impl>::wakeCPU() { cpu->wakeCPU(); @@ -1273,13 +1292,23 @@ DefaultIEW<Impl>::executeInsts() // event adds the instruction to the queue to commit fault = ldstQueue.executeLoad(inst); } else if (inst->isStore()) { - ldstQueue.executeStore(inst); + fault = ldstQueue.executeStore(inst); // If the store had a fault then it may not have a mem req - if (inst->req && !(inst->req->getFlags() & LOCKED)) { + if (!inst->isStoreConditional() && fault == NoFault) { + inst->setExecuted(); + + instToCommit(inst); + } else if (fault != NoFault) { + // If the instruction faulted, then we need to send it along to commit + // without the instruction completing. + + // Send this instruction to commit, also make sure iew stage + // realizes there is activity. inst->setExecuted(); instToCommit(inst); + activityThisCycle(); } // Store conditionals will mark themselves as @@ -1404,7 +1433,7 @@ DefaultIEW<Impl>::writebackInsts() // E.g. Uncached loads have not actually executed when they // are first sent to commit. Instead commit must tell the LSQ // when it's ready to execute the uncached load. - if (!inst->isSquashed() && inst->isExecuted()) { + if (!inst->isSquashed() && inst->isExecuted() && inst->getFault() == NoFault) { int dependents = instQueue.wakeDependents(inst); for (int i = 0; i < inst->numDestRegs(); i++) { diff --git a/src/cpu/o3/inst_queue.hh b/src/cpu/o3/inst_queue.hh index d745faf7b..3dd4dc658 100644 --- a/src/cpu/o3/inst_queue.hh +++ b/src/cpu/o3/inst_queue.hh @@ -479,13 +479,13 @@ class InstructionQueue /** Distribution of number of instructions in the queue. * @todo: Need to create struct to track the entry time for each * instruction. */ - Stats::VectorDistribution<> queueResDist; +// Stats::VectorDistribution<> queueResDist; /** Distribution of the number of instructions issued. */ Stats::Distribution<> numIssuedDist; /** Distribution of the cycles it takes to issue an instruction. * @todo: Need to create struct to track the ready time for each * instruction. */ - Stats::VectorDistribution<> issueDelayDist; +// Stats::VectorDistribution<> issueDelayDist; /** Number of times an instruction could not be issued because a * FU was busy. diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh index 47634f645..6edb528a9 100644 --- a/src/cpu/o3/inst_queue_impl.hh +++ b/src/cpu/o3/inst_queue_impl.hh @@ -230,7 +230,7 @@ InstructionQueue<Impl>::regStats() .name(name() + ".iqSquashedNonSpecRemoved") .desc("Number of squashed non-spec instructions that were removed") .prereq(iqSquashedNonSpecRemoved); - +/* queueResDist .init(Num_OpClasses, 0, 99, 2) .name(name() + ".IQ:residence:") @@ -240,6 +240,7 @@ InstructionQueue<Impl>::regStats() for (int i = 0; i < Num_OpClasses; ++i) { queueResDist.subname(i, opClassStrings[i]); } +*/ numIssuedDist .init(0,totalWidth,1) .name(name() + ".ISSUE:issued_per_cycle") @@ -268,7 +269,7 @@ InstructionQueue<Impl>::regStats() // // How long did instructions for a particular FU type wait prior to issue // - +/* issueDelayDist .init(Num_OpClasses,0,99,2) .name(name() + ".ISSUE:") @@ -281,7 +282,7 @@ InstructionQueue<Impl>::regStats() subname << opClassStrings[i] << "_delay"; issueDelayDist.subname(i, subname.str()); } - +*/ issueRate .name(name() + ".ISSUE:rate") .desc("Inst issue rate") @@ -385,8 +386,16 @@ template <class Impl> void InstructionQueue<Impl>::switchOut() { +/* + if (!instList[0].empty() || (numEntries != freeEntries) || + !readyInsts[0].empty() || !nonSpecInsts.empty() || !listOrder.empty()) { + dumpInsts(); +// assert(0); + } +*/ resetState(); dependGraph.reset(); + instsToExecute.clear(); switchedOut = true; for (int i = 0; i < numThreads; ++i) { memDepUnit[i].switchOut(); @@ -642,9 +651,12 @@ template <class Impl> void InstructionQueue<Impl>::processFUCompletion(DynInstPtr &inst, int fu_idx) { + DPRINTF(IQ, "Processing FU completion [sn:%lli]\n", inst->seqNum); // The CPU could have been sleeping until this op completed (*extremely* // long latency op). Wake it if it was. This may be overkill. if (isSwitchedOut()) { + DPRINTF(IQ, "FU completion not processed, IQ is switched out [sn:%lli]\n", + inst->seqNum); return; } @@ -1036,6 +1048,10 @@ InstructionQueue<Impl>::doSquash(unsigned tid) (squashed_inst->isMemRef() && !squashed_inst->memOpDone)) { + DPRINTF(IQ, "[tid:%i]: Instruction [sn:%lli] PC %#x " + "squashed.\n", + tid, squashed_inst->seqNum, squashed_inst->readPC()); + // Remove the instruction from the dependency list. if (!squashed_inst->isNonSpeculative() && !squashed_inst->isStoreConditional() && @@ -1066,7 +1082,7 @@ InstructionQueue<Impl>::doSquash(unsigned tid) ++iqSquashedOperandsExamined; } - } else { + } else if (!squashed_inst->isStoreConditional() || !squashed_inst->isCompleted()) { NonSpecMapIt ns_inst_it = nonSpecInsts.find(squashed_inst->seqNum); assert(ns_inst_it != nonSpecInsts.end()); @@ -1093,10 +1109,6 @@ InstructionQueue<Impl>::doSquash(unsigned tid) count[squashed_inst->threadNumber]--; ++freeEntries; - - DPRINTF(IQ, "[tid:%i]: Instruction [sn:%lli] PC %#x " - "squashed.\n", - tid, squashed_inst->seqNum, squashed_inst->readPC()); } instList[tid].erase(squash_it--); diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh index 2bbab71f0..a1ac5adb8 100644 --- a/src/cpu/o3/lsq_impl.hh +++ b/src/cpu/o3/lsq_impl.hh @@ -167,6 +167,16 @@ LSQ<Impl>::regStats() template<class Impl> void +LSQ<Impl>::regStats() +{ + //Initialize LSQs + for (int tid=0; tid < numThreads; tid++) { + thread[tid].regStats(); + } +} + +template<class Impl> +void LSQ<Impl>::setActiveThreads(std::list<unsigned> *at_ptr) { activeThreads = at_ptr; diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 1358a3699..8537e9dd7 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -407,20 +407,9 @@ class LSQUnit { // Will also need how many read/write ports the Dcache has. Or keep track // of that in stage that is one level up, and only call executeLoad/Store // the appropriate number of times. - /** Total number of loads forwaded from LSQ stores. */ Stats::Scalar<> lsqForwLoads; - /** Total number of loads ignored due to invalid addresses. */ - Stats::Scalar<> invAddrLoads; - - /** Total number of squashed loads. */ - Stats::Scalar<> lsqSquashedLoads; - - /** Total number of responses from the memory system that are - * ignored due to the instruction already being squashed. */ - Stats::Scalar<> lsqIgnoredResponses; - /** Total number of squashed stores. */ Stats::Scalar<> lsqSquashedStores; diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh index fa716c712..2922b81bd 100644 --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -180,6 +180,10 @@ LSQUnit<Impl>::regStats() .name(name() + ".ignoredResponses") .desc("Number of memory responses ignored because the instruction is squashed"); + lsqMemOrderViolation + .name(name() + ".memOrderViolation") + .desc("Number of memory ordering violations"); + lsqSquashedStores .name(name() + ".squashedStores") .desc("Number of stores squashed"); @@ -220,8 +224,10 @@ void LSQUnit<Impl>::switchOut() { switchedOut = true; - for (int i = 0; i < loadQueue.size(); ++i) + for (int i = 0; i < loadQueue.size(); ++i) { + assert(!loadQueue[i]); loadQueue[i] = NULL; + } assert(storesToWB == 0); } @@ -408,6 +414,11 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst) if (load_fault != NoFault) { // Send this instruction to commit, also make sure iew stage // realizes there is activity. + // Mark it as executed unless it is an uncached load that + // needs to hit the head of commit. + if (!(inst->req->flags & UNCACHEABLE) || inst->isAtCommit()) { + inst->setExecuted(); + } iewStage->instToCommit(inst); iewStage->activityThisCycle(); } @@ -467,6 +478,7 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst) // A load incorrectly passed this store. Squash and refetch. // For now return a fault to show that it was unsuccessful. memDepViolator = loadQueue[load_idx]; + ++lsqMemOrderViolation; return genMachineCheckFault(); } diff --git a/src/cpu/o3/mem_dep_unit_impl.hh b/src/cpu/o3/mem_dep_unit_impl.hh index 16f67a4e0..c649ca385 100644 --- a/src/cpu/o3/mem_dep_unit_impl.hh +++ b/src/cpu/o3/mem_dep_unit_impl.hh @@ -109,6 +109,9 @@ template <class MemDepPred, class Impl> void MemDepUnit<MemDepPred, Impl>::switchOut() { + assert(instList[0].empty()); + assert(instsToReplay.empty()); + assert(memDepHash.empty()); // Clear any state. for (int i = 0; i < Impl::MaxThreads; ++i) { instList[i].clear(); diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh index ba26a01dd..177b9cb87 100644 --- a/src/cpu/o3/rename.hh +++ b/src/cpu/o3/rename.hh @@ -417,6 +417,8 @@ class DefaultRename /** The maximum skid buffer size. */ unsigned skidBufferMax; + PhysRegIndex maxPhysicalRegs; + /** Enum to record the source of a structure full stall. Can come from * either ROB, IQ, LSQ, and it is priortized in that order. */ diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh index 782c0fe5f..248d7deb6 100644 --- a/src/cpu/o3/rename_impl.hh +++ b/src/cpu/o3/rename_impl.hh @@ -41,7 +41,8 @@ DefaultRename<Impl>::DefaultRename(Params *params) commitToRenameDelay(params->commitToRenameDelay), renameWidth(params->renameWidth), commitWidth(params->commitWidth), - numThreads(params->numberOfThreads) + numThreads(params->numberOfThreads), + maxPhysicalRegs(params->numPhysIntRegs + params->numPhysFloatRegs) { _status = Inactive; @@ -286,6 +287,11 @@ DefaultRename<Impl>::switchOut() // Put the renamed physical register back on the free list. freeList->addReg(hb_it->newPhysReg); + // Be sure to mark its register as ready if it's a misc register. + if (hb_it->newPhysReg >= maxPhysicalRegs) { + scoreboard->setReg(hb_it->newPhysReg); + } + historyBuffer[i].erase(hb_it++); } insts[i].clear(); @@ -889,6 +895,11 @@ DefaultRename<Impl>::doSquash(const InstSeqNum &squashed_seq_num, unsigned tid) // Put the renamed physical register back on the free list. freeList->addReg(hb_it->newPhysReg); + // Be sure to mark its register as ready if it's a misc register. + if (hb_it->newPhysReg >= maxPhysicalRegs) { + scoreboard->setReg(hb_it->newPhysReg); + } + historyBuffer[tid].erase(hb_it++); ++renameUndoneMaps; diff --git a/src/cpu/o3/thread_state.hh b/src/cpu/o3/thread_state.hh index b6f2e14c0..0247deb52 100644 --- a/src/cpu/o3/thread_state.hh +++ b/src/cpu/o3/thread_state.hh @@ -31,8 +31,11 @@ #ifndef __CPU_O3_THREAD_STATE_HH__ #define __CPU_O3_THREAD_STATE_HH__ +#include "base/callback.hh" +#include "base/output.hh" #include "cpu/thread_context.hh" #include "cpu/thread_state.hh" +#include "sim/sim_exit.hh" class Event; class Process; @@ -75,8 +78,22 @@ struct O3ThreadState : public ThreadState { #if FULL_SYSTEM O3ThreadState(O3CPU *_cpu, int _thread_num) : ThreadState(-1, _thread_num), - inSyscall(0), trapPending(0) - { } + cpu(_cpu), inSyscall(0), trapPending(0) + { + if (cpu->params->profile) { + profile = new FunctionProfile(cpu->params->system->kernelSymtab); + Callback *cb = + new MakeCallback<O3ThreadState, + &O3ThreadState::dumpFuncProfile>(this); + registerExitCallback(cb); + } + + // let's fill with a dummy node for now so we don't get a segfault + // on the first cycle when there's no node available. + static ProfileNode dummyNode; + profileNode = &dummyNode; + profilePC = 3; + } #else O3ThreadState(O3CPU *_cpu, int _thread_num, Process *_process, int _asid, MemObject *mem) @@ -95,6 +112,14 @@ struct O3ThreadState : public ThreadState { /** Handles the syscall. */ void syscall(int64_t callnum) { process->syscall(callnum, tc); } #endif + +#if FULL_SYSTEM + void dumpFuncProfile() + { + std::ostream *os = simout.create(csprintf("profile.%s.dat", cpu->name())); + profile->dump(xcProxy, *os); + } +#endif }; #endif // __CPU_O3_THREAD_STATE_HH__ diff --git a/src/cpu/o3/tournament_pred.cc b/src/cpu/o3/tournament_pred.cc index 7cf78dcb1..ffb941c77 100644 --- a/src/cpu/o3/tournament_pred.cc +++ b/src/cpu/o3/tournament_pred.cc @@ -62,6 +62,8 @@ TournamentBP::TournamentBP(unsigned _localPredictorSize, for (int i = 0; i < localPredictorSize; ++i) localCtrs[i].setBits(localCtrBits); + localPredictorMask = floorPow2(localPredictorSize) - 1; + if (!isPowerOf2(localHistoryTableSize)) { fatal("Invalid local history table size!\n"); } @@ -158,7 +160,7 @@ TournamentBP::lookup(Addr &branch_addr, void * &bp_history) //Lookup in the local predictor to get its branch prediction local_history_idx = calcLocHistIdx(branch_addr); local_predictor_idx = localHistoryTable[local_history_idx] - & localHistoryMask; + & localPredictorMask; local_prediction = localCtrs[local_predictor_idx].read() > threshold; //Lookup in the global predictor to get its branch prediction @@ -176,7 +178,8 @@ TournamentBP::lookup(Addr &branch_addr, void * &bp_history) bp_history = (void *)history; assert(globalHistory < globalPredictorSize && - local_history_idx < localPredictorSize); + local_history_idx < localHistoryTableSize && + local_predictor_idx < localPredictorSize); // Commented code is for doing speculative update of counters and // all histories. @@ -234,7 +237,7 @@ TournamentBP::update(Addr &branch_addr, bool taken, void *bp_history) // Get the local predictor's current prediction local_history_idx = calcLocHistIdx(branch_addr); local_predictor_hist = localHistoryTable[local_history_idx]; - local_predictor_idx = local_predictor_hist & localHistoryMask; + local_predictor_idx = local_predictor_hist & localPredictorMask; // Update the choice predictor to tell it which one was correct if // there was a prediction. @@ -256,6 +259,7 @@ TournamentBP::update(Addr &branch_addr, bool taken, void *bp_history) } assert(globalHistory < globalPredictorSize && + local_history_idx < localHistoryTableSize && local_predictor_idx < localPredictorSize); // Update the counters and local history with the proper diff --git a/src/cpu/o3/tournament_pred.hh b/src/cpu/o3/tournament_pred.hh index 66b4aaae2..472944910 100644 --- a/src/cpu/o3/tournament_pred.hh +++ b/src/cpu/o3/tournament_pred.hh @@ -159,6 +159,9 @@ class TournamentBP /** Size of the local predictor. */ unsigned localPredictorSize; + /** Mask to get the proper index bits into the predictor. */ + unsigned localPredictorMask; + /** Number of bits of the local predictor's counters. */ unsigned localCtrBits; diff --git a/src/cpu/ozone/checker_builder.cc b/src/cpu/ozone/checker_builder.cc index c372e51d6..99ba3e308 100644 --- a/src/cpu/ozone/checker_builder.cc +++ b/src/cpu/ozone/checker_builder.cc @@ -65,6 +65,8 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(OzoneChecker) Param<Counter> max_insts_all_threads; Param<Counter> max_loads_any_thread; Param<Counter> max_loads_all_threads; + Param<Counter> stats_reset_inst; + Param<Tick> progress_interval; #if FULL_SYSTEM SimObjectParam<AlphaITB *> itb; @@ -79,6 +81,7 @@ BEGIN_DECLARE_SIM_OBJECT_PARAMS(OzoneChecker) Param<bool> defer_registration; Param<bool> exitOnError; + Param<bool> updateOnError; Param<bool> warnOnlyOnLoadError; Param<bool> function_trace; Param<Tick> function_trace_start; @@ -95,6 +98,9 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(OzoneChecker) "terminate when any thread reaches this load count"), INIT_PARAM(max_loads_all_threads, "terminate when all threads have reached this load count"), + INIT_PARAM(stats_reset_inst, + "blah"), + INIT_PARAM_DFLT(progress_interval, "CPU Progress Interval", 0), #if FULL_SYSTEM INIT_PARAM(itb, "Instruction TLB"), @@ -110,6 +116,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(OzoneChecker) INIT_PARAM(defer_registration, "defer system registration (for sampling)"), INIT_PARAM(exitOnError, "exit on error"), + INIT_PARAM(updateOnError, "Update the checker with the main CPU's state on error"), INIT_PARAM_DFLT(warnOnlyOnLoadError, "warn, but don't exit, if a load " "result errors", false), INIT_PARAM(function_trace, "Enable function trace"), @@ -127,7 +134,9 @@ CREATE_SIM_OBJECT(OzoneChecker) params->max_insts_all_threads = 0; params->max_loads_any_thread = 0; params->max_loads_all_threads = 0; + params->stats_reset_inst = 0; params->exitOnError = exitOnError; + params->updateOnError = updateOnError; params->warnOnlyOnLoadError = warnOnlyOnLoadError; params->deferRegistration = defer_registration; params->functionTrace = function_trace; @@ -140,6 +149,10 @@ CREATE_SIM_OBJECT(OzoneChecker) temp = max_insts_all_threads; temp = max_loads_any_thread; temp = max_loads_all_threads; + temp = stats_reset_inst; + Tick temp2 = progress_interval; + temp2++; + params->progress_interval = 0; #if FULL_SYSTEM params->itb = itb; diff --git a/src/cpu/ozone/cpu.hh b/src/cpu/ozone/cpu.hh index e411c12bd..ece68282f 100644 --- a/src/cpu/ozone/cpu.hh +++ b/src/cpu/ozone/cpu.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005 The Regents of The University of Michigan + * Copyright (c) 2006 The Regents of The University of Michigan * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -81,13 +81,13 @@ template <class> class Checker; /** - * Declaration of Out-of-Order CPU class. Basically it is a SimpleCPU with - * simple out-of-order capabilities added to it. It is still a 1 CPI machine - * (?), but is capable of handling cache misses. Basically it models having - * a ROB/IQ by only allowing a certain amount of instructions to execute while - * the cache miss is outstanding. + * Light weight out of order CPU model that approximates an out of + * order CPU. It is separated into a front end and a back end, with + * the template parameter Impl describing the classes used for each. + * The goal is to be able to specify through the Impl the class to use + * for the front end and back end, with different classes used to + * model different levels of detail. */ - template <class Impl> class OzoneCPU : public BaseCPU { @@ -273,6 +273,7 @@ class OzoneCPU : public BaseCPU typedef OzoneThreadState<Impl> ImplState; private: + // Committed thread state for the OzoneCPU. OzoneThreadState<Impl> thread; public: @@ -310,12 +311,6 @@ class OzoneCPU : public BaseCPU tickEvent.squash(); } - private: - Trace::InstRecord *traceData; - - template<typename T> - void trace_data(T data); - public: enum Status { Running, @@ -326,8 +321,6 @@ class OzoneCPU : public BaseCPU Status _status; public: - bool checkInterrupts; - void post_interrupt(int int_num, int index); void zero_fill_64(Addr addr) { @@ -379,6 +372,7 @@ class OzoneCPU : public BaseCPU FrontEnd *frontEnd; BackEnd *backEnd; + private: Status status() const { return _status; } void setStatus(Status new_status) { _status = new_status; } @@ -410,12 +404,11 @@ class OzoneCPU : public BaseCPU // number of idle cycles Stats::Average<> notIdleFraction; Stats::Formula idleFraction; - public: + public: virtual void serialize(std::ostream &os); virtual void unserialize(Checkpoint *cp, const std::string §ion); - #if FULL_SYSTEM /** Translates instruction requestion. */ Fault translateInstReq(RequestPtr &req, OzoneThreadState<Impl> *thread) @@ -582,12 +575,9 @@ class OzoneCPU : public BaseCPU Fault copy(Addr dest); - InstSeqNum globalSeqNum; - public: void squashFromTC(); - // @todo: This can be a useful debug function. Implement it. void dumpInsts() { frontEnd->dumpInsts(); } #if FULL_SYSTEM @@ -605,7 +595,6 @@ class OzoneCPU : public BaseCPU ThreadContext *tcBase() { return tc; } - bool decoupledFrontEnd; struct CommStruct { InstSeqNum doneSeqNum; InstSeqNum nonSpecSeqNum; @@ -614,8 +603,13 @@ class OzoneCPU : public BaseCPU bool stall; }; + + InstSeqNum globalSeqNum; + TimeBuffer<CommStruct> comm; + bool decoupledFrontEnd; + bool lockFlag; Stats::Scalar<> quiesceCycles; diff --git a/src/cpu/ozone/cpu_builder.cc b/src/cpu/ozone/cpu_builder.cc index e239b7a94..e3e4ec433 100644 --- a/src/cpu/ozone/cpu_builder.cc +++ b/src/cpu/ozone/cpu_builder.cc @@ -63,6 +63,7 @@ SimObjectParam<System *> system; Param<int> cpu_id; SimObjectParam<AlphaITB *> itb; SimObjectParam<AlphaDTB *> dtb; +Param<Tick> profile; #else SimObjectVectorParam<Process *> workload; //SimObjectParam<PageTable *> page_table; @@ -76,16 +77,19 @@ Param<Counter> max_insts_any_thread; Param<Counter> max_insts_all_threads; Param<Counter> max_loads_any_thread; Param<Counter> max_loads_all_threads; +Param<Counter> stats_reset_inst; +Param<Tick> progress_interval; //SimObjectParam<BaseCache *> icache; //SimObjectParam<BaseCache *> dcache; Param<unsigned> cachePorts; Param<unsigned> width; +Param<unsigned> frontEndLatency; Param<unsigned> frontEndWidth; +Param<unsigned> backEndLatency; Param<unsigned> backEndWidth; Param<unsigned> backEndSquashLatency; -Param<unsigned> backEndLatency; Param<unsigned> maxInstBufferSize; Param<unsigned> numPhysicalRegs; Param<unsigned> maxOutstandingMemOps; @@ -140,6 +144,7 @@ Param<unsigned> RASSize; Param<unsigned> LQEntries; Param<unsigned> SQEntries; +Param<bool> lsqLimits; Param<unsigned> LFSTSize; Param<unsigned> SSITSize; @@ -181,6 +186,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU) INIT_PARAM(cpu_id, "processor ID"), INIT_PARAM(itb, "Instruction translation buffer"), INIT_PARAM(dtb, "Data translation buffer"), + INIT_PARAM(profile, ""), #else INIT_PARAM(workload, "Processes to run"), // INIT_PARAM(page_table, "Page table"), @@ -204,16 +210,21 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU) "Terminate when all threads have reached this load" "count", 0), + INIT_PARAM_DFLT(stats_reset_inst, + "blah", + 0), + INIT_PARAM_DFLT(progress_interval, "Progress interval", 0), // INIT_PARAM_DFLT(icache, "L1 instruction cache", NULL), // INIT_PARAM_DFLT(dcache, "L1 data cache", NULL), INIT_PARAM_DFLT(cachePorts, "Cache Ports", 200), INIT_PARAM_DFLT(width, "Width", 1), + INIT_PARAM_DFLT(frontEndLatency, "Front end latency", 1), INIT_PARAM_DFLT(frontEndWidth, "Front end width", 1), + INIT_PARAM_DFLT(backEndLatency, "Back end latency", 1), INIT_PARAM_DFLT(backEndWidth, "Back end width", 1), INIT_PARAM_DFLT(backEndSquashLatency, "Back end squash latency", 1), - INIT_PARAM_DFLT(backEndLatency, "Back end latency", 1), INIT_PARAM_DFLT(maxInstBufferSize, "Maximum instruction buffer size", 16), INIT_PARAM(numPhysicalRegs, "Number of physical registers"), INIT_PARAM_DFLT(maxOutstandingMemOps, "Maximum outstanding memory operations", 4), @@ -274,6 +285,7 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(DerivOzoneCPU) INIT_PARAM(LQEntries, "Number of load queue entries"), INIT_PARAM(SQEntries, "Number of store queue entries"), + INIT_PARAM_DFLT(lsqLimits, "LSQ size limits dispatch", true), INIT_PARAM(LFSTSize, "Last fetched store table size"), INIT_PARAM(SSITSize, "Store set ID table size"), @@ -336,6 +348,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU) params->cpu_id = cpu_id; params->itb = itb; params->dtb = dtb; + params->profile = profile; #else params->workload = workload; // params->pTable = page_table; @@ -347,6 +360,8 @@ CREATE_SIM_OBJECT(DerivOzoneCPU) params->max_insts_all_threads = max_insts_all_threads; params->max_loads_any_thread = max_loads_any_thread; params->max_loads_all_threads = max_loads_all_threads; + params->stats_reset_inst = stats_reset_inst; + params->progress_interval = progress_interval; // // Caches @@ -357,6 +372,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU) params->width = width; params->frontEndWidth = frontEndWidth; + params->frontEndLatency = frontEndLatency; params->backEndWidth = backEndWidth; params->backEndSquashLatency = backEndSquashLatency; params->backEndLatency = backEndLatency; @@ -414,6 +430,7 @@ CREATE_SIM_OBJECT(DerivOzoneCPU) params->LQEntries = LQEntries; params->SQEntries = SQEntries; + params->lsqLimits = lsqLimits; params->SSITSize = SSITSize; params->LFSTSize = LFSTSize; diff --git a/src/cpu/ozone/cpu_impl.hh b/src/cpu/ozone/cpu_impl.hh index 80f18434c..5c8b5001d 100644 --- a/src/cpu/ozone/cpu_impl.hh +++ b/src/cpu/ozone/cpu_impl.hh @@ -50,7 +50,6 @@ #include "arch/alpha/types.hh" #include "arch/vtophys.hh" #include "base/callback.hh" -//#include "base/remote_gdb.hh" #include "cpu/profile.hh" #include "kern/kernel_stats.hh" #include "sim/faults.hh" @@ -68,15 +67,6 @@ using namespace TheISA; template <class Impl> -template<typename T> -void -OzoneCPU<Impl>::trace_data(T data) { - if (traceData) { - traceData->setData(data); - } -} - -template <class Impl> OzoneCPU<Impl>::TickEvent::TickEvent(OzoneCPU *c, int w) : Event(&mainEventQueue, CPU_Tick_Pri), cpu(c), width(w) { @@ -112,7 +102,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) _status = Idle; if (p->checker) { -#if USE_CHECKER + BaseCPU *temp_checker = p->checker; checker = dynamic_cast<Checker<DynInstPtr> *>(temp_checker); checker->setMemory(mem); @@ -126,6 +116,8 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) panic("Checker enabled but not compiled in!"); #endif } else { + // If checker is not being used, then the xcProxy points + // directly to the CPU's ExecContext. checker = NULL; thread.tc = &ozoneTC; tc = &ozoneTC; @@ -138,7 +130,7 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) thread.setStatus(ThreadContext::Suspended); #if FULL_SYSTEM - /***** All thread state stuff *****/ + // Setup thread state stuff. thread.cpu = this; thread.setTid(0); @@ -187,12 +179,15 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) frontEnd->setBackEnd(backEnd); backEnd->setFrontEnd(frontEnd); - decoupledFrontEnd = p->decoupledFrontEnd; - globalSeqNum = 1; +#if FULL_SYSTEM checkInterrupts = false; +#endif + + lockFlag = 0; + // Setup rename table, initializing all values to ready. for (int i = 0; i < TheISA::TotalNumRegs; ++i) { thread.renameTable[i] = new DynInst(this); thread.renameTable[i]->setResultReady(); @@ -233,8 +228,6 @@ OzoneCPU<Impl>::OzoneCPU(Params *p) thread.setVirtPort(virt_port); #endif - lockFlag = 0; - DPRINTF(OzoneCPU, "OzoneCPU: Created Ozone cpu object.\n"); } @@ -247,6 +240,7 @@ template <class Impl> void OzoneCPU<Impl>::switchOut() { + BaseCPU::switchOut(_sampler); switchCount = 0; // Front end needs state from back end, so switch out the back end first. backEnd->switchOut(); @@ -257,6 +251,8 @@ template <class Impl> void OzoneCPU<Impl>::signalSwitched() { + // Only complete the switchout when both the front end and back + // end have signalled they are ready to switch. if (++switchCount == 2) { backEnd->doSwitchOut(); frontEnd->doSwitchOut(); @@ -266,6 +262,17 @@ OzoneCPU<Impl>::signalSwitched() #endif _status = SwitchedOut; +#ifndef NDEBUG + // Loop through all registers + for (int i = 0; i < AlphaISA::TotalNumRegs; ++i) { + assert(thread.renameTable[i] == frontEnd->renameTable[i]); + + assert(thread.renameTable[i] == backEnd->renameTable[i]); + + DPRINTF(OzoneCPU, "Checking if register %i matches.\n", i); + } +#endif + if (tickEvent.scheduled()) tickEvent.squash(); } @@ -278,13 +285,25 @@ OzoneCPU<Impl>::takeOverFrom(BaseCPU *oldCPU) { BaseCPU::takeOverFrom(oldCPU); + thread.trapPending = false; + thread.inSyscall = false; + backEnd->takeOverFrom(); frontEnd->takeOverFrom(); + frontEnd->renameTable.copyFrom(thread.renameTable); + backEnd->renameTable.copyFrom(thread.renameTable); assert(!tickEvent.scheduled()); +#ifndef NDEBUG + // Check rename table. + for (int i = 0; i < TheISA::TotalNumRegs; ++i) { + assert(thread.renameTable[i]->isResultReady()); + } +#endif + // @todo: Fix hardcoded number // Clear out any old information in time buffer. - for (int i = 0; i < 6; ++i) { + for (int i = 0; i < 15; ++i) { comm.advance(); } @@ -316,6 +335,10 @@ OzoneCPU<Impl>::activateContext(int thread_num, int delay) notIdleFraction++; scheduleTickEvent(delay); _status = Running; +#if FULL_SYSTEM + if (thread.quiesceEvent && thread.quiesceEvent->scheduled()) + thread.quiesceEvent->deschedule(); +#endif thread.setStatus(ThreadContext::Active); frontEnd->wakeFromQuiesce(); } @@ -393,7 +416,7 @@ template <class Impl> void OzoneCPU<Impl>::resetStats() { - startNumInst = numInst; +// startNumInst = numInst; notIdleFraction = (_status != Idle); } @@ -441,6 +464,15 @@ OzoneCPU<Impl>::serialize(std::ostream &os) ozoneTC.serialize(os); nameOut(os, csprintf("%s.tickEvent", name())); tickEvent.serialize(os); + + // Use SimpleThread's ability to checkpoint to make it easier to + // write out the registers. Also make this static so it doesn't + // get instantiated multiple times (causes a panic in statistics). + static CPUExecContext temp; + + nameOut(os, csprintf("%s.xc.0", name())); + temp.copyXC(thread.getXCProxy()); + temp.serialize(os); } template <class Impl> @@ -451,6 +483,15 @@ OzoneCPU<Impl>::unserialize(Checkpoint *cp, const std::string §ion) UNSERIALIZE_ENUM(_status); ozoneTC.unserialize(cp, csprintf("%s.tc", section)); tickEvent.unserialize(cp, csprintf("%s.tickEvent", section)); + + // Use SimpleThread's ability to checkpoint to make it easier to + // read in the registers. Also make this static so it doesn't + // get instantiated multiple times (causes a panic in statistics). + static CPUExecContext temp; + + temp.copyXC(thread.getXCProxy()); + temp.unserialize(cp, csprintf("%s.xc.0", section)); + thread.getXCProxy()->copyArchRegs(temp.getProxy()); } template <class Impl> @@ -810,7 +851,9 @@ OzoneCPU<Impl>::OzoneTC::halt() template <class Impl> void OzoneCPU<Impl>::OzoneTC::dumpFuncProfile() -{ } +{ + thread->dumpFuncProfile(); +} #endif template <class Impl> @@ -829,6 +872,7 @@ OzoneCPU<Impl>::OzoneTC::takeOverFrom(ThreadContext *old_context) copyArchRegs(old_context); setCpuId(old_context->readCpuId()); + thread->inst = old_context->getInst(); #if !FULL_SYSTEM setFuncExeInst(old_context->readFuncExeInst()); #else @@ -842,6 +886,7 @@ OzoneCPU<Impl>::OzoneTC::takeOverFrom(ThreadContext *old_context) thread->quiesceEvent->tc = this; } + // Copy kernel stats pointer from old context. thread->kernelStats = old_context->getKernelStats(); // storeCondFailures = 0; cpu->lockFlag = false; @@ -863,7 +908,11 @@ OzoneCPU<Impl>::OzoneTC::regStats(const std::string &name) template <class Impl> void OzoneCPU<Impl>::OzoneTC::serialize(std::ostream &os) -{ } +{ + // Once serialization is added, serialize the quiesce event and + // kernel stats. Will need to make sure there aren't multiple + // things that serialize them. +} template <class Impl> void @@ -896,16 +945,14 @@ template <class Impl> void OzoneCPU<Impl>::OzoneTC::profileClear() { - if (thread->profile) - thread->profile->clear(); + thread->profileClear(); } template <class Impl> void OzoneCPU<Impl>::OzoneTC::profileSample() { - if (thread->profile) - thread->profile->sample(thread->profileNode, thread->profilePC); + thread->profileSample(); } #endif @@ -916,7 +963,6 @@ OzoneCPU<Impl>::OzoneTC::getThreadNum() return thread->readTid(); } -// Also somewhat obnoxious. Really only used for the TLB fault. template <class Impl> TheISA::MachInst OzoneCPU<Impl>::OzoneTC::getInst() @@ -934,14 +980,20 @@ OzoneCPU<Impl>::OzoneTC::copyArchRegs(ThreadContext *tc) cpu->frontEnd->setPC(thread->PC); cpu->frontEnd->setNextPC(thread->nextPC); - for (int i = 0; i < TheISA::TotalNumRegs; ++i) { - if (i < TheISA::FP_Base_DepTag) { - thread->renameTable[i]->setIntResult(tc->readIntReg(i)); - } else if (i < (TheISA::FP_Base_DepTag + TheISA::NumFloatRegs)) { - int fp_idx = i - TheISA::FP_Base_DepTag; - thread->renameTable[i]->setDoubleResult( - tc->readFloatReg(fp_idx, 64)); - } + // First loop through the integer registers. + for (int i = 0; i < TheISA::NumIntRegs; ++i) { +/* DPRINTF(OzoneCPU, "Copying over register %i, had data %lli, " + "now has data %lli.\n", + i, thread->renameTable[i]->readIntResult(), + tc->readIntReg(i)); +*/ + thread->renameTable[i]->setIntResult(tc->readIntReg(i)); + } + + // Then loop through the floating point registers. + for (int i = 0; i < TheISA::NumFloatRegs; ++i) { + int fp_idx = i + TheISA::FP_Base_DepTag; + thread->renameTable[fp_idx]->setIntResult(tc->readFloatRegBits(i)); } #if !FULL_SYSTEM diff --git a/src/cpu/ozone/front_end.hh b/src/cpu/ozone/front_end.hh index 3ed3c4d18..5ffd3666e 100644 --- a/src/cpu/ozone/front_end.hh +++ b/src/cpu/ozone/front_end.hh @@ -34,6 +34,7 @@ #include <deque> #include "arch/utility.hh" +#include "base/timebuf.hh" #include "cpu/inst_seq.hh" #include "cpu/o3/bpred_unit.hh" #include "cpu/ozone/rename_table.hh" @@ -246,15 +247,21 @@ class FrontEnd void dumpInsts(); private: + TimeBuffer<int> numInstsReady; + typedef typename std::deque<DynInstPtr> InstBuff; typedef typename InstBuff::iterator InstBuffIt; + InstBuff feBuffer; + InstBuff instBuffer; int instBufferSize; int maxInstBufferSize; + int latency; + int width; int freeRegs; diff --git a/src/cpu/ozone/front_end_impl.hh b/src/cpu/ozone/front_end_impl.hh index 1b120460a..d34716de6 100644 --- a/src/cpu/ozone/front_end_impl.hh +++ b/src/cpu/ozone/front_end_impl.hh @@ -92,8 +92,10 @@ FrontEnd<Impl>::FrontEnd(Params *params) : branchPred(params), icachePort(this), mem(params->mem), + numInstsReady(params->frontEndLatency, 0), instBufferSize(0), maxInstBufferSize(params->maxInstBufferSize), + latency(params->frontEndLatency), width(params->frontEndWidth), freeRegs(params->numPhysicalRegs), numPhysRegs(params->numPhysicalRegs), @@ -326,6 +328,18 @@ FrontEnd<Impl>::tick() if (switchedOut) return; + for (int insts_to_queue = numInstsReady[-latency]; + !instBuffer.empty() && insts_to_queue; + --insts_to_queue) + { + DPRINTF(FE, "Transferring instruction [sn:%lli] to the feBuffer\n", + instBuffer.front()->seqNum); + feBuffer.push_back(instBuffer.front()); + instBuffer.pop_front(); + } + + numInstsReady.advance(); + // @todo: Maybe I want to just have direct communication... if (fromCommit->doneSeqNum) { branchPred.update(fromCommit->doneSeqNum, 0); @@ -339,8 +353,8 @@ FrontEnd<Impl>::tick() cacheBlkValid = true; status = Running; - if (barrierInst) - status = SerializeBlocked; +// if (barrierInst) +// status = SerializeBlocked; if (freeRegs <= 0) status = RenameBlocked; checkBE(); @@ -414,11 +428,12 @@ FrontEnd<Impl>::tick() // latency instBuffer.push_back(inst); ++instBufferSize; + numInstsReady[0]++; ++num_inst; #if FULL_SYSTEM if (inst->isQuiesce()) { - warn("%lli: Quiesce instruction encountered, halting fetch!", curTick); +// warn("%lli: Quiesce instruction encountered, halting fetch!", curTick); status = QuiescePending; break; } @@ -572,10 +587,10 @@ FrontEnd<Impl>::processBarriers(DynInstPtr &inst) // Change status over to SerializeBlocked so that other stages know // what this is blocked on. - status = SerializeBlocked; +// status = SerializeBlocked; - barrierInst = inst; - return true; +// barrierInst = inst; +// return true; } else if ((inst->isStoreConditional() || inst->isSerializeAfter()) && !inst->isSerializeHandled()) { DPRINTF(FE, "Serialize after instruction encountered.\n"); @@ -620,6 +635,7 @@ FrontEnd<Impl>::handleFault(Fault &fault) instruction->fault = fault; instruction->setCanIssue(); instBuffer.push_back(instruction); + numInstsReady[0]++; ++instBufferSize; } @@ -649,6 +665,21 @@ FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC, freeRegs+= inst->numDestRegs(); } + while (!feBuffer.empty() && + feBuffer.back()->seqNum > squash_num) { + DynInstPtr inst = feBuffer.back(); + + DPRINTF(FE, "Squashing instruction [sn:%lli] PC %#x\n", + inst->seqNum, inst->readPC()); + + inst->clearDependents(); + + feBuffer.pop_back(); + --instBufferSize; + + freeRegs+= inst->numDestRegs(); + } + // Copy over rename table from the back end. renameTable.copyFrom(backEnd->renameTable); @@ -666,12 +697,12 @@ FrontEnd<Impl>::squash(const InstSeqNum &squash_num, const Addr &next_PC, DPRINTF(FE, "Squashing outstanding Icache access.\n"); memReq = NULL; } - +/* if (status == SerializeBlocked) { assert(barrierInst->seqNum > squash_num); barrierInst = NULL; } - +*/ // Unless this squash originated from the front end, we're probably // in running mode now. // Actually might want to make this latency dependent. @@ -683,13 +714,22 @@ template <class Impl> typename Impl::DynInstPtr FrontEnd<Impl>::getInst() { - if (instBufferSize == 0) { + if (feBuffer.empty()) { return NULL; } - DynInstPtr inst = instBuffer.front(); + DynInstPtr inst = feBuffer.front(); - instBuffer.pop_front(); + if (inst->isSerializeBefore() || inst->isIprAccess()) { + DPRINTF(FE, "Back end is getting a serialize before inst\n"); + if (!backEnd->robEmpty()) { + DPRINTF(FE, "Rob is not empty yet, not returning inst\n"); + return NULL; + } + inst->clearSerializeBefore(); + } + + feBuffer.pop_front(); --instBufferSize; @@ -784,11 +824,11 @@ FrontEnd<Impl>::updateStatus() } if (status == BEBlocked && !be_block) { - if (barrierInst) { - status = SerializeBlocked; - } else { +// if (barrierInst) { +// status = SerializeBlocked; +// } else { status = Running; - } +// } ret_val = true; } return ret_val; @@ -810,6 +850,7 @@ template <class Impl> typename Impl::DynInstPtr FrontEnd<Impl>::getInstFromCacheline() { +/* if (status == SerializeComplete) { DynInstPtr inst = barrierInst; status = Running; @@ -817,7 +858,7 @@ FrontEnd<Impl>::getInstFromCacheline() inst->clearSerializeBefore(); return inst; } - +*/ InstSeqNum inst_seq; MachInst inst; // @todo: Fix this magic number used here to handle word offset (and @@ -932,6 +973,7 @@ FrontEnd<Impl>::doSwitchOut() squash(0, 0); instBuffer.clear(); instBufferSize = 0; + feBuffer.clear(); status = Idle; } diff --git a/src/cpu/ozone/inorder_back_end_impl.hh b/src/cpu/ozone/inorder_back_end_impl.hh index 701fc0ee9..16ebac163 100644 --- a/src/cpu/ozone/inorder_back_end_impl.hh +++ b/src/cpu/ozone/inorder_back_end_impl.hh @@ -284,7 +284,7 @@ InorderBackEnd<Impl>::executeInsts() } inst->setExecuted(); - inst->setCompleted(); + inst->setResultReady(); inst->setCanCommit(); instList.pop_front(); diff --git a/src/cpu/ozone/inst_queue_impl.hh b/src/cpu/ozone/inst_queue_impl.hh index f2d80e621..32a940241 100644 --- a/src/cpu/ozone/inst_queue_impl.hh +++ b/src/cpu/ozone/inst_queue_impl.hh @@ -850,13 +850,13 @@ template <class Impl> void InstQueue<Impl>::addReadyMemInst(DynInstPtr &ready_inst) { - OpClass op_class = ready_inst->opClass(); +// OpClass op_class = ready_inst->opClass(); readyInsts.push(ready_inst); DPRINTF(IQ, "Instruction is ready to issue, putting it onto " "the ready list, PC %#x opclass:%i [sn:%lli].\n", - ready_inst->readPC(), op_class, ready_inst->seqNum); + ready_inst->readPC(), ready_inst->opClass(), ready_inst->seqNum); } /* template <class Impl> @@ -1177,11 +1177,11 @@ InstQueue<Impl>::addIfReady(DynInstPtr &inst) return; } - OpClass op_class = inst->opClass(); +// OpClass op_class = inst->opClass(); DPRINTF(IQ, "Instruction is ready to issue, putting it onto " "the ready list, PC %#x opclass:%i [sn:%lli].\n", - inst->readPC(), op_class, inst->seqNum); + inst->readPC(), inst->opClass(), inst->seqNum); readyInsts.push(inst); } diff --git a/src/cpu/ozone/lw_back_end.hh b/src/cpu/ozone/lw_back_end.hh index d836ceebd..49c6a1ae2 100644 --- a/src/cpu/ozone/lw_back_end.hh +++ b/src/cpu/ozone/lw_back_end.hh @@ -80,7 +80,7 @@ class LWBackEnd TimeBuffer<IssueToExec> i2e; typename TimeBuffer<IssueToExec>::wire instsToExecute; TimeBuffer<ExecToCommit> e2c; - TimeBuffer<Writeback> numInstsToWB; + TimeBuffer<int> numInstsToWB; TimeBuffer<CommStruct> *comm; typename TimeBuffer<CommStruct>::wire toIEW; @@ -139,7 +139,7 @@ class LWBackEnd Tick lastCommitCycle; - bool robEmpty() { return instList.empty(); } + bool robEmpty() { return numInsts == 0; } bool isFull() { return numInsts >= numROBEntries; } bool isBlocked() { return status == Blocked || dispatchStatus == Blocked; } @@ -194,6 +194,7 @@ class LWBackEnd } void instToCommit(DynInstPtr &inst); + void readyInstsForCommit(); void switchOut(); void doSwitchOut(); @@ -255,12 +256,13 @@ class LWBackEnd RenameTable<Impl> renameTable; private: + int latency; + // General back end width. Used if the more specific isn't given. int width; // Dispatch width. int dispatchWidth; - int numDispatchEntries; int dispatchSize; int waitingInsts; @@ -285,6 +287,7 @@ class LWBackEnd int numROBEntries; int numInsts; + bool lsqLimits; std::set<InstSeqNum> waitingMemOps; typedef std::set<InstSeqNum>::iterator MemIt; @@ -295,9 +298,6 @@ class LWBackEnd InstSeqNum squashSeqNum; Addr squashNextPC; - Fault faultFromFetch; - bool fetchHasFault; - bool switchedOut; bool switchPending; @@ -321,8 +321,6 @@ class LWBackEnd std::list<DynInstPtr> replayList; std::list<DynInstPtr> writeback; - int latency; - int squashLatency; bool exactFullStall; @@ -331,37 +329,39 @@ class LWBackEnd /* Stats::Scalar<> dcacheStallCycles; Counter lastDcacheStall; */ - Stats::Vector<> rob_cap_events; - Stats::Vector<> rob_cap_inst_count; - Stats::Vector<> iq_cap_events; - Stats::Vector<> iq_cap_inst_count; + Stats::Vector<> robCapEvents; + Stats::Vector<> robCapInstCount; + Stats::Vector<> iqCapEvents; + Stats::Vector<> iqCapInstCount; // total number of instructions executed - Stats::Vector<> exe_inst; - Stats::Vector<> exe_swp; - Stats::Vector<> exe_nop; - Stats::Vector<> exe_refs; - Stats::Vector<> exe_loads; - Stats::Vector<> exe_branches; + Stats::Vector<> exeInst; + Stats::Vector<> exeSwp; + Stats::Vector<> exeNop; + Stats::Vector<> exeRefs; + Stats::Vector<> exeLoads; + Stats::Vector<> exeBranches; - Stats::Vector<> issued_ops; + Stats::Vector<> issuedOps; // total number of loads forwaded from LSQ stores - Stats::Vector<> lsq_forw_loads; + Stats::Vector<> lsqForwLoads; // total number of loads ignored due to invalid addresses - Stats::Vector<> inv_addr_loads; + Stats::Vector<> invAddrLoads; // total number of software prefetches ignored due to invalid addresses - Stats::Vector<> inv_addr_swpfs; + Stats::Vector<> invAddrSwpfs; // ready loads blocked due to memory disambiguation - Stats::Vector<> lsq_blocked_loads; + Stats::Vector<> lsqBlockedLoads; Stats::Scalar<> lsqInversion; - Stats::Vector<> n_issued_dist; - Stats::VectorDistribution<> issue_delay_dist; + Stats::Vector<> nIssuedDist; +/* + Stats::VectorDistribution<> issueDelayDist; - Stats::VectorDistribution<> queue_res_dist; + Stats::VectorDistribution<> queueResDist; +*/ /* Stats::Vector<> stat_fu_busy; Stats::Vector2d<> stat_fuBusy; @@ -379,37 +379,37 @@ class LWBackEnd Stats::Formula commit_ipb; Stats::Formula lsq_inv_rate; */ - Stats::Vector<> writeback_count; - Stats::Vector<> producer_inst; - Stats::Vector<> consumer_inst; - Stats::Vector<> wb_penalized; + Stats::Vector<> writebackCount; + Stats::Vector<> producerInst; + Stats::Vector<> consumerInst; + Stats::Vector<> wbPenalized; - Stats::Formula wb_rate; - Stats::Formula wb_fanout; - Stats::Formula wb_penalized_rate; + Stats::Formula wbRate; + Stats::Formula wbFanout; + Stats::Formula wbPenalizedRate; // total number of instructions committed - Stats::Vector<> stat_com_inst; - Stats::Vector<> stat_com_swp; - Stats::Vector<> stat_com_refs; - Stats::Vector<> stat_com_loads; - Stats::Vector<> stat_com_membars; - Stats::Vector<> stat_com_branches; + Stats::Vector<> statComInst; + Stats::Vector<> statComSwp; + Stats::Vector<> statComRefs; + Stats::Vector<> statComLoads; + Stats::Vector<> statComMembars; + Stats::Vector<> statComBranches; - Stats::Distribution<> n_committed_dist; + Stats::Distribution<> nCommittedDist; - Stats::Scalar<> commit_eligible_samples; - Stats::Vector<> commit_eligible; + Stats::Scalar<> commitEligibleSamples; + Stats::Vector<> commitEligible; Stats::Vector<> squashedInsts; Stats::Vector<> ROBSquashedInsts; - Stats::Scalar<> ROB_fcount; - Stats::Formula ROB_full_rate; + Stats::Scalar<> ROBFcount; + Stats::Formula ROBFullRate; - Stats::Vector<> ROB_count; // cumulative ROB occupancy - Stats::Formula ROB_occ_rate; - Stats::VectorDistribution<> ROB_occ_dist; + Stats::Vector<> ROBCount; // cumulative ROB occupancy + Stats::Formula ROBOccRate; +// Stats::VectorDistribution<> ROBOccDist; public: void dumpInsts(); diff --git a/src/cpu/ozone/lw_back_end_impl.hh b/src/cpu/ozone/lw_back_end_impl.hh index a4f1d805e..f87a2bc57 100644 --- a/src/cpu/ozone/lw_back_end_impl.hh +++ b/src/cpu/ozone/lw_back_end_impl.hh @@ -141,13 +141,14 @@ LWBackEnd<Impl>::replayMemInst(DynInstPtr &inst) template <class Impl> LWBackEnd<Impl>::LWBackEnd(Params *params) - : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(5, 5), + : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(params->backEndLatency, 0), trapSquash(false), tcSquash(false), - width(params->backEndWidth), exactFullStall(true) + latency(params->backEndLatency), + width(params->backEndWidth), lsqLimits(params->lsqLimits), + exactFullStall(true) { numROBEntries = params->numROBEntries; numInsts = 0; - numDispatchEntries = 32; maxOutstandingMemOps = params->maxOutstandingMemOps; numWaitingMemOps = 0; waitingInsts = 0; @@ -184,78 +185,79 @@ void LWBackEnd<Impl>::regStats() { using namespace Stats; - rob_cap_events + LSQ.regStats(); + + robCapEvents .init(cpu->number_of_threads) .name(name() + ".ROB:cap_events") .desc("number of cycles where ROB cap was active") .flags(total) ; - rob_cap_inst_count + robCapInstCount .init(cpu->number_of_threads) .name(name() + ".ROB:cap_inst") .desc("number of instructions held up by ROB cap") .flags(total) ; - iq_cap_events + iqCapEvents .init(cpu->number_of_threads) .name(name() +".IQ:cap_events" ) .desc("number of cycles where IQ cap was active") .flags(total) ; - iq_cap_inst_count + iqCapInstCount .init(cpu->number_of_threads) .name(name() + ".IQ:cap_inst") .desc("number of instructions held up by IQ cap") .flags(total) ; - - exe_inst + exeInst .init(cpu->number_of_threads) .name(name() + ".ISSUE:count") .desc("number of insts issued") .flags(total) ; - exe_swp + exeSwp .init(cpu->number_of_threads) .name(name() + ".ISSUE:swp") .desc("number of swp insts issued") .flags(total) ; - exe_nop + exeNop .init(cpu->number_of_threads) .name(name() + ".ISSUE:nop") .desc("number of nop insts issued") .flags(total) ; - exe_refs + exeRefs .init(cpu->number_of_threads) .name(name() + ".ISSUE:refs") .desc("number of memory reference insts issued") .flags(total) ; - exe_loads + exeLoads .init(cpu->number_of_threads) .name(name() + ".ISSUE:loads") .desc("number of load insts issued") .flags(total) ; - exe_branches + exeBranches .init(cpu->number_of_threads) .name(name() + ".ISSUE:branches") .desc("Number of branches issued") .flags(total) ; - issued_ops + issuedOps .init(cpu->number_of_threads) .name(name() + ".ISSUE:op_count") .desc("number of insts issued") @@ -272,28 +274,28 @@ LWBackEnd<Impl>::regStats() // // Other stats // - lsq_forw_loads + lsqForwLoads .init(cpu->number_of_threads) .name(name() + ".LSQ:forw_loads") .desc("number of loads forwarded via LSQ") .flags(total) ; - inv_addr_loads + invAddrLoads .init(cpu->number_of_threads) .name(name() + ".ISSUE:addr_loads") .desc("number of invalid-address loads") .flags(total) ; - inv_addr_swpfs + invAddrSwpfs .init(cpu->number_of_threads) .name(name() + ".ISSUE:addr_swpfs") .desc("number of invalid-address SW prefetches") .flags(total) ; - lsq_blocked_loads + lsqBlockedLoads .init(cpu->number_of_threads) .name(name() + ".LSQ:blocked_loads") .desc("number of ready loads not issued due to memory disambiguation") @@ -305,51 +307,52 @@ LWBackEnd<Impl>::regStats() .desc("Number of times LSQ instruction issued early") ; - n_issued_dist + nIssuedDist .init(issueWidth + 1) .name(name() + ".ISSUE:issued_per_cycle") .desc("Number of insts issued each cycle") .flags(total | pdf | dist) ; - issue_delay_dist +/* + issueDelayDist .init(Num_OpClasses,0,99,2) .name(name() + ".ISSUE:") .desc("cycles from operands ready to issue") .flags(pdf | cdf) ; - queue_res_dist + queueResDist .init(Num_OpClasses, 0, 99, 2) .name(name() + ".IQ:residence:") .desc("cycles from dispatch to issue") .flags(total | pdf | cdf ) ; for (int i = 0; i < Num_OpClasses; ++i) { - queue_res_dist.subname(i, opClassStrings[i]); + queueResDist.subname(i, opClassStrings[i]); } - - writeback_count +*/ + writebackCount .init(cpu->number_of_threads) .name(name() + ".WB:count") .desc("cumulative count of insts written-back") .flags(total) ; - producer_inst + producerInst .init(cpu->number_of_threads) .name(name() + ".WB:producers") .desc("num instructions producing a value") .flags(total) ; - consumer_inst + consumerInst .init(cpu->number_of_threads) .name(name() + ".WB:consumers") .desc("num instructions consuming a value") .flags(total) ; - wb_penalized + wbPenalized .init(cpu->number_of_threads) .name(name() + ".WB:penalized") .desc("number of instrctions required to write to 'other' IQ") @@ -357,71 +360,71 @@ LWBackEnd<Impl>::regStats() ; - wb_penalized_rate + wbPenalizedRate .name(name() + ".WB:penalized_rate") .desc ("fraction of instructions written-back that wrote to 'other' IQ") .flags(total) ; - wb_penalized_rate = wb_penalized / writeback_count; + wbPenalizedRate = wbPenalized / writebackCount; - wb_fanout + wbFanout .name(name() + ".WB:fanout") .desc("average fanout of values written-back") .flags(total) ; - wb_fanout = producer_inst / consumer_inst; + wbFanout = producerInst / consumerInst; - wb_rate + wbRate .name(name() + ".WB:rate") .desc("insts written-back per cycle") .flags(total) ; - wb_rate = writeback_count / cpu->numCycles; + wbRate = writebackCount / cpu->numCycles; - stat_com_inst + statComInst .init(cpu->number_of_threads) .name(name() + ".COM:count") .desc("Number of instructions committed") .flags(total) ; - stat_com_swp + statComSwp .init(cpu->number_of_threads) .name(name() + ".COM:swp_count") .desc("Number of s/w prefetches committed") .flags(total) ; - stat_com_refs + statComRefs .init(cpu->number_of_threads) .name(name() + ".COM:refs") .desc("Number of memory references committed") .flags(total) ; - stat_com_loads + statComLoads .init(cpu->number_of_threads) .name(name() + ".COM:loads") .desc("Number of loads committed") .flags(total) ; - stat_com_membars + statComMembars .init(cpu->number_of_threads) .name(name() + ".COM:membars") .desc("Number of memory barriers committed") .flags(total) ; - stat_com_branches + statComBranches .init(cpu->number_of_threads) .name(name() + ".COM:branches") .desc("Number of branches committed") .flags(total) ; - n_committed_dist + nCommittedDist .init(0,commitWidth,1) .name(name() + ".COM:committed_per_cycle") .desc("Number of insts commited each cycle") @@ -441,14 +444,14 @@ LWBackEnd<Impl>::regStats() // -> The standard deviation is computed only over cycles where // we reached the BW limit // - commit_eligible + commitEligible .init(cpu->number_of_threads) .name(name() + ".COM:bw_limited") .desc("number of insts not committed due to BW limits") .flags(total) ; - commit_eligible_samples + commitEligibleSamples .name(name() + ".COM:bw_lim_events") .desc("number cycles where commit BW limit reached") ; @@ -465,37 +468,38 @@ LWBackEnd<Impl>::regStats() .desc("Number of instructions removed from inst list when they reached the head of the ROB") ; - ROB_fcount + ROBFcount .name(name() + ".ROB:full_count") .desc("number of cycles where ROB was full") ; - ROB_count + ROBCount .init(cpu->number_of_threads) .name(name() + ".ROB:occupancy") .desc(name() + ".ROB occupancy (cumulative)") .flags(total) ; - ROB_full_rate + ROBFullRate .name(name() + ".ROB:full_rate") .desc("ROB full per cycle") ; - ROB_full_rate = ROB_fcount / cpu->numCycles; + ROBFullRate = ROBFcount / cpu->numCycles; - ROB_occ_rate + ROBOccRate .name(name() + ".ROB:occ_rate") .desc("ROB occupancy rate") .flags(total) ; - ROB_occ_rate = ROB_count / cpu->numCycles; - - ROB_occ_dist + ROBOccRate = ROBCount / cpu->numCycles; +/* + ROBOccDist .init(cpu->number_of_threads,0,numROBEntries,2) .name(name() + ".ROB:occ_dist") .desc("ROB Occupancy per cycle") .flags(total | cdf) ; +*/ } template <class Impl> @@ -588,17 +592,21 @@ LWBackEnd<Impl>::tick() { DPRINTF(BE, "Ticking back end\n"); + // Read in any done instruction information and update the IQ or LSQ. + updateStructures(); + if (switchPending && robEmpty() && !LSQ.hasStoresToWB()) { cpu->signalSwitched(); return; } - ROB_count[0]+= numInsts; + readyInstsForCommit(); - wbCycle = 0; + numInstsToWB.advance(); - // Read in any done instruction information and update the IQ or LSQ. - updateStructures(); + ROBCount[0]+= numInsts; + + wbCycle = 0; #if FULL_SYSTEM checkInterrupts(); @@ -674,6 +682,10 @@ LWBackEnd<Impl>::dispatchInsts() while (numInsts < numROBEntries && numWaitingMemOps < maxOutstandingMemOps) { // Get instruction from front of time buffer + if (lsqLimits && LSQ.isFull()) { + break; + } + DynInstPtr inst = frontEnd->getInst(); if (!inst) { break; @@ -732,6 +744,7 @@ LWBackEnd<Impl>::dispatchInsts() inst->setIssued(); inst->setExecuted(); inst->setCanCommit(); + numInstsToWB[0]++; } else { DPRINTF(BE, "Instruction [sn:%lli] ready, addding to " "exeList.\n", @@ -866,8 +879,17 @@ LWBackEnd<Impl>::executeInsts() if (inst->isLoad()) { LSQ.executeLoad(inst); } else if (inst->isStore()) { - LSQ.executeStore(inst); - if (inst->req && !(inst->req->getFlags() & LOCKED)) { + Fault fault = LSQ.executeStore(inst); + + if (!inst->isStoreConditional() && fault == NoFault) { + inst->setExecuted(); + + instToCommit(inst); + } else if (fault != NoFault) { + // If the instruction faulted, then we need to send it along to commit + // without the instruction completing. + // Send this instruction to commit, also make sure iew stage + // realizes there is activity. inst->setExecuted(); instToCommit(inst); @@ -908,36 +930,54 @@ LWBackEnd<Impl>::executeInsts() } } - issued_ops[0]+= num_executed; - n_issued_dist[num_executed]++; + issuedOps[0]+= num_executed; + nIssuedDist[num_executed]++; } template<class Impl> void LWBackEnd<Impl>::instToCommit(DynInstPtr &inst) { - DPRINTF(BE, "Sending instructions to commit [sn:%lli] PC %#x.\n", inst->seqNum, inst->readPC()); if (!inst->isSquashed()) { - DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n", - inst->seqNum, inst->readPC()); - - inst->setCanCommit(); - if (inst->isExecuted()) { inst->setResultReady(); int dependents = wakeDependents(inst); if (dependents) { - producer_inst[0]++; - consumer_inst[0]+= dependents; + producerInst[0]++; + consumerInst[0]+= dependents; } } } - writeback_count[0]++; + writeback.push_back(inst); + + numInstsToWB[0]++; + + writebackCount[0]++; +} + +template <class Impl> +void +LWBackEnd<Impl>::readyInstsForCommit() +{ + for (int i = numInstsToWB[-latency]; + !writeback.empty() && i; + --i) + { + DynInstPtr inst = writeback.front(); + writeback.pop_front(); + if (!inst->isSquashed()) { + DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n", + inst->seqNum, inst->readPC()); + + inst->setCanCommit(); + } + } } + #if 0 template <class Impl> void @@ -1010,7 +1050,7 @@ LWBackEnd<Impl>::commitInst(int inst_num) // or store inst. Signal backwards that it should be executed. if (!inst->isExecuted()) { if (inst->isNonSpeculative() || - inst->isStoreConditional() || + (inst->isStoreConditional() && inst->getFault() == NoFault) || inst->isMemBarrier() || inst->isWriteBarrier()) { #if !FULL_SYSTEM @@ -1151,6 +1191,20 @@ LWBackEnd<Impl>::commitInst(int inst_num) ++freed_regs; } +#if FULL_SYSTEM + if (thread->profile) { +// bool usermode = +// (xc->readMiscReg(AlphaISA::IPR_DTB_CM) & 0x18) != 0; +// thread->profilePC = usermode ? 1 : inst->readPC(); + thread->profilePC = inst->readPC(); + ProfileNode *node = thread->profile->consume(thread->getXCProxy(), + inst->staticInst); + + if (node) + thread->profileNode = node; + } +#endif + if (inst->traceData) { inst->traceData->setFetchSeq(inst->seqNum); inst->traceData->setCPSeq(thread->numInst); @@ -1158,6 +1212,9 @@ LWBackEnd<Impl>::commitInst(int inst_num) inst->traceData = NULL; } + if (inst->isCopy()) + panic("Should not commit any copy instructions!"); + inst->clearDependents(); frontEnd->addFreeRegs(freed_regs); @@ -1207,9 +1264,9 @@ LWBackEnd<Impl>::commitInsts() while (!instList.empty() && inst_num < commitWidth) { if (instList.back()->isSquashed()) { instList.back()->clearDependents(); + ROBSquashedInsts[instList.back()->threadNumber]++; instList.pop_back(); --numInsts; - ROBSquashedInsts[instList.back()->threadNumber]++; continue; } @@ -1221,7 +1278,7 @@ LWBackEnd<Impl>::commitInsts() break; } } - n_committed_dist.sample(inst_num); + nCommittedDist.sample(inst_num); } template <class Impl> @@ -1231,10 +1288,10 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn) LSQ.squash(sn); int freed_regs = 0; - InstListIt waiting_list_end = waitingList.end(); + InstListIt insts_end_it = waitingList.end(); InstListIt insts_it = waitingList.begin(); - while (insts_it != waiting_list_end && (*insts_it)->seqNum > sn) + while (insts_it != insts_end_it && (*insts_it)->seqNum > sn) { if ((*insts_it)->isSquashed()) { ++insts_it; @@ -1260,6 +1317,7 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn) while (!instList.empty() && (*insts_it)->seqNum > sn) { if ((*insts_it)->isSquashed()) { + panic("Instruction should not be already squashed and on list!"); ++insts_it; continue; } @@ -1291,18 +1349,6 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn) --numInsts; } - insts_it = waitingList.begin(); - while (!waitingList.empty() && insts_it != waitingList.end()) { - if ((*insts_it)->seqNum < sn) { - ++insts_it; - continue; - } - assert((*insts_it)->isSquashed()); - - waitingList.erase(insts_it++); - waitingInsts--; - } - while (memBarrier && memBarrier->seqNum > sn) { DPRINTF(BE, "[sn:%lli] Memory barrier squashed (or previously " "squashed)\n", memBarrier->seqNum); @@ -1320,6 +1366,18 @@ LWBackEnd<Impl>::squash(const InstSeqNum &sn) } } + insts_it = replayList.begin(); + insts_end_it = replayList.end(); + while (!replayList.empty() && insts_it != insts_end_it) { + if ((*insts_it)->seqNum < sn) { + ++insts_it; + continue; + } + assert((*insts_it)->isSquashed()); + + replayList.erase(insts_it++); + } + frontEnd->addFreeRegs(freed_regs); } @@ -1392,14 +1450,6 @@ LWBackEnd<Impl>::squashDueToMemBlocked(DynInstPtr &inst) template <class Impl> void -LWBackEnd<Impl>::fetchFault(Fault &fault) -{ - faultFromFetch = fault; - fetchHasFault = true; -} - -template <class Impl> -void LWBackEnd<Impl>::switchOut() { switchPending = true; @@ -1416,17 +1466,25 @@ LWBackEnd<Impl>::doSwitchOut() // yet written back. assert(robEmpty()); assert(!LSQ.hasStoresToWB()); - + writeback.clear(); + for (int i = 0; i < numInstsToWB.getSize() + 1; ++i) + numInstsToWB.advance(); + +// squash(0); + assert(waitingList.empty()); + assert(instList.empty()); + assert(replayList.empty()); + assert(writeback.empty()); LSQ.switchOut(); - - squash(0); } template <class Impl> void LWBackEnd<Impl>::takeOverFrom(ThreadContext *old_tc) { - switchedOut = false; + assert(!squashPending); + squashSeqNum = 0; + squashNextPC = 0; tcSquash = false; trapSquash = false; @@ -1451,27 +1509,27 @@ LWBackEnd<Impl>::updateExeInstStats(DynInstPtr &inst) // #ifdef TARGET_ALPHA if (inst->isDataPrefetch()) - exe_swp[thread_number]++; + exeSwp[thread_number]++; else - exe_inst[thread_number]++; + exeInst[thread_number]++; #else - exe_inst[thread_number]++; + exeInst[thread_number]++; #endif // // Control operations // if (inst->isControl()) - exe_branches[thread_number]++; + exeBranches[thread_number]++; // // Memory operations // if (inst->isMemRef()) { - exe_refs[thread_number]++; + exeRefs[thread_number]++; if (inst->isLoad()) - exe_loads[thread_number]++; + exeLoads[thread_number]++; } } @@ -1491,33 +1549,33 @@ LWBackEnd<Impl>::updateComInstStats(DynInstPtr &inst) // #ifdef TARGET_ALPHA if (inst->isDataPrefetch()) { - stat_com_swp[tid]++; + statComSwp[tid]++; } else { - stat_com_inst[tid]++; + statComInst[tid]++; } #else - stat_com_inst[tid]++; + statComInst[tid]++; #endif // // Control Instructions // if (inst->isControl()) - stat_com_branches[tid]++; + statComBranches[tid]++; // // Memory references // if (inst->isMemRef()) { - stat_com_refs[tid]++; + statComRefs[tid]++; if (inst->isLoad()) { - stat_com_loads[tid]++; + statComLoads[tid]++; } } if (inst->isMemBarrier()) { - stat_com_membars[tid]++; + statComMembars[tid]++; } } @@ -1569,6 +1627,45 @@ LWBackEnd<Impl>::dumpInsts() ++num; } + inst_list_it = --(writeback.end()); + + cprintf("Writeback list size: %i\n", writeback.size()); + + while (inst_list_it != writeback.end()) + { + cprintf("Instruction:%i\n", + num); + if (!(*inst_list_it)->isSquashed()) { + if (!(*inst_list_it)->isIssued()) { + ++valid_num; + cprintf("Count:%i\n", valid_num); + } else if ((*inst_list_it)->isMemRef() && + !(*inst_list_it)->memOpDone) { + // Loads that have not been marked as executed still count + // towards the total instructions. + ++valid_num; + cprintf("Count:%i\n", valid_num); + } + } + + cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n" + "Issued:%i\nSquashed:%i\n", + (*inst_list_it)->readPC(), + (*inst_list_it)->seqNum, + (*inst_list_it)->threadNumber, + (*inst_list_it)->isIssued(), + (*inst_list_it)->isSquashed()); + + if ((*inst_list_it)->isMemRef()) { + cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone); + } + + cprintf("\n"); + + inst_list_it--; + ++num; + } + cprintf("Waiting list size: %i\n", waitingList.size()); inst_list_it = --(waitingList.end()); diff --git a/src/cpu/ozone/lw_lsq.hh b/src/cpu/ozone/lw_lsq.hh index 9a21a9d01..6640a9f34 100644 --- a/src/cpu/ozone/lw_lsq.hh +++ b/src/cpu/ozone/lw_lsq.hh @@ -84,6 +84,8 @@ class OzoneLWLSQ { /** Returns the name of the LSQ unit. */ std::string name() const; + void regStats(); + /** Sets the CPU pointer. */ void setCPU(OzoneCPU *cpu_ptr); @@ -179,7 +181,7 @@ class OzoneLWLSQ { int numLoads() { return loads; } /** Returns the number of stores in the SQ. */ - int numStores() { return stores; } + int numStores() { return stores + storesInFlight; } /** Returns if either the LQ or SQ is full. */ bool isFull() { return lqFull() || sqFull(); } @@ -188,7 +190,7 @@ class OzoneLWLSQ { bool lqFull() { return loads >= (LQEntries - 1); } /** Returns if the SQ is full. */ - bool sqFull() { return stores >= (SQEntries - 1); } + bool sqFull() { return (stores + storesInFlight) >= (SQEntries - 1); } /** Debugging function to dump instructions in the LSQ. */ void dumpInsts(); @@ -223,7 +225,9 @@ class OzoneLWLSQ { void storePostSend(Packet *pkt, DynInstPtr &inst); /** Completes the store at the specified index. */ - void completeStore(int store_idx); + void completeStore(DynInstPtr &inst); + + void removeStore(int store_idx); /** Handles doing the retry. */ void recvRetry(); @@ -394,6 +398,10 @@ class OzoneLWLSQ { int storesToWB; + public: + int storesInFlight; + + private: /// @todo Consider moving to a more advanced model with write vs read ports /** The number of cache ports available each cycle. */ int cachePorts; @@ -403,6 +411,9 @@ class OzoneLWLSQ { //list<InstSeqNum> mshrSeqNums; + /** Tota number of memory ordering violations. */ + Stats::Scalar<> lsqMemOrderViolation; + //Stats::Scalar<> dcacheStallCycles; Counter lastDcacheStall; @@ -525,7 +536,7 @@ OzoneLWLSQ<Impl>::read(RequestPtr req, T &data, int load_idx) store_size = (*sq_it).size; - if (store_size == 0) { + if (store_size == 0 || (*sq_it).committed) { sq_it++; continue; } diff --git a/src/cpu/ozone/lw_lsq_impl.hh b/src/cpu/ozone/lw_lsq_impl.hh index 7eef4b11f..31ffa9d67 100644 --- a/src/cpu/ozone/lw_lsq_impl.hh +++ b/src/cpu/ozone/lw_lsq_impl.hh @@ -132,7 +132,7 @@ OzoneLWLSQ<Impl>::completeDataAccess(PacketPtr pkt) template <class Impl> OzoneLWLSQ<Impl>::OzoneLWLSQ() : switchedOut(false), dcachePort(this), loads(0), stores(0), - storesToWB(0), stalled(false), isStoreBlocked(false), + storesToWB(0), storesInFlight(0), stalled(false), isStoreBlocked(false), isLoadBlocked(false), loadBlockedHandled(false) { } @@ -173,6 +173,11 @@ OzoneLWLSQ<Impl>::name() const template<class Impl> void +OzoneLWLSQ<Impl>::regStats() +{ + lsqMemOrderViolation + .name(name() + ".memOrderViolation") + .desc("Number of memory ordering violations"); OzoneLWLSQ<Impl>::setCPU(OzoneCPU *cpu_ptr) { cpu = cpu_ptr; @@ -321,7 +326,7 @@ unsigned OzoneLWLSQ<Impl>::numFreeEntries() { unsigned free_lq_entries = LQEntries - loads; - unsigned free_sq_entries = SQEntries - stores; + unsigned free_sq_entries = SQEntries - (stores + storesInFlight); // Both the LQ and SQ entries have an extra dummy entry to differentiate // empty/full conditions. Subtract 1 from the free entries. @@ -385,6 +390,9 @@ OzoneLWLSQ<Impl>::executeLoad(DynInstPtr &inst) // Actually probably want the oldest faulting load if (load_fault != NoFault) { DPRINTF(OzoneLSQ, "Load [sn:%lli] has a fault\n", inst->seqNum); + if (!(inst->req->flags & UNCACHEABLE && !inst->isAtCommit())) { + inst->setExecuted(); + } // Maybe just set it as can commit here, although that might cause // some other problems with sending traps to the ROB too quickly. be->instToCommit(inst); @@ -461,6 +469,7 @@ OzoneLWLSQ<Impl>::executeStore(DynInstPtr &store_inst) // A load incorrectly passed this store. Squash and refetch. // For now return a fault to show that it was unsuccessful. memDepViolator = (*lq_it); + ++lsqMemOrderViolation; return TheISA::genMachineCheckFault(); } @@ -553,8 +562,8 @@ OzoneLWLSQ<Impl>::writebackStores() if ((*sq_it).size == 0 && !(*sq_it).completed) { sq_it--; - completeStore(inst->sqIdx); - + removeStore(inst->sqIdx); + completeStore(inst); continue; } @@ -626,6 +635,8 @@ OzoneLWLSQ<Impl>::writebackStores() inst->sqIdx,inst->readPC(), req->paddr, *(req->data), inst->seqNum); + DPRINTF(OzoneLSQ, "StoresInFlight: %i\n", + storesInFlight + 1); if (dcacheInterface) { assert(!req->completionEvent); @@ -687,6 +698,8 @@ OzoneLWLSQ<Impl>::writebackStores() } sq_it--; } + ++storesInFlight; +// removeStore(inst->sqIdx); } else { panic("Must HAVE DCACHE!!!!!\n"); } @@ -704,7 +717,7 @@ void OzoneLWLSQ<Impl>::squash(const InstSeqNum &squashed_num) { DPRINTF(OzoneLSQ, "Squashing until [sn:%lli]!" - "(Loads:%i Stores:%i)\n",squashed_num,loads,stores); + "(Loads:%i Stores:%i)\n",squashed_num,loads,stores+storesInFlight); LQIt lq_it = loadQueue.begin(); @@ -881,7 +894,7 @@ OzoneLWLSQ<Impl>::writeback(DynInstPtr &inst, PacketPtr pkt) template <class Impl> void -OzoneLWLSQ<Impl>::completeStore(int store_idx) +OzoneLWLSQ<Impl>::removeStore(int store_idx) { SQHashIt sq_hash_it = SQItHash.find(store_idx); assert(sq_hash_it != SQItHash.end()); @@ -891,8 +904,6 @@ OzoneLWLSQ<Impl>::completeStore(int store_idx) (*sq_it).completed = true; DynInstPtr inst = (*sq_it).inst; - --storesToWB; - if (isStalled() && inst->seqNum == stallingStoreIsn) { DPRINTF(OzoneLSQ, "Unstalling, stalling store [sn:%lli] " @@ -910,6 +921,13 @@ OzoneLWLSQ<Impl>::completeStore(int store_idx) SQItHash.erase(sq_hash_it); SQIndices.push(inst->sqIdx); storeQueue.erase(sq_it); +} + +template <class Impl> +void +OzoneLWLSQ<Impl>::completeStore(DynInstPtr &inst) +{ + --storesToWB; --stores; inst->setCompleted(); @@ -935,9 +953,14 @@ OzoneLWLSQ<Impl>::switchOut() switchedOut = true; // Clear the queue to free up resources + assert(stores == 0); + assert(storeQueue.empty()); + assert(loads == 0); + assert(loadQueue.empty()); + assert(storesInFlight == 0); storeQueue.clear(); loadQueue.clear(); - loads = stores = storesToWB = 0; + loads = stores = storesToWB = storesInFlight = 0; } template <class Impl> diff --git a/src/cpu/ozone/simple_params.hh b/src/cpu/ozone/simple_params.hh index 11cee716f..3f63d2e1d 100644 --- a/src/cpu/ozone/simple_params.hh +++ b/src/cpu/ozone/simple_params.hh @@ -71,10 +71,11 @@ class SimpleParams : public BaseCPU::Params unsigned cachePorts; unsigned width; + unsigned frontEndLatency; unsigned frontEndWidth; + unsigned backEndLatency; unsigned backEndWidth; unsigned backEndSquashLatency; - unsigned backEndLatency; unsigned maxInstBufferSize; unsigned numPhysicalRegs; unsigned maxOutstandingMemOps; @@ -150,6 +151,7 @@ class SimpleParams : public BaseCPU::Params // unsigned LQEntries; unsigned SQEntries; + bool lsqLimits; // // Memory dependence diff --git a/src/cpu/ozone/thread_state.hh b/src/cpu/ozone/thread_state.hh index 8234cf938..adaa8e71b 100644 --- a/src/cpu/ozone/thread_state.hh +++ b/src/cpu/ozone/thread_state.hh @@ -34,9 +34,12 @@ #include "arch/faults.hh" #include "arch/types.hh" #include "arch/regfile.hh" +#include "base/callback.hh" +#include "base/output.hh" #include "cpu/thread_context.hh" #include "cpu/thread_state.hh" #include "sim/process.hh" +#include "sim/sim_exit.hh" class Event; //class Process; @@ -65,8 +68,21 @@ struct OzoneThreadState : public ThreadState { #if FULL_SYSTEM OzoneThreadState(CPUType *_cpu, int _thread_num) : ThreadState(-1, _thread_num), - intrflag(0), inSyscall(0), trapPending(0) + cpu(_cpu), intrflag(0), inSyscall(0), trapPending(0) { + if (cpu->params->profile) { + profile = new FunctionProfile(cpu->params->system->kernelSymtab); + Callback *cb = + new MakeCallback<OzoneThreadState, + &OzoneThreadState::dumpFuncProfile>(this); + registerExitCallback(cb); + } + + // let's fill with a dummy node for now so we don't get a segfault + // on the first cycle when there's no node available. + static ProfileNode dummyNode; + profileNode = &dummyNode; + profilePC = 3; miscRegFile.clear(); } #else @@ -130,6 +146,14 @@ struct OzoneThreadState : public ThreadState { void setNextPC(uint64_t val) { nextPC = val; } + +#if FULL_SYSTEM + void dumpFuncProfile() + { + std::ostream *os = simout.create(csprintf("profile.%s.dat", cpu->name())); + profile->dump(xcProxy, *os); + } +#endif }; #endif // __CPU_OZONE_THREAD_STATE_HH__ diff --git a/src/cpu/simple/base.cc b/src/cpu/simple/base.cc index f801b93fa..522fe79aa 100644 --- a/src/cpu/simple/base.cc +++ b/src/cpu/simple/base.cc @@ -170,7 +170,7 @@ BaseSimpleCPU::regStats() void BaseSimpleCPU::resetStats() { - startNumInst = numInst; +// startNumInst = numInst; // notIdleFraction = (_status != Idle); } diff --git a/src/cpu/simple_thread.cc b/src/cpu/simple_thread.cc index 5f86cf2b7..4fc47c982 100644 --- a/src/cpu/simple_thread.cc +++ b/src/cpu/simple_thread.cc @@ -162,6 +162,11 @@ SimpleThread::takeOverFrom(ThreadContext *oldContext) if (quiesceEvent) { quiesceEvent->tc = tc; } + + Kernel::Statistics *stats = oldContext->getKernelStats(); + if (stats) { + kernelStats = stats; + } #endif storeCondFailures = 0; diff --git a/src/cpu/thread_state.hh b/src/cpu/thread_state.hh index 6e985054f..5479f8478 100644 --- a/src/cpu/thread_state.hh +++ b/src/cpu/thread_state.hh @@ -32,6 +32,7 @@ #define __CPU_THREAD_STATE_HH__ #include "arch/types.hh" +#include "cpu/profile.hh" #include "cpu/thread_context.hh" #if !FULL_SYSTEM @@ -191,6 +192,21 @@ struct ThreadState { // simulation only; all functional memory accesses should use // one of the FunctionalMemory pointers above. short asid; + +#endif + +#if FULL_SYSTEM + void profileClear() + { + if (profile) + profile->clear(); + } + + void profileSample() + { + if (profile) + profile->sample(profileNode, profilePC); + } #endif /** Current instruction the thread is committing. Only set and diff --git a/src/dev/ide_disk.hh b/src/dev/ide_disk.hh index 0bc0b73ab..2ed860013 100644 --- a/src/dev/ide_disk.hh +++ b/src/dev/ide_disk.hh @@ -242,6 +242,10 @@ class IdeDisk : public SimObject Stats::Scalar<> dmaWriteFullPages; Stats::Scalar<> dmaWriteBytes; Stats::Scalar<> dmaWriteTxs; + Stats::Formula rdBandwidth; + Stats::Formula wrBandwidth; + Stats::Formula totBandwidth; + Stats::Formula totBytes; public: /** diff --git a/src/python/m5/objects/BaseCPU.py b/src/python/m5/objects/BaseCPU.py index 3dd0bda01..05ccbca6a 100644 --- a/src/python/m5/objects/BaseCPU.py +++ b/src/python/m5/objects/BaseCPU.py @@ -26,6 +26,9 @@ class BaseCPU(SimObject): "terminate when all threads have reached this load count") max_loads_any_thread = Param.Counter(0, "terminate when any thread reaches this load count") + stats_reset_inst = Param.Counter(0, + "reset stats once this many instructions are committed") + progress_interval = Param.Tick(0, "interval to print out the progress message") defer_registration = Param.Bool(False, "defer registration with system (for sampling)") diff --git a/src/python/m5/objects/O3CPU.py b/src/python/m5/objects/O3CPU.py index 5100c7ccb..59b40c6e8 100644 --- a/src/python/m5/objects/O3CPU.py +++ b/src/python/m5/objects/O3CPU.py @@ -9,6 +9,8 @@ class DerivO3CPU(BaseCPU): activity = Param.Unsigned(0, "Initial count") numThreads = Param.Unsigned(1, "number of HW thread contexts") + if build_env['FULL_SYSTEM']: + profile = Param.Latency('0ns', "trace the kernel stack") if build_env['USE_CHECKER']: if not build_env['FULL_SYSTEM']: checker = Param.BaseCPU(O3Checker(workload=Parent.workload, diff --git a/src/python/m5/objects/OzoneCPU.py b/src/python/m5/objects/OzoneCPU.py index 8f25d77ed..0913e044c 100644 --- a/src/python/m5/objects/OzoneCPU.py +++ b/src/python/m5/objects/OzoneCPU.py @@ -8,12 +8,15 @@ class DerivOzoneCPU(BaseCPU): numThreads = Param.Unsigned("number of HW thread contexts") checker = Param.BaseCPU("Checker CPU") + if build_env['FULL_SYSTEM']: + profile = Param.Latency('0ns', "trace the kernel stack") icache_port = Port("Instruction Port") dcache_port = Port("Data Port") width = Param.Unsigned("Width") frontEndWidth = Param.Unsigned("Front end width") + frontEndLatency = Param.Unsigned("Front end latency") backEndWidth = Param.Unsigned("Back end width") backEndSquashLatency = Param.Unsigned("Back end squash latency") backEndLatency = Param.Unsigned("Back end latency") @@ -76,6 +79,7 @@ class DerivOzoneCPU(BaseCPU): LQEntries = Param.Unsigned("Number of load queue entries") SQEntries = Param.Unsigned("Number of store queue entries") + lsqLimits = Param.Bool(True, "LSQ size limits dispatch") LFSTSize = Param.Unsigned("Last fetched store table size") SSITSize = Param.Unsigned("Store set ID table size") diff --git a/src/python/m5/objects/Root.py b/src/python/m5/objects/Root.py index f01fc06c4..8e8d87f6d 100644 --- a/src/python/m5/objects/Root.py +++ b/src/python/m5/objects/Root.py @@ -1,6 +1,7 @@ from m5.SimObject import SimObject from m5.params import * from Serialize import Serialize +from Serialize import Statreset from Statistics import Statistics from Trace import Trace from ExeTrace import ExecutionTrace diff --git a/src/python/m5/objects/System.py b/src/python/m5/objects/System.py index bc2a002cb..e7dd1bc60 100644 --- a/src/python/m5/objects/System.py +++ b/src/python/m5/objects/System.py @@ -16,6 +16,7 @@ class System(SimObject): boot_osflags = Param.String("a", "boot flags to pass to the kernel") kernel = Param.String("file that contains the kernel code") readfile = Param.String("", "file to read startup script from") + symbolfile = Param.String("", "file to get the symbols from") class AlphaSystem(System): type = 'AlphaSystem' diff --git a/src/sim/eventq.hh b/src/sim/eventq.hh index 430473df3..537bfb918 100644 --- a/src/sim/eventq.hh +++ b/src/sim/eventq.hh @@ -46,6 +46,7 @@ #include "sim/host.hh" // for Tick #include "base/fast_alloc.hh" +#include "base/misc.hh" #include "base/trace.hh" #include "sim/serialize.hh" @@ -135,7 +136,7 @@ class Event : public Serializable, public FastAlloc /// same cycle (after unscheduling the old CPU's tick event). /// The switch needs to come before any tick events to make /// sure we don't tick both CPUs in the same cycle. - CPU_Switch_Pri = 31, + CPU_Switch_Pri = -31, /// Serailization needs to occur before tick events also, so /// that a serialize/unserialize is identical to an on-line @@ -351,7 +352,8 @@ inline void Event::schedule(Tick t) { assert(!scheduled()); - assert(t >= curTick); +// if (t < curTick) +// warn("t is less than curTick, ensure you don't want cycles"); setFlags(Scheduled); #if TRACING_ON diff --git a/src/sim/pseudo_inst.cc b/src/sim/pseudo_inst.cc index bd26e9dc5..aae2f6021 100644 --- a/src/sim/pseudo_inst.cc +++ b/src/sim/pseudo_inst.cc @@ -149,6 +149,54 @@ namespace AlphaPseudo } void + loadsymbol(ExecContext *xc) + { + const string &filename = xc->getCpuPtr()->system->params()->symbolfile; + if (filename.empty()) { + return; + } + + std::string buffer; + ifstream file(filename.c_str()); + + if (!file) + fatal("file error: Can't open symbol table file %s\n", filename); + + while (!file.eof()) { + getline(file, buffer); + + if (buffer.empty()) + continue; + + int idx = buffer.find(' '); + if (idx == string::npos) + continue; + + string address = "0x" + buffer.substr(0, idx); + eat_white(address); + if (address.empty()) + continue; + + // Skip over letter and space + string symbol = buffer.substr(idx + 3); + eat_white(symbol); + if (symbol.empty()) + continue; + + Addr addr; + if (!to_number(address, addr)) + continue; + + if (!xc->getSystemPtr()->kernelSymtab->insert(addr, symbol)) + continue; + + + DPRINTF(Loader, "Loaded symbol: %s @ %#llx\n", symbol, addr); + } + file.close(); + } + + void resetstats(ThreadContext *tc, Tick delay, Tick period) { if (!doStatisticsInsts) diff --git a/src/sim/pseudo_inst.hh b/src/sim/pseudo_inst.hh index da2fb4ee3..d211de44e 100644 --- a/src/sim/pseudo_inst.hh +++ b/src/sim/pseudo_inst.hh @@ -51,6 +51,7 @@ namespace AlphaPseudo void ivle(ThreadContext *tc); void m5exit(ThreadContext *tc, Tick delay); void m5exit_old(ThreadContext *tc); + void loadsymbol(ThreadContext *xc); void resetstats(ThreadContext *tc, Tick delay, Tick period); void dumpstats(ThreadContext *tc, Tick delay, Tick period); void dumpresetstats(ThreadContext *tc, Tick delay, Tick period); diff --git a/src/sim/serialize.cc b/src/sim/serialize.cc index 6a1d084b7..941f0b1c6 100644 --- a/src/sim/serialize.cc +++ b/src/sim/serialize.cc @@ -52,6 +52,9 @@ #include "sim/sim_exit.hh" #include "sim/sim_object.hh" +// For stat reset hack +#include "sim/stat_control.hh" + using namespace std; int Serializable::ckptMaxCount = 0; @@ -404,3 +407,36 @@ Checkpoint::sectionExists(const std::string §ion) { return db->sectionExists(section); } + +/** Hacked stat reset event */ + +class StatresetParamContext : public ParamContext +{ + public: + StatresetParamContext(const string §ion); + ~StatresetParamContext(); + void startup(); +}; + +StatresetParamContext statParams("statsreset"); + +Param<Tick> reset_cycle(&statParams, "reset_cycle", + "Cycle to reset stats on", 0); + +StatresetParamContext::StatresetParamContext(const string §ion) + : ParamContext(section) +{ } + +StatresetParamContext::~StatresetParamContext() +{ +} + +void +StatresetParamContext::startup() +{ + if (reset_cycle > 0) { + Stats::SetupEvent(Stats::Reset, curTick + reset_cycle, 0); + cprintf("Stats reset event scheduled for %lli\n", + curTick + reset_cycle); + } +} diff --git a/src/sim/stat_control.cc b/src/sim/stat_control.cc index 041830ab7..dfed2a0c8 100644 --- a/src/sim/stat_control.cc +++ b/src/sim/stat_control.cc @@ -160,13 +160,13 @@ class StatEvent : public Event Tick repeat; public: - StatEvent(int _flags, Tick _when, Tick _repeat); + StatEvent(EventQueue *queue, int _flags, Tick _when, Tick _repeat); virtual void process(); virtual const char *description(); }; -StatEvent::StatEvent(int _flags, Tick _when, Tick _repeat) - : Event(&mainEventQueue, Stat_Event_Pri), +StatEvent::StatEvent(EventQueue *queue, int _flags, Tick _when, Tick _repeat) + : Event(queue, Stat_Event_Pri), flags(_flags), repeat(_repeat) { setFlags(AutoDelete); @@ -185,8 +185,10 @@ StatEvent::process() if (flags & Stats::Dump) DumpNow(); - if (flags & Stats::Reset) + if (flags & Stats::Reset) { + cprintf("Resetting stats!\n"); reset(); + } if (repeat) schedule(curTick + repeat); @@ -214,9 +216,12 @@ DumpNow() } void -SetupEvent(int flags, Tick when, Tick repeat) +SetupEvent(int flags, Tick when, Tick repeat, EventQueue *queue) { - new StatEvent(flags, when, repeat); + if (queue == NULL) + queue = &mainEventQueue; + + new StatEvent(queue, flags, when, repeat); } /* namespace Stats */ } diff --git a/src/sim/stat_control.hh b/src/sim/stat_control.hh index fb369f640..67f7cc491 100644 --- a/src/sim/stat_control.hh +++ b/src/sim/stat_control.hh @@ -34,6 +34,8 @@ #include <fstream> #include <list> +class EventQueue; + namespace Stats { enum { @@ -45,7 +47,7 @@ class Output; extern std::list<Output *> OutputList; void DumpNow(); -void SetupEvent(int flags, Tick when, Tick repeat = 0); +void SetupEvent(int flags, Tick when, Tick repeat = 0, EventQueue *queue = NULL); void InitSimStats(); diff --git a/src/sim/system.hh b/src/sim/system.hh index 11f4f0c90..3ab1d81f2 100644 --- a/src/sim/system.hh +++ b/src/sim/system.hh @@ -182,6 +182,7 @@ class System : public SimObject std::string kernel_path; std::string readfile; + std::string symbolfile; #endif }; |