//
// This file has been modified by Kevin Moore and Dan Nussbaum of the
// Scalable Systems Research Group at Sun Microsystems Laboratories
// (http://research.sun.com/scalable/) to support the Adaptive
// Transactional Memory Test Platform (ATMTP).  For information about
// ATMTP, see the GEMS website: http://www.cs.wisc.edu/gems/.
//
// Please send email to atmtp-interest@sun.com with feedback, questions, or
// to request future announcements about ATMTP.
//
// ----------------------------------------------------------------------
//
// File modification date: 2008-02-23
//
// ----------------------------------------------------------------------
//
// ATMTP is distributed as part of the GEMS software toolset and is
// available for use and modification under the terms of version 2 of the
// GNU General Public License.  The GNU General Public License is contained
// in the file $GEMS/LICENSE.
//
// Multifacet GEMS is free software; you can redistribute it and/or modify
// it under the terms of version 2 of the GNU General Public License as
// published by the Free Software Foundation.
//
// Multifacet GEMS is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with the Multifacet GEMS; if not, write to the Free Software Foundation,
// Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA
//
// ----------------------------------------------------------------------
//

g_RANDOM_SEED: 1

g_DEADLOCK_THRESHOLD: 500000

// determines how many Simics cycles advance for every Ruby cycle
//  (does not apply when running Opal)
SIMICS_RUBY_MULTIPLIER: 4

// Ruby cycles between when a sequencer issues a request and it arrives at
// the L1 cache controller
//
// ** important ** this parameter determines the L2 hit latency when
//  using the SMP protocols with a combined L1/L2 controller (-cache.sm)
//
SEQUENCER_TO_CONTROLLER_LATENCY: 4


// When set to false, the L1 cache structures are probed for a hit in Sequencer.C
//  If a request hits, it is *not* issued to the cache controller
// When set to true, all processor data requests issue to cache controller
//
// ** important ** this parameter must be set to false for proper L1/L2 hit timing
//  for the SMP protocols with combined L1/L2 controllers (-cache.sm)
//
REMOVE_SINGLE_CYCLE_DCACHE_FAST_PATH: false


// When running with Opal in SMT configurations, this indicates the number of threads per physical processor
g_NUM_SMT_THREADS: 1


// Maximum number of requests (including SW prefetches) outstanding from
// the sequencer (Note: this also include items buffered in the store
// buffer)
g_SEQUENCER_OUTSTANDING_REQUESTS: 16


PROTOCOL_DEBUG_TRACE: true
DEBUG_FILTER_STRING: none
DEBUG_VERBOSITY_STRING: none
DEBUG_START_TIME: 0
DEBUG_OUTPUT_FILENAME: none


TRANSACTION_TRACE_ENABLED: false
USER_MODE_DATA_ONLY: false
PROFILE_HOT_LINES: false

PROFILE_ALL_INSTRUCTIONS: false
PRINT_INSTRUCTION_TRACE: false
g_DEBUG_CYCLE: 0
BLOCK_STC: false
PERFECT_MEMORY_SYSTEM: false
PERFECT_MEMORY_SYSTEM_LATENCY: 0
DATA_BLOCK: false


// *********************************************
// CACHE & MEMORY PARAMETERS
// *********************************************


L1_CACHE_ASSOC: 4
L1_CACHE_NUM_SETS_BITS: 8
L2_CACHE_ASSOC: 4
L2_CACHE_NUM_SETS_BITS: 16

// 32 bits = 4 GB address space
g_MEMORY_SIZE_BYTES: 1073741824 //4294967296
g_DATA_BLOCK_BYTES: 64
g_PAGE_SIZE_BYTES: 4096
g_REPLACEMENT_POLICY: PSEDUO_LRU // currently, only other option is LRU

g_PROCS_PER_CHIP: 1


// set automatically
g_NUM_PROCESSORS: 0
g_NUM_L2_BANKS: 0
g_NUM_MEMORIES: 0

// The following group of parameters are calculated.  They must
// _always_ be left at zero.
g_NUM_CHIPS: 0
g_NUM_CHIP_BITS: 0
g_MEMORY_SIZE_BITS: 0
g_DATA_BLOCK_BITS: 0
g_PAGE_SIZE_BITS: 0
g_NUM_PROCESSORS_BITS: 0
g_PROCS_PER_CHIP_BITS: 0
g_NUM_L2_BANKS_BITS: 0
g_NUM_L2_BANKS_PER_CHIP: 0
g_NUM_L2_BANKS_PER_CHIP_BITS: 0
g_NUM_MEMORIES_BITS: 0
g_NUM_MEMORIES_PER_CHIP: 0
g_MEMORY_MODULE_BITS: 0
g_MEMORY_MODULE_BLOCKS: 0


// For certain CMP protocols, determines whether the lowest bits of a block address
// are used to index to a L2 cache bank or into the sets of a
// single bank
//        lowest                                                             highest
// true:   g_DATA_BLOCK_BITS | g_NUM_L2_BANKS_PER_CHIP_BITS | L2_CACHE_NUM_SETS_BITS
// false:  g_DATA_BLOCK_BITS | L2_CACHE_NUM_SETS_BITS | g_NUM_L2_BANKS_PER_CHIP_BITS
MAP_L2BANKS_TO_LOWEST_BITS: false



// TIMING PARAMETERS  -- many of these are protocol specific.  See SLICC files
//                       to determine where they apply

MEMORY_RESPONSE_LATENCY_MINUS_2: 158  // determines memory response latency
DIRECTORY_CACHE_LATENCY: 6
NULL_LATENCY: 1
ISSUE_LATENCY: 2
CACHE_RESPONSE_LATENCY: 12
L1_RESPONSE_LATENCY: 3
L2_RESPONSE_LATENCY: 6
L2_TAG_LATENCY: 6
DIRECTORY_LATENCY: 80
NETWORK_LINK_LATENCY: 1
COPY_HEAD_LATENCY: 4
ON_CHIP_LINK_LATENCY: 1
RECYCLE_LATENCY: 10
L2_RECYCLE_LATENCY: 5
TIMER_LATENCY: 10000
TBE_RESPONSE_LATENCY: 1
PERIODIC_TIMER_WAKEUPS: true


// constants used by CMP protocols
// cache bank access times
L1_REQUEST_LATENCY: 2
L2_REQUEST_LATENCY: 4


// Number of transitions each controller state machines can complete per cycle
// i.e. the number of ports to each controller
// L1cache is the sum of the L1I and L1D cache ports
L1CACHE_TRANSITIONS_PER_RUBY_CYCLE: 32
// Note: if SINGLE_ACCESS_L2_BANKS is enabled, this will probably enforce a
// much greater constraint on the concurrency of a L2 cache bank
L2CACHE_TRANSITIONS_PER_RUBY_CYCLE: 32
DIRECTORY_TRANSITIONS_PER_RUBY_CYCLE: 32
DMA_TRANSITIONS_PER_RUBY_CYCLE: 1


// Number of TBEs available for demand misses, ALL prefetches, and replacements
// used by one-level protocols
NUMBER_OF_TBES: 128
// two-level protocols
NUMBER_OF_L1_TBES: 32
NUMBER_OF_L2_TBES: 32

// ** INTERCONECT PARAMETERS **
//
g_PRINT_TOPOLOGY: true
g_NETWORK_TOPOLOGY: HIERARCHICAL_SWITCH
g_CACHE_DESIGN: NUCA  // specifies file prefix for FILE_SPECIFIED topology
FAN_OUT_DEGREE: 4  // for HIERARCHICAL SWITCH topology

g_adaptive_routing: true
NUMBER_OF_VIRTUAL_NETWORKS: 6

// bandwidth unit is 1/1000 byte per cycle.  the following parameter is multiplied by
//  topology specific link weights
g_endpoint_bandwidth: 10000


// ** finite buffering parameters
//
// note: Finite buffering allows us to simulate a realistic virtual cut-through
// routed network with idealized flow control.  this feature is NOT heavily tested
FINITE_BUFFERING: false
// All message buffers within the network (i.e. the switch's input and
// output buffers) are set to the size specified below by the FINITE_BUFFER_SIZE
FINITE_BUFFER_SIZE: 3
// g_SEQUENCER_OUTSTANDING_REQUESTS (above) controlls the number of demand requests
// issued by the sequencer.  The PROCESSOR_BUFFER_SIZE controlls the
// number of requests in the mandatory queue
// Only effects the simualtion when FINITE_BUFFERING is enabled
PROCESSOR_BUFFER_SIZE: 10
// The PROTOCOL_BUFFER_SIZE limits the size of all other buffers connecting to
// Controllers.  Controlls the number of request issued by the L2 HW Prefetcher
PROTOCOL_BUFFER_SIZE: 32
// ** end finite buffering parameters


// (deprecated)
// Allows on a single accesses to a multi-cycle L2 bank.
// Ensures the cache array is only accessed once for every L2_REQUEST_LATENCY
// number of cycles.  However the TBE table can be accessed in parallel.
SINGLE_ACCESS_L2_BANKS: true


// MOESI_CMP_token parameters (some might be deprecated)
g_FILTERING_ENABLED: false
g_DISTRIBUTED_PERSISTENT_ENABLED: true
g_RETRY_THRESHOLD: 1
g_DYNAMIC_TIMEOUT_ENABLED: true
g_FIXED_TIMEOUT_LATENCY: 300


// tester parameters (overridden by testerconfig.defaults)
//
//  injects random message delays to excite protocol races
RANDOMIZATION: false
g_SYNTHETIC_DRIVER: false
g_DETERMINISTIC_DRIVER: false
g_trace_warmup_length: 1000000
g_bash_bandwidth_adaptive_threshold: 0.75

g_tester_length: 0
// # of synthetic locks == 16 * 128
g_synthetic_locks: 2048
g_deterministic_addrs: 1
g_SpecifiedGenerator: DetermInvGenerator
g_callback_counter: 0
g_NUM_COMPLETIONS_BEFORE_PASS: 0
// parameters used by locking synthetic tester
g_think_time: 5
g_hold_time:  5
g_wait_time:  5

// Princeton Network (Garnet)
g_GARNET_NETWORK: true
g_DETAIL_NETWORK: false
g_NETWORK_TESTING: false
g_FLIT_SIZE: 16
g_NUM_PIPE_STAGES: 4
g_VCS_PER_CLASS: 4
g_BUFFER_SIZE: 4

///////////////////////////////////////////////////////////////////////////////
//
// MemoryControl:

// Basic cycle time of the memory controller.  This defines the period which is
// used as the memory channel clock period, the address bus bit time, and the
// memory controller cycle time.
// Assuming a 200 MHz memory channel (DDR-400, which has 400 bits/sec data),
// and a 2 GHz Ruby clock:
MEM_BUS_CYCLE_MULTIPLIER: 10

// How many internal banks in each DRAM chip:
BANKS_PER_RANK: 8

// How many sets of DRAM chips per DIMM.
RANKS_PER_DIMM: 2

// How many DIMMs per channel.  (Currently the only thing that
// matters is the number of ranks per channel, i.e. the product
// of this parameter and RANKS_PER_DIMM.  But if and when this is
// expanded to do FB-DIMMs, the distinction between the two
// will matter.)
DIMMS_PER_CHANNEL: 2

// Which bits to use to find the bank, rank, and DIMM numbers.
// You could choose to have the bank bits, rank bits, and DIMM bits
// in any order; here they are in that order.
// For these defaults, we assume this format for addresses:
//    Offset within line:     [5:0]
//    Memory controller #:    [7:6]
//    Bank:                  [10:8]
//    Rank:                    [11]
//    DIMM:                    [12]
//    Row addr / Col addr: [top:13]
// If you get these bits wrong, then some banks won't see any
// requests; you need to check for this in the .stats output.
BANK_BIT_0: 8
RANK_BIT_0: 11
DIMM_BIT_0: 12

// Number of entries max in each bank queues; set to whatever you want.
// If it is too small, you will see in the .stats file a lot of delay
// time spent in the common input queue.
BANK_QUEUE_SIZE: 12

// Bank cycle time (tRC) measured in memory cycles:
BANK_BUSY_TIME: 11

// This is how many memory address cycles to delay between reads to
// different ranks of DRAMs to allow for clock skew:
RANK_RANK_DELAY: 1

// This is how many memory address cycles to delay between a read
// and a write.  This is based on two things:  (1) the data bus is
// used one cycle earlier in the operation; (2) a round-trip wire
// delay from the controller to the DIMM that did the reading.
READ_WRITE_DELAY: 2

// Basic address and data bus occupancy.  If you are assuming a
// 16-byte-wide data bus (pairs of DIMMs side-by-side), then
// the data bus occupancy matches the address bus occupancy at
// two cycles.  But if the channel is only 8 bytes wide, you
// need to increase this bus occupancy time to 4 cycles.
BASIC_BUS_BUSY_TIME: 2

// Latency to returning read request or writeback acknowledgement.
// Measured in memory address cycles.
// This equals tRCD + CL + AL + (four bit times)
//                            + (round trip on channel)
//                            + (memory control internal delays)
// It's going to be an approximation, so pick what you like.
// Note:  The fact that latency is a constant, and does not depend on two
// low-order address bits, implies that our memory controller either:
// (a) tells the DRAM to read the critical word first, and sends the
// critical word first back to the CPU, or (b) waits until it has
// seen all four bit times on the data wires before sending anything
// back.  Either is plausible.  If (a), remove the "four bit times"
// term from the calculation above.
MEM_CTL_LATENCY: 12

// refresh_period is the number of memory cycles between refresh
// of row x in bank n and refresh of row x+1 in bank n.  For DDR-400,
// this is typically 7.8 usec for commercial systems; after 8192 such
// refreshes, this will have refreshed the whole chip in 64 msec.  If
// we have a 5 nsec memory clock, 7800 / 5 = 1560 cycles.  The memory
// controller will divide this by the total number of banks, and kick
// off a refresh to *somebody* every time that amount is counted
// down to zero. (There will be some rounding error there, but it
// should have minimal effect.)
REFRESH_PERIOD: 1560

// tFAW is a DRAM chip parameter which restricts the number of
// activates that can be done within a certain window of time.
// The window is specified here in terms of number of memory
// controller cycles.  At most four activates may be done during
// any such sliding window.  If this number is set to be no more
// than 4 * BASIC_BUS_BUSY_TIME, it will have no effect.
// It is typical in real systems for tFAW to have no effect, but
// it may be useful in throttling power.  Set to zero to ignore.
TFAW: 0

// By default, the memory controller uses round-robin to arbitrate
// between ready bank queues for use of the address bus.  If you
// wish to add randomness to the system, set this parameter to
// one instead, and it will restart the round-robin pointer at a
// random bank number each cycle.  If you want additional
// nondeterminism, set the parameter to some integer n >= 2, and
// it will in addition add a n% chance each cycle that a ready bank
// will be delayed an additional cycle.  Note that if you are
// in MEM_FIXED_DELAY mode (see below), MEM_RANDOM_ARBITRATE=1 will
// have no effect, but MEM_RANDOM_ARBITRATE=2 or more will.
MEM_RANDOM_ARBITRATE: 0

// The following parameter, if nonzero, will disable the memory
// controller and instead give every request a fixed latency.  The
// nonzero value specified here is measured in memory cycles and is
// just added to MEM_CTL_LATENCY.  It will also show up in the stats
// file as a contributor to memory_delays_stalled_at_head_of_bank_queue.
MEM_FIXED_DELAY: 0

// If instead of DDR-400, you wanted DDR-800, the channel gets faster
// but the basic operation of the DRAM core is unchanged.
// Busy times appear to double just because they are measured
// in smaller clock cycles.  The performance advantage comes because
// the bus busy times don't actually quite double.
// You would use something like these values:
//
// MEM_BUS_CYCLE_MULTIPLIER: 5
// BANK_BUSY_TIME: 22
// RANK_RANK_DELAY: 2
// READ_WRITE_DELAY: 3
// BASIC_BUS_BUSY_TIME: 3
// MEM_CTL_LATENCY: 20
// REFRESH_PERIOD: 3120