diff options
Diffstat (limited to 'ext')
104 files changed, 48876 insertions, 0 deletions
diff --git a/ext/mcpat/ARM_A9.xml b/ext/mcpat/ARM_A9.xml new file mode 100644 index 000000000..9289b6644 --- /dev/null +++ b/ext/mcpat/ARM_A9.xml @@ -0,0 +1,415 @@ +<?xml version="1.0" ?> +<component id="root" name="root"> + <component id="system" name="system"> + <!--McPAT will skip the components if number is set to 0 --> + <param name="number_of_cores" value="2"/> + <param name="number_of_L1Directories" value="2"/> + <param name="number_of_L2Directories" value="0"/> + <param name="number_of_L2s" value="0"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports --> + <param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent --> + <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters --> + <param name="number_of_NoCs" value="1"/> + <param name="homogeneous_cores" value="1"/><!--1 means homo --> + <param name="homogeneous_L2s" value="1"/> + <param name="homogeneous_L1Directorys" value="1"/> + <param name="homogeneous_L2Directorys" value="1"/> + <param name="homogeneous_L3s" value="1"/> + <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware --> + <param name="homogeneous_NoCs" value="1"/> + <param name="core_tech_node" value="40"/><!-- nm --> + <param name="target_core_clockrate" value="2000"/><!--MHz --> + <param name="temperature" value="380"/> <!-- Kelvin --> + <param name="number_cache_levels" value="2"/> + <param name="interconnect_projection_type" value="1"/><!--0: agressive wire technology; 1: conservative wire technology --> + <param name="device_type" value="1"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) --> + <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate --> + <param name="Embedded" value="1"/><!-- Embedded processor like ARM or general purpose processors? --> + <param name="machine_bits" value="32"/> + <param name="virtual_address_width" value="32"/> + <param name="physical_address_width" value="32"/> + <param name="virtual_memory_page_size" value="4096"/> + <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller + default value is machine_bits, if not set --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of + virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank --> + <!-- *********************** cores ******************* --> + <component id="system.core0" name="core0"> + <!-- Core property --> + <param name="clock_rate" value="2000"/> + <!-- for cores with unknow timing, set to 0 to force off the opt flag --> + <param name="opt_local" value="1"/> + <param name="instruction_length" value="32"/> + <param name="opcode_width" value="7"/> + <param name="x86" value="0"/> + <param name="micro_opcode_width" value="8"/> + <param name="machine_type" value="0"/> + <!-- inorder/OoO; 1 inorder; 0 OOO--> + <param name="number_hardware_threads" value="1"/> + <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor, + it only may be more than one in SMT processors. BTB ports always equals to fetch ports since + branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> + <param name="fetch_width" value="2"/> + <!-- fetch_width determins the size of cachelines of L1 cache block --> + <param name="number_instruction_fetch_ports" value="1"/> + <param name="decode_width" value="2"/> + <!-- decode_width determins the number of ports of the + renaming table (both RAM and CAM) scheme --> + <param name="issue_width" value="4"/> + <param name="peak_issue_width" value="7"/> + <!-- issue_width determins the number of ports of Issue window and other logic + as in the complexity effective proccessors paper; issue_width==dispatch_width --> + <param name="commit_width" value="4"/> + <!-- commit_width determins the number of ports of register files --> + <param name="fp_issue_width" value="1"/> + <param name="prediction_width" value="1"/> + <!-- number of branch instructions can be predicted simultannouesl--> + <!-- Current version of McPAT does not distinguish int and floating point pipelines + Theses parameters are reserved for future use.--> + <param name="pipelines_per_core" value="1,1"/> + <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared--> + <param name="pipeline_depth" value="8,8"/> + <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops --> + <!-- issue and exe unit--> + <param name="ALU_per_core" value="3"/> + <!-- contains an adder, a shifter, and a logical unit --> + <param name="MUL_per_core" value="1"/> + <!-- For MUL and Div --> + <param name="FPU_per_core" value="1"/> + <!-- buffer between IF and ID stage --> + <param name="instruction_buffer_size" value="32"/> + <!-- buffer between ID and sche/exe stage --> + <param name="decoded_stream_buffer_size" value="16"/> + <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED--> + <!-- McPAT support 2 types of OoO cores, RS based and physical reg based--> + <param name="instruction_window_size" value="20"/> + <param name="fp_instruction_window_size" value="15"/> + <!-- Numbers need to be confirmed --> + <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 --> + <param name="ROB_size" value="0"/> + <!-- each in-flight instruction has an entry in ROB --> + <!-- registers --> + <param name="archi_Regs_IRF_size" value="32"/> + <param name="archi_Regs_FRF_size" value="32"/> + <!-- if OoO processor, phy_reg number is needed for renaming logic, + renaming logic is for both integer and floating point insts. --> + <param name="phy_Regs_IRF_size" value="64"/> + <param name="phy_Regs_FRF_size" value="64"/> + <!-- rename logic --> + <param name="rename_scheme" value="0"/> + <!-- can be RAM based(0) or CAM based(1) rename scheme + RAM-based scheme will have free list, status table; + CAM-based scheme have the valid bit in the data field of the CAM + both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions; + Detailed RAT Implementation see TR --> + <param name="register_windows_size" value="0"/> + <!-- how many windows in the windowed register file, sun processors; + no register windowing is used when this number is 0 --> + <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha), + They will always try to exeute out-of-order though. --> + <param name="LSU_order" value="inorder"/> + <param name="store_buffer_size" value="4"/> + <!-- By default, in-order cores do not have load buffers --> + <param name="load_buffer_size" value="0"/> + <!-- number of ports refer to sustainable concurrent memory accesses --> + <param name="memory_ports" value="1"/> + <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer + as well as the ports of Dcache which is connected to LSU --> + <!-- dual-pumped Dcache can be used to save the extra read/write ports --> + <param name="RAS_size" value="32"/> + <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check --> + <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops --> + <stat name="total_instructions" value="400000"/> + <stat name="int_instructions" value="200000"/> + <stat name="fp_instructions" value="100000"/> + <stat name="branch_instructions" value="100000"/> + <stat name="branch_mispredictions" value="0"/> + <stat name="load_instructions" value="0"/> + <stat name="store_instructions" value="50000"/> + <stat name="committed_instructions" value="400000"/> + <stat name="committed_int_instructions" value="200000"/> + <stat name="committed_fp_instructions" value="100000"/> + <stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous --> + <!-- the following cycle stats are used for heterogeneouse cores only, + please ignore them if homogeneouse cores --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!-- instruction buffer stats --> + <!-- ROB stats, both RS and Phy based OoOs have ROB + performance simulator should capture the difference on accesses, + otherwise, McPAT has to guess based on number of commited instructions. --> + <stat name="ROB_reads" value="400000"/> + <stat name="ROB_writes" value="400000"/> + <!-- RAT accesses --> + <stat name="rename_reads" value="800000"/> <!--lookup in renaming logic --> + <stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic --> + <stat name="fp_rename_reads" value="200000"/> + <stat name="fp_rename_writes" value="100000"/> + <!-- decode and rename stage use this, should be total ic - nop --> + <!-- Inst window stats --> + <stat name="inst_window_reads" value="400000"/> + <stat name="inst_window_writes" value="400000"/> + <stat name="inst_window_wakeup_accesses" value="800000"/> + <stat name="fp_inst_window_reads" value="200000"/> + <stat name="fp_inst_window_writes" value="200000"/> + <stat name="fp_inst_window_wakeup_accesses" value="400000"/> + <!-- RF accesses --> + <stat name="int_regfile_reads" value="600000"/> + <stat name="float_regfile_reads" value="100000"/> + <stat name="int_regfile_writes" value="300000"/> + <stat name="float_regfile_writes" value="50000"/> + <!-- accesses to the working reg --> + <stat name="function_calls" value="5"/> + <stat name="context_switches" value="260343"/> + <!-- Number of Windowes switches (number of function calls and returns)--> + <!-- Alu stats by default, the processor has one FPU that includes the divider and + multiplier. The fpu accesses should include accesses to multiplier and divider --> + <stat name="ialu_accesses" value="300000"/> + <stat name="fpu_accesses" value="100000"/> + <stat name="mul_accesses" value="200000"/> + <stat name="cdb_alu_accesses" value="300000"/> + <stat name="cdb_mul_accesses" value="200000"/> + <stat name="cdb_fpu_accesses" value="100000"/> + <!-- multiple cycle accesses should be counted multiple times, + otherwise, McPAT can use internal counter for different floating point instructions + to get final accesses. But that needs detailed info for floating point inst mix --> + <!-- currently the performance simulator should + make sure all the numbers are final numbers, + including the explicit read/write accesses, + and the implicite accesses such as replacements and etc. + Future versions of McPAT may be able to reason the implicite access + based on param and stats of last level cache + The same rule applies to all cache access stats too! --> + <!-- following is AF for max power computation. + Do not change them, unless you understand them--> + <stat name="IFU_duty_cycle" value="1"/> + <stat name="LSU_duty_cycle" value="0.5"/> + <stat name="MemManU_I_duty_cycle" value="1"/> + <stat name="MemManU_D_duty_cycle" value="0.5"/> + <stat name="ALU_duty_cycle" value="1"/> + <stat name="MUL_duty_cycle" value="0.3"/> + <stat name="FPU_duty_cycle" value="0.3"/> + <stat name="ALU_cdb_duty_cycle" value="1"/> + <stat name="MUL_cdb_duty_cycle" value="0.3"/> + <stat name="FPU_cdb_duty_cycle" value="0.3"/> + <param name="number_of_BPT" value="2"/> + <component id="system.core0.predictor" name="PBT"> + <!-- branch predictor; tournament predictor see Alpha implementation --> + <param name="local_predictor_size" value="10,3"/> + <param name="local_predictor_entries" value="1024"/> + <param name="global_predictor_entries" value="4096"/> + <param name="global_predictor_bits" value="2"/> + <param name="chooser_predictor_entries" value="4096"/> + <param name="chooser_predictor_bits" value="2"/> + <!-- These parameters can be combined like below in next version + <param name="load_predictor" value="10,3,1024"/> + <param name="global_predictor" value="4096,2"/> + <param name="predictor_chooser" value="4096,2"/> + --> + </component> + <component id="system.core0.itlb" name="itlb"> + <param name="number_entries" value="64"/> + <stat name="total_accesses" value="200000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + <!-- there is no write requests to itlb although writes happen to itlb after miss, + which is actually a replacement --> + </component> + <component id="system.core0.icache" name="icache"> + <!-- there is no write requests to itlb although writes happen to it after miss, + which is actually a replacement --> + <param name="icache_config" value="32768,8,4,1,10,10,32,0"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy, --> + <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate --> + <param name="buffer_sizes" value="4, 4, 4,0"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="read_misses" value="0"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dtlb" name="dtlb"> + <param name="number_entries" value="64"/><!--dual threads--> + <stat name="total_accesses" value="400000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dcache" name="dcache"> + <!-- all the buffer related are optional --> + <param name="dcache_config" value="32768,8,4,1, 10,10, 32,1 "/> + <param name="buffer_sizes" value="4, 4, 4, 4"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + </component> + <param name="number_of_BTB" value="2"/> + <component id="system.core0.BTB" name="BTB"> + <!-- all the buffer related are optional --> + <param name="BTB_config" value="2048,4,2, 2, 1,3"/> <!--should be 4096 + 1024 --> + <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <stat name="read_accesses" value="400000"/> <!--See IFU code for guideline --> + <stat name="write_accesses" value="0"/> + </component> + </component> + <component id="system.L1Directory0" name="L1Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="2048,1,0,1, 4, 4, 8"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="2000"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="20"/> + <stat name="duty_cycle" value="0.1"/> + </component> + <component id="system.L2Directory0" name="L2Directory0"> + <param name="Directory_type" value="1"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="1048576,16,16,1,2, 100"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="2000"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="100"/> + </component> + <component id="system.L20" name="L20"> + <!-- all the buffer related are optional --> + <param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <param name="clockrate" value="2000"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="1.0"/> + </component> + +<!--**********************************************************************--> +<component id="system.L30" name="L30"> + <param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="clockrate" value="800"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="11824"/> + <stat name="write_accesses" value="11276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="1.0"/> + </component> +<!--**********************************************************************--> + <component id="system.NoC0" name="noc0"> + <param name="clockrate" value="2000"/> + <param name="type" value="0"/> + <!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus + at each time only one node can send req --> + <param name="horizontal_nodes" value="1"/> + <param name="vertical_nodes" value="1"/> + <param name="has_global_link" value="0"/> + <!-- 1 has global link, 0 does not have global link --> + <param name="link_throughput" value="1"/><!--w.r.t clock --> + <param name="link_latency" value="1"/><!--w.r.t clock --> + <!-- througput >= latency --> + <!-- Router architecture --> + <param name="input_ports" value="1"/> + <param name="output_ports" value="1"/> + <!-- For bus the I/O ports should be 1 --> + <param name="flit_bits" value="128"/> + <param name="chip_coverage" value="1"/> + <!-- When multiple NOC present, one NOC will cover part of the whole chip. + chip_coverage <=1 --> + <param name="link_routing_over_percentage" value="0.5"/> + <!-- Links can route over other components or occupy whole area. + by default, 50% of the NoC global links routes over other + components --> + <stat name="total_accesses" value="100000"/> + <!-- This is the number of total accesses within the whole network not for each router --> + <stat name="duty_cycle" value="1"/> + </component> +<!--**********************************************************************--> + <component id="system.mem" name="mem"> + <!-- Main memory property --> + <param name="mem_tech_node" value="32"/> + <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB --> + <param name="peak_transfer_rate" value="6400"/><!--MB/S--> + <param name="internal_prefetch_of_DRAM_chip" value="4"/> + <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...--> + <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property --> + <!-- above numbers can be easily found from Wikipedia --> + <param name="capacity_per_channel" value="4096"/> <!-- MB --> + <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank + Current McPAT assumes single DIMMs are used.--> + <param name="number_ranks" value="2"/> + <param name="num_banks_of_DRAM_chip" value="8"/> + <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B --> + <param name="output_width_of_DRAM_chip" value="8"/> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 --> + <param name="burstlength_of_DRAM_chip" value="8"/> + <stat name="memory_accesses" value="1052"/> + <stat name="memory_reads" value="1052"/> + <stat name="memory_writes" value="1052"/> + </component> + <component id="system.mc" name="mc"> + <!-- Memeory controllers are for DDR(2,3...) DIMMs --> + <!-- current version of McPAT uses published values for base parameters of memory controller + improvments on MC will be added in later versions. --> + <param name="mc_clock" value="400"/><!--MHz--> + <param name="peak_transfer_rate" value="6400"/><!--MB/S--> + <param name="llc_line_length" value="64"/><!--B--> + <param name="number_mcs" value="0"/> + <!-- current McPAT only supports homogeneous memory controllers --> + <param name="memory_channels_per_mc" value="1"/> + <param name="number_ranks" value="2"/> + <!-- # of ranks of each channel--> + <param name="req_window_size_per_channel" value="32"/> + <param name="IO_buffer_size_per_channel" value="32"/> + <param name="databus_width" value="128"/> + <param name="addressbus_width" value="51"/> + <!-- McPAT will add the control bus width to the addressbus width automatically --> + <stat name="memory_accesses" value="66666"/> + <stat name="memory_reads" value="33333"/> + <stat name="memory_writes" value="33333"/> + <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate + the average power per MC or per channel. This is sufficent for most application. + Further trackdown can be easily added in later versions. --> + </component> +<!--**********************************************************************--> + </component> +</component> diff --git a/ext/mcpat/ARM_A9_2000.xml b/ext/mcpat/ARM_A9_2000.xml new file mode 100644 index 000000000..c040e1be3 --- /dev/null +++ b/ext/mcpat/ARM_A9_2000.xml @@ -0,0 +1,463 @@ +<?xml version="1.0" ?> +<component id="root" name="root"> + <component id="system" name="system"> + <!--McPAT will skip the components if number is set to 0 --> + <!--Duty cycles in this file are set according to "ARM MPcore + ARchitecture performance Enhancement" in MPF Japan 2008 --> + <param name="number_of_cores" value="2"/> + <param name="number_of_L1Directories" value="2"/> + <param name="number_of_L2Directories" value="0"/> + <param name="number_of_L2s" value="0"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports --> + <param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent --> + <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters --> + <param name="number_of_NoCs" value="1"/> + <param name="homogeneous_cores" value="1"/><!--1 means homo --> + <param name="homogeneous_L2s" value="1"/> + <param name="homogeneous_L1Directorys" value="1"/> + <param name="homogeneous_L2Directorys" value="1"/> + <param name="homogeneous_L3s" value="1"/> + <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware --> + <param name="homogeneous_NoCs" value="1"/> + <param name="core_tech_node" value="22"/><!-- nm --> + <param name="target_core_clockrate" value="2000"/><!--MHz --> + <param name="temperature" value="340"/> <!-- Kelvin --> + <param name="number_cache_levels" value="2"/> + <param name="interconnect_projection_type" value="1"/><!--0: agressive wire technology; 1: conservative wire technology --> + <param name="device_type" value="2"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) --> + <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate --> + <param name="Embedded" value="1"/><!-- Embedded processor like ARM or general purpose processors? --> + <param name="opt_clockrate" value="1"/> + <param name="machine_bits" value="32"/> + <param name="virtual_address_width" value="32"/> + <param name="physical_address_width" value="32"/> + <param name="virtual_memory_page_size" value="4096"/> + <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller + default value is machine_bits, if not set --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of + virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank --> + <!-- *********************** cores ******************* --> + <component id="system.core0" name="core0"> + <!-- Core property --> + <param name="clock_rate" value="2000"/> + <!-- for cores with unknow timing, set to 0 to force off the opt flag --> + <param name="opt_local" value="1"/> + <param name="instruction_length" value="32"/> + <param name="opcode_width" value="7"/> + <param name="x86" value="0"/> + <param name="micro_opcode_width" value="8"/> + <param name="machine_type" value="0"/> + <!-- inorder/OoO; 1 inorder; 0 OOO--> + <param name="number_hardware_threads" value="1"/> + <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor, + it only may be more than one in SMT processors. BTB ports always equals to fetch ports since + branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> + <param name="fetch_width" value="2"/> + <!-- fetch_width determins the size of cachelines of L1 cache block --> + <param name="number_instruction_fetch_ports" value="1"/> + <param name="decode_width" value="2"/> + <!-- decode_width determins the number of ports of the + renaming table (both RAM and CAM) scheme --> + <param name="issue_width" value="4"/> + <param name="peak_issue_width" value="7"/> + <!-- issue_width determins the number of ports of Issue window and other logic + as in the complexity effective proccessors paper; issue_width==dispatch_width --> + <param name="commit_width" value="4"/> + <!-- commit_width determins the number of ports of register files --> + <param name="fp_issue_width" value="1"/> + <param name="prediction_width" value="1"/> + <!-- number of branch instructions can be predicted simultannouesl--> + <!-- Current version of McPAT does not distinguish int and floating point pipelines + Theses parameters are reserved for future use.--> + <param name="pipelines_per_core" value="1,1"/> + <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared--> + <param name="pipeline_depth" value="8,8"/> + <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops --> + <!-- issue and exe unit--> + <param name="ALU_per_core" value="3"/> + <!-- contains an adder, a shifter, and a logical unit --> + <param name="MUL_per_core" value="1"/> + <!-- For MUL and Div --> + <param name="FPU_per_core" value="1"/> + <!-- buffer between IF and ID stage --> + <param name="instruction_buffer_size" value="32"/> + <!-- buffer between ID and sche/exe stage --> + <param name="decoded_stream_buffer_size" value="16"/> + <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED--> + <!-- McPAT support 2 types of OoO cores, RS based and physical reg based--> + <param name="instruction_window_size" value="20"/> + <param name="fp_instruction_window_size" value="15"/> + <!-- Numbers need to be confirmed --> + <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 --> + <param name="ROB_size" value="0"/> + <!-- each in-flight instruction has an entry in ROB --> + <!-- registers --> + <param name="archi_Regs_IRF_size" value="32"/> + <param name="archi_Regs_FRF_size" value="32"/> + <!-- if OoO processor, phy_reg number is needed for renaming logic, + renaming logic is for both integer and floating point insts. --> + <param name="phy_Regs_IRF_size" value="64"/> + <param name="phy_Regs_FRF_size" value="64"/> + <!-- rename logic --> + <param name="rename_scheme" value="0"/> + <!-- can be RAM based(0) or CAM based(1) rename scheme + RAM-based scheme will have free list, status table; + CAM-based scheme have the valid bit in the data field of the CAM + both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions; + Detailed RAT Implementation see TR --> + <param name="register_windows_size" value="0"/> + <!-- how many windows in the windowed register file, sun processors; + no register windowing is used when this number is 0 --> + <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha), + They will always try to exeute out-of-order though. --> + <param name="LSU_order" value="inorder"/> + <param name="store_buffer_size" value="4"/> + <!-- By default, in-order cores do not have load buffers --> + <param name="load_buffer_size" value="0"/> + <!-- number of ports refer to sustainable concurrent memory accesses --> + <param name="memory_ports" value="1"/> + <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer + as well as the ports of Dcache which is connected to LSU --> + <!-- dual-pumped Dcache can be used to save the extra read/write ports --> + <param name="RAS_size" value="4"/> + <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check --> + <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops --> + <stat name="total_instructions" value="400000"/> + <stat name="int_instructions" value="200000"/> + <stat name="fp_instructions" value="100000"/> + <stat name="branch_instructions" value="100000"/> + <stat name="branch_mispredictions" value="0"/> + <stat name="load_instructions" value="0"/> + <stat name="store_instructions" value="50000"/> + <stat name="committed_instructions" value="400000"/> + <stat name="committed_int_instructions" value="200000"/> + <stat name="committed_fp_instructions" value="100000"/> + <stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous --> + <!-- the following cycle stats are used for heterogeneouse cores only, + please ignore them if homogeneouse cores --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!-- instruction buffer stats --> + <!-- ROB stats, both RS and Phy based OoOs have ROB + performance simulator should capture the difference on accesses, + otherwise, McPAT has to guess based on number of commited instructions. --> + <stat name="ROB_reads" value="400000"/> + <stat name="ROB_writes" value="400000"/> + <!-- RAT accesses --> + <stat name="rename_reads" value="800000"/> <!--lookup in renaming logic --> + <stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic --> + <stat name="fp_rename_reads" value="200000"/> + <stat name="fp_rename_writes" value="100000"/> + <!-- decode and rename stage use this, should be total ic - nop --> + <!-- Inst window stats --> + <stat name="inst_window_reads" value="400000"/> + <stat name="inst_window_writes" value="400000"/> + <stat name="inst_window_wakeup_accesses" value="800000"/> + <stat name="fp_inst_window_reads" value="200000"/> + <stat name="fp_inst_window_writes" value="200000"/> + <stat name="fp_inst_window_wakeup_accesses" value="400000"/> + <!-- RF accesses --> + <stat name="int_regfile_reads" value="600000"/> + <stat name="float_regfile_reads" value="100000"/> + <stat name="int_regfile_writes" value="300000"/> + <stat name="float_regfile_writes" value="50000"/> + <!-- accesses to the working reg --> + <stat name="function_calls" value="5"/> + <stat name="context_switches" value="260343"/> + <!-- Number of Windowes switches (number of function calls and returns)--> + <!-- Alu stats by default, the processor has one FPU that includes the divider and + multiplier. The fpu accesses should include accesses to multiplier and divider --> + <stat name="ialu_accesses" value="300000"/> + <stat name="fpu_accesses" value="100000"/> + <stat name="mul_accesses" value="200000"/> + <stat name="cdb_alu_accesses" value="300000"/> + <stat name="cdb_mul_accesses" value="200000"/> + <stat name="cdb_fpu_accesses" value="100000"/> + <!-- multiple cycle accesses should be counted multiple times, + otherwise, McPAT can use internal counter for different floating point instructions + to get final accesses. But that needs detailed info for floating point inst mix --> + <!-- currently the performance simulator should + make sure all the numbers are final numbers, + including the explicit read/write accesses, + and the implicite accesses such as replacements and etc. + Future versions of McPAT may be able to reason the implicite access + based on param and stats of last level cache + The same rule applies to all cache access stats too! --> + <!-- following is AF for max power computation. + Do not change them, unless you understand them--> + <stat name="IFU_duty_cycle" value="0.9"/> + <stat name="BR_duty_cycle" value="0.72"/><!--branch--> + <stat name="LSU_duty_cycle" value="0.71"/> + <stat name="MemManU_I_duty_cycle" value="0.9"/> + <stat name="MemManU_D_duty_cycle" value="0.71"/> + <stat name="ALU_duty_cycle" value="0.76"/> + <!-- (.78*2+.71)/3 --> + <stat name="MUL_duty_cycle" value="0.82"/> + <stat name="FPU_duty_cycle" value="0.0"/> + <stat name="ALU_cdb_duty_cycle" value="0.76"/> + <stat name="MUL_cdb_duty_cycle" value="0.82"/> + <stat name="FPU_cdb_duty_cycle" value="0.0"/> + <param name="number_of_BPT" value="2"/> + <component id="system.core0.predictor" name="PBT"> + <!-- branch predictor; tournament predictor see Alpha implementation --> + <param name="local_predictor_size" value="10,3"/> + <param name="local_predictor_entries" value="4"/> + <param name="global_predictor_entries" value="4096"/> + <param name="global_predictor_bits" value="2"/> + <param name="chooser_predictor_entries" value="4096"/> + <param name="chooser_predictor_bits" value="2"/> + <!-- These parameters can be combined like below in next version + <param name="load_predictor" value="10,3,1024"/> + <param name="global_predictor" value="4096,2"/> + <param name="predictor_chooser" value="4096,2"/> + --> + </component> + <component id="system.core0.itlb" name="itlb"> + <param name="number_entries" value="64"/> + <stat name="total_accesses" value="200000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + <!-- there is no write requests to itlb although writes happen to itlb after miss, + which is actually a replacement --> + </component> + <component id="system.core0.icache" name="icache"> + <!-- there is no write requests to itlb although writes happen to it after miss, + which is actually a replacement --> + <param name="icache_config" value="32768,8,4,1,10,10,32,0"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy, --> + <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate --> + <param name="buffer_sizes" value="4, 4, 4,0"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="read_misses" value="0"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dtlb" name="dtlb"> + <param name="number_entries" value="64"/><!--dual threads--> + <stat name="total_accesses" value="400000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dcache" name="dcache"> + <!-- all the buffer related are optional --> + <param name="dcache_config" value="32768,8,4,1, 10,10, 32,1 "/> + <param name="buffer_sizes" value="4, 4, 4, 4"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + </component> + <param name="number_of_BTB" value="2"/> + <component id="system.core0.BTB" name="BTB"> + <!-- all the buffer related are optional --> + <param name="BTB_config" value="4096,4,2, 2, 1,1"/> + <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <stat name="read_accesses" value="400000"/> <!--See IFU code for guideline --> + <stat name="write_accesses" value="0"/> + </component> + </component> + <component id="system.L1Directory0" name="L1Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="2048,1,0,1, 4, 4, 8"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="2000"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="2"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="20"/> + <stat name="duty_cycle" value="0.1"/> + </component> + <component id="system.L2Directory0" name="L2Directory0"> + <param name="Directory_type" value="1"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="1048576,16,16,1,2, 100"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3400"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="100"/> + <stat name="duty_cycle" value="0.1"/> + </component> + <component id="system.L20" name="L20"> + <!-- all the buffer related are optional --> + <param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <param name="clockrate" value="3400"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="1.0"/> + </component> + +<!--**********************************************************************--> +<component id="system.L30" name="L30"> + <param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="clockrate" value="800"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="11824"/> + <stat name="write_accesses" value="11276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="1.0"/> + </component> +<!--**********************************************************************--> + <component id="system.NoC0" name="noc0"> + <param name="clockrate" value="2000"/> + <param name="type" value="0"/> + <!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus + at each time only one node can send req --> + <param name="horizontal_nodes" value="1"/> + <param name="vertical_nodes" value="1"/> + <param name="has_global_link" value="0"/> + <!-- 1 has global link, 0 does not have global link --> + <param name="link_throughput" value="1"/><!--w.r.t clock --> + <param name="link_latency" value="1"/><!--w.r.t clock --> + <!-- througput >= latency --> + <!-- Router architecture --> + <param name="input_ports" value="1"/> + <param name="output_ports" value="1"/> + <!-- For bus the I/O ports should be 1 --> + <param name="flit_bits" value="64"/> + <param name="chip_coverage" value="1"/> + <!-- When multiple NOC present, one NOC will cover part of the whole chip. + chip_coverage <=1 --> + <param name="link_routing_over_percentage" value="0.5"/> + <!-- Links can route over other components or occupy whole area. + by default, 50% of the NoC global links routes over other + components --> + <stat name="total_accesses" value="100000"/> + <!-- This is the number of total accesses within the whole network not for each router --> + <stat name="duty_cycle" value="0.2"/> + </component> +<!--**********************************************************************--> + <component id="system.mem" name="mem"> + <!-- Main memory property --> + <param name="mem_tech_node" value="32"/> + <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB --> + <param name="peak_transfer_rate" value="6400"/><!--MB/S--> + <param name="internal_prefetch_of_DRAM_chip" value="4"/> + <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...--> + <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property --> + <!-- above numbers can be easily found from Wikipedia --> + <param name="capacity_per_channel" value="4096"/> <!-- MB --> + <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank + Current McPAT assumes single DIMMs are used.--> + <param name="number_ranks" value="2"/> + <param name="num_banks_of_DRAM_chip" value="8"/> + <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B --> + <param name="output_width_of_DRAM_chip" value="8"/> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 --> + <param name="burstlength_of_DRAM_chip" value="8"/> + <stat name="memory_accesses" value="1052"/> + <stat name="memory_reads" value="1052"/> + <stat name="memory_writes" value="1052"/> + </component> + <component id="system.mc" name="mc"> + <!-- Memeory controllers are for DDR(2,3...) DIMMs --> + <!-- current version of McPAT uses published values for base parameters of memory controller + improvments on MC will be added in later versions. --> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="mc_clock" value="400"/><!--MHz--> + <param name="peak_transfer_rate" value="6400"/><!--MB/S--> + <param name="block_size" value="64"/><!--(B) the block size of last level cache, which is the unit for one memory burst transfer --> + <param name="number_mcs" value="1"/> + <!-- current McPAT only supports homogeneous memory controllers --> + <param name="memory_channels_per_mc" value="1"/> + <param name="number_ranks" value="0"/> + <!-- # of ranks of each channel--> + <param name="req_window_size_per_channel" value="32"/> + <param name="IO_buffer_size_per_channel" value="32"/> + <param name="databus_width" value="128"/> + <param name="addressbus_width" value="51"/> + <!-- McPAT will add the control bus width to the addressbus width automatically --> + <stat name="memory_accesses" value="66666"/> + <stat name="memory_reads" value="33333"/> + <stat name="memory_writes" value="33333"/> + <param name="withPHY" value="1"/> + <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate + the average power per MC or per channel. This is sufficent for most application. + Further trackdown can be easily added in later versions. --> + </component> +<!--**********************************************************************--> + <component id="system.niu" name="niu"> + <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller --> + <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. + the low bound of clock rate of a 10Gb MAC is 150Mhz --> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="clockrate" value="350"/> + <param name="number_units" value="1"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate + the average power per nic or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.pcie" name="pcie"> + <!-- On chip PCIe controller, including Phy--> + <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. + the low bound of clock rate of a PCIe per lane logic is 120Mhz --> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="clockrate" value="350"/> + <param name="number_units" value="1"/> + <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate + the average power per pcie controller or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.flashc" name="flashc"> + <param name="number_flashcs" value="1"/> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate + the average power per fc or per channel. This is sufficent for most application --> + </component> +<!--**********************************************************************--> + + </component> +</component> diff --git a/ext/mcpat/ARM_A9_800.xml b/ext/mcpat/ARM_A9_800.xml new file mode 100644 index 000000000..fd7b21438 --- /dev/null +++ b/ext/mcpat/ARM_A9_800.xml @@ -0,0 +1,463 @@ +<?xml version="1.0" ?> +<component id="root" name="root"> + <component id="system" name="system"> + <!--McPAT will skip the components if number is set to 0 --> + <!--Duty cycles in this file are set according to "ARM MPcore + ARchitecture performance Enhancement" in MPF Japan 2008 --> + <param name="number_of_cores" value="2"/> + <param name="number_of_L1Directories" value="2"/> + <param name="number_of_L2Directories" value="0"/> + <param name="number_of_L2s" value="0"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports --> + <param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent --> + <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters --> + <param name="number_of_NoCs" value="1"/> + <param name="homogeneous_cores" value="1"/><!--1 means homo --> + <param name="homogeneous_L2s" value="1"/> + <param name="homogeneous_L1Directorys" value="1"/> + <param name="homogeneous_L2Directorys" value="1"/> + <param name="homogeneous_L3s" value="1"/> + <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware --> + <param name="homogeneous_NoCs" value="1"/> + <param name="core_tech_node" value="32"/><!-- nm --> + <param name="target_core_clockrate" value="800"/><!--MHz --> + <param name="temperature" value="340"/> <!-- Kelvin --> + <param name="number_cache_levels" value="2"/> + <param name="interconnect_projection_type" value="1"/><!--0: agressive wire technology; 1: conservative wire technology --> + <param name="device_type" value="2"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) --> + <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate --> + <param name="Embedded" value="1"/><!-- Embedded processor like ARM or general purpose processors? --> + <param name="opt_clockrate" value="0"/> + <param name="machine_bits" value="32"/> + <param name="virtual_address_width" value="32"/> + <param name="physical_address_width" value="32"/> + <param name="virtual_memory_page_size" value="4096"/> + <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller + default value is machine_bits, if not set --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of + virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank --> + <!-- *********************** cores ******************* --> + <component id="system.core0" name="core0"> + <!-- Core property --> + <param name="clock_rate" value="800"/> + <!-- for cores with unknow timing, set to 0 to force off the opt flag --> + <param name="opt_local" value="1"/> + <param name="instruction_length" value="32"/> + <param name="opcode_width" value="7"/> + <param name="x86" value="0"/> + <param name="micro_opcode_width" value="8"/> + <param name="machine_type" value="0"/> + <!-- inorder/OoO; 1 inorder; 0 OOO--> + <param name="number_hardware_threads" value="1"/> + <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor, + it only may be more than one in SMT processors. BTB ports always equals to fetch ports since + branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> + <param name="fetch_width" value="2"/> + <!-- fetch_width determins the size of cachelines of L1 cache block --> + <param name="number_instruction_fetch_ports" value="1"/> + <param name="decode_width" value="2"/> + <!-- decode_width determins the number of ports of the + renaming table (both RAM and CAM) scheme --> + <param name="issue_width" value="4"/> + <param name="peak_issue_width" value="7"/> + <!-- issue_width determins the number of ports of Issue window and other logic + as in the complexity effective proccessors paper; issue_width==dispatch_width --> + <param name="commit_width" value="4"/> + <!-- commit_width determins the number of ports of register files --> + <param name="fp_issue_width" value="1"/> + <param name="prediction_width" value="1"/> + <!-- number of branch instructions can be predicted simultannouesl--> + <!-- Current version of McPAT does not distinguish int and floating point pipelines + Theses parameters are reserved for future use.--> + <param name="pipelines_per_core" value="1,1"/> + <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared--> + <param name="pipeline_depth" value="8,8"/> + <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops --> + <!-- issue and exe unit--> + <param name="ALU_per_core" value="3"/> + <!-- contains an adder, a shifter, and a logical unit --> + <param name="MUL_per_core" value="1"/> + <!-- For MUL and Div --> + <param name="FPU_per_core" value="1"/> + <!-- buffer between IF and ID stage --> + <param name="instruction_buffer_size" value="32"/> + <!-- buffer between ID and sche/exe stage --> + <param name="decoded_stream_buffer_size" value="16"/> + <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED--> + <!-- McPAT support 2 types of OoO cores, RS based and physical reg based--> + <param name="instruction_window_size" value="20"/> + <param name="fp_instruction_window_size" value="15"/> + <!-- Numbers need to be confirmed --> + <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 --> + <param name="ROB_size" value="0"/> + <!-- each in-flight instruction has an entry in ROB --> + <!-- registers --> + <param name="archi_Regs_IRF_size" value="32"/> + <param name="archi_Regs_FRF_size" value="32"/> + <!-- if OoO processor, phy_reg number is needed for renaming logic, + renaming logic is for both integer and floating point insts. --> + <param name="phy_Regs_IRF_size" value="64"/> + <param name="phy_Regs_FRF_size" value="64"/> + <!-- rename logic --> + <param name="rename_scheme" value="0"/> + <!-- can be RAM based(0) or CAM based(1) rename scheme + RAM-based scheme will have free list, status table; + CAM-based scheme have the valid bit in the data field of the CAM + both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions; + Detailed RAT Implementation see TR --> + <param name="register_windows_size" value="0"/> + <!-- how many windows in the windowed register file, sun processors; + no register windowing is used when this number is 0 --> + <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha), + They will always try to exeute out-of-order though. --> + <param name="LSU_order" value="inorder"/> + <param name="store_buffer_size" value="4"/> + <!-- By default, in-order cores do not have load buffers --> + <param name="load_buffer_size" value="0"/> + <!-- number of ports refer to sustainable concurrent memory accesses --> + <param name="memory_ports" value="1"/> + <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer + as well as the ports of Dcache which is connected to LSU --> + <!-- dual-pumped Dcache can be used to save the extra read/write ports --> + <param name="RAS_size" value="4"/> + <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check --> + <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops --> + <stat name="total_instructions" value="400000"/> + <stat name="int_instructions" value="200000"/> + <stat name="fp_instructions" value="100000"/> + <stat name="branch_instructions" value="100000"/> + <stat name="branch_mispredictions" value="0"/> + <stat name="load_instructions" value="0"/> + <stat name="store_instructions" value="50000"/> + <stat name="committed_instructions" value="400000"/> + <stat name="committed_int_instructions" value="200000"/> + <stat name="committed_fp_instructions" value="100000"/> + <stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous --> + <!-- the following cycle stats are used for heterogeneouse cores only, + please ignore them if homogeneouse cores --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!-- instruction buffer stats --> + <!-- ROB stats, both RS and Phy based OoOs have ROB + performance simulator should capture the difference on accesses, + otherwise, McPAT has to guess based on number of commited instructions. --> + <stat name="ROB_reads" value="400000"/> + <stat name="ROB_writes" value="400000"/> + <!-- RAT accesses --> + <stat name="rename_reads" value="800000"/> <!--lookup in renaming logic --> + <stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic --> + <stat name="fp_rename_reads" value="200000"/> + <stat name="fp_rename_writes" value="100000"/> + <!-- decode and rename stage use this, should be total ic - nop --> + <!-- Inst window stats --> + <stat name="inst_window_reads" value="400000"/> + <stat name="inst_window_writes" value="400000"/> + <stat name="inst_window_wakeup_accesses" value="800000"/> + <stat name="fp_inst_window_reads" value="200000"/> + <stat name="fp_inst_window_writes" value="200000"/> + <stat name="fp_inst_window_wakeup_accesses" value="400000"/> + <!-- RF accesses --> + <stat name="int_regfile_reads" value="600000"/> + <stat name="float_regfile_reads" value="100000"/> + <stat name="int_regfile_writes" value="300000"/> + <stat name="float_regfile_writes" value="50000"/> + <!-- accesses to the working reg --> + <stat name="function_calls" value="5"/> + <stat name="context_switches" value="260343"/> + <!-- Number of Windowes switches (number of function calls and returns)--> + <!-- Alu stats by default, the processor has one FPU that includes the divider and + multiplier. The fpu accesses should include accesses to multiplier and divider --> + <stat name="ialu_accesses" value="300000"/> + <stat name="fpu_accesses" value="100000"/> + <stat name="mul_accesses" value="200000"/> + <stat name="cdb_alu_accesses" value="300000"/> + <stat name="cdb_mul_accesses" value="200000"/> + <stat name="cdb_fpu_accesses" value="100000"/> + <!-- multiple cycle accesses should be counted multiple times, + otherwise, McPAT can use internal counter for different floating point instructions + to get final accesses. But that needs detailed info for floating point inst mix --> + <!-- currently the performance simulator should + make sure all the numbers are final numbers, + including the explicit read/write accesses, + and the implicite accesses such as replacements and etc. + Future versions of McPAT may be able to reason the implicite access + based on param and stats of last level cache + The same rule applies to all cache access stats too! --> + <!-- following is AF for max power computation. + Do not change them, unless you understand them--> + <stat name="IFU_duty_cycle" value="0.9"/> + <stat name="BR_duty_cycle" value="0.72"/><!--branch--> + <stat name="LSU_duty_cycle" value="0.71"/> + <stat name="MemManU_I_duty_cycle" value="0.9"/> + <stat name="MemManU_D_duty_cycle" value="0.71"/> + <stat name="ALU_duty_cycle" value="0.76"/> + <!-- (.78*2+.71)/3 --> + <stat name="MUL_duty_cycle" value="0.82"/> + <stat name="FPU_duty_cycle" value="0.0"/> + <stat name="ALU_cdb_duty_cycle" value="0.76"/> + <stat name="MUL_cdb_duty_cycle" value="0.82"/> + <stat name="FPU_cdb_duty_cycle" value="0.0"/> + <param name="number_of_BPT" value="2"/> + <component id="system.core0.predictor" name="PBT"> + <!-- branch predictor; tournament predictor see Alpha implementation --> + <param name="local_predictor_size" value="10,3"/> + <param name="local_predictor_entries" value="4"/> + <param name="global_predictor_entries" value="4096"/> + <param name="global_predictor_bits" value="2"/> + <param name="chooser_predictor_entries" value="4096"/> + <param name="chooser_predictor_bits" value="2"/> + <!-- These parameters can be combined like below in next version + <param name="load_predictor" value="10,3,1024"/> + <param name="global_predictor" value="4096,2"/> + <param name="predictor_chooser" value="4096,2"/> + --> + </component> + <component id="system.core0.itlb" name="itlb"> + <param name="number_entries" value="64"/> + <stat name="total_accesses" value="200000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + <!-- there is no write requests to itlb although writes happen to itlb after miss, + which is actually a replacement --> + </component> + <component id="system.core0.icache" name="icache"> + <!-- there is no write requests to itlb although writes happen to it after miss, + which is actually a replacement --> + <param name="icache_config" value="32768,8,4,1,10,10,32,0"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy, --> + <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate --> + <param name="buffer_sizes" value="4, 4, 4,0"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="read_misses" value="0"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dtlb" name="dtlb"> + <param name="number_entries" value="64"/><!--dual threads--> + <stat name="total_accesses" value="400000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dcache" name="dcache"> + <!-- all the buffer related are optional --> + <param name="dcache_config" value="32768,8,4,1, 10,10, 32,1 "/> + <param name="buffer_sizes" value="4, 4, 4, 4"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + </component> + <param name="number_of_BTB" value="2"/> + <component id="system.core0.BTB" name="BTB"> + <!-- all the buffer related are optional --> + <param name="BTB_config" value="4096,4,2, 2, 1,1"/> + <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <stat name="read_accesses" value="400000"/> <!--See IFU code for guideline --> + <stat name="write_accesses" value="0"/> + </component> + </component> + <component id="system.L1Directory0" name="L1Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="2048,1,0,1, 4, 4, 8"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="800"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="2"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="20"/> + <stat name="duty_cycle" value="0.1"/> + </component> + <component id="system.L2Directory0" name="L2Directory0"> + <param name="Directory_type" value="1"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="1048576,16,16,1,2, 100"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3400"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="100"/> + <stat name="duty_cycle" value="0.1"/> + </component> + <component id="system.L20" name="L20"> + <!-- all the buffer related are optional --> + <param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <param name="clockrate" value="3400"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="1.0"/> + </component> + +<!--**********************************************************************--> +<component id="system.L30" name="L30"> + <param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="clockrate" value="800"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="11824"/> + <stat name="write_accesses" value="11276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="1.0"/> + </component> +<!--**********************************************************************--> + <component id="system.NoC0" name="noc0"> + <param name="clockrate" value="800"/> + <param name="type" value="0"/> + <!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus + at each time only one node can send req --> + <param name="horizontal_nodes" value="1"/> + <param name="vertical_nodes" value="1"/> + <param name="has_global_link" value="0"/> + <!-- 1 has global link, 0 does not have global link --> + <param name="link_throughput" value="1"/><!--w.r.t clock --> + <param name="link_latency" value="1"/><!--w.r.t clock --> + <!-- througput >= latency --> + <!-- Router architecture --> + <param name="input_ports" value="1"/> + <param name="output_ports" value="1"/> + <!-- For bus the I/O ports should be 1 --> + <param name="flit_bits" value="64"/> + <param name="chip_coverage" value="1"/> + <!-- When multiple NOC present, one NOC will cover part of the whole chip. + chip_coverage <=1 --> + <param name="link_routing_over_percentage" value="0.5"/> + <!-- Links can route over other components or occupy whole area. + by default, 50% of the NoC global links routes over other + components --> + <stat name="total_accesses" value="100000"/> + <!-- This is the number of total accesses within the whole network not for each router --> + <stat name="duty_cycle" value="0.2"/> + </component> +<!--**********************************************************************--> + <component id="system.mem" name="mem"> + <!-- Main memory property --> + <param name="mem_tech_node" value="32"/> + <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB --> + <param name="peak_transfer_rate" value="6400"/><!--MB/S--> + <param name="internal_prefetch_of_DRAM_chip" value="4"/> + <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...--> + <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property --> + <!-- above numbers can be easily found from Wikipedia --> + <param name="capacity_per_channel" value="4096"/> <!-- MB --> + <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank + Current McPAT assumes single DIMMs are used.--> + <param name="number_ranks" value="2"/> + <param name="num_banks_of_DRAM_chip" value="8"/> + <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B --> + <param name="output_width_of_DRAM_chip" value="8"/> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 --> + <param name="burstlength_of_DRAM_chip" value="8"/> + <stat name="memory_accesses" value="1052"/> + <stat name="memory_reads" value="1052"/> + <stat name="memory_writes" value="1052"/> + </component> + <component id="system.mc" name="mc"> + <!-- Memeory controllers are for DDR(2,3...) DIMMs --> + <!-- current version of McPAT uses published values for base parameters of memory controller + improvments on MC will be added in later versions. --> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="mc_clock" value="400"/><!--MHz--> + <param name="peak_transfer_rate" value="6400"/><!--MB/S--> + <param name="block_size" value="64"/><!--(B) the block size of last level cache, which is the unit for one memory burst transfer --> + <param name="number_mcs" value="0"/> + <!-- current McPAT only supports homogeneous memory controllers --> + <param name="memory_channels_per_mc" value="1"/> + <param name="number_ranks" value="0"/> + <!-- # of ranks of each channel--> + <param name="req_window_size_per_channel" value="32"/> + <param name="IO_buffer_size_per_channel" value="32"/> + <param name="databus_width" value="128"/> + <param name="addressbus_width" value="51"/> + <!-- McPAT will add the control bus width to the addressbus width automatically --> + <stat name="memory_accesses" value="66666"/> + <stat name="memory_reads" value="33333"/> + <stat name="memory_writes" value="33333"/> + <param name="withPHY" value="1"/> + <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate + the average power per MC or per channel. This is sufficent for most application. + Further trackdown can be easily added in later versions. --> + </component> +<!--**********************************************************************--> + <component id="system.niu" name="niu"> + <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller --> + <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. + the low bound of clock rate of a 10Gb MAC is 150Mhz --> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate + the average power per nic or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.pcie" name="pcie"> + <!-- On chip PCIe controller, including Phy--> + <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. + the low bound of clock rate of a PCIe per lane logic is 120Mhz --> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> + <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate + the average power per pcie controller or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.flashc" name="flashc"> + <param name="number_flashcs" value="0"/> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate + the average power per fc or per channel. This is sufficent for most application --> + </component> +<!--**********************************************************************--> + + </component> +</component> diff --git a/ext/mcpat/Alpha21364.xml b/ext/mcpat/Alpha21364.xml new file mode 100644 index 000000000..c40c4f50b --- /dev/null +++ b/ext/mcpat/Alpha21364.xml @@ -0,0 +1,456 @@ +<?xml version="1.0" ?> +<component id="root" name="root"> + <component id="system" name="system"> + <!--McPAT will skip the components if number is set to 0 --> + <param name="number_of_cores" value="1"/> + <param name="number_of_L1Directories" value="0"/> + <param name="number_of_L2Directories" value="1"/> + <param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports --> + <param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent --> + <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters --> + <param name="number_of_NoCs" value="1"/> + <param name="homogeneous_cores" value="1"/><!--1 means homo --> + <param name="homogeneous_L2s" value="1"/> + <param name="homogeneous_L1Directorys" value="1"/> + <param name="homogeneous_L2Directorys" value="1"/> + <param name="homogeneous_L3s" value="1"/> + <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware --> + <param name="homogeneous_NoCs" value="1"/> + <param name="core_tech_node" value="90"/><!-- nm --> + <param name="target_core_clockrate" value="1200"/><!--MHz --> + <param name="temperature" value="380"/> <!-- Kelvin --> + <param name="number_cache_levels" value="2"/> + <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology --> + <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) --> + <param name="longer_channel_device" value="0"/><!-- 0 no use; 1 use when approperiate --> + <param name="machine_bits" value="64"/> + <param name="virtual_address_width" value="64"/> + <param name="physical_address_width" value="52"/> + <param name="virtual_memory_page_size" value="4096"/> + <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller + default value is machine_bits, if not set --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of + virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank --> + <!-- *********************** cores ******************* --> + <component id="system.core0" name="core0"> + <!-- Core property --> + <param name="clock_rate" value="1200"/> + <!-- for cores with unknow timing, set to 0 to force off the opt flag --> + <param name="opt_local" value="1"/> + <param name="instruction_length" value="32"/> + <param name="opcode_width" value="7"/> + <param name="x86" value="0"/> + <param name="micro_opcode_width" value="8"/> + <param name="machine_type" value="0"/> + <!-- inorder/OoO; 1 inorder; 0 OOO--> + <param name="number_hardware_threads" value="1"/> + <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor, + it only may be more than one in SMT processors. BTB ports always equals to fetch ports since + branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> + <param name="fetch_width" value="4"/> + <!-- fetch_width determins the size of cachelines of L1 cache block --> + <param name="number_instruction_fetch_ports" value="1"/> + <param name="decode_width" value="4"/> + <!-- decode_width determins the number of ports of the + renaming table (both RAM and CAM) scheme --> + <param name="issue_width" value="4"/> + <param name="peak_issue_width" value="6"/> + <!-- issue_width determins the number of ports of Issue window and other logic + as in the complexity effective proccessors paper; issue_width==dispatch_width --> + <param name="commit_width" value="4"/> + <!-- commit_width determins the number of ports of register files --> + <param name="fp_issue_width" value="2"/> + <param name="prediction_width" value="1"/> + <!-- number of branch instructions can be predicted simultannouesl--> + <!-- Current version of McPAT does not distinguish int and floating point pipelines + Theses parameters are reserved for future use.--> + <param name="pipelines_per_core" value="1,1"/> + <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared--> + <param name="pipeline_depth" value="7,7"/> + <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops --> + <!-- issue and exe unit--> + <param name="ALU_per_core" value="4"/> + <!-- contains an adder, a shifter, and a logical unit --> + <param name="MUL_per_core" value="0"/> + <!-- For MUL and Div --> + <param name="FPU_per_core" value="1"/> + <!-- buffer between IF and ID stage --> + <param name="instruction_buffer_size" value="32"/> + <!-- buffer between ID and sche/exe stage --> + <param name="decoded_stream_buffer_size" value="16"/> + <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED--> + <!-- McPAT support 2 types of OoO cores, RS based and physical reg based--> + <param name="instruction_window_size" value="20"/> + <param name="fp_instruction_window_size" value="15"/> + <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 --> + <param name="ROB_size" value="80"/> + <!-- each in-flight instruction has an entry in ROB --> + <!-- registers --> + <param name="archi_Regs_IRF_size" value="32"/> + <param name="archi_Regs_FRF_size" value="32"/> + <!-- if OoO processor, phy_reg number is needed for renaming logic, + renaming logic is for both integer and floating point insts. --> + <param name="phy_Regs_IRF_size" value="80"/> + <param name="phy_Regs_FRF_size" value="72"/> + <!-- rename logic --> + <param name="rename_scheme" value="1"/> + <!-- can be RAM based(0) or CAM based(1) rename scheme + RAM-based scheme will have free list, status table; + CAM-based scheme have the valid bit in the data field of the CAM + both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions; + Detailed RAT Implementation see TR --> + <param name="register_windows_size" value="0"/> + <!-- how many windows in the windowed register file, sun processors; + no register windowing is used when this number is 0 --> + <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha), + They will always try to exeute out-of-order though. --> + <param name="LSU_order" value="inorder"/> + <param name="store_buffer_size" value="32"/> + <!-- By default, in-order cores do not have load buffers --> + <param name="load_buffer_size" value="32"/> + <!-- number of ports refer to sustainable concurrent memory accesses --> + <param name="memory_ports" value="2"/> + <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer + as well as the ports of Dcache which is connected to LSU --> + <!-- dual-pumped Dcache can be used to save the extra read/write ports --> + <param name="RAS_size" value="32"/> + <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check --> + <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops --> + <stat name="total_instructions" value="400000"/> + <stat name="int_instructions" value="200000"/> + <stat name="fp_instructions" value="100000"/> + <stat name="branch_instructions" value="100000"/> + <stat name="branch_mispredictions" value="0"/> + <stat name="load_instructions" value="0"/> + <stat name="store_instructions" value="50000"/> + <stat name="committed_instructions" value="400000"/> + <stat name="committed_int_instructions" value="200000"/> + <stat name="committed_fp_instructions" value="100000"/> + <stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous --> + <!-- the following cycle stats are used for heterogeneouse cores only, + please ignore them if homogeneouse cores --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!-- instruction buffer stats --> + <!-- ROB stats, both RS and Phy based OoOs have ROB + performance simulator should capture the difference on accesses, + otherwise, McPAT has to guess based on number of commited instructions. --> + <stat name="ROB_reads" value="400000"/> + <stat name="ROB_writes" value="400000"/> + <!-- RAT accesses --> + <stat name="rename_reads" value="800000"/> <!--lookup in renaming logic --> + <stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic --> + <stat name="fp_rename_reads" value="200000"/> + <stat name="fp_rename_writes" value="100000"/> + <!-- decode and rename stage use this, should be total ic - nop --> + <!-- Inst window stats --> + <stat name="inst_window_reads" value="400000"/> + <stat name="inst_window_writes" value="400000"/> + <stat name="inst_window_wakeup_accesses" value="800000"/> + <stat name="fp_inst_window_reads" value="200000"/> + <stat name="fp_inst_window_writes" value="200000"/> + <stat name="fp_inst_window_wakeup_accesses" value="400000"/> + <!-- RF accesses --> + <stat name="int_regfile_reads" value="600000"/> + <stat name="float_regfile_reads" value="100000"/> + <stat name="int_regfile_writes" value="300000"/> + <stat name="float_regfile_writes" value="50000"/> + <!-- accesses to the working reg --> + <stat name="function_calls" value="5"/> + <stat name="context_switches" value="260343"/> + <!-- Number of Windowes switches (number of function calls and returns)--> + <!-- Alu stats by default, the processor has one FPU that includes the divider and + multiplier. The fpu accesses should include accesses to multiplier and divider --> + <stat name="ialu_accesses" value="300000"/> + <stat name="fpu_accesses" value="100000"/> + <stat name="mul_accesses" value="200000"/> + <stat name="cdb_alu_accesses" value="300000"/> + <stat name="cdb_mul_accesses" value="200000"/> + <stat name="cdb_fpu_accesses" value="100000"/> + <!-- multiple cycle accesses should be counted multiple times, + otherwise, McPAT can use internal counter for different floating point instructions + to get final accesses. But that needs detailed info for floating point inst mix --> + <!-- currently the performance simulator should + make sure all the numbers are final numbers, + including the explicit read/write accesses, + and the implicite accesses such as replacements and etc. + Future versions of McPAT may be able to reason the implicite access + based on param and stats of last level cache + The same rule applies to all cache access stats too! --> + <!-- following is AF for max power computation. + Do not change them, unless you understand them--> + <stat name="IFU_duty_cycle" value="1"/> + <stat name="LSU_duty_cycle" value="1"/> + <stat name="MemManU_I_duty_cycle" value="1"/> + <stat name="MemManU_D_duty_cycle" value="1"/> + <stat name="ALU_duty_cycle" value="1"/> + <stat name="MUL_duty_cycle" value="0.3"/> + <stat name="FPU_duty_cycle" value="1"/> + <stat name="ALU_cdb_duty_cycle" value="1"/> + <stat name="MUL_cdb_duty_cycle" value="0.3"/> + <stat name="FPU_cdb_duty_cycle" value="1"/> + <param name="number_of_BPT" value="2"/> + <component id="system.core0.predictor" name="PBT"> + <!-- branch predictor; tournament predictor see Alpha implementation --> + <param name="local_predictor_size" value="10,3"/> + <param name="local_predictor_entries" value="1024"/> + <param name="global_predictor_entries" value="4096"/> + <param name="global_predictor_bits" value="2"/> + <param name="chooser_predictor_entries" value="4096"/> + <param name="chooser_predictor_bits" value="2"/> + <!-- These parameters can be combined like below in next version + <param name="load_predictor" value="10,3,1024"/> + <param name="global_predictor" value="4096,2"/> + <param name="predictor_chooser" value="4096,2"/> + --> + </component> + <component id="system.core0.itlb" name="itlb"> + <param name="number_entries" value="128"/> + <stat name="total_accesses" value="200000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + <!-- there is no write requests to itlb although writes happen to itlb after miss, + which is actually a replacement --> + </component> + <component id="system.core0.icache" name="icache"> + <!-- there is no write requests to itlb although writes happen to it after miss, + which is actually a replacement --> + <param name="icache_config" value="65536,16,2,1,1,2,16,0"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy, --> + <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate --> + <param name="buffer_sizes" value="16, 16, 16,0"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="read_misses" value="0"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dtlb" name="dtlb"> + <param name="number_entries" value="128"/><!--dual threads--> + <stat name="total_accesses" value="400000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dcache" name="dcache"> + <!-- all the buffer related are optional --> + <param name="dcache_config" value="65536,16,2,1,1,3,16,0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + </component> + <param name="number_of_BTB" value="2"/> + <component id="system.core0.BTB" name="BTB"> + <!-- all the buffer related are optional --> + <param name="BTB_config" value="6144,4,2,1, 1,3"/> <!--48Kbits --> + <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <stat name="read_accesses" value="400000"/> <!--See IFU code for guideline --> + <stat name="write_accesses" value="0"/> + </component> + </component> + <component id="system.L1Directory0" name="L1Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="4096,2,0,1,100,100, 8"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3400"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="20"/> + </component> + <component id="system.L2Directory0" name="L2Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="512,4,0,1,1, 1"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="1200"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="100"/> + </component> + <component id="system.L20" name="L20"> + <!-- all the buffer related are optional --> + <param name="L2_config" value="1835008,16, 8, 16, 32, 32, 12, 1"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <param name="clockrate" value="1200"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="1.0"/> + </component> + +<!--**********************************************************************--> +<component id="system.L30" name="L30"> + <param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="clockrate" value="850"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="11824"/> + <stat name="write_accesses" value="11276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="1.0"/> + </component> +<!--**********************************************************************--> + <component id="system.NoC0" name="noc0"> + <param name="clockrate" value="1200"/> + <param name="type" value="1"/> + <!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus + at each time only one node can send req --> + <param name="horizontal_nodes" value="1"/> + <param name="vertical_nodes" value="1"/> + <param name="has_global_link" value="1"/> + <!-- 1 has global link, 0 does not have global link --> + <param name="link_throughput" value="1"/><!--w.r.t clock --> + <param name="link_latency" value="1"/><!--w.r.t clock --> + <!-- througput >= latency --> + <!-- Router architecture --> + <param name="input_ports" value="8"/> + <param name="output_ports" value="7"/> + <!-- For bus the I/O ports should be 1 --> + <param name="virtual_channel_per_port" value="2"/> + <param name="input_buffer_entries_per_vc" value="128"/> + <param name="flit_bits" value="40"/> + <param name="chip_coverage" value="1"/> + <!-- When multiple NOC present, one NOC will cover part of the whole chip. + chip_coverage <=1 --> + <param name="link_routing_over_percentage" value="1.0"/> + <!-- Links can route over other components or occupy whole area. + by default, 50% of the NoC global links routes over other + components --> + <stat name="total_accesses" value="100000"/> + <!-- This is the number of total accesses within the whole network not for each router --> + <stat name="duty_cycle" value="1"/> + </component> +<!--**********************************************************************--> + <component id="system.mem" name="mem"> + <!-- Main memory property --> + <param name="mem_tech_node" value="180"/> + <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB --> + <param name="peak_transfer_rate" value="6400"/><!--MB/S--> + <param name="internal_prefetch_of_DRAM_chip" value="4"/> + <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...--> + <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property --> + <!-- above numbers can be easily found from Wikipedia --> + <param name="capacity_per_channel" value="4096"/> <!-- MB --> + <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank + Current McPAT assumes single DIMMs are used.--> + <param name="number_ranks" value="2"/> + <param name="num_banks_of_DRAM_chip" value="8"/> + <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B --> + <param name="output_width_of_DRAM_chip" value="8"/> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 --> + <param name="burstlength_of_DRAM_chip" value="8"/> + <stat name="memory_accesses" value="1052"/> + <stat name="memory_reads" value="1052"/> + <stat name="memory_writes" value="1052"/> + </component> + <component id="system.mc" name="mc"> + <!-- Memeory controllers are for DDR(2,3...) DIMMs --> + <!-- current version of McPAT uses published values for base parameters of memory controller + improvments on MC will be added in later versions. --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="mc_clock" value="800"/><!--MHz--> + <param name="peak_transfer_rate" value="1600"/><!--MB/S--> + <param name="block_size" value="16"/><!--B--> + <param name="number_mcs" value="2"/> + <!-- current McPAT only supports homogeneous memory controllers --> + <param name="memory_channels_per_mc" value="2"/> + <param name="number_ranks" value="2"/> + <param name="withPHY" value="0"/> + <!-- # of ranks of each channel--> + <param name="req_window_size_per_channel" value="32"/> + <param name="IO_buffer_size_per_channel" value="32"/> + <param name="databus_width" value="32"/> + <param name="addressbus_width" value="32"/> + <!-- McPAT will add the control bus width to the addressbus width automatically --> + <stat name="memory_accesses" value="6666"/> + <stat name="memory_reads" value="3333"/> + <stat name="memory_writes" value="3333"/> + <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate + the average power per MC or per channel. This is sufficent for most application. + Further trackdown can be easily added in later versions. --> + </component> +<!--**********************************************************************--> + <component id="system.niu" name="niu"> + <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller --> + <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. + the low bound of clock rate of a 10Gb MAC is 150Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate + the average power per nic or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.pcie" name="pcie"> + <!-- On chip PCIe controller, including Phy--> + <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. + the low bound of clock rate of a PCIe per lane logic is 120Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> + <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate + the average power per pcie controller or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.flashc" name="flashc"> + <param name="number_flashcs" value="0"/> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate + the average power per fc or per channel. This is sufficent for most application --> + </component> +<!--**********************************************************************--> + + </component> +</component> diff --git a/ext/mcpat/Niagara1.xml b/ext/mcpat/Niagara1.xml new file mode 100644 index 000000000..ae748e246 --- /dev/null +++ b/ext/mcpat/Niagara1.xml @@ -0,0 +1,442 @@ +<?xml version="1.0" ?> +<component id="root" name="root"> + <component id="system" name="system"> + <!--McPAT will skip the components if number is set to 0 --> + <param name="number_of_cores" value="8"/> + <param name="number_of_L1Directories" value="4"/> + <param name="number_of_L2Directories" value="0"/> + <param name="number_of_L2s" value="4"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports --> + <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters --> + <param name="number_of_NoCs" value="1"/> + <param name="homogeneous_cores" value="1"/><!--1 means homo --> + <param name="homogeneous_L2s" value="1"/> + <param name="homogeneous_L1Directorys" value="1"/> + <param name="homogeneous_L2Directorys" value="1"/> + <param name="homogeneous_L3s" value="1"/> + <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware --> + <param name="homogeneous_NoCs" value="1"/> + <param name="core_tech_node" value="90"/><!-- nm --> + <param name="target_core_clockrate" value="1200"/><!--MHz --> + <param name="temperature" value="380"/> <!-- Kelvin --> + <param name="number_cache_levels" value="2"/> + <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology --> + <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) --> + <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible --> + <param name="machine_bits" value="64"/> + <param name="virtual_address_width" value="64"/> + <param name="physical_address_width" value="52"/> + <param name="virtual_memory_page_size" value="4096"/> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of + virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank --> + <!-- *********************** cores ******************* --> + <component id="system.core0" name="core0"> + <!-- Core property --> + <param name="clock_rate" value="1200"/> + <param name="instruction_length" value="32"/> + <param name="opcode_width" value="9"/> + <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller + default value is machine_bits, if not set --> + <param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO--> + <!-- inorder/OoO --> + <param name="number_hardware_threads" value="4"/> + <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor, + it only may be more than one in SMT processors. BTB ports always equals to fetch ports since + branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> + <param name="fetch_width" value="1"/> + <!-- fetch_width determins the size of cachelines of L1 cache block --> + <param name="number_instruction_fetch_ports" value="1"/> + <param name="decode_width" value="1"/> + <!-- decode_width determins the number of ports of the + renaming table (both RAM and CAM) scheme --> + <param name="issue_width" value="1"/> + <!-- issue_width determins the number of ports of Issue window and other logic + as in the complexity effective proccessors paper; issue_width==dispatch_width --> + <param name="commit_width" value="1"/> + <!-- commit_width determins the number of ports of register files --> + <param name="fp_issue_width" value="1"/> + <param name="prediction_width" value="0"/> + <!-- number of branch instructions can be predicted simultannouesl--> + <!-- Current version of McPAT does not distinguish int and floating point pipelines + Theses parameters are reserved for future use.--> + <param name="pipelines_per_core" value="1,1"/> + <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared--> + <param name="pipeline_depth" value="6,6"/> + <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops --> + <!-- issue and exe unit--> + <param name="ALU_per_core" value="1"/> + <!-- contains an adder, a shifter, and a logical unit --> + <param name="MUL_per_core" value="1"/> + <!-- For MUL and Div --> + <param name="FPU_per_core" value="0.125"/> + <!-- buffer between IF and ID stage --> + <param name="instruction_buffer_size" value="16"/> + <!-- buffer between ID and sche/exe stage --> + <param name="decoded_stream_buffer_size" value="16"/> + <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED--> + <!-- McPAT support 2 types of OoO cores, RS based and physical reg based--> + <param name="instruction_window_size" value="16"/> + <param name="fp_instruction_window_size" value="16"/> + <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 --> + <param name="ROB_size" value="80"/> + <!-- each in-flight instruction has an entry in ROB --> + <!-- registers --> + <param name="archi_Regs_IRF_size" value="32"/> + <param name="archi_Regs_FRF_size" value="32"/> + <!-- if OoO processor, phy_reg number is needed for renaming logic, + renaming logic is for both integer and floating point insts. --> + <param name="phy_Regs_IRF_size" value="80"/> + <param name="phy_Regs_FRF_size" value="80"/> + <!-- rename logic --> + <param name="rename_scheme" value="0"/> + <!-- can be RAM based(0) or CAM based(1) rename scheme + RAM-based scheme will have free list, status table; + CAM-based scheme have the valid bit in the data field of the CAM + both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions; + Detailed RAT Implementation see TR --> + <param name="register_windows_size" value="8"/> + <!-- how many windows in the windowed register file, sun processors; + no register windowing is used when this number is 0 --> + <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha), + They will always try to exeute out-of-order though. --> + <param name="LSU_order" value="inorder"/> + <param name="store_buffer_size" value="32"/> + <!-- By default, in-order cores do not have load buffers --> + <param name="load_buffer_size" value="32"/> + <!-- number of ports refer to sustainable concurrent memory accesses --> + <param name="memory_ports" value="1"/> + <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer + as well as the ports of Dcache which is connected to LSU --> + <!-- dual-pumped Dcache can be used to save the extra read/write ports --> + <param name="RAS_size" value="32"/> + <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check --> + <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops --> + <stat name="total_instructions" value="800000"/> + <stat name="int_instructions" value="600000"/> + <stat name="fp_instructions" value="20000"/> + <stat name="branch_instructions" value="0"/> + <stat name="branch_mispredictions" value="0"/> + <stat name="load_instructions" value="100000"/> + <stat name="store_instructions" value="100000"/> + <stat name="committed_instructions" value="800000"/> + <stat name="committed_int_instructions" value="600000"/> + <stat name="committed_fp_instructions" value="20000"/> + <stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous --> + <!-- the following cycle stats are used for heterogeneouse cores only, + please ignore them if homogeneouse cores --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!-- instruction buffer stats --> + <!-- ROB stats, both RS and Phy based OoOs have ROB + performance simulator should capture the difference on accesses, + otherwise, McPAT has to guess based on number of commited instructions. --> + <stat name="ROB_reads" value="263886"/> + <stat name="ROB_writes" value="263886"/> + <!-- RAT accesses --> + <stat name="rename_accesses" value="263886"/> + <stat name="fp_rename_accesses" value="263886"/> + <!-- decode and rename stage use this, should be total ic - nop --> + <!-- Inst window stats --> + <stat name="inst_window_reads" value="263886"/> + <stat name="inst_window_writes" value="263886"/> + <stat name="inst_window_wakeup_accesses" value="263886"/> + <stat name="fp_inst_window_reads" value="263886"/> + <stat name="fp_inst_window_writes" value="263886"/> + <stat name="fp_inst_window_wakeup_accesses" value="263886"/> + <!-- RF accesses --> + <stat name="int_regfile_reads" value="1600000"/> + <stat name="float_regfile_reads" value="40000"/> + <stat name="int_regfile_writes" value="800000"/> + <stat name="float_regfile_writes" value="20000"/> + <!-- accesses to the working reg --> + <stat name="function_calls" value="5"/> + <stat name="context_switches" value="260343"/> + <!-- Number of Windowes switches (number of function calls and returns)--> + <!-- Alu stats by default, the processor has one FPU that includes the divider and + multiplier. The fpu accesses should include accesses to multiplier and divider --> + <stat name="ialu_accesses" value="800000"/> + <stat name="fpu_accesses" value="10000"/> + <stat name="mul_accesses" value="100000"/> + <stat name="cdb_alu_accesses" value="1000000"/> + <stat name="cdb_mul_accesses" value="0"/> + <stat name="cdb_fpu_accesses" value="0"/> + <!-- multiple cycle accesses should be counted multiple times, + otherwise, McPAT can use internal counter for different floating point instructions + to get final accesses. But that needs detailed info for floating point inst mix --> + <!-- currently the performance simulator should + make sure all the numbers are final numbers, + including the explicit read/write accesses, + and the implicite accesses such as replacements and etc. + Future versions of McPAT may be able to reason the implicite access + based on param and stats of last level cache + The same rule applies to all cache access stats too! --> + <!-- following is AF for max power computation. + Do not change them, unless you understand them--> + <stat name="IFU_duty_cycle" value="0.25"/> + <stat name="LSU_duty_cycle" value="0.25"/> + <stat name="MemManU_I_duty_cycle" value="1"/> + <stat name="MemManU_D_duty_cycle" value="0.25"/> + <stat name="ALU_duty_cycle" value="0.9"/> + <stat name="MUL_duty_cycle" value="0.5"/> + <stat name="FPU_duty_cycle" value="0.4"/> + <stat name="ALU_cdb_duty_cycle" value="0.9"/> + <stat name="MUL_cdb_duty_cycle" value="0.5"/> + <stat name="FPU_cdb_duty_cycle" value="0.4"/> + <component id="system.core0.predictor" name="PBT"> + <!-- branch predictor; tournament predictor see Alpha implementation --> + <param name="local_predictor_size" value="10,3"/> + <param name="local_predictor_entries" value="1024"/> + <param name="global_predictor_entries" value="4096"/> + <param name="global_predictor_bits" value="2"/> + <param name="chooser_predictor_entries" value="4096"/> + <param name="chooser_predictor_bits" value="2"/> + <!-- These parameters can be combined like below in next version + <param name="load_predictor" value="10,3,1024"/> + <param name="global_predictor" value="4096,2"/> + <param name="predictor_chooser" value="4096,2"/> + --> + </component> + <component id="system.core0.itlb" name="itlb"> + <param name="number_entries" value="64"/> + <stat name="total_accesses" value="800000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + <!-- there is no write requests to itlb although writes happen to itlb after miss, + which is actually a replacement --> + </component> + <component id="system.core0.icache" name="icache"> + <!-- there is no write requests to itlb although writes happen to it after miss, + which is actually a replacement --> + <param name="icache_config" value="16384,32,4,1,1,3,8,0"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate --> + <param name="buffer_sizes" value="16, 16, 16,0"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="read_misses" value="0"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dtlb" name="dtlb"> + <param name="number_entries" value="64"/> + <stat name="total_accesses" value="200000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dcache" name="dcache"> + <!-- all the buffer related are optional --> + <param name="dcache_config" value="8192,16,4,1,1,3,16,0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.BTB" name="BTB"> + <!-- all the buffer related are optional --> + <param name="BTB_config" value="8192,4,2,1, 1,3"/> + <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + </component> + </component> + <component id="system.L1Directory0" name="L1Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="2048,1,0,1, 4, 4,8"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="1200"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="20"/> + <stat name="duty_cycle" value="0.45"/> + </component> + <component id="system.L2Directory0" name="L2Directory0"> + <param name="Directory_type" value="1"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="1048576,16,16,1,2, 100"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="1200"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="100"/> + <stat name="duty_cycle" value="0.45"/> + </component> + <component id="system.L20" name="L20"> + <!-- all the buffer related are optional --> + <param name="L2_config" value="786432,64,16,1, 4,23, 64, 1"/> + <!-- consider 4-way bank interleaving for Niagara 1 --> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <param name="clockrate" value="1200"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="0"/> + <stat name="read_misses" value="0"/> + <stat name="write_misses" value="0"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="0.5"/> + </component> + +<!--**********************************************************************--> +<component id="system.L30" name="L30"> + <param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="0.35"/> + </component> +<!--**********************************************************************--> + <component id="system.NoC0" name="noc0"> + <param name="clockrate" value="1200"/> + <param name="type" value="1"/> + <!-- 1 NoC, O bus --> + <param name="horizontal_nodes" value="2"/> + <param name="vertical_nodes" value="1"/> + <param name="has_global_link" value="0"/> + <!-- 1 has global link, 0 does not have global link --> + <param name="link_throughput" value="1"/><!--w.r.t clock --> + <param name="link_latency" value="1"/><!--w.r.t clock --> + <!-- througput >= latency --> + <!-- Router architecture --> + <param name="input_ports" value="8"/> + <param name="output_ports" value="5"/> + <param name="virtual_channel_per_port" value="1"/> + <!-- input buffer; in classic routers only input ports need buffers --> + <param name="flit_bits" value="136"/> + <param name="input_buffer_entries_per_vc" value="2"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs--> + <param name="chip_coverage" value="1"/> + <!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 --> + <stat name="total_accesses" value="360000"/> + <!-- This is the number of total accesses within the whole network not for each router --> + <stat name="duty_cycle" value="0.6"/> + </component> + +<!--**********************************************************************--> + <component id="system.mem" name="mem"> + <!-- Main memory property --> + <param name="mem_tech_node" value="32"/> + <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB --> + <param name="peak_transfer_rate" value="3200"/><!--MB/S--> + <param name="internal_prefetch_of_DRAM_chip" value="4"/> + <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...--> + <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property --> + <!-- above numbers can be easily found from Wikipedia --> + <param name="capacity_per_channel" value="4096"/> <!-- MB --> + <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank + Current McPAT assumes single DIMMs are used.--> + <param name="number_ranks" value="2"/> + <param name="num_banks_of_DRAM_chip" value="8"/> + <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B --> + <param name="output_width_of_DRAM_chip" value="8"/> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 --> + <param name="burstlength_of_DRAM_chip" value="8"/> + <stat name="memory_accesses" value="1052"/> + <stat name="memory_reads" value="1052"/> + <stat name="memory_writes" value="1052"/> + </component> + <component id="system.mc" name="mc"> + <!-- Memeory controllers are for DDR(2,3...) DIMMs --> + <!-- current version of McPAT uses published values for base parameters of memory controller + improvments on MC will be added in later versions. --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> + <param name="peak_transfer_rate" value="3200"/><!--MB/S--> + <param name="block_size" value="64"/><!--B--> + <param name="number_mcs" value="4"/> + <!-- current McPAT only supports homogeneous memory controllers --> + <param name="memory_channels_per_mc" value="1"/> + <param name="number_ranks" value="2"/> + <param name="withPHY" value="0"/> + <!-- # of ranks of each channel--> + <param name="req_window_size_per_channel" value="32"/> + <param name="IO_buffer_size_per_channel" value="32"/> + <param name="databus_width" value="128"/> + <param name="addressbus_width" value="51"/> + <!-- McPAT will add the control bus width to the addressbus width automatically --> + <stat name="memory_accesses" value="33333"/> + <stat name="memory_reads" value="16667"/> + <stat name="memory_writes" value="16667"/> + <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate + the average power per MC or per channel. This is sufficent for most application. + Further trackdown can be easily added in later versions. --> + </component> +<!--**********************************************************************--> + <component id="system.niu" name="niu"> + <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller --> + <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. + the low bound of clock rate of a 10Gb MAC is 150Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate + the average power per nic or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.pcie" name="pcie"> + <!-- On chip PCIe controller, including Phy--> + <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. + the low bound of clock rate of a PCIe per lane logic is 120Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> + <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate + the average power per pcie controller or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.flashc" name="flashc"> + <param name="number_flashcs" value="0"/> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate + the average power per fc or per channel. This is sufficent for most application --> + </component> +<!--**********************************************************************--> + + </component> +</component> diff --git a/ext/mcpat/Niagara1_sharing.xml b/ext/mcpat/Niagara1_sharing.xml new file mode 100644 index 000000000..93531aebd --- /dev/null +++ b/ext/mcpat/Niagara1_sharing.xml @@ -0,0 +1,400 @@ +<?xml version="1.0" ?> +<component id="root" name="root"> + <component id="system" name="system"> + <!--McPAT will skip the components if number is set to 0 --> + <param name="number_of_cores" value="64"/> + <param name="number_of_L1Directories" value="0"/> + <param name="number_of_L2Directories" value="0"/> + <param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports --> + <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters --> + <param name="number_of_NoCs" value="1"/> + <param name="homogeneous_cores" value="1"/><!--1 means homo --> + <param name="homogeneous_L2s" value="1"/> + <param name="homogeneous_L1Directorys" value="1"/> + <param name="homogeneous_L2Directorys" value="1"/> + <param name="homogeneous_L3s" value="1"/> + <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware --> + <param name="homogeneous_NoCs" value="1"/> + <param name="core_tech_node" value="22"/><!-- nm --> + <param name="target_core_clockrate" value="3500"/><!--MHz --> + <param name="temperature" value="360"/> <!-- Kelvin --> + <param name="number_cache_levels" value="2"/> + <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology --> + <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) --> + <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible --> + <param name="machine_bits" value="64"/> + <param name="virtual_address_width" value="64"/> + <param name="physical_address_width" value="52"/> + <param name="virtual_memory_page_size" value="4096"/> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of + virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank --> + <!-- *********************** cores ******************* --> + <component id="system.core0" name="core0"> + <!-- Core property --> + <param name="clock_rate" value="3500"/> + <param name="instruction_length" value="32"/> + <param name="opcode_width" value="9"/> + <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller + default value is machine_bits, if not set --> + <param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO--> + <!-- inorder/OoO --> + <param name="number_hardware_threads" value="4"/> + <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor, + it only may be more than one in SMT processors. BTB ports always equals to fetch ports since + branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> + <param name="fetch_width" value="1"/> + <!-- fetch_width determins the size of cachelines of L1 cache block --> + <param name="number_instruction_fetch_ports" value="1"/> + <param name="decode_width" value="1"/> + <!-- decode_width determins the number of ports of the + renaming table (both RAM and CAM) scheme --> + <param name="issue_width" value="1"/> + <!-- issue_width determins the number of ports of Issue window and other logic + as in the complexity effective proccessors paper; issue_width==dispatch_width --> + <param name="commit_width" value="1"/> + <!-- commit_width determins the number of ports of register files --> + <param name="fp_issue_width" value="1"/> + <param name="prediction_width" value="0"/> + <!-- number of branch instructions can be predicted simultannouesl--> + <!-- Current version of McPAT does not distinguish int and floating point pipelines + Theses parameters are reserved for future use.--> + <param name="pipelines_per_core" value="1,1"/> + <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared--> + <param name="pipeline_depth" value="6,6"/> + <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops --> + <!-- issue and exe unit--> + <param name="ALU_per_core" value="1"/> + <!-- contains an adder, a shifter, and a logical unit --> + <param name="MUL_per_core" value="1"/> + <!-- For MUL and Div --> + <param name="FPU_per_core" value="0.125"/> + <!-- buffer between IF and ID stage --> + <param name="instruction_buffer_size" value="16"/> + <!-- buffer between ID and sche/exe stage --> + <param name="decoded_stream_buffer_size" value="16"/> + <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED--> + <!-- McPAT support 2 types of OoO cores, RS based and physical reg based--> + <param name="instruction_window_size" value="16"/> + <param name="fp_instruction_window_size" value="16"/> + <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 --> + <param name="ROB_size" value="80"/> + <!-- each in-flight instruction has an entry in ROB --> + <!-- registers --> + <param name="archi_Regs_IRF_size" value="32"/> + <param name="archi_Regs_FRF_size" value="32"/> + <!-- if OoO processor, phy_reg number is needed for renaming logic, + renaming logic is for both integer and floating point insts. --> + <param name="phy_Regs_IRF_size" value="80"/> + <param name="phy_Regs_FRF_size" value="80"/> + <!-- rename logic --> + <param name="rename_scheme" value="0"/> + <!-- can be RAM based(0) or CAM based(1) rename scheme + RAM-based scheme will have free list, status table; + CAM-based scheme have the valid bit in the data field of the CAM + both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions; + Detailed RAT Implementation see TR --> + <param name="register_windows_size" value="8"/> + <!-- how many windows in the windowed register file, sun processors; + no register windowing is used when this number is 0 --> + <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha), + They will always try to exeute out-of-order though. --> + <param name="LSU_order" value="inorder"/> + <param name="store_buffer_size" value="32"/> + <!-- By default, in-order cores do not have load buffers --> + <param name="load_buffer_size" value="32"/> + <!-- number of ports refer to sustainable concurrent memory accesses --> + <param name="memory_ports" value="1"/> + <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer + as well as the ports of Dcache which is connected to LSU --> + <!-- dual-pumped Dcache can be used to save the extra read/write ports --> + <param name="RAS_size" value="32"/> + <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check --> + <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops --> + <stat name="total_instructions" value="800000"/> + <stat name="int_instructions" value="600000"/> + <stat name="fp_instructions" value="20000"/> + <stat name="branch_instructions" value="0"/> + <stat name="branch_mispredictions" value="0"/> + <stat name="load_instructions" value="100000"/> + <stat name="store_instructions" value="100000"/> + <stat name="committed_instructions" value="800000"/> + <stat name="committed_int_instructions" value="600000"/> + <stat name="committed_fp_instructions" value="20000"/> + <stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous --> + <!-- the following cycle stats are used for heterogeneouse cores only, + please ignore them if homogeneouse cores --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!-- instruction buffer stats --> + <!-- ROB stats, both RS and Phy based OoOs have ROB + performance simulator should capture the difference on accesses, + otherwise, McPAT has to guess based on number of commited instructions. --> + <stat name="ROB_reads" value="263886"/> + <stat name="ROB_writes" value="263886"/> + <!-- RAT accesses --> + <stat name="rename_accesses" value="263886"/> + <stat name="fp_rename_accesses" value="263886"/> + <!-- decode and rename stage use this, should be total ic - nop --> + <!-- Inst window stats --> + <stat name="inst_window_reads" value="263886"/> + <stat name="inst_window_writes" value="263886"/> + <stat name="inst_window_wakeup_accesses" value="263886"/> + <stat name="fp_inst_window_reads" value="263886"/> + <stat name="fp_inst_window_writes" value="263886"/> + <stat name="fp_inst_window_wakeup_accesses" value="263886"/> + <!-- RF accesses --> + <stat name="int_regfile_reads" value="1600000"/> + <stat name="float_regfile_reads" value="40000"/> + <stat name="int_regfile_writes" value="800000"/> + <stat name="float_regfile_writes" value="20000"/> + <!-- accesses to the working reg --> + <stat name="function_calls" value="5"/> + <stat name="context_switches" value="260343"/> + <!-- Number of Windowes switches (number of function calls and returns)--> + <!-- Alu stats by default, the processor has one FPU that includes the divider and + multiplier. The fpu accesses should include accesses to multiplier and divider --> + <stat name="ialu_accesses" value="800000"/> + <stat name="fpu_accesses" value="10000"/> + <stat name="mul_accesses" value="100000"/> + <stat name="cdb_alu_accesses" value="1000000"/> + <stat name="cdb_mul_accesses" value="0"/> + <stat name="cdb_fpu_accesses" value="0"/> + <!-- multiple cycle accesses should be counted multiple times, + otherwise, McPAT can use internal counter for different floating point instructions + to get final accesses. But that needs detailed info for floating point inst mix --> + <!-- currently the performance simulator should + make sure all the numbers are final numbers, + including the explicit read/write accesses, + and the implicite accesses such as replacements and etc. + Future versions of McPAT may be able to reason the implicite access + based on param and stats of last level cache + The same rule applies to all cache access stats too! --> + <!-- following is AF for max power computation. + Do not change them, unless you understand them--> + <stat name="IFU_duty_cycle" value="0.25"/> + <stat name="LSU_duty_cycle" value="0.25"/> + <stat name="MemManU_I_duty_cycle" value="1"/> + <stat name="MemManU_D_duty_cycle" value="0.25"/> + <stat name="ALU_duty_cycle" value="0.9"/> + <stat name="MUL_duty_cycle" value="0.5"/> + <stat name="FPU_duty_cycle" value="0.4"/> + <stat name="ALU_cdb_duty_cycle" value="0.9"/> + <stat name="MUL_cdb_duty_cycle" value="0.5"/> + <stat name="FPU_cdb_duty_cycle" value="0.4"/> + <component id="system.core0.predictor" name="PBT"> + <!-- branch predictor; tournament predictor see Alpha implementation --> + <param name="local_predictor_size" value="10,3"/> + <param name="local_predictor_entries" value="1024"/> + <param name="global_predictor_entries" value="4096"/> + <param name="global_predictor_bits" value="2"/> + <param name="chooser_predictor_entries" value="4096"/> + <param name="chooser_predictor_bits" value="2"/> + <!-- These parameters can be combined like below in next version + <param name="load_predictor" value="10,3,1024"/> + <param name="global_predictor" value="4096,2"/> + <param name="predictor_chooser" value="4096,2"/> + --> + </component> + <component id="system.core0.itlb" name="itlb"> + <param name="number_entries" value="64"/> + <stat name="total_accesses" value="800000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + <!-- there is no write requests to itlb although writes happen to itlb after miss, + which is actually a replacement --> + </component> + <component id="system.core0.icache" name="icache"> + <!-- there is no write requests to itlb although writes happen to it after miss, + which is actually a replacement --> + <param name="icache_config" value="16384,32,4,1,1,3,8,0"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate --> + <param name="buffer_sizes" value="16, 16, 16,0"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="read_misses" value="0"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dtlb" name="dtlb"> + <param name="number_entries" value="64"/> + <stat name="total_accesses" value="200000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dcache" name="dcache"> + <!-- all the buffer related are optional --> + <param name="dcache_config" value="8192,16,4,1,1,3,16,0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.BTB" name="BTB"> + <!-- all the buffer related are optional --> + <param name="BTB_config" value="8192,4,2,1, 1,3"/> + <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + </component> + </component> + <component id="system.L1Directory0" name="L1Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="2048,1,0,1, 4, 4,8"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="20"/> + <stat name="duty_cycle" value="0.45"/> + </component> + <component id="system.L2Directory0" name="L2Directory0"> + <param name="Directory_type" value="1"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="1048576,16,16,1,2, 100"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="100"/> + <stat name="duty_cycle" value="0.45"/> + </component> + <component id="system.L20" name="L20"> + <!-- all the buffer related are optional --> + <param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/> + <!-- consider 4-way bank interleaving for Niagara 1 --> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="0"/> + <stat name="read_misses" value="0"/> + <stat name="write_misses" value="0"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="0.5"/> + </component> + +<!--**********************************************************************--> +<component id="system.L30" name="L30"> + <param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="0.35"/> + </component> +<!--**********************************************************************--> + <component id="system.NoC0" name="noc0"> + <param name="clockrate" value="3500"/> + <param name="type" value="1"/> + <!-- 1 NoC, O bus --> + <param name="horizontal_nodes" value="8"/> + <param name="vertical_nodes" value="8"/> + <param name="has_global_link" value="1"/> + <!-- 1 has global link, 0 does not have global link --> + <param name="link_throughput" value="1"/><!--w.r.t clock --> + <param name="link_latency" value="1"/><!--w.r.t clock --> + <!-- througput >= latency --> + <!-- Router architecture --> + <param name="input_ports" value="5"/> + <param name="output_ports" value="5"/> + <param name="virtual_channel_per_port" value="1"/> + <!-- input buffer; in classic routers only input ports need buffers --> + <param name="flit_bits" value="256"/> + <param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs--> + <param name="chip_coverage" value="1"/> + <!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 --> + <stat name="total_accesses" value="360000"/> + <!-- This is the number of total accesses within the whole network not for each router --> + <stat name="duty_cycle" value="0.1"/> + </component> + +<!--**********************************************************************--> + <component id="system.mem" name="mem"> + <!-- Main memory property --> + <param name="mem_tech_node" value="32"/> + <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB --> + <param name="peak_transfer_rate" value="3200"/><!--MB/S--> + <param name="internal_prefetch_of_DRAM_chip" value="4"/> + <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...--> + <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property --> + <!-- above numbers can be easily found from Wikipedia --> + <param name="capacity_per_channel" value="4096"/> <!-- MB --> + <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank + Current McPAT assumes single DIMMs are used.--> + <param name="number_ranks" value="2"/> + <param name="num_banks_of_DRAM_chip" value="8"/> + <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B --> + <param name="output_width_of_DRAM_chip" value="8"/> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 --> + <param name="burstlength_of_DRAM_chip" value="8"/> + <stat name="memory_accesses" value="1052"/> + <stat name="memory_reads" value="1052"/> + <stat name="memory_writes" value="1052"/> + </component> + <component id="system.mc" name="mc"> + <!-- Memeory controllers are for DDR(2,3...) DIMMs --> + <!-- current version of McPAT uses published values for base parameters of memory controller + improvments on MC will be added in later versions. --> + <param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> + <param name="peak_transfer_rate" value="3200"/><!--MB/S--> + <param name="llc_line_length" value="64"/><!--B--> + <param name="number_mcs" value="4"/> + <!-- current McPAT only supports homogeneous memory controllers --> + <param name="memory_channels_per_mc" value="1"/> + <param name="number_ranks" value="2"/> + <!-- # of ranks of each channel--> + <param name="req_window_size_per_channel" value="32"/> + <param name="IO_buffer_size_per_channel" value="32"/> + <param name="databus_width" value="128"/> + <param name="addressbus_width" value="51"/> + <!-- McPAT will add the control bus width to the addressbus width automatically --> + <stat name="memory_accesses" value="33333"/> + <stat name="memory_reads" value="16667"/> + <stat name="memory_writes" value="16667"/> + <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate + the average power per MC or per channel. This is sufficent for most application. + Further trackdown can be easily added in later versions. --> + </component> +<!--**********************************************************************--> + </component> +</component> diff --git a/ext/mcpat/Niagara1_sharing_DC.xml b/ext/mcpat/Niagara1_sharing_DC.xml new file mode 100644 index 000000000..574ec8157 --- /dev/null +++ b/ext/mcpat/Niagara1_sharing_DC.xml @@ -0,0 +1,442 @@ +<?xml version="1.0" ?> +<component id="root" name="root"> + <component id="system" name="system"> + <!--McPAT will skip the components if number is set to 0 --> + <param name="number_of_cores" value="64"/> + <param name="number_of_L1Directories" value="0"/> + <param name="number_of_L2Directories" value="8"/> + <param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports --> + <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters --> + <param name="number_of_NoCs" value="1"/> + <param name="homogeneous_cores" value="1"/><!--1 means homo --> + <param name="homogeneous_L2s" value="1"/> + <param name="homogeneous_L1Directorys" value="1"/> + <param name="homogeneous_L2Directorys" value="1"/> + <param name="homogeneous_L3s" value="1"/> + <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware --> + <param name="homogeneous_NoCs" value="1"/> + <param name="core_tech_node" value="22"/><!-- nm --> + <param name="target_core_clockrate" value="3500"/><!--MHz --> + <param name="temperature" value="360"/> <!-- Kelvin --> + <param name="number_cache_levels" value="2"/> + <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology --> + <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) --> + <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible --> + <param name="machine_bits" value="64"/> + <param name="virtual_address_width" value="64"/> + <param name="physical_address_width" value="52"/> + <param name="virtual_memory_page_size" value="4096"/> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of + virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank --> + <!-- *********************** cores ******************* --> + <component id="system.core0" name="core0"> + <!-- Core property --> + <param name="clock_rate" value="3500"/> + <param name="instruction_length" value="32"/> + <param name="opcode_width" value="9"/> + <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller + default value is machine_bits, if not set --> + <param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO--> + <!-- inorder/OoO --> + <param name="number_hardware_threads" value="4"/> + <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor, + it only may be more than one in SMT processors. BTB ports always equals to fetch ports since + branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> + <param name="fetch_width" value="1"/> + <!-- fetch_width determins the size of cachelines of L1 cache block --> + <param name="number_instruction_fetch_ports" value="1"/> + <param name="decode_width" value="1"/> + <!-- decode_width determins the number of ports of the + renaming table (both RAM and CAM) scheme --> + <param name="issue_width" value="1"/> + <!-- issue_width determins the number of ports of Issue window and other logic + as in the complexity effective proccessors paper; issue_width==dispatch_width --> + <param name="commit_width" value="1"/> + <!-- commit_width determins the number of ports of register files --> + <param name="fp_issue_width" value="1"/> + <param name="prediction_width" value="0"/> + <!-- number of branch instructions can be predicted simultannouesl--> + <!-- Current version of McPAT does not distinguish int and floating point pipelines + Theses parameters are reserved for future use.--> + <param name="pipelines_per_core" value="1,1"/> + <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared--> + <param name="pipeline_depth" value="6,6"/> + <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops --> + <!-- issue and exe unit--> + <param name="ALU_per_core" value="1"/> + <!-- contains an adder, a shifter, and a logical unit --> + <param name="MUL_per_core" value="1"/> + <!-- For MUL and Div --> + <param name="FPU_per_core" value="0.125"/> + <!-- buffer between IF and ID stage --> + <param name="instruction_buffer_size" value="16"/> + <!-- buffer between ID and sche/exe stage --> + <param name="decoded_stream_buffer_size" value="16"/> + <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED--> + <!-- McPAT support 2 types of OoO cores, RS based and physical reg based--> + <param name="instruction_window_size" value="16"/> + <param name="fp_instruction_window_size" value="16"/> + <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 --> + <param name="ROB_size" value="80"/> + <!-- each in-flight instruction has an entry in ROB --> + <!-- registers --> + <param name="archi_Regs_IRF_size" value="32"/> + <param name="archi_Regs_FRF_size" value="32"/> + <!-- if OoO processor, phy_reg number is needed for renaming logic, + renaming logic is for both integer and floating point insts. --> + <param name="phy_Regs_IRF_size" value="80"/> + <param name="phy_Regs_FRF_size" value="80"/> + <!-- rename logic --> + <param name="rename_scheme" value="0"/> + <!-- can be RAM based(0) or CAM based(1) rename scheme + RAM-based scheme will have free list, status table; + CAM-based scheme have the valid bit in the data field of the CAM + both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions; + Detailed RAT Implementation see TR --> + <param name="register_windows_size" value="8"/> + <!-- how many windows in the windowed register file, sun processors; + no register windowing is used when this number is 0 --> + <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha), + They will always try to exeute out-of-order though. --> + <param name="LSU_order" value="inorder"/> + <param name="store_buffer_size" value="32"/> + <!-- By default, in-order cores do not have load buffers --> + <param name="load_buffer_size" value="32"/> + <!-- number of ports refer to sustainable concurrent memory accesses --> + <param name="memory_ports" value="1"/> + <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer + as well as the ports of Dcache which is connected to LSU --> + <!-- dual-pumped Dcache can be used to save the extra read/write ports --> + <param name="RAS_size" value="32"/> + <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check --> + <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops --> + <stat name="total_instructions" value="800000"/> + <stat name="int_instructions" value="600000"/> + <stat name="fp_instructions" value="20000"/> + <stat name="branch_instructions" value="0"/> + <stat name="branch_mispredictions" value="0"/> + <stat name="load_instructions" value="100000"/> + <stat name="store_instructions" value="100000"/> + <stat name="committed_instructions" value="800000"/> + <stat name="committed_int_instructions" value="600000"/> + <stat name="committed_fp_instructions" value="20000"/> + <stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous --> + <!-- the following cycle stats are used for heterogeneouse cores only, + please ignore them if homogeneouse cores --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!-- instruction buffer stats --> + <!-- ROB stats, both RS and Phy based OoOs have ROB + performance simulator should capture the difference on accesses, + otherwise, McPAT has to guess based on number of commited instructions. --> + <stat name="ROB_reads" value="263886"/> + <stat name="ROB_writes" value="263886"/> + <!-- RAT accesses --> + <stat name="rename_accesses" value="263886"/> + <stat name="fp_rename_accesses" value="263886"/> + <!-- decode and rename stage use this, should be total ic - nop --> + <!-- Inst window stats --> + <stat name="inst_window_reads" value="263886"/> + <stat name="inst_window_writes" value="263886"/> + <stat name="inst_window_wakeup_accesses" value="263886"/> + <stat name="fp_inst_window_reads" value="263886"/> + <stat name="fp_inst_window_writes" value="263886"/> + <stat name="fp_inst_window_wakeup_accesses" value="263886"/> + <!-- RF accesses --> + <stat name="int_regfile_reads" value="1600000"/> + <stat name="float_regfile_reads" value="40000"/> + <stat name="int_regfile_writes" value="800000"/> + <stat name="float_regfile_writes" value="20000"/> + <!-- accesses to the working reg --> + <stat name="function_calls" value="5"/> + <stat name="context_switches" value="260343"/> + <!-- Number of Windowes switches (number of function calls and returns)--> + <!-- Alu stats by default, the processor has one FPU that includes the divider and + multiplier. The fpu accesses should include accesses to multiplier and divider --> + <stat name="ialu_accesses" value="800000"/> + <stat name="fpu_accesses" value="10000"/> + <stat name="mul_accesses" value="100000"/> + <stat name="cdb_alu_accesses" value="1000000"/> + <stat name="cdb_mul_accesses" value="0"/> + <stat name="cdb_fpu_accesses" value="0"/> + <!-- multiple cycle accesses should be counted multiple times, + otherwise, McPAT can use internal counter for different floating point instructions + to get final accesses. But that needs detailed info for floating point inst mix --> + <!-- currently the performance simulator should + make sure all the numbers are final numbers, + including the explicit read/write accesses, + and the implicite accesses such as replacements and etc. + Future versions of McPAT may be able to reason the implicite access + based on param and stats of last level cache + The same rule applies to all cache access stats too! --> + <!-- following is AF for max power computation. + Do not change them, unless you understand them--> + <stat name="IFU_duty_cycle" value="0.25"/> + <stat name="LSU_duty_cycle" value="0.25"/> + <stat name="MemManU_I_duty_cycle" value="1"/> + <stat name="MemManU_D_duty_cycle" value="0.25"/> + <stat name="ALU_duty_cycle" value="0.9"/> + <stat name="MUL_duty_cycle" value="0.5"/> + <stat name="FPU_duty_cycle" value="0.4"/> + <stat name="ALU_cdb_duty_cycle" value="0.9"/> + <stat name="MUL_cdb_duty_cycle" value="0.5"/> + <stat name="FPU_cdb_duty_cycle" value="0.4"/> + <component id="system.core0.predictor" name="PBT"> + <!-- branch predictor; tournament predictor see Alpha implementation --> + <param name="local_predictor_size" value="10,3"/> + <param name="local_predictor_entries" value="1024"/> + <param name="global_predictor_entries" value="4096"/> + <param name="global_predictor_bits" value="2"/> + <param name="chooser_predictor_entries" value="4096"/> + <param name="chooser_predictor_bits" value="2"/> + <!-- These parameters can be combined like below in next version + <param name="load_predictor" value="10,3,1024"/> + <param name="global_predictor" value="4096,2"/> + <param name="predictor_chooser" value="4096,2"/> + --> + </component> + <component id="system.core0.itlb" name="itlb"> + <param name="number_entries" value="64"/> + <stat name="total_accesses" value="800000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + <!-- there is no write requests to itlb although writes happen to itlb after miss, + which is actually a replacement --> + </component> + <component id="system.core0.icache" name="icache"> + <!-- there is no write requests to itlb although writes happen to it after miss, + which is actually a replacement --> + <param name="icache_config" value="16384,32,4,1,1,3,8,0"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate --> + <param name="buffer_sizes" value="16, 16, 16,0"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="read_misses" value="0"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dtlb" name="dtlb"> + <param name="number_entries" value="64"/> + <stat name="total_accesses" value="200000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dcache" name="dcache"> + <!-- all the buffer related are optional --> + <param name="dcache_config" value="8192,16,4,1,1,3,16,0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.BTB" name="BTB"> + <!-- all the buffer related are optional --> + <param name="BTB_config" value="8192,4,2,1, 1,3"/> + <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + </component> + </component> + <component id="system.L1Directory0" name="L1Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="2048,1,0,1, 4, 4,8"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="20"/> + <stat name="duty_cycle" value="0.45"/> + </component> + <component id="system.L2Directory0" name="L2Directory0"> + <param name="Directory_type" value="1"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="1048576,9,16,1,2, 100"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="100"/> + <stat name="duty_cycle" value="0.45"/> + </component> + <component id="system.L20" name="L20"> + <!-- all the buffer related are optional --> + <param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/> + <!-- consider 4-way bank interleaving for Niagara 1 --> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="0"/> + <stat name="read_misses" value="0"/> + <stat name="write_misses" value="0"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="0.5"/> + </component> + +<!--**********************************************************************--> +<component id="system.L30" name="L30"> + <param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="0.35"/> + </component> +<!--**********************************************************************--> + <component id="system.NoC0" name="noc0"> + <param name="clockrate" value="3500"/> + <param name="type" value="1"/> + <!-- 1 NoC, O bus --> + <param name="horizontal_nodes" value="8"/> + <param name="vertical_nodes" value="8"/> + <param name="has_global_link" value="1"/> + <!-- 1 has global link, 0 does not have global link --> + <param name="link_throughput" value="1"/><!--w.r.t clock --> + <param name="link_latency" value="1"/><!--w.r.t clock --> + <!-- througput >= latency --> + <!-- Router architecture --> + <param name="input_ports" value="5"/> + <param name="output_ports" value="5"/> + <param name="virtual_channel_per_port" value="1"/> + <!-- input buffer; in classic routers only input ports need buffers --> + <param name="flit_bits" value="256"/> + <param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs--> + <param name="chip_coverage" value="1"/> + <!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 --> + <stat name="total_accesses" value="360000"/> + <!-- This is the number of total accesses within the whole network not for each router --> + <stat name="duty_cycle" value="0.1"/> + </component> + +<!--**********************************************************************--> + <component id="system.mem" name="mem"> + <!-- Main memory property --> + <param name="mem_tech_node" value="32"/> + <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB --> + <param name="peak_transfer_rate" value="3200"/><!--MB/S--> + <param name="internal_prefetch_of_DRAM_chip" value="4"/> + <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...--> + <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property --> + <!-- above numbers can be easily found from Wikipedia --> + <param name="capacity_per_channel" value="4096"/> <!-- MB --> + <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank + Current McPAT assumes single DIMMs are used.--> + <param name="number_ranks" value="2"/> + <param name="num_banks_of_DRAM_chip" value="8"/> + <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B --> + <param name="output_width_of_DRAM_chip" value="8"/> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 --> + <param name="burstlength_of_DRAM_chip" value="8"/> + <stat name="memory_accesses" value="1052"/> + <stat name="memory_reads" value="1052"/> + <stat name="memory_writes" value="1052"/> + </component> + <component id="system.mc" name="mc"> + <!-- Memeory controllers are for DDR(2,3...) DIMMs --> + <!-- current version of McPAT uses published values for base parameters of memory controller + improvments on MC will be added in later versions. --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> + <param name="peak_transfer_rate" value="3200"/><!--MB/S--> + <param name="block_size" value="64"/><!--B--> + <param name="number_mcs" value="0"/> + <!-- current McPAT only supports homogeneous memory controllers --> + <param name="memory_channels_per_mc" value="1"/> + <param name="number_ranks" value="2"/> + <param name="withPHY" value="0"/> + <!-- # of ranks of each channel--> + <param name="req_window_size_per_channel" value="32"/> + <param name="IO_buffer_size_per_channel" value="32"/> + <param name="databus_width" value="128"/> + <param name="addressbus_width" value="51"/> + <!-- McPAT will add the control bus width to the addressbus width automatically --> + <stat name="memory_accesses" value="33333"/> + <stat name="memory_reads" value="16667"/> + <stat name="memory_writes" value="16667"/> + <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate + the average power per MC or per channel. This is sufficent for most application. + Further trackdown can be easily added in later versions. --> + </component> +<!--**********************************************************************--> + <component id="system.niu" name="niu"> + <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller --> + <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. + the low bound of clock rate of a 10Gb MAC is 150Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate + the average power per nic or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.pcie" name="pcie"> + <!-- On chip PCIe controller, including Phy--> + <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. + the low bound of clock rate of a PCIe per lane logic is 120Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> + <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate + the average power per pcie controller or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.flashc" name="flashc"> + <param name="number_flashcs" value="0"/> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate + the average power per fc or per channel. This is sufficent for most application --> + </component> +<!--**********************************************************************--> + + </component> +</component> diff --git a/ext/mcpat/Niagara1_sharing_SBT.xml b/ext/mcpat/Niagara1_sharing_SBT.xml new file mode 100644 index 000000000..32eeca382 --- /dev/null +++ b/ext/mcpat/Niagara1_sharing_SBT.xml @@ -0,0 +1,455 @@ +<?xml version="1.0" ?> +<component id="root" name="root"> + <component id="system" name="system"> + <!--McPAT will skip the components if number is set to 0 --> + <param name="number_of_cores" value="64"/> + <param name="number_of_L1Directories" value="0"/> + <param name="number_of_L2Directories" value="0"/> + <param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports --> + <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters --> + <param name="number_of_NoCs" value="1"/> + <param name="homogeneous_cores" value="1"/><!--1 means homo --> + <param name="homogeneous_L2s" value="1"/> + <param name="homogeneous_L1Directorys" value="1"/> + <param name="homogeneous_L2Directorys" value="1"/> + <param name="homogeneous_L3s" value="1"/> + <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware --> + <param name="homogeneous_NoCs" value="1"/> + <param name="core_tech_node" value="22"/><!-- nm --> + <param name="target_core_clockrate" value="3500"/><!--MHz --> + <param name="temperature" value="360"/> <!-- Kelvin --> + <param name="number_cache_levels" value="2"/> + <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology --> + <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) --> + <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible --> + <param name="machine_bits" value="64"/> + <param name="virtual_address_width" value="64"/> + <param name="physical_address_width" value="52"/> + <param name="virtual_memory_page_size" value="4096"/> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of + virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank --> + <!-- *********************** cores ******************* --> + <component id="system.core0" name="core0"> + <!-- Core property --> + <param name="clock_rate" value="3500"/> + <param name="instruction_length" value="32"/> + <param name="opcode_width" value="9"/> + <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller + default value is machine_bits, if not set --> + <param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO--> + <!-- inorder/OoO --> + <param name="number_hardware_threads" value="4"/> + <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor, + it only may be more than one in SMT processors. BTB ports always equals to fetch ports since + branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> + <param name="fetch_width" value="1"/> + <!-- fetch_width determins the size of cachelines of L1 cache block --> + <param name="number_instruction_fetch_ports" value="1"/> + <param name="decode_width" value="1"/> + <!-- decode_width determins the number of ports of the + renaming table (both RAM and CAM) scheme --> + <param name="issue_width" value="1"/> + <!-- issue_width determins the number of ports of Issue window and other logic + as in the complexity effective proccessors paper; issue_width==dispatch_width --> + <param name="commit_width" value="1"/> + <!-- commit_width determins the number of ports of register files --> + <param name="fp_issue_width" value="1"/> + <param name="prediction_width" value="0"/> + <!-- number of branch instructions can be predicted simultannouesl--> + <!-- Current version of McPAT does not distinguish int and floating point pipelines + Theses parameters are reserved for future use.--> + <param name="pipelines_per_core" value="1,1"/> + <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared--> + <param name="pipeline_depth" value="6,6"/> + <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops --> + <!-- issue and exe unit--> + <param name="ALU_per_core" value="1"/> + <!-- contains an adder, a shifter, and a logical unit --> + <param name="MUL_per_core" value="1"/> + <!-- For MUL and Div --> + <param name="FPU_per_core" value="0.125"/> + <!-- buffer between IF and ID stage --> + <param name="instruction_buffer_size" value="16"/> + <!-- buffer between ID and sche/exe stage --> + <param name="decoded_stream_buffer_size" value="16"/> + <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED--> + <!-- McPAT support 2 types of OoO cores, RS based and physical reg based--> + <param name="instruction_window_size" value="16"/> + <param name="fp_instruction_window_size" value="16"/> + <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 --> + <param name="ROB_size" value="80"/> + <!-- each in-flight instruction has an entry in ROB --> + <!-- registers --> + <param name="archi_Regs_IRF_size" value="32"/> + <param name="archi_Regs_FRF_size" value="32"/> + <!-- if OoO processor, phy_reg number is needed for renaming logic, + renaming logic is for both integer and floating point insts. --> + <param name="phy_Regs_IRF_size" value="80"/> + <param name="phy_Regs_FRF_size" value="80"/> + <!-- rename logic --> + <param name="rename_scheme" value="0"/> + <!-- can be RAM based(0) or CAM based(1) rename scheme + RAM-based scheme will have free list, status table; + CAM-based scheme have the valid bit in the data field of the CAM + both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions; + Detailed RAT Implementation see TR --> + <param name="register_windows_size" value="8"/> + <!-- how many windows in the windowed register file, sun processors; + no register windowing is used when this number is 0 --> + <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha), + They will always try to exeute out-of-order though. --> + <param name="LSU_order" value="inorder"/> + <param name="store_buffer_size" value="32"/> + <!-- By default, in-order cores do not have load buffers --> + <param name="load_buffer_size" value="32"/> + <!-- number of ports refer to sustainable concurrent memory accesses --> + <param name="memory_ports" value="1"/> + <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer + as well as the ports of Dcache which is connected to LSU --> + <!-- dual-pumped Dcache can be used to save the extra read/write ports --> + <param name="RAS_size" value="32"/> + <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check --> + <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops --> + <stat name="total_instructions" value="800000"/> + <stat name="int_instructions" value="600000"/> + <stat name="fp_instructions" value="20000"/> + <stat name="branch_instructions" value="0"/> + <stat name="branch_mispredictions" value="0"/> + <stat name="load_instructions" value="100000"/> + <stat name="store_instructions" value="100000"/> + <stat name="committed_instructions" value="800000"/> + <stat name="committed_int_instructions" value="600000"/> + <stat name="committed_fp_instructions" value="20000"/> + <stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous --> + <!-- the following cycle stats are used for heterogeneouse cores only, + please ignore them if homogeneouse cores --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!-- instruction buffer stats --> + <!-- ROB stats, both RS and Phy based OoOs have ROB + performance simulator should capture the difference on accesses, + otherwise, McPAT has to guess based on number of commited instructions. --> + <stat name="ROB_reads" value="263886"/> + <stat name="ROB_writes" value="263886"/> + <!-- RAT accesses --> + <stat name="rename_accesses" value="263886"/> + <stat name="fp_rename_accesses" value="263886"/> + <!-- decode and rename stage use this, should be total ic - nop --> + <!-- Inst window stats --> + <stat name="inst_window_reads" value="263886"/> + <stat name="inst_window_writes" value="263886"/> + <stat name="inst_window_wakeup_accesses" value="263886"/> + <stat name="fp_inst_window_reads" value="263886"/> + <stat name="fp_inst_window_writes" value="263886"/> + <stat name="fp_inst_window_wakeup_accesses" value="263886"/> + <!-- RF accesses --> + <stat name="int_regfile_reads" value="1600000"/> + <stat name="float_regfile_reads" value="40000"/> + <stat name="int_regfile_writes" value="800000"/> + <stat name="float_regfile_writes" value="20000"/> + <!-- accesses to the working reg --> + <stat name="function_calls" value="5"/> + <stat name="context_switches" value="260343"/> + <!-- Number of Windowes switches (number of function calls and returns)--> + <!-- Alu stats by default, the processor has one FPU that includes the divider and + multiplier. The fpu accesses should include accesses to multiplier and divider --> + <stat name="ialu_accesses" value="800000"/> + <stat name="fpu_accesses" value="10000"/> + <stat name="mul_accesses" value="100000"/> + <stat name="cdb_alu_accesses" value="1000000"/> + <stat name="cdb_mul_accesses" value="0"/> + <stat name="cdb_fpu_accesses" value="0"/> + <!-- multiple cycle accesses should be counted multiple times, + otherwise, McPAT can use internal counter for different floating point instructions + to get final accesses. But that needs detailed info for floating point inst mix --> + <!-- currently the performance simulator should + make sure all the numbers are final numbers, + including the explicit read/write accesses, + and the implicite accesses such as replacements and etc. + Future versions of McPAT may be able to reason the implicite access + based on param and stats of last level cache + The same rule applies to all cache access stats too! --> + <!-- following is AF for max power computation. + Do not change them, unless you understand them--> + <stat name="IFU_duty_cycle" value="0.25"/> + <stat name="LSU_duty_cycle" value="0.25"/> + <stat name="MemManU_I_duty_cycle" value="1"/> + <stat name="MemManU_D_duty_cycle" value="0.25"/> + <stat name="ALU_duty_cycle" value="0.9"/> + <stat name="MUL_duty_cycle" value="0.5"/> + <stat name="FPU_duty_cycle" value="0.4"/> + <stat name="ALU_cdb_duty_cycle" value="0.9"/> + <stat name="MUL_cdb_duty_cycle" value="0.5"/> + <stat name="FPU_cdb_duty_cycle" value="0.4"/> + <component id="system.core0.predictor" name="PBT"> + <!-- branch predictor; tournament predictor see Alpha implementation --> + <param name="local_predictor_size" value="10,3"/> + <param name="local_predictor_entries" value="1024"/> + <param name="global_predictor_entries" value="4096"/> + <param name="global_predictor_bits" value="2"/> + <param name="chooser_predictor_entries" value="4096"/> + <param name="chooser_predictor_bits" value="2"/> + <!-- These parameters can be combined like below in next version + <param name="load_predictor" value="10,3,1024"/> + <param name="global_predictor" value="4096,2"/> + <param name="predictor_chooser" value="4096,2"/> + --> + </component> + <component id="system.core0.itlb" name="itlb"> + <param name="number_entries" value="64"/> + <stat name="total_accesses" value="800000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + <!-- there is no write requests to itlb although writes happen to itlb after miss, + which is actually a replacement --> + </component> + <component id="system.core0.icache" name="icache"> + <!-- there is no write requests to itlb although writes happen to it after miss, + which is actually a replacement --> + <param name="icache_config" value="16384,32,4,1,1,3,8,0"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate --> + <param name="buffer_sizes" value="16, 16, 16,0"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="read_misses" value="0"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dtlb" name="dtlb"> + <param name="number_entries" value="64"/> + <stat name="total_accesses" value="200000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dcache" name="dcache"> + <!-- all the buffer related are optional --> + <param name="dcache_config" value="8192,16,4,1,1,3,16,0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.BTB" name="BTB"> + <!-- all the buffer related are optional --> + <param name="BTB_config" value="8192,4,2,1, 1,3"/> + <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + </component> + </component> + <component id="system.L1Directory0" name="L1Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank --> + <param name="Dir_config" value="2048,1,0,1, 4, 4,8"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="20"/> + <stat name="duty_cycle" value="0.45"/> + </component> + <component id="system.L2Directory0" name="L2Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank --> + <param name="Dir_config" value="8388608,9,0,1,100, 100"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,8"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="100"/> + <stat name="duty_cycle" value="0.45"/> + </component> + <component id="system.L20" name="L20"> + <!-- all the buffer related are optional --> + <param name="merged_dir" value="1"/><!--if static bank tag is used as the directory --> + <param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/> + <!-- consider 4-way bank interleaving for Niagara 1 --> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="0"/> + <stat name="read_misses" value="0"/> + <stat name="write_misses" value="0"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="0.5"/> + <stat name="coherent_read_accesses" value="400000"/> + <stat name="coherent_write_accesses" value="0"/> + <stat name="coherent_read_misses" value="400000"/> + <stat name="coherent_write_misses" value="0"/> + <stat name="dir_duty_cycle" value="0.5"/> + + </component> + +<!--**********************************************************************--> +<component id="system.L30" name="L30"> + <param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="0.35"/> + <param name="Merged_dir" value="1"/><!--if static bank tag is used as the directory --> + <stat name="coherent_read_accesses" value="400000"/> + <stat name="coherent_write_accesses" value="0"/> + <stat name="coherent_read_misses" value="400000"/> + <stat name="coherent_write_misses" value="0"/> + <stat name="dir_duty_cycle" value="0.5"/> + </component> +<!--**********************************************************************--> + <component id="system.NoC0" name="noc0"> + <param name="clockrate" value="3500"/> + <param name="type" value="1"/> + <!-- 1 NoC, O bus --> + <param name="horizontal_nodes" value="8"/> + <param name="vertical_nodes" value="8"/> + <param name="has_global_link" value="1"/> + <!-- 1 has global link, 0 does not have global link --> + <param name="link_throughput" value="1"/><!--w.r.t clock --> + <param name="link_latency" value="1"/><!--w.r.t clock --> + <!-- througput >= latency --> + <!-- Router architecture --> + <param name="input_ports" value="5"/> + <param name="output_ports" value="5"/> + <param name="virtual_channel_per_port" value="1"/> + <!-- input buffer; in classic routers only input ports need buffers --> + <param name="flit_bits" value="256"/> + <param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs--> + <param name="chip_coverage" value="1"/> + <!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 --> + <stat name="total_accesses" value="360000"/> + <!-- This is the number of total accesses within the whole network not for each router --> + <stat name="duty_cycle" value="0.1"/> + </component> + +<!--**********************************************************************--> + <component id="system.mem" name="mem"> + <!-- Main memory property --> + <param name="mem_tech_node" value="32"/> + <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB --> + <param name="peak_transfer_rate" value="3200"/><!--MB/S--> + <param name="internal_prefetch_of_DRAM_chip" value="4"/> + <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...--> + <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property --> + <!-- above numbers can be easily found from Wikipedia --> + <param name="capacity_per_channel" value="4096"/> <!-- MB --> + <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank + Current McPAT assumes single DIMMs are used.--> + <param name="number_ranks" value="2"/> + <param name="num_banks_of_DRAM_chip" value="8"/> + <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B --> + <param name="output_width_of_DRAM_chip" value="8"/> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 --> + <param name="burstlength_of_DRAM_chip" value="8"/> + <stat name="memory_accesses" value="1052"/> + <stat name="memory_reads" value="1052"/> + <stat name="memory_writes" value="1052"/> + </component> + <component id="system.mc" name="mc"> + <!-- Memeory controllers are for DDR(2,3...) DIMMs --> + <!-- current version of McPAT uses published values for base parameters of memory controller + improvments on MC will be added in later versions. --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> + <param name="peak_transfer_rate" value="3200"/><!--MB/S--> + <param name="block_size" value="64"/><!--B--> + <param name="number_mcs" value="0"/> + <!-- current McPAT only supports homogeneous memory controllers --> + <param name="memory_channels_per_mc" value="1"/> + <param name="number_ranks" value="2"/> + <param name="withPHY" value="0"/> + <!-- # of ranks of each channel--> + <param name="req_window_size_per_channel" value="32"/> + <param name="IO_buffer_size_per_channel" value="32"/> + <param name="databus_width" value="128"/> + <param name="addressbus_width" value="51"/> + <!-- McPAT will add the control bus width to the addressbus width automatically --> + <stat name="memory_accesses" value="33333"/> + <stat name="memory_reads" value="16667"/> + <stat name="memory_writes" value="16667"/> + <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate + the average power per MC or per channel. This is sufficent for most application. + Further trackdown can be easily added in later versions. --> + </component> +<!--**********************************************************************--> + <component id="system.niu" name="niu"> + <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller --> + <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. + the low bound of clock rate of a 10Gb MAC is 150Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate + the average power per nic or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.pcie" name="pcie"> + <!-- On chip PCIe controller, including Phy--> + <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. + the low bound of clock rate of a PCIe per lane logic is 120Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> + <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate + the average power per pcie controller or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.flashc" name="flashc"> + <param name="number_flashcs" value="0"/> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate + the average power per fc or per channel. This is sufficent for most application --> + </component> +<!--**********************************************************************--> + + </component> +</component> diff --git a/ext/mcpat/Niagara1_sharing_ST.xml b/ext/mcpat/Niagara1_sharing_ST.xml new file mode 100644 index 000000000..3f0573fe9 --- /dev/null +++ b/ext/mcpat/Niagara1_sharing_ST.xml @@ -0,0 +1,443 @@ +<?xml version="1.0" ?> +<component id="root" name="root"> + <component id="system" name="system"> + <!--McPAT will skip the components if number is set to 0 --> + <param name="number_of_cores" value="64"/> + <param name="number_of_L1Directories" value="0"/> + <param name="number_of_L2Directories" value="1"/> + <param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports --> + <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters --> + <param name="number_of_NoCs" value="1"/> + <param name="homogeneous_cores" value="1"/><!--1 means homo --> + <param name="homogeneous_L2s" value="1"/> + <param name="homogeneous_L1Directorys" value="1"/> + <param name="homogeneous_L2Directorys" value="1"/> + <param name="homogeneous_L3s" value="1"/> + <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware --> + <param name="homogeneous_NoCs" value="1"/> + <param name="core_tech_node" value="22"/><!-- nm --> + <param name="target_core_clockrate" value="3500"/><!--MHz --> + <param name="temperature" value="360"/> <!-- Kelvin --> + <param name="number_cache_levels" value="2"/> + <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology --> + <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) --> + <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible --> + <param name="machine_bits" value="64"/> + <param name="virtual_address_width" value="64"/> + <param name="physical_address_width" value="52"/> + <param name="virtual_memory_page_size" value="4096"/> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of + virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank --> + <!-- *********************** cores ******************* --> + <component id="system.core0" name="core0"> + <!-- Core property --> + <param name="clock_rate" value="3500"/> + <param name="instruction_length" value="32"/> + <param name="opcode_width" value="9"/> + <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller + default value is machine_bits, if not set --> + <param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO--> + <!-- inorder/OoO --> + <param name="number_hardware_threads" value="4"/> + <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor, + it only may be more than one in SMT processors. BTB ports always equals to fetch ports since + branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> + <param name="fetch_width" value="1"/> + <!-- fetch_width determins the size of cachelines of L1 cache block --> + <param name="number_instruction_fetch_ports" value="1"/> + <param name="decode_width" value="1"/> + <!-- decode_width determins the number of ports of the + renaming table (both RAM and CAM) scheme --> + <param name="issue_width" value="1"/> + <!-- issue_width determins the number of ports of Issue window and other logic + as in the complexity effective proccessors paper; issue_width==dispatch_width --> + <param name="commit_width" value="1"/> + <!-- commit_width determins the number of ports of register files --> + <param name="fp_issue_width" value="1"/> + <param name="prediction_width" value="0"/> + <!-- number of branch instructions can be predicted simultannouesl--> + <!-- Current version of McPAT does not distinguish int and floating point pipelines + Theses parameters are reserved for future use.--> + <param name="pipelines_per_core" value="1,1"/> + <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared--> + <param name="pipeline_depth" value="6,6"/> + <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops --> + <!-- issue and exe unit--> + <param name="ALU_per_core" value="1"/> + <!-- contains an adder, a shifter, and a logical unit --> + <param name="MUL_per_core" value="1"/> + <!-- For MUL and Div --> + <param name="FPU_per_core" value="0.125"/> + <!-- buffer between IF and ID stage --> + <param name="instruction_buffer_size" value="16"/> + <!-- buffer between ID and sche/exe stage --> + <param name="decoded_stream_buffer_size" value="16"/> + <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED--> + <!-- McPAT support 2 types of OoO cores, RS based and physical reg based--> + <param name="instruction_window_size" value="16"/> + <param name="fp_instruction_window_size" value="16"/> + <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 --> + <param name="ROB_size" value="80"/> + <!-- each in-flight instruction has an entry in ROB --> + <!-- registers --> + <param name="archi_Regs_IRF_size" value="32"/> + <param name="archi_Regs_FRF_size" value="32"/> + <!-- if OoO processor, phy_reg number is needed for renaming logic, + renaming logic is for both integer and floating point insts. --> + <param name="phy_Regs_IRF_size" value="80"/> + <param name="phy_Regs_FRF_size" value="80"/> + <!-- rename logic --> + <param name="rename_scheme" value="0"/> + <!-- can be RAM based(0) or CAM based(1) rename scheme + RAM-based scheme will have free list, status table; + CAM-based scheme have the valid bit in the data field of the CAM + both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions; + Detailed RAT Implementation see TR --> + <param name="register_windows_size" value="8"/> + <!-- how many windows in the windowed register file, sun processors; + no register windowing is used when this number is 0 --> + <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha), + They will always try to exeute out-of-order though. --> + <param name="LSU_order" value="inorder"/> + <param name="store_buffer_size" value="32"/> + <!-- By default, in-order cores do not have load buffers --> + <param name="load_buffer_size" value="32"/> + <!-- number of ports refer to sustainable concurrent memory accesses --> + <param name="memory_ports" value="1"/> + <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer + as well as the ports of Dcache which is connected to LSU --> + <!-- dual-pumped Dcache can be used to save the extra read/write ports --> + <param name="RAS_size" value="32"/> + <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check --> + <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops --> + <stat name="total_instructions" value="800000"/> + <stat name="int_instructions" value="600000"/> + <stat name="fp_instructions" value="20000"/> + <stat name="branch_instructions" value="0"/> + <stat name="branch_mispredictions" value="0"/> + <stat name="load_instructions" value="100000"/> + <stat name="store_instructions" value="100000"/> + <stat name="committed_instructions" value="800000"/> + <stat name="committed_int_instructions" value="600000"/> + <stat name="committed_fp_instructions" value="20000"/> + <stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous --> + <!-- the following cycle stats are used for heterogeneouse cores only, + please ignore them if homogeneouse cores --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!-- instruction buffer stats --> + <!-- ROB stats, both RS and Phy based OoOs have ROB + performance simulator should capture the difference on accesses, + otherwise, McPAT has to guess based on number of commited instructions. --> + <stat name="ROB_reads" value="263886"/> + <stat name="ROB_writes" value="263886"/> + <!-- RAT accesses --> + <stat name="rename_accesses" value="263886"/> + <stat name="fp_rename_accesses" value="263886"/> + <!-- decode and rename stage use this, should be total ic - nop --> + <!-- Inst window stats --> + <stat name="inst_window_reads" value="263886"/> + <stat name="inst_window_writes" value="263886"/> + <stat name="inst_window_wakeup_accesses" value="263886"/> + <stat name="fp_inst_window_reads" value="263886"/> + <stat name="fp_inst_window_writes" value="263886"/> + <stat name="fp_inst_window_wakeup_accesses" value="263886"/> + <!-- RF accesses --> + <stat name="int_regfile_reads" value="1600000"/> + <stat name="float_regfile_reads" value="40000"/> + <stat name="int_regfile_writes" value="800000"/> + <stat name="float_regfile_writes" value="20000"/> + <!-- accesses to the working reg --> + <stat name="function_calls" value="5"/> + <stat name="context_switches" value="260343"/> + <!-- Number of Windowes switches (number of function calls and returns)--> + <!-- Alu stats by default, the processor has one FPU that includes the divider and + multiplier. The fpu accesses should include accesses to multiplier and divider --> + <stat name="ialu_accesses" value="800000"/> + <stat name="fpu_accesses" value="10000"/> + <stat name="mul_accesses" value="100000"/> + <stat name="cdb_alu_accesses" value="1000000"/> + <stat name="cdb_mul_accesses" value="0"/> + <stat name="cdb_fpu_accesses" value="0"/> + <!-- multiple cycle accesses should be counted multiple times, + otherwise, McPAT can use internal counter for different floating point instructions + to get final accesses. But that needs detailed info for floating point inst mix --> + <!-- currently the performance simulator should + make sure all the numbers are final numbers, + including the explicit read/write accesses, + and the implicite accesses such as replacements and etc. + Future versions of McPAT may be able to reason the implicite access + based on param and stats of last level cache + The same rule applies to all cache access stats too! --> + <!-- following is AF for max power computation. + Do not change them, unless you understand them--> + <stat name="IFU_duty_cycle" value="0.25"/> + <stat name="LSU_duty_cycle" value="0.25"/> + <stat name="MemManU_I_duty_cycle" value="1"/> + <stat name="MemManU_D_duty_cycle" value="0.25"/> + <stat name="ALU_duty_cycle" value="0.9"/> + <stat name="MUL_duty_cycle" value="0.5"/> + <stat name="FPU_duty_cycle" value="0.4"/> + <stat name="ALU_cdb_duty_cycle" value="0.9"/> + <stat name="MUL_cdb_duty_cycle" value="0.5"/> + <stat name="FPU_cdb_duty_cycle" value="0.4"/> + <component id="system.core0.predictor" name="PBT"> + <!-- branch predictor; tournament predictor see Alpha implementation --> + <param name="local_predictor_size" value="10,3"/> + <param name="local_predictor_entries" value="1024"/> + <param name="global_predictor_entries" value="4096"/> + <param name="global_predictor_bits" value="2"/> + <param name="chooser_predictor_entries" value="4096"/> + <param name="chooser_predictor_bits" value="2"/> + <!-- These parameters can be combined like below in next version + <param name="load_predictor" value="10,3,1024"/> + <param name="global_predictor" value="4096,2"/> + <param name="predictor_chooser" value="4096,2"/> + --> + </component> + <component id="system.core0.itlb" name="itlb"> + <param name="number_entries" value="64"/> + <stat name="total_accesses" value="800000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + <!-- there is no write requests to itlb although writes happen to itlb after miss, + which is actually a replacement --> + </component> + <component id="system.core0.icache" name="icache"> + <!-- there is no write requests to itlb although writes happen to it after miss, + which is actually a replacement --> + <param name="icache_config" value="16384,32,4,1,1,3,8,0"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate --> + <param name="buffer_sizes" value="16, 16, 16,0"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="read_misses" value="0"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dtlb" name="dtlb"> + <param name="number_entries" value="64"/> + <stat name="total_accesses" value="200000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dcache" name="dcache"> + <!-- all the buffer related are optional --> + <param name="dcache_config" value="8192,16,4,1,1,3,16,0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.BTB" name="BTB"> + <!-- all the buffer related are optional --> + <param name="BTB_config" value="8192,4,2,1, 1,3"/> + <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + </component> + </component> + <component id="system.L1Directory0" name="L1Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank --> + <param name="Dir_config" value="2048,1,0,1, 4, 4,8"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="20"/> + <stat name="duty_cycle" value="0.45"/> + </component> + <component id="system.L2Directory0" name="L2Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank --> + <param name="Dir_config" value="8388608,9,0,1,100, 100"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3500"/> + <param name="ports" value="0,0,8"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="100"/> + <stat name="duty_cycle" value="0.45"/> + </component> + <component id="system.L20" name="L20"> + <!-- all the buffer related are optional --> + <param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/> + <param name="Merged_dir" value="1"/> + <!-- consider 4-way bank interleaving for Niagara 1 --> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="0"/> + <stat name="read_misses" value="0"/> + <stat name="write_misses" value="0"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="0.5"/> + </component> + +<!--**********************************************************************--> +<component id="system.L30" name="L30"> + <param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="Merged_dir" value="1"/> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="0.35"/> + </component> +<!--**********************************************************************--> + <component id="system.NoC0" name="noc0"> + <param name="clockrate" value="3500"/> + <param name="type" value="1"/> + <!-- 1 NoC, O bus --> + <param name="horizontal_nodes" value="8"/> + <param name="vertical_nodes" value="8"/> + <param name="has_global_link" value="1"/> + <!-- 1 has global link, 0 does not have global link --> + <param name="link_throughput" value="1"/><!--w.r.t clock --> + <param name="link_latency" value="1"/><!--w.r.t clock --> + <!-- througput >= latency --> + <!-- Router architecture --> + <param name="input_ports" value="5"/> + <param name="output_ports" value="5"/> + <param name="virtual_channel_per_port" value="1"/> + <!-- input buffer; in classic routers only input ports need buffers --> + <param name="flit_bits" value="256"/> + <param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs--> + <param name="chip_coverage" value="1"/> + <!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 --> + <stat name="total_accesses" value="360000"/> + <!-- This is the number of total accesses within the whole network not for each router --> + <stat name="duty_cycle" value="0.1"/> + </component> + +<!--**********************************************************************--> + <component id="system.mem" name="mem"> + <!-- Main memory property --> + <param name="mem_tech_node" value="32"/> + <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB --> + <param name="peak_transfer_rate" value="3200"/><!--MB/S--> + <param name="internal_prefetch_of_DRAM_chip" value="4"/> + <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...--> + <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property --> + <!-- above numbers can be easily found from Wikipedia --> + <param name="capacity_per_channel" value="4096"/> <!-- MB --> + <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank + Current McPAT assumes single DIMMs are used.--> + <param name="number_ranks" value="2"/> + <param name="num_banks_of_DRAM_chip" value="8"/> + <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B --> + <param name="output_width_of_DRAM_chip" value="8"/> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 --> + <param name="burstlength_of_DRAM_chip" value="8"/> + <stat name="memory_accesses" value="1052"/> + <stat name="memory_reads" value="1052"/> + <stat name="memory_writes" value="1052"/> + </component> + <component id="system.mc" name="mc"> + <!-- Memeory controllers are for DDR(2,3...) DIMMs --> + <!-- current version of McPAT uses published values for base parameters of memory controller + improvments on MC will be added in later versions. --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> + <param name="peak_transfer_rate" value="3200"/><!--MB/S--> + <param name="block_size" value="64"/><!--B--> + <param name="number_mcs" value="0"/> + <!-- current McPAT only supports homogeneous memory controllers --> + <param name="memory_channels_per_mc" value="1"/> + <param name="number_ranks" value="2"/> + <param name="withPHY" value="0"/> + <!-- # of ranks of each channel--> + <param name="req_window_size_per_channel" value="32"/> + <param name="IO_buffer_size_per_channel" value="32"/> + <param name="databus_width" value="128"/> + <param name="addressbus_width" value="51"/> + <!-- McPAT will add the control bus width to the addressbus width automatically --> + <stat name="memory_accesses" value="33333"/> + <stat name="memory_reads" value="16667"/> + <stat name="memory_writes" value="16667"/> + <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate + the average power per MC or per channel. This is sufficent for most application. + Further trackdown can be easily added in later versions. --> + </component> +<!--**********************************************************************--> + <component id="system.niu" name="niu"> + <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller --> + <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. + the low bound of clock rate of a 10Gb MAC is 150Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate + the average power per nic or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.pcie" name="pcie"> + <!-- On chip PCIe controller, including Phy--> + <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. + the low bound of clock rate of a PCIe per lane logic is 120Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> + <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate + the average power per pcie controller or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.flashc" name="flashc"> + <param name="number_flashcs" value="0"/> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate + the average power per fc or per channel. This is sufficent for most application --> + </component> +<!--**********************************************************************--> + </component> +</component>
\ No newline at end of file diff --git a/ext/mcpat/Niagara2.xml b/ext/mcpat/Niagara2.xml new file mode 100644 index 000000000..c7e311ff8 --- /dev/null +++ b/ext/mcpat/Niagara2.xml @@ -0,0 +1,438 @@ +<?xml version="1.0" ?> +<component id="root" name="root"> + <component id="system" name="system"> + <!--McPAT will skip the components if number is set to 0 --> + <param name="number_of_cores" value="8"/> + <param name="number_of_L1Directories" value="8"/> + <param name="number_of_L2Directories" value="0"/> + <param name="number_of_L2s" value="8"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports --> + <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters --> + <param name="number_of_NoCs" value="1"/> + <param name="homogeneous_cores" value="1"/><!--1 means homo --> + <param name="homogeneous_L2s" value="1"/> + <param name="homogeneous_L1Directorys" value="1"/> + <param name="homogeneous_L2Directorys" value="1"/> + <param name="homogeneous_L3s" value="1"/> + <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware --> + <param name="homogeneous_NoCs" value="1"/> + <param name="core_tech_node" value="65"/><!-- nm --> + <param name="target_core_clockrate" value="1400"/><!--MHz --> + <param name="temperature" value="380"/> <!-- Kelvin --> + <param name="number_cache_levels" value="2"/> + <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology --> + <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) --> + <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible --> + <param name="machine_bits" value="64"/> + <param name="virtual_address_width" value="64"/> + <param name="physical_address_width" value="52"/> + <param name="virtual_memory_page_size" value="4096"/> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of + virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank --> + <!-- *********************** cores ******************* --> + <component id="system.core0" name="core0"> + <!-- Core property --> + <param name="clock_rate" value="1400"/> + <param name="instruction_length" value="32"/> + <param name="opcode_width" value="9"/> + <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller + default value is machine_bits, if not set --> + <param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO--> + <!-- inorder/OoO --> + <param name="number_hardware_threads" value="4"/> + <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor, + it only may be more than one in SMT processors. BTB ports always equals to fetch ports since + branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> + <param name="fetch_width" value="1"/> + <!-- fetch_width determins the size of cachelines of L1 cache block --> + <param name="number_instruction_fetch_ports" value="1"/> + <param name="decode_width" value="1"/> + <!-- decode_width determins the number of ports of the + renaming table (both RAM and CAM) scheme --> + <param name="issue_width" value="1"/> + <!-- issue_width determins the number of ports of Issue window and other logic + as in the complexity effective proccessors paper; issue_width==dispatch_width --> + <param name="commit_width" value="1"/> + <!-- commit_width determins the number of ports of register files --> + <param name="fp_issue_width" value="1"/> + <param name="prediction_width" value="0"/> + <!-- number of branch instructions can be predicted simultannouesl--> + <!-- Current version of McPAT does not distinguish int and floating point pipelines + Theses parameters are reserved for future use.--> + <param name="pipelines_per_core" value="2,1"/> + <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared--> + <param name="pipeline_depth" value="8,8"/> + <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops --> + <!-- issue and exe unit--> + <param name="ALU_per_core" value="2"/> + <!-- contains an adder, a shifter, and a logical unit --> + <param name="MUL_per_core" value="0"/> + <!-- For MUL and Div --> + <param name="FPU_per_core" value="1"/> + <!-- buffer between IF and ID stage --> + <param name="instruction_buffer_size" value="32"/> + <!-- buffer between ID and sche/exe stage --> + <param name="decoded_stream_buffer_size" value="16"/> + <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED--> + <!-- McPAT support 2 types of OoO cores, RS based and physical reg based--> + <param name="instruction_window_size" value="16"/> + <param name="fp_instruction_window_size" value="16"/> + <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 --> + <param name="ROB_size" value="80"/> + <!-- each in-flight instruction has an entry in ROB --> + <!-- registers --> + <param name="archi_Regs_IRF_size" value="32"/> + <param name="archi_Regs_FRF_size" value="32"/> + <!-- if OoO processor, phy_reg number is needed for renaming logic, + renaming logic is for both integer and floating point insts. --> + <param name="phy_Regs_IRF_size" value="80"/> + <param name="phy_Regs_FRF_size" value="80"/> + <!-- rename logic --> + <param name="rename_scheme" value="0"/> + <!-- can be RAM based(0) or CAM based(1) rename scheme + RAM-based scheme will have free list, status table; + CAM-based scheme have the valid bit in the data field of the CAM + both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions; + Detailed RAT Implementation see TR --> + <param name="register_windows_size" value="8"/> + <!-- how many windows in the windowed register file, sun processors; + no register windowing is used when this number is 0 --> + <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha), + They will always try to exeute out-of-order though. --> + <param name="LSU_order" value="inorder"/> + <param name="store_buffer_size" value="64"/> + <!-- By default, in-order cores do not have load buffers --> + <param name="load_buffer_size" value="64"/> + <!-- number of ports refer to sustainable concurrent memory accesses --> + <param name="memory_ports" value="1"/> + <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer + as well as the ports of Dcache which is connected to LSU --> + <!-- dual-pumped Dcache can be used to save the extra read/write ports --> + <param name="RAS_size" value="32"/> + <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check --> + <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops --> + <stat name="total_instructions" value="1600000"/> + <stat name="int_instructions" value="1200000"/> + <stat name="fp_instructions" value="40000"/> + <stat name="branch_instructions" value="0"/> + <stat name="branch_mispredictions" value="0"/> + <stat name="load_instructions" value="200000"/> + <stat name="store_instructions" value="200000"/> + <stat name="committed_instructions" value="1600000"/> + <stat name="committed_int_instructions" value="1200000"/> + <stat name="committed_fp_instructions" value="40000"/> + <stat name="pipeline_duty_cycle" value="0.5"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous --> + <!-- the following cycle stats are used for heterogeneouse cores only, + please ignore them if homogeneouse cores --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!-- instruction buffer stats --> + <!-- ROB stats, both RS and Phy based OoOs have ROB + performance simulator should capture the difference on accesses, + otherwise, McPAT has to guess based on number of commited instructions. --> + <stat name="ROB_reads" value="263886"/> + <stat name="ROB_writes" value="263886"/> + <!-- RAT accesses --> + <stat name="rename_accesses" value="263886"/> + <stat name="fp_rename_accesses" value="263886"/> + <!-- decode and rename stage use this, should be total ic - nop --> + <!-- Inst window stats --> + <stat name="inst_window_reads" value="263886"/> + <stat name="inst_window_writes" value="263886"/> + <stat name="inst_window_wakeup_accesses" value="263886"/> + <stat name="fp_inst_window_reads" value="263886"/> + <stat name="fp_inst_window_writes" value="263886"/> + <stat name="fp_inst_window_wakeup_accesses" value="263886"/> + <!-- RF accesses --> + <stat name="int_regfile_reads" value="3200000"/> + <stat name="float_regfile_reads" value="80000"/> + <stat name="int_regfile_writes" value="1600000"/> + <stat name="float_regfile_writes" value="40000"/> + <!-- accesses to the working reg --> + <stat name="function_calls" value="5"/> + <stat name="context_switches" value="260343"/> + <!-- Number of Windowes switches (number of function calls and returns)--> + <!-- Alu stats by default, the processor has one FPU that includes the divider and + multiplier. The fpu accesses should include accesses to multiplier and divider --> + <stat name="ialu_accesses" value="1600000"/> + <stat name="fpu_accesses" value="10000"/> + <stat name="mul_accesses" value="100000"/> + <stat name="cdb_alu_accesses" value="1200000"/> + <stat name="cdb_mul_accesses" value="0"/> + <stat name="cdb_fpu_accesses" value="0"/> + <!-- multiple cycle accesses should be counted multiple times, + otherwise, McPAT can use internal counter for different floating point instructions + to get final accesses. But that needs detailed info for floating point inst mix --> + <!-- currently the performance simulator should + make sure all the numbers are final numbers, + including the explicit read/write accesses, + and the implicite accesses such as replacements and etc. + Future versions of McPAT may be able to reason the implicite access + based on param and stats of last level cache + The same rule applies to all cache access stats too! --> + <!-- following is AF for max power computation. + Do not change them, unless you understand them--> + <stat name="IFU_duty_cycle" value="0.5"/> + <stat name="LSU_duty_cycle" value="0.25"/> + <stat name="MemManU_I_duty_cycle" value="0.5"/> + <stat name="MemManU_D_duty_cycle" value="0.25"/> + <stat name="ALU_duty_cycle" value="0.9"/> + <stat name="MUL_duty_cycle" value="0"/> + <stat name="FPU_duty_cycle" value="0.6"/> + <!--FPU also handles Mul/div --> + <stat name="ALU_cdb_duty_cycle" value="0.9"/> + <stat name="MUL_cdb_duty_cycle" value="0"/> + <stat name="FPU_cdb_duty_cycle" value="0.6"/> + <component id="system.core0.predictor" name="PBT"> + <!-- branch predictor; tournament predictor see Alpha implementation --> + <param name="local_predictor_size" value="10,3"/> + <param name="local_predictor_entries" value="1024"/> + <param name="global_predictor_entries" value="4096"/> + <param name="global_predictor_bits" value="2"/> + <param name="chooser_predictor_entries" value="4096"/> + <param name="chooser_predictor_bits" value="2"/> + <!-- These parameters can be combined like below in next version + <param name="load_predictor" value="10,3,1024"/> + <param name="global_predictor" value="4096,2"/> + <param name="predictor_chooser" value="4096,2"/> + --> + </component> + <component id="system.core0.itlb" name="itlb"> + <param name="number_entries" value="64"/> + <stat name="total_accesses" value="800000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + <!-- there is no write requests to itlb although writes happen to itlb after miss, + which is actually a replacement --> + </component> + <component id="system.core0.icache" name="icache"> + <!-- there is no write requests to itlb although writes happen to it after miss, + which is actually a replacement --> + <param name="icache_config" value="16384,32,8,1,1,7,8,0"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate --> + <param name="buffer_sizes" value="16, 16, 16,0"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="read_misses" value="0"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dtlb" name="dtlb"> + <param name="number_entries" value="128"/> + <stat name="total_accesses" value="200000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dcache" name="dcache"> + <!-- all the buffer related are optional --> + <param name="dcache_config" value="8192,16,4,1, 1,3, 16,0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.BTB" name="BTB"> + <!-- all the buffer related are optional --> + <param name="BTB_config" value="8192,4,2,1, 1,3"/> + <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + </component> + </component> + <component id="system.L1Directory0" name="L1Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="1024,2,0,1,1,1, 8"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="1400"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="20"/> + </component> + <component id="system.L2Directory0" name="L2Directory0"> + <param name="Directory_type" value="1"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="1048576,16,16,1,2, 100"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="1400"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="100"/> + </component> + <component id="system.L20" name="L20"> + <!-- all the buffer related are optional --> + <param name="L2_config" value="524228,64,16,1, 8,23, 64,1"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <param name="clockrate" value="1400"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <stat name="read_accesses" value="400000"/> + <stat name="write_accesses" value="0"/> + <stat name="read_misses" value="0"/> + <stat name="write_misses" value="0"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="1"/> + </component> + +<!--**********************************************************************--> +<component id="system.L30" name="L30"> + <param name="L3_config" value="1048576,64,16,1, 2,100, 64, 1"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="clockrate" value="3500"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="0.35"/> + </component> +<!--**********************************************************************--> + <component id="system.NoC0" name="noc0"> + <param name="clockrate" value="1400"/> + <param name="horizontal_nodes" value="2"/> + <param name="vertical_nodes" value="1"/> + <param name="has_global_link" value="0"/> + <!-- 1 has global link, 0 does not have global link --> + <param name="link_throughput" value="1"/><!--w.r.t clock --> + <param name="link_latency" value="1"/><!--w.r.t clock --> + <!-- througput >= latency --> + <!-- Router architecture --> + <param name="input_ports" value="9"/> + <param name="output_ports" value="8"/> + <param name="virtual_channel_per_port" value="1"/> + <!-- input buffer; in classic routers only input ports need buffers --> + <param name="flit_bits" value="136"/> + <param name="input_buffer_entries_per_vc" value="16"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs--> + <param name="chip_coverage" value="1"/> + <!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 --> + <stat name="total_accesses" value="160000"/> + <!-- This is the number of total accesses within the whole network not for each router --> + <stat name="duty_cycle" value="0.1"/> + </component> + +<!--**********************************************************************--> + <component id="system.mem" name="mem"> + <!-- Main memory property --> + <param name="mem_tech_node" value="32"/> + <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB --> + <param name="peak_transfer_rate" value="6400"/><!--MB/S--> + <param name="internal_prefetch_of_DRAM_chip" value="4"/> + <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...--> + <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property --> + <!-- above numbers can be easily found from Wikipedia --> + <param name="capacity_per_channel" value="4096"/> <!-- MB --> + <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank + Current McPAT assumes single DIMMs are used.--> + <param name="number_ranks" value="2"/> + <param name="num_banks_of_DRAM_chip" value="8"/> + <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B --> + <param name="output_width_of_DRAM_chip" value="8"/> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 --> + <param name="burstlength_of_DRAM_chip" value="8"/> + <stat name="memory_accesses" value="1052"/> + <stat name="memory_reads" value="1052"/> + <stat name="memory_writes" value="1052"/> + </component> + <component id="system.mc" name="mc"> + <!-- Memeory controllers are for DDR(2,3...) DIMMs --> + <!-- current version of McPAT uses published values for base parameters of memory controller + improvments on MC will be added in later versions. --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="mc_clock" value="400"/><!--MHz--> + <param name="peak_transfer_rate" value="6400"/><!--MB/S--> + <param name="block_size" value="64"/><!--(B) the block size of last level cache, which is the unit for one memory burst transfer --> + <param name="number_mcs" value="4"/> + <!-- current McPAT only supports homogeneous memory controllers --> + <param name="memory_channels_per_mc" value="1"/> + <param name="number_ranks" value="2"/> + <param name="withPHY" value="0"/> + <!-- # of ranks of each channel--> + <param name="req_window_size_per_channel" value="32"/> + <param name="IO_buffer_size_per_channel" value="32"/> + <param name="databus_width" value="128"/> + <param name="addressbus_width" value="51"/> + <!-- McPAT will add the control bus width to the addressbus width automatically --> + <stat name="memory_accesses" value="66666"/> + <stat name="memory_reads" value="33333"/> + <stat name="memory_writes" value="33333"/> + <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate + the average power per MC or per channel. This is sufficent for most application. + Further trackdown can be easily added in later versions. --> + </component> +<!--**********************************************************************--> + <component id="system.niu" name="niu"> + <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller --> + <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. + the low bound of clock rate of a 10Gb MAC is 150Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="clockrate" value="350"/> + <param name="number_units" value="2"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate + the average power per nic or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.pcie" name="pcie"> + <!-- On chip PCIe controller, including Phy--> + <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. + the low bound of clock rate of a PCIe per lane logic is 120Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="clockrate" value="350"/> + <param name="number_units" value="1"/> + <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate + the average power per pcie controller or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.flashc" name="flashc"> + <param name="number_flashcs" value="0"/> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate + the average power per fc or per channel. This is sufficent for most application --> + </component> +<!--**********************************************************************--> + + </component> +</component> diff --git a/ext/mcpat/Penryn.xml b/ext/mcpat/Penryn.xml new file mode 100644 index 000000000..fe9715b77 --- /dev/null +++ b/ext/mcpat/Penryn.xml @@ -0,0 +1,456 @@ +<?xml version="1.0" ?> +<component id="root" name="root"> + <component id="system" name="system"> + <!--McPAT will skip the components if number is set to 0 --> + <param name="number_of_cores" value="2"/> + <param name="number_of_L1Directories" value="0"/> + <param name="number_of_L2Directories" value="0"/> + <param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports --> + <param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent --> + <param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters --> + <param name="number_of_NoCs" value="1"/> + <param name="homogeneous_cores" value="1"/><!--1 means homo --> + <param name="homogeneous_L2s" value="1"/> + <param name="homogeneous_L1Directorys" value="1"/> + <param name="homogeneous_L2Directorys" value="1"/> + <param name="homogeneous_L3s" value="1"/> + <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware --> + <param name="homogeneous_NoCs" value="1"/> + <param name="core_tech_node" value="45"/><!-- nm --> + <param name="target_core_clockrate" value="3700"/><!--MHz --> + <param name="temperature" value="380"/> <!-- Kelvin --> + <param name="number_cache_levels" value="2"/> + <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology --> + <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) --> + <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate --> + <param name="machine_bits" value="64"/> + <param name="virtual_address_width" value="64"/> + <param name="physical_address_width" value="52"/> + <param name="virtual_memory_page_size" value="4096"/> + <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller + default value is machine_bits, if not set --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of + virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank --> + <!-- *********************** cores ******************* --> + <component id="system.core0" name="core0"> + <!-- Core property --> + <param name="clock_rate" value="3700"/> + <!-- for cores with unknow timing, set to 0 to force off the opt flag --> + <param name="opt_local" value="1"/> + <param name="instruction_length" value="32"/> + <param name="opcode_width" value="16"/> + <param name="x86" value="1"/> + <param name="micro_opcode_width" value="8"/> + <param name="machine_type" value="0"/> + <!-- inorder/OoO; 1 inorder; 0 OOO--> + <param name="number_hardware_threads" value="1"/> + <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor, + it only may be more than one in SMT processors. BTB ports always equals to fetch ports since + branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> + <param name="fetch_width" value="4"/> + <!-- fetch_width determins the size of cachelines of L1 cache block --> + <param name="number_instruction_fetch_ports" value="1"/> + <param name="decode_width" value="4"/> + <!-- decode_width determins the number of ports of the + renaming table (both RAM and CAM) scheme --> + <param name="issue_width" value="4"/> + <param name="peak_issue_width" value="6"/><!--As shown in Wiki figure which has max 5 ports, store data/address is modeled + as a single port.--> + <!-- issue_width determins the number of ports of Issue window and other logic + as in the complexity effective proccessors paper; issue_width==dispatch_width --> + <param name="commit_width" value="4"/> + <!-- commit_width determins the number of ports of register files --> + <param name="fp_issue_width" value="2"/> + <param name="prediction_width" value="1"/> + <!-- number of branch instructions can be predicted simultannouesl--> + <!-- Current version of McPAT does not distinguish int and floating point pipelines + Theses parameters are reserved for future use.--> + <param name="pipelines_per_core" value="1,1"/> + <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared--> + <param name="pipeline_depth" value="14,14"/> + <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops --> + <!-- issue and exe unit--> + <param name="ALU_per_core" value="6"/> + <!-- contains an adder, a shifter, and a logical unit --> + <param name="MUL_per_core" value="1"/> + <!-- For MUL and Div --> + <param name="FPU_per_core" value="2"/> + <!-- buffer between IF and ID stage --> + <param name="instruction_buffer_size" value="32"/><!--Inst. + micro-op --> + <!-- buffer between ID and sche/exe stage --> + <param name="decoded_stream_buffer_size" value="16"/> + <param name="instruction_window_scheme" value="1"/><!-- 0 PHYREG based, 1 RSBASED--> + <!-- McPAT support 2 types of OoO cores, RS based and physical reg based--> + <param name="instruction_window_size" value="32"/> + <param name="fp_instruction_window_size" value="32"/> + <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 --> + <param name="ROB_size" value="96"/> + <!-- each in-flight instruction has an entry in ROB --> + <!-- registers --> + <param name="archi_Regs_IRF_size" value="16"/><!-- X86-64 has 16GPR --> + <param name="archi_Regs_FRF_size" value="32"/><!-- MMX + XMM --> + <!-- if OoO processor, phy_reg number is needed for renaming logic, + renaming logic is for both integer and floating point insts. --> + <param name="phy_Regs_IRF_size" value="256"/> + <param name="phy_Regs_FRF_size" value="256"/> + <!-- rename logic --> + <param name="rename_scheme" value="0"/> + <!-- can be RAM based(0) or CAM based(1) rename scheme + RAM-based scheme will have free list, status table; + CAM-based scheme have the valid bit in the data field of the CAM + both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions; + Detailed RAT Implementation see TR --> + <param name="register_windows_size" value="0"/> + <!-- how many windows in the windowed register file, sun processors; + no register windowing is used when this number is 0 --> + <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha), + They will always try to exeute out-of-order though. --> + <param name="LSU_order" value="inorder"/> + <param name="store_buffer_size" value="96"/> + <!-- By default, in-order cores do not have load buffers --> + <param name="load_buffer_size" value="48"/> + <!-- number of ports refer to sustainable concurrent memory accesses --> + <param name="memory_ports" value="2"/> + <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer + as well as the ports of Dcache which is connected to LSU --> + <!-- dual-pumped Dcache can be used to save the extra read/write ports --> + <param name="RAS_size" value="64"/> + <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check --> + <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops --> + <stat name="total_instructions" value="400000"/> + <stat name="int_instructions" value="200000"/> + <stat name="fp_instructions" value="100000"/> + <stat name="branch_instructions" value="100000"/> + <stat name="branch_mispredictions" value="0"/> + <stat name="load_instructions" value="0"/> + <stat name="store_instructions" value="50000"/> + <stat name="committed_instructions" value="400000"/> + <stat name="committed_int_instructions" value="200000"/> + <stat name="committed_fp_instructions" value="100000"/> + <stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous --> + <!-- the following cycle stats are used for heterogeneouse cores only, + please ignore them if homogeneouse cores --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!-- instruction buffer stats --> + <!-- ROB stats, both RS and Phy based OoOs have ROB + performance simulator should capture the difference on accesses, + otherwise, McPAT has to guess based on number of commited instructions. --> + <stat name="ROB_reads" value="400000"/> + <stat name="ROB_writes" value="400000"/> + <!-- RAT accesses --> + <stat name="rename_reads" value="800000"/> <!--lookup in renaming logic --> + <stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic --> + <stat name="fp_rename_reads" value="200000"/> + <stat name="fp_rename_writes" value="100000"/> + <!-- decode and rename stage use this, should be total ic - nop --> + <!-- Inst window stats --> + <stat name="inst_window_reads" value="400000"/> + <stat name="inst_window_writes" value="400000"/> + <stat name="inst_window_wakeup_accesses" value="800000"/> + <stat name="fp_inst_window_reads" value="200000"/> + <stat name="fp_inst_window_writes" value="200000"/> + <stat name="fp_inst_window_wakeup_accesses" value="400000"/> + <!-- RF accesses --> + <stat name="int_regfile_reads" value="600000"/> + <stat name="float_regfile_reads" value="100000"/> + <stat name="int_regfile_writes" value="300000"/> + <stat name="float_regfile_writes" value="50000"/> + <!-- accesses to the working reg --> + <stat name="function_calls" value="5"/> + <stat name="context_switches" value="260343"/> + <!-- Number of Windowes switches (number of function calls and returns)--> + <!-- Alu stats by default, the processor has one FPU that includes the divider and + multiplier. The fpu accesses should include accesses to multiplier and divider --> + <stat name="ialu_accesses" value="300000"/> + <stat name="fpu_accesses" value="100000"/> + <stat name="mul_accesses" value="200000"/> + <stat name="cdb_alu_accesses" value="300000"/> + <stat name="cdb_mul_accesses" value="200000"/> + <stat name="cdb_fpu_accesses" value="100000"/> + <!-- multiple cycle accesses should be counted multiple times, + otherwise, McPAT can use internal counter for different floating point instructions + to get final accesses. But that needs detailed info for floating point inst mix --> + <!-- currently the performance simulator should + make sure all the numbers are final numbers, + including the explicit read/write accesses, + and the implicite accesses such as replacements and etc. + Future versions of McPAT may be able to reason the implicite access + based on param and stats of last level cache + The same rule applies to all cache access stats too! --> + <!-- following is AF for max power computation. + Do not change them, unless you understand them--> + <stat name="IFU_duty_cycle" value="1"/> + <stat name="LSU_duty_cycle" value="0.5"/> + <stat name="MemManU_I_duty_cycle" value="1"/> + <stat name="MemManU_D_duty_cycle" value="0.5"/> + <stat name="ALU_duty_cycle" value="1"/> + <stat name="MUL_duty_cycle" value="0.3"/> + <stat name="FPU_duty_cycle" value="0.3"/> + <stat name="ALU_cdb_duty_cycle" value="1"/> + <stat name="MUL_cdb_duty_cycle" value="0.3"/> + <stat name="FPU_cdb_duty_cycle" value="0.3"/> + <param name="number_of_BPT" value="2"/> + <component id="system.core0.predictor" name="PBT"> + <!-- branch predictor; tournament predictor see Alpha implementation --> + <param name="local_predictor_size" value="10,3"/> + <param name="local_predictor_entries" value="1024"/> + <param name="global_predictor_entries" value="4096"/> + <param name="global_predictor_bits" value="2"/> + <param name="chooser_predictor_entries" value="4096"/> + <param name="chooser_predictor_bits" value="2"/> + <!-- These parameters can be combined like below in next version + <param name="load_predictor" value="10,3,1024"/> + <param name="global_predictor" value="4096,2"/> + <param name="predictor_chooser" value="4096,2"/> + --> + </component> + <component id="system.core0.itlb" name="itlb"> + <param name="number_entries" value="128"/> + <stat name="total_accesses" value="200000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + <!-- there is no write requests to itlb although writes happen to itlb after miss, + which is actually a replacement --> + </component> + <component id="system.core0.icache" name="icache"> + <!-- there is no write requests to itlb although writes happen to it after miss, + which is actually a replacement --> + <param name="icache_config" value="32768,32,8,1,4,4,32,0"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy, --> + <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate --> + <param name="buffer_sizes" value="16, 16, 16,0"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="read_misses" value="0"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dtlb" name="dtlb"> + <param name="number_entries" value="256"/><!--dual threads--> + <stat name="total_accesses" value="400000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dcache" name="dcache"> + <!-- all the buffer related are optional --> + <param name="dcache_config" value="32768,32,8,1, 4,6, 32,1 "/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + </component> + <param name="number_of_BTB" value="2"/> + <component id="system.core0.BTB" name="BTB"> + <!-- all the buffer related are optional --> + <param name="BTB_config" value="5120,4,2,1, 1,3"/> <!--should be 4096 + 1024 --> + <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <stat name="read_accesses" value="400000"/> <!--See IFU code for guideline --> + <stat name="write_accesses" value="0"/> + </component> + </component> + <component id="system.L1Directory0" name="L1Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="4096,2,0,1,100,100, 8"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3400"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="20"/> + </component> + <component id="system.L2Directory0" name="L2Directory0"> + <param name="Directory_type" value="1"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="1048576,16,16,1,2, 100"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3400"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="100"/> + </component> + <component id="system.L20" name="L20"> + <!-- all the buffer related are optional --> + <param name="L2_config" value="6291456,64, 16, 8, 8, 23, 32, 1"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <param name="clockrate" value="3700"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="1.0"/> + </component> + +<!--**********************************************************************--> +<component id="system.L30" name="L30"> + <param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="clockrate" value="850"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="11824"/> + <stat name="write_accesses" value="11276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="1.0"/> + </component> +<!--**********************************************************************--> + <component id="system.NoC0" name="noc0"> + <param name="clockrate" value="3400"/> + <param name="type" value="0"/> + <!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus + at each time only one node can send req --> + <param name="horizontal_nodes" value="1"/> + <param name="vertical_nodes" value="1"/> + <param name="has_global_link" value="0"/> + <!-- 1 has global link, 0 does not have global link --> + <param name="link_throughput" value="1"/><!--w.r.t clock --> + <param name="link_latency" value="1"/><!--w.r.t clock --> + <!-- througput >= latency --> + <!-- Router architecture --> + <param name="input_ports" value="1"/> + <param name="output_ports" value="1"/> + <!-- For bus the I/O ports should be 1 --> + <param name="flit_bits" value="256"/> + <param name="chip_coverage" value="1"/> + <!-- When multiple NOC present, one NOC will cover part of the whole chip. + chip_coverage <=1 --> + <param name="link_routing_over_percentage" value="0.5"/> + <!-- Links can route over other components or occupy whole area. + by default, 50% of the NoC global links routes over other + components --> + <stat name="total_accesses" value="100000"/> + <!-- This is the number of total accesses within the whole network not for each router --> + <stat name="duty_cycle" value="1"/> + </component> +<!--**********************************************************************--> + <component id="system.mem" name="mem"> + <!-- Main memory property --> + <param name="mem_tech_node" value="32"/> + <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB --> + <param name="peak_transfer_rate" value="6400"/><!--MB/S--> + <param name="internal_prefetch_of_DRAM_chip" value="4"/> + <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...--> + <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property --> + <!-- above numbers can be easily found from Wikipedia --> + <param name="capacity_per_channel" value="4096"/> <!-- MB --> + <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank + Current McPAT assumes single DIMMs are used.--> + <param name="number_ranks" value="2"/> + <param name="num_banks_of_DRAM_chip" value="8"/> + <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B --> + <param name="output_width_of_DRAM_chip" value="8"/> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 --> + <param name="burstlength_of_DRAM_chip" value="8"/> + <stat name="memory_accesses" value="1052"/> + <stat name="memory_reads" value="1052"/> + <stat name="memory_writes" value="1052"/> + </component> + <component id="system.mc" name="mc"> + <!-- Memeory controllers are for DDR(2,3...) DIMMs --> + <!-- current version of McPAT uses published values for base parameters of memory controller + improvments on MC will be added in later versions. --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> + <param name="peak_transfer_rate" value="3200"/><!--MB/S--> + <param name="block_size" value="64"/><!--B--> + <param name="number_mcs" value="0"/> + <!-- current McPAT only supports homogeneous memory controllers --> + <param name="memory_channels_per_mc" value="1"/> + <param name="number_ranks" value="2"/> + <param name="withPHY" value="0"/> + <!-- # of ranks of each channel--> + <param name="req_window_size_per_channel" value="32"/> + <param name="IO_buffer_size_per_channel" value="32"/> + <param name="databus_width" value="128"/> + <param name="addressbus_width" value="51"/> + <!-- McPAT will add the control bus width to the addressbus width automatically --> + <stat name="memory_accesses" value="33333"/> + <stat name="memory_reads" value="16667"/> + <stat name="memory_writes" value="16667"/> + <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate + the average power per MC or per channel. This is sufficent for most application. + Further trackdown can be easily added in later versions. --> + </component> +<!--**********************************************************************--> + <component id="system.niu" name="niu"> + <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller --> + <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. + the low bound of clock rate of a 10Gb MAC is 150Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate + the average power per nic or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.pcie" name="pcie"> + <!-- On chip PCIe controller, including Phy--> + <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. + the low bound of clock rate of a PCIe per lane logic is 120Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> + <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate + the average power per pcie controller or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.flashc" name="flashc"> + <param name="number_flashcs" value="0"/> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate + the average power per fc or per channel. This is sufficent for most application --> + </component> +<!--**********************************************************************--> + + </component> +</component> + diff --git a/ext/mcpat/README b/ext/mcpat/README new file mode 100644 index 000000000..4887b1037 --- /dev/null +++ b/ext/mcpat/README @@ -0,0 +1,226 @@ + __ __ ____ _ _____ ____ _ +| \/ | ___| _ \ / \|_ _| | __ ) ___| |_ __ _ +| |\/| |/ __| |_) / _ \ | | | _ \ / _ \ __|/ _` | +| | | | (__| __/ ___ \| | | |_) | __/ |_| (_| | +|_| |_|\___|_| /_/ \_\_| |____/ \___|\__|\__,_| + +McPAT: Multicore Power, Area, and Timing +Current version 0.8Beta +=============================== + +McPAT is an architectural modeling tool for chip multiprocessors (CMP) +The main focus of McPAT is accurate power and area +modeling, and a target clock rate is used as a design constraint. +McPAT performs automatic extensive search to find optimal designs +that satisfy the target clock frequency. + +For complete documentation of the McPAT, please refer McPAT 1.0 +technical report and the following paper, +"McPAT: An Integrated Power, Area, and Timing Modeling + Framework for Multicore and Manycore Architectures", +that appears in MICRO 2009. Please cite the paper, if you use +McPAT in your work. The bibtex entry is provided below for your convenience. + + @inproceedings{mcpat:micro, + author = {Sheng Li and Jung Ho Ahn and Richard D. Strong and Jay B. Brockman and Dean M. Tullsen and Norman P. Jouppi}, + title = "{McPAT: An Integrated Power, Area, and Timing Modeling Framework for Multicore and Manycore Architectures}", + booktitle = {MICRO 42: Proceedings of the 42nd Annual IEEE/ACM International Symposium on Microarchitecture}, + year = {2009}, + pages = {469--480}, + } + +Current McPAT is in its beta release. +List of features of beta release +=============================== +The following are the list of features supported by the tool. + +* Power, area, and timing models for CMPs with: + Inorder cores both single and multithreaded + OOO cores both single and multithreaded + Shared/coherent caches with directory hardware: + including directory cache, shadowed tag directory + and static bank mapped tag directory + Network-on-Chip + On-chip memory controllers + +* Internal models are based on real modern processors: + Inorder models are based on Sun Niagara family + OOO models are based on Intel P6 for reservation + station based OOO cores, and on Intel Netburst and + Alpha 21264 for physical register file based OOO cores. + +* Leakage power modeling considers both sub-threshold leakage + and gate leakage power. The impact of operating temperature + on both leakage power are considered. Longer channel devices + that can reduce leakage significantly with modest performance + penalty are also modeled. + +* McPAT supports automatic extensive search to find optimal designs + that satisfy the target clock frequency. The timing constraint + include both throughput and latency. + +* Interconnect model with different delay, power, and area + properties, as well as both the aggressive and conservative + interconnect projections on wire technologies. + +* All process specific values used by the McPAT are obtained + from ITRS and currently, the McPAT supports 90nm, 65nm, 45nm, + 32nm, and 22nm technology nodes. At 32nm and 22nm nodes, SOI + and DG devices are used. After 45nm, Hi-K metal gates are used. + +How to use the tool? +==================== + +McPAT takes input parameters from an XML-based interface, +then it computes area and peak power of the +Please note that the peak power is the absolute worst case power, +which could be even higher than TDP. + +1. Steps to run McPAT: + -> define the target processor using inorder.xml or OOO.xml + -> run the "mcpat" binary: + ./mcpat -infile <*.xml> -print_level < level of detailed output> + ./mcpat -h (or mcpat --help) will show the quick help message. + + Rather than being hardwired to certain simulators, McPAT + uses an XML-based interface to enable easy integration + with various performance simulators. Our collaborator, + Richard Strong, at University of California, San Diego, + designed an experimental parser for the M5 simulator, aiming for + streamlining the integration of McPAT and M5. Please check the M5 + repository/ for the latest version of the parser. + +2. Optimize: + McPAT will try its best to satisfy the target clock rate. + When it cannot find a valid solution, it gives out warnings, + while still giving a solution that is closest to the timing + constraints and calculate power based on it. The optimization + will lead to larger power/area numbers for target higher clock + rate. McPAT also provides the option "-opt_for_clk" to turn on + ("-opt_for_clk 1") and off this strict optimization for the + timing constraint. When it is off, McPAT always optimize + component for ED^2P without worrying about meeting the + target clock frequency. By turning it off, the computation time + can be reduced, which suites for situations where target clock rate + is conservative. + +3. The output: + McPAT outputs results in a hierarchical manner. Increasing + the "-print_level" will show detailed results inside each + component. For each component, major parts are shown, and associated + pipeline registers/control logic are added up in total area/power of each + components. In general, McPAT does not model the area/overhead of the pad + frame used in a processor die. + +4. How to use the XML interface for McPAT + 4.1 Set up the parameters + Parameters of target designs need to be set in the *.xml file for + entries taged as "param". McPAT have very detailed parameter settings. + please remove the structure parameter from the file if you want + to use the default values. Otherwise, the parameters in the xml file + will override the default values. + + 4.2 Pass the statistics + There are two options to get the correct stats: a) the performance + simulator can capture all the stats in detail and pass them to McPAT; + b). Performance simulator can only capture partial stats and pass + them to McPAT, while McPAT can reason about the complete stats using + the partial information and the configuration. Therefore, there are + some overlap for the stats. + + 4.3 Interface XML file structures (PLEASE READ!) + The XML is hierarchical from processor level to micro-architecture + level. McPAT support both heterogeneous and homogeneous manycore processors. + + 1). For heterogeneous processor setup, each component (core, NoC, cache, + and etc) must have its own instantiations (core0, core1, ..., coreN). + Each instantiation will have different parameters as well as its stats. + Thus, the XML file must have multiple "instantiation" of each type of + heterogeneous components and the corresponding hetero flags must be set + in the XML file. Then state in the XML should be the stats of "a" instantiation + (e.g. "a" cores). The reported runtime dynamic is of a single instantiation + (e.g. "a" cores). Since the stats for each (e.g. "a" cores) may be different, + we will see a whole list of (e.g. "a" cores) with different dynamic power, + and total power is just a sum of them. + + 2). For homogeneous processors, the same method for heterogeneous can + also be used by treating all homogeneous instantiations as heterogeneous. + However, a preferred approach is to use a single representative for all + the same components (e.g. core0 to represent all cores) and set the + processor to have homogeneous components (e.g. <param name="homogeneous_cores + " value="1"/> ). Thus, the XML file only has one instantiation to represent + all others with the same architectural parameters. The corresponding homo + flags must be set in the XML file. Then, the stats in the XML should be + the aggregated stats of the sum of all instantiations (e.g. aggregated stats + of all cores). In the final results, McPAT will only report a single + instantiation of each type of component, and the reported runtime dynamic power + is the sum of all instantiations of the same type. This approach can run fast + and use much less memory. + +5. Guide for integrating McPAT into performance simulators and bypassing the XML interface + The detailed work flow of McPAT has two phases: the initialization phase and + the computation phase. Specifically, in order to start the initialization phase a + user specifies static configurations, including parameters at all three levels, + namely, architectural, circuit, and technology levels. During the initialization + phase, McPAT will generate the internal chip representation using the configurations + set by the user. + The computation phase of McPAT is called by McPAT or the performance simulator + during simulation to generate runtime power numbers. Before calling McPAT to + compute runtime power numbers, the performance simulator needs to pass the + statistics, namely, the activity factors of each individual components to McPAT + via the XML interface. + The initialization phase is very time-consuming, since it will repeat many + times until valid configurations are found or the possible configurations are + exhausted. To reduce the overhead, a user can let the simulator to call McPAT + directly for computation phase and only call initialization phase once at the + beginning of simulation. In this case, the XML interface file is bypassed, + please refer to processor.cc to see how the two phases are called. + +6. Sample input files: + This package provide sample XML files for validating target processors. Please find the + enclosed Niagara1.xml (for the Sun Niagara1 processor), Niagara2.xml (for the Sun Niagara2 + processor), Alpha21364.xml (for the Alpha21364 processor), and Xeon.xml (for the Intel + Xeon Tulsa processor). + + Special instructions for using Xeon.xml: + McPAT uses ITRS device types including HP, LSTP, and LOP. Although most + designs follow ITRS projections, there are designs with special technologies. + For example, the 65nm Xeon Tulsa processor uses 1.25 V rather than 1.1V + for the core voltage domain, which results in the changes in threshold voltage, + leakage current density, saturation current, and etc, besides the different + supply voltage. We use MASTAR to match the special technology as used in Xeon + core domain. Therefore, in order to generate accurate results of Xeon + Tulsa cores, users need to do make TAR=mcpatXeonCore and use the generated + special executable. The L3 cache and buses must be computed using standard + ITRS technology. + + +==================== +McPAT is in its beginning stage. We are still improving +the tool and refining the code. Please come back to its website +for newer versions. If you have any comments, +questions, or suggestions, please write to us. + +Version history and roadmap + +McPAT Alpha: released Sep. 2009 Experimental release +McPAT Beta (0.6): released Nov. 2009 New code base and technology base +McPAT Beta (0.7): released May. 2010 Added various new models, + including long channel devices, buses model; together + with bug fixes and extensive code optimization to reduce + memory usage. +McPAT Beta (0.8): released Aug. 2010 Added various new models, + including on-chip 10Gb ethernet units, PCIe, and flash controllers. +Next major release: +McPAT 1.0: including advance power-saving states + +Future releases may include the modeling of embedded low-power +processors as well as vector processors and GPGPUs. + + +Sheng Li +sheng.li@hp.com + + + + diff --git a/ext/mcpat/XML_Parse.cc b/ext/mcpat/XML_Parse.cc new file mode 100644 index 000000000..ae3ee6f17 --- /dev/null +++ b/ext/mcpat/XML_Parse.cc @@ -0,0 +1,1798 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + +#include <cstdio> +#include <string> + +#include "XML_Parse.h" +#include "xmlParser.h" + +using namespace std; + +void ParseXML::parse(char* filepath) +{ + unsigned int i,j,k,m,n; + unsigned int NumofCom_4; + unsigned int itmp; + //Initialize all structures + ParseXML::initialize(); + + // this open and parse the XML file: + XMLNode xMainNode=XMLNode::openFileHelper(filepath,"component"); //the 'component' in the first layer + + XMLNode xNode2=xMainNode.getChildNode("component"); // the 'component' in the second layer + //get all params in the second layer + itmp=xNode2.nChildNode("param"); + for(i=0; i<itmp; i++) + { + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_of_cores")==0) {sys.number_of_cores=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_of_L1Directories")==0) {sys.number_of_L1Directories=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_of_L2Directories")==0) {sys.number_of_L2Directories=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_of_L2s")==0) {sys.number_of_L2s=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"Private_L2")==0) {sys.Private_L2=(bool)atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_of_L3s")==0) {sys.number_of_L3s=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_of_NoCs")==0) {sys.number_of_NoCs=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_of_dir_levels")==0) {sys.number_of_dir_levels=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"domain_size")==0) {sys.domain_size=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"first_level_dir")==0) {sys.first_level_dir=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"homogeneous_cores")==0) {sys.homogeneous_cores=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"core_tech_node")==0) {sys.core_tech_node=atof(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"target_core_clockrate")==0) {sys.target_core_clockrate=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"target_chip_area")==0) {sys.target_chip_area=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"temperature")==0) {sys.temperature=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"number_cache_levels")==0) {sys.number_cache_levels=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"L1_property")==0) {sys.L1_property =atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"L2_property")==0) {sys.L2_property =atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"homogeneous_L2s")==0) {sys.homogeneous_L2s=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"homogeneous_L1Directories")==0) {sys.homogeneous_L1Directories=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"homogeneous_L2Directories")==0) {sys.homogeneous_L2Directories=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"L3_property")==0) {sys.L3_property =atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"homogeneous_L3s")==0) {sys.homogeneous_L3s=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"homogeneous_ccs")==0) {sys.homogeneous_ccs=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"homogeneous_NoCs")==0) {sys.homogeneous_NoCs=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"Max_area_deviation")==0) {sys.Max_area_deviation=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"Max_power_deviation")==0) {sys.Max_power_deviation=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"device_type")==0) {sys.device_type=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"longer_channel_device")==0) {sys.longer_channel_device=(bool)atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"opt_dynamic_power")==0) {sys.opt_dynamic_power=(bool)atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"opt_lakage_power")==0) {sys.opt_lakage_power=(bool)atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"opt_clockrate")==0) {sys.opt_clockrate=(bool)atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"opt_area")==0) {sys.opt_area=(bool)atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"Embedded")==0) {sys.Embedded=(bool)atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"interconnect_projection_type")==0) {sys.interconnect_projection_type=atoi(xNode2.getChildNode("param",i).getAttribute("value"))==0?0:1;continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"machine_bits")==0) {sys.machine_bits=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"virtual_address_width")==0) {sys.virtual_address_width=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"physical_address_width")==0) {sys.physical_address_width=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"virtual_memory_page_size")==0) {sys.virtual_memory_page_size=atoi(xNode2.getChildNode("param",i).getAttribute("value"));continue;} + } + +// if (sys.Private_L2 && sys.number_of_cores!=sys.number_of_L2s) +// { +// cout<<"Private L2: Number of L2s must equal to Number of Cores"<<endl; +// exit(0); +// } + + itmp=xNode2.nChildNode("stat"); + for(i=0; i<itmp; i++) + { + if (strcmp(xNode2.getChildNode("stat",i).getAttribute("name"),"total_cycles")==0) {sys.total_cycles=atof(xNode2.getChildNode("stat",i).getAttribute("value"));continue;} + + } + + //get the number of components within the second layer + unsigned int NumofCom_3=xNode2.nChildNode("component"); + XMLNode xNode3,xNode4; //define the third-layer(system.core0) and fourth-layer(system.core0.predictor) xnodes + + string strtmp; + char chtmp[60]; + char chtmp1[60]; + chtmp1[0]='\0'; + unsigned int OrderofComponents_3layer=0; + if (NumofCom_3>OrderofComponents_3layer) + { + //___________________________get all system.core0-n________________________________________________ + if (sys.homogeneous_cores==1) OrderofComponents_3layer=0; + else OrderofComponents_3layer=sys.number_of_cores-1; + for (i=0; i<=OrderofComponents_3layer; i++) + { + xNode3=xNode2.getChildNode("component",i); + if (xNode3.isEmpty()==1) { + printf("The value of homogeneous_cores or number_of_cores is not correct!"); + exit(0); + } + else{ + if (strstr(xNode3.getAttribute("name"),"core")!=NULL) + { + { //For cpu0-cpui + //Get all params with system.core? + itmp=xNode3.nChildNode("param"); + for(k=0; k<itmp; k++) + { + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clock_rate")==0) {sys.core[i].clock_rate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"opt_local")==0) {sys.core[i].opt_local=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"x86")==0) {sys.core[i].x86=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"machine_bits")==0) {sys.core[i].machine_bits=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"virtual_address_width")==0) {sys.core[i].virtual_address_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"physical_address_width")==0) {sys.core[i].physical_address_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"instruction_length")==0) {sys.core[i].instruction_length=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"opcode_width")==0) {sys.core[i].opcode_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"micro_opcode_width")==0) {sys.core[i].micro_opcode_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"machine_type")==0) {sys.core[i].machine_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"internal_datapath_width")==0) {sys.core[i].internal_datapath_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_hardware_threads")==0) {sys.core[i].number_hardware_threads=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"fetch_width")==0) {sys.core[i].fetch_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_instruction_fetch_ports")==0) {sys.core[i].number_instruction_fetch_ports=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"decode_width")==0) {sys.core[i].decode_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"issue_width")==0) {sys.core[i].issue_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"peak_issue_width")==0) {sys.core[i].peak_issue_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"commit_width")==0) {sys.core[i].commit_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"fp_issue_width")==0) {sys.core[i].fp_issue_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"prediction_width")==0) {sys.core[i].prediction_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"pipelines_per_core")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.core[i].pipelines_per_core[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.core[i].pipelines_per_core[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"pipeline_depth")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.core[i].pipeline_depth[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.core[i].pipeline_depth[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"FPU")==0) {strcpy(sys.core[i].FPU,xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"divider_multiplier")==0) {strcpy(sys.core[i].divider_multiplier,xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"ALU_per_core")==0) {sys.core[i].ALU_per_core=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"FPU_per_core")==0) {sys.core[i].FPU_per_core=atof(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"MUL_per_core")==0) {sys.core[i].MUL_per_core=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"instruction_buffer_size")==0) {sys.core[i].instruction_buffer_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"decoded_stream_buffer_size")==0) {sys.core[i].decoded_stream_buffer_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"instruction_window_scheme")==0) {sys.core[i].instruction_window_scheme =atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"instruction_window_size")==0) {sys.core[i].instruction_window_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"fp_instruction_window_size")==0) {sys.core[i].fp_instruction_window_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"ROB_size")==0) {sys.core[i].ROB_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"archi_Regs_IRF_size")==0) {sys.core[i].archi_Regs_IRF_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"archi_Regs_FRF_size")==0) {sys.core[i].archi_Regs_FRF_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"phy_Regs_IRF_size")==0) {sys.core[i].phy_Regs_IRF_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"phy_Regs_FRF_size")==0) {sys.core[i].phy_Regs_FRF_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"rename_scheme")==0) {sys.core[i].rename_scheme=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"register_windows_size")==0) {sys.core[i].register_windows_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"LSU_order")==0) {strcpy(sys.core[i].LSU_order,xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"store_buffer_size")==0) {sys.core[i].store_buffer_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"load_buffer_size")==0) {sys.core[i].load_buffer_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"memory_ports")==0) {sys.core[i].memory_ports=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"Dcache_dual_pump")==0) {strcpy(sys.core[i].Dcache_dual_pump,xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"RAS_size")==0) {sys.core[i].RAS_size=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + } + //Get all stats with system.core? + itmp=xNode3.nChildNode("stat"); + for(k=0; k<itmp; k++) + { + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_instructions")==0) {sys.core[i].total_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"int_instructions")==0) {sys.core[i].int_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fp_instructions")==0) {sys.core[i].fp_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"branch_instructions")==0) {sys.core[i].branch_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"branch_mispredictions")==0) {sys.core[i].branch_mispredictions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"committed_instructions")==0) {sys.core[i].committed_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"committed_int_instructions")==0) {sys.core[i].committed_int_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"committed_fp_instructions")==0) {sys.core[i].committed_fp_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"load_instructions")==0) {sys.core[i].load_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"store_instructions")==0) {sys.core[i].store_instructions=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_cycles")==0) {sys.core[i].total_cycles=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"idle_cycles")==0) {sys.core[i].idle_cycles=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"busy_cycles")==0) {sys.core[i].busy_cycles=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"instruction_buffer_reads")==0) {sys.core[i].instruction_buffer_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"instruction_buffer_write")==0) {sys.core[i].instruction_buffer_write=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"ROB_reads")==0) {sys.core[i].ROB_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"ROB_writes")==0) {sys.core[i].ROB_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"rename_reads")==0) {sys.core[i].rename_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"rename_writes")==0) {sys.core[i].rename_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fp_rename_reads")==0) {sys.core[i].fp_rename_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fp_rename_writes")==0) {sys.core[i].fp_rename_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"inst_window_reads")==0) {sys.core[i].inst_window_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"inst_window_writes")==0) {sys.core[i].inst_window_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"inst_window_wakeup_accesses")==0) {sys.core[i].inst_window_wakeup_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"inst_window_selections")==0) {sys.core[i].inst_window_selections=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fp_inst_window_reads")==0) {sys.core[i].fp_inst_window_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fp_inst_window_writes")==0) {sys.core[i].fp_inst_window_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fp_inst_window_wakeup_accesses")==0) {sys.core[i].fp_inst_window_wakeup_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"archi_int_regfile_reads")==0) {sys.core[i].archi_int_regfile_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"archi_float_regfile_reads")==0) {sys.core[i].archi_float_regfile_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"phy_int_regfile_reads")==0) {sys.core[i].phy_int_regfile_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"phy_float_regfile_reads")==0) {sys.core[i].phy_float_regfile_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"phy_int_regfile_writes")==0) {sys.core[i].archi_int_regfile_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"phy_float_regfile_writes")==0) {sys.core[i].archi_float_regfile_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"archi_int_regfile_writes")==0) {sys.core[i].phy_int_regfile_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"archi_float_regfile_writes")==0) {sys.core[i].phy_float_regfile_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"int_regfile_reads")==0) {sys.core[i].int_regfile_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"float_regfile_reads")==0) {sys.core[i].float_regfile_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"int_regfile_writes")==0) {sys.core[i].int_regfile_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"float_regfile_writes")==0) {sys.core[i].float_regfile_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"windowed_reg_accesses")==0) {sys.core[i].windowed_reg_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"windowed_reg_transports")==0) {sys.core[i].windowed_reg_transports=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"function_calls")==0) {sys.core[i].function_calls=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"context_switches")==0) {sys.core[i].context_switches=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"ialu_accesses")==0) {sys.core[i].ialu_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fpu_accesses")==0) {sys.core[i].fpu_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"mul_accesses")==0) {sys.core[i].mul_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"cdb_alu_accesses")==0) {sys.core[i].cdb_alu_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"cdb_mul_accesses")==0) {sys.core[i].cdb_mul_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"cdb_fpu_accesses")==0) {sys.core[i].cdb_fpu_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"load_buffer_reads")==0) {sys.core[i].load_buffer_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"load_buffer_writes")==0) {sys.core[i].load_buffer_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"load_buffer_cams")==0) {sys.core[i].load_buffer_cams=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"store_buffer_reads")==0) {sys.core[i].store_buffer_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"store_buffer_writes")==0) {sys.core[i].store_buffer_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"store_buffer_cams")==0) {sys.core[i].store_buffer_cams=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"store_buffer_forwards")==0) {sys.core[i].store_buffer_forwards=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"main_memory_access")==0) {sys.core[i].main_memory_access=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"main_memory_read")==0) {sys.core[i].main_memory_read=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"main_memory_write")==0) {sys.core[i].main_memory_write=atoi(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"pipeline_duty_cycle")==0) {sys.core[i].pipeline_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"IFU_duty_cycle")==0) {sys.core[i].IFU_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"BR_duty_cycle")==0) {sys.core[i].BR_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"LSU_duty_cycle")==0) {sys.core[i].LSU_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"MemManU_I_duty_cycle")==0) {sys.core[i].MemManU_I_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"MemManU_D_duty_cycle")==0) {sys.core[i].MemManU_D_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"ALU_duty_cycle")==0) {sys.core[i].ALU_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"MUL_duty_cycle")==0) {sys.core[i].MUL_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"FPU_duty_cycle")==0) {sys.core[i].FPU_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"ALU_cdb_duty_cycle")==0) {sys.core[i].ALU_cdb_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"MUL_cdb_duty_cycle")==0) {sys.core[i].MUL_cdb_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"FPU_cdb_duty_cycle")==0) {sys.core[i].FPU_cdb_duty_cycle=atoi(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + } + } + + NumofCom_4=xNode3.nChildNode("component"); //get the number of components within the third layer + for(j=0; j<NumofCom_4; j++) + { + xNode4=xNode3.getChildNode("component",j); + if (strcmp(xNode4.getAttribute("name"),"PBT")==0) + { //find PBT + itmp=xNode4.nChildNode("param"); + for(k=0; k<itmp; k++) + { //get all items of param in system.core0.predictor--PBT + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"prediction_width")==0) {sys.core[i].predictor.prediction_width=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"prediction_scheme")==0) {strcpy(sys.core[i].predictor.prediction_scheme,xNode4.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"predictor_size")==0) {sys.core[i].predictor.predictor_size=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"predictor_entries")==0) {sys.core[i].predictor.predictor_entries=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"local_predictor_size")==0) + { + strtmp.assign(xNode4.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.core[i].predictor.local_predictor_size[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.core[i].predictor.local_predictor_size[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"local_predictor_entries")==0) {sys.core[i].predictor.local_predictor_entries=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"global_predictor_entries")==0) {sys.core[i].predictor.global_predictor_entries=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"global_predictor_bits")==0) {sys.core[i].predictor.global_predictor_bits=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"chooser_predictor_entries")==0) {sys.core[i].predictor.chooser_predictor_entries=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"chooser_predictor_bits")==0) {sys.core[i].predictor.chooser_predictor_bits=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;} + } + itmp=xNode4.nChildNode("stat"); + for(k=0; k<itmp; k++) + { //get all items of stat in system.core0.predictor--PBT + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"predictor_accesses")==0) sys.core[i].predictor.predictor_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value")); + } + } + if (strcmp(xNode4.getAttribute("name"),"itlb")==0) + {//find system.core0.itlb + itmp=xNode4.nChildNode("param"); + for(k=0; k<itmp; k++) + { //get all items of param in system.core0.itlb--itlb + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"number_entries")==0) sys.core[i].itlb.number_entries=atoi(xNode4.getChildNode("param",k).getAttribute("value")); + } + itmp=xNode4.nChildNode("stat"); + for(k=0; k<itmp; k++) + { //get all items of stat in itlb + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_hits")==0) {sys.core[i].itlb.total_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.core[i].itlb.total_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_misses")==0) {sys.core[i].itlb.total_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.core[i].itlb.conflicts=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + } + } + if (strcmp(xNode4.getAttribute("name"),"icache")==0) + {//find system.core0.icache + itmp=xNode4.nChildNode("param"); + for(k=0; k<itmp; k++) + { //get all items of param in system.core0.icache--icache + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"icache_config")==0) + { + strtmp.assign(xNode4.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.core[i].icache.icache_config[m]=atof(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.core[i].icache.icache_config[m]=atof(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"buffer_sizes")==0) + { + strtmp.assign(xNode4.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.core[i].icache.buffer_sizes[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.core[i].icache.buffer_sizes[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + itmp=xNode4.nChildNode("stat"); + for(k=0; k<itmp; k++) + { + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.core[i].icache.total_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.core[i].icache.read_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.core[i].icache.read_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"replacements")==0) {sys.core[i].icache.replacements=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_hits")==0) {sys.core[i].icache.read_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_hits")==0) {sys.core[i].icache.total_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_misses")==0) {sys.core[i].icache.total_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"miss_buffer_access")==0) {sys.core[i].icache.miss_buffer_access=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"fill_buffer_accesses")==0) {sys.core[i].icache.fill_buffer_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_accesses")==0) {sys.core[i].icache.prefetch_buffer_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_writes")==0) {sys.core[i].icache.prefetch_buffer_writes=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_reads")==0) {sys.core[i].icache.prefetch_buffer_reads=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_hits")==0) {sys.core[i].icache.prefetch_buffer_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.core[i].icache.conflicts=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + } + } + if (strcmp(xNode4.getAttribute("name"),"dtlb")==0) + {//find system.core0.dtlb + itmp=xNode4.nChildNode("param"); + for(k=0; k<itmp; k++) + { //get all items of param in system.core0.dtlb--dtlb + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"number_entries")==0) sys.core[i].dtlb.number_entries=atoi(xNode4.getChildNode("param",k).getAttribute("value")); + } + itmp=xNode4.nChildNode("stat"); + for(k=0; k<itmp; k++) + { //get all items of stat in dtlb + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.core[i].dtlb.total_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.core[i].dtlb.read_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_accesses")==0) {sys.core[i].dtlb.write_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_hits")==0) {sys.core[i].dtlb.read_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_hits")==0) {sys.core[i].dtlb.write_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.core[i].dtlb.read_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_misses")==0) {sys.core[i].dtlb.write_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_hits")==0) {sys.core[i].dtlb.total_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_misses")==0) {sys.core[i].dtlb.total_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.core[i].dtlb.conflicts=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + + } + } + if (strcmp(xNode4.getAttribute("name"),"dcache")==0) + {//find system.core0.dcache + itmp=xNode4.nChildNode("param"); + for(k=0; k<itmp; k++) + { //get all items of param in system.core0.dcache--dcache + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"dcache_config")==0) + { + strtmp.assign(xNode4.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.core[i].dcache.dcache_config[m]=atof(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.core[i].dcache.dcache_config[m]=atof(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"buffer_sizes")==0) + { + strtmp.assign(xNode4.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.core[i].dcache.buffer_sizes[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.core[i].dcache.buffer_sizes[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + itmp=xNode4.nChildNode("stat"); + for(k=0; k<itmp; k++) + { //get all items of stat in dcache + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.core[i].dcache.total_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.core[i].dcache.read_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_accesses")==0) {sys.core[i].dcache.write_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_hits")==0) {sys.core[i].dcache.total_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_misses")==0) {sys.core[i].dcache.total_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_hits")==0) {sys.core[i].dcache.read_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_hits")==0) {sys.core[i].dcache.write_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.core[i].dcache.read_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_misses")==0) {sys.core[i].dcache.write_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"replacements")==0) {sys.core[i].dcache.replacements=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_backs")==0) {sys.core[i].dcache.write_backs=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"miss_buffer_access")==0) {sys.core[i].dcache.miss_buffer_access=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"fill_buffer_accesses")==0) {sys.core[i].dcache.fill_buffer_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_accesses")==0) {sys.core[i].dcache.prefetch_buffer_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_writes")==0) {sys.core[i].dcache.prefetch_buffer_writes=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_reads")==0) {sys.core[i].dcache.prefetch_buffer_reads=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_hits")==0) {sys.core[i].dcache.prefetch_buffer_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"wbb_writes")==0) {sys.core[i].dcache.wbb_writes=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"wbb_reads")==0) {sys.core[i].dcache.wbb_reads=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.core[i].dcache.conflicts=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + + } + } + if (strcmp(xNode4.getAttribute("name"),"BTB")==0) + {//find system.core0.BTB + itmp=xNode4.nChildNode("param"); + for(k=0; k<itmp; k++) + { //get all items of param in system.core0.BTB--BTB + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"BTB_config")==0) + { + strtmp.assign(xNode4.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.core[i].BTB.BTB_config[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.core[i].BTB.BTB_config[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + itmp=xNode4.nChildNode("stat"); + for(k=0; k<itmp; k++) + { //get all items of stat in BTB + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.core[i].BTB.total_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.core[i].BTB.read_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_accesses")==0) {sys.core[i].BTB.write_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_hits")==0) {sys.core[i].BTB.total_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"total_misses")==0) {sys.core[i].BTB.total_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_hits")==0) {sys.core[i].BTB.read_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_hits")==0) {sys.core[i].BTB.write_hits=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.core[i].BTB.read_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"write_misses")==0) {sys.core[i].BTB.write_misses=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"replacements")==0) {sys.core[i].BTB.replacements=atof(xNode4.getChildNode("stat",k).getAttribute("value"));continue;} + } + } + } + } + else { + printf("The value of homogeneous_cores or number_of_cores is not correct!"); + exit(0); + } + } + } + + //__________________________________________Get system.L1Directory0-n____________________________________________ + int w,tmpOrderofComponents_3layer; + w=OrderofComponents_3layer+1; + tmpOrderofComponents_3layer=OrderofComponents_3layer; + if (sys.homogeneous_L1Directories==1) OrderofComponents_3layer=OrderofComponents_3layer+1; + else OrderofComponents_3layer=OrderofComponents_3layer+sys.number_of_L1Directories; + + for (i=0; i<(OrderofComponents_3layer-tmpOrderofComponents_3layer); i++) + { + xNode3=xNode2.getChildNode("component",w); + if (xNode3.isEmpty()==1) { + printf("The value of homogeneous_L1Directories or number_of_L1Directories is not correct!"); + exit(0); + } + else + { + if (strstr(xNode3.getAttribute("id"),"L1Directory")!=NULL) + { + itmp=xNode3.nChildNode("param"); + for(k=0; k<itmp; k++) + { //get all items of param in system.L1Directory + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"Dir_config")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.L1Directory[i].Dir_config[m]=atof(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.L1Directory[i].Dir_config[m]=atof(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"buffer_sizes")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.L1Directory[i].buffer_sizes[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.L1Directory[i].buffer_sizes[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clockrate")==0) {sys.L1Directory[i].clockrate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"ports")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.L1Directory[i].ports[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.L1Directory[i].ports[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"device_type")==0) {sys.L1Directory[i].device_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"Directory_type")==0) {sys.L1Directory[i].Directory_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"3D_stack")==0) {strcpy(sys.L1Directory[i].threeD_stack,xNode3.getChildNode("param",k).getAttribute("value"));continue;} + } + itmp=xNode3.nChildNode("stat"); + for(k=0; k<itmp; k++) + { //get all items of stat in system.L2directorydirectory + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.L1Directory[i].total_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.L1Directory[i].read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_accesses")==0) {sys.L1Directory[i].write_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.L1Directory[i].read_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_misses")==0) {sys.L1Directory[i].write_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.L1Directory[i].conflicts=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) {sys.L1Directory[i].duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + } + w=w+1; + } + else { + printf("The value of homogeneous_L1Directories or number_of_L1Directories is not correct!"); + exit(0); + } + } + } + + //__________________________________________Get system.L2Directory0-n____________________________________________ + w=OrderofComponents_3layer+1; + tmpOrderofComponents_3layer=OrderofComponents_3layer; + if (sys.homogeneous_L2Directories==1) OrderofComponents_3layer=OrderofComponents_3layer+1; + else OrderofComponents_3layer=OrderofComponents_3layer+sys.number_of_L2Directories; + + for (i=0; i<(OrderofComponents_3layer-tmpOrderofComponents_3layer); i++) + { + xNode3=xNode2.getChildNode("component",w); + if (xNode3.isEmpty()==1) { + printf("The value of homogeneous_L2Directories or number_of_L2Directories is not correct!"); + exit(0); + } + else + { + if (strstr(xNode3.getAttribute("id"),"L2Directory")!=NULL) + { + itmp=xNode3.nChildNode("param"); + for(k=0; k<itmp; k++) + { //get all items of param in system.L2Directory + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"Dir_config")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.L2Directory[i].Dir_config[m]=atof(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.L2Directory[i].Dir_config[m]=atof(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"buffer_sizes")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.L2Directory[i].buffer_sizes[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.L2Directory[i].buffer_sizes[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clockrate")==0) {sys.L2Directory[i].clockrate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"Directory_type")==0) {sys.L2Directory[i].Directory_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"ports")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.L2Directory[i].ports[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.L2Directory[i].ports[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"device_type")==0) {sys.L2Directory[i].device_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"3D_stack")==0) {strcpy(sys.L2Directory[i].threeD_stack,xNode3.getChildNode("param",k).getAttribute("value"));continue;} + } + itmp=xNode3.nChildNode("stat"); + for(k=0; k<itmp; k++) + { //get all items of stat in system.L2directorydirectory + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.L2Directory[i].total_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.L2Directory[i].read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_accesses")==0) {sys.L2Directory[i].write_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.L2Directory[i].read_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_misses")==0) {sys.L2Directory[i].write_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.L2Directory[i].conflicts=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) {sys.L2Directory[i].duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + + } + w=w+1; + } + else { + printf("The value of homogeneous_L2Directories or number_of_L2Directories is not correct!"); + exit(0); + } + } + } + + //__________________________________________Get system.L2[0..n]____________________________________________ + w=OrderofComponents_3layer+1; + tmpOrderofComponents_3layer=OrderofComponents_3layer; + if (sys.homogeneous_L2s==1) OrderofComponents_3layer=OrderofComponents_3layer+1; + else OrderofComponents_3layer=OrderofComponents_3layer+sys.number_of_L2s; + + for (i=0; i<(OrderofComponents_3layer-tmpOrderofComponents_3layer); i++) + { + xNode3=xNode2.getChildNode("component",w); + if (xNode3.isEmpty()==1) { + printf("The value of homogeneous_L2s or number_of_L2s is not correct!"); + exit(0); + } + else + { + if (strstr(xNode3.getAttribute("name"),"L2")!=NULL) + { + { //For L20-L2i + //Get all params with system.L2? + itmp=xNode3.nChildNode("param"); + for(k=0; k<itmp; k++) + { + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"L2_config")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.L2[i].L2_config[m]=atof(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.L2[i].L2_config[m]=atof(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clockrate")==0) {sys.L2[i].clockrate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"merged_dir")==0) {sys.L2[i].merged_dir=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"ports")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.L2[i].ports[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.L2[i].ports[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"device_type")==0) {sys.L2[i].device_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"threeD_stack")==0) {strcpy(sys.L2[i].threeD_stack,(xNode3.getChildNode("param",k).getAttribute("value")));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"buffer_sizes")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.L2[i].buffer_sizes[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.L2[i].buffer_sizes[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + } + //Get all stats with system.L2? + itmp=xNode3.nChildNode("stat"); + for(k=0; k<itmp; k++) + { + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.L2[i].total_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.L2[i].read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_accesses")==0) {sys.L2[i].write_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_hits")==0) {sys.L2[i].total_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_misses")==0) {sys.L2[i].total_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_hits")==0) {sys.L2[i].read_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_hits")==0) {sys.L2[i].write_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.L2[i].read_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_misses")==0) {sys.L2[i].write_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"replacements")==0) {sys.L2[i].replacements=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_backs")==0) {sys.L2[i].write_backs=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"miss_buffer_accesses")==0) {sys.L2[i].miss_buffer_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fill_buffer_accesses")==0) {sys.L2[i].fill_buffer_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_accesses")==0) {sys.L2[i].prefetch_buffer_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_writes")==0) {sys.L2[i].prefetch_buffer_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_reads")==0) {sys.L2[i].prefetch_buffer_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_hits")==0) {sys.L2[i].prefetch_buffer_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"wbb_writes")==0) {sys.L2[i].wbb_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"wbb_reads")==0) {sys.L2[i].wbb_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.L2[i].conflicts=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) {sys.L2[i].duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_accesses")==0) {sys.L2[i].homenode_read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_accesses")==0) {sys.L2[i].homenode_read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_hits")==0) {sys.L2[i].homenode_read_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_write_hits")==0) {sys.L2[i].homenode_write_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_misses")==0) {sys.L2[i].homenode_read_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_write_misses")==0) {sys.L2[i].homenode_write_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"dir_duty_cycle")==0) {sys.L2[i].dir_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + + } + } + w=w+1; + } + else { + printf("The value of homogeneous_L2s or number_of_L2s is not correct!"); + exit(0); + } + } + } + //__________________________________________Get system.L3[0..n]____________________________________________ + w=OrderofComponents_3layer+1; + tmpOrderofComponents_3layer=OrderofComponents_3layer; + if (sys.homogeneous_L3s==1) OrderofComponents_3layer=OrderofComponents_3layer+1; + else OrderofComponents_3layer=OrderofComponents_3layer+sys.number_of_L3s; + + for (i=0; i<(OrderofComponents_3layer-tmpOrderofComponents_3layer); i++) + { + xNode3=xNode2.getChildNode("component",w); + if (xNode3.isEmpty()==1) { + printf("The value of homogeneous_L3s or number_of_L3s is not correct!"); + exit(0); + } + else + { + if (strstr(xNode3.getAttribute("name"),"L3")!=NULL) + { + { //For L30-L3i + //Get all params with system.L3? + itmp=xNode3.nChildNode("param"); + for(k=0; k<itmp; k++) + { + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"L3_config")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.L3[i].L3_config[m]=atof(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.L3[i].L3_config[m]=atof(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clockrate")==0) {sys.L3[i].clockrate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"merged_dir")==0) {sys.L3[i].merged_dir=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"ports")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.L3[i].ports[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.L3[i].ports[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"device_type")==0) {sys.L3[i].device_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"threeD_stack")==0) {strcpy(sys.L3[i].threeD_stack,(xNode3.getChildNode("param",k).getAttribute("value")));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"buffer_sizes")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.L3[i].buffer_sizes[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.L3[i].buffer_sizes[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + } + //Get all stats with system.L3? + itmp=xNode3.nChildNode("stat"); + for(k=0; k<itmp; k++) + { + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) {sys.L3[i].total_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_accesses")==0) {sys.L3[i].read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_accesses")==0) {sys.L3[i].write_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_hits")==0) {sys.L3[i].total_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_misses")==0) {sys.L3[i].total_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_hits")==0) {sys.L3[i].read_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_hits")==0) {sys.L3[i].write_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"read_misses")==0) {sys.L3[i].read_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_misses")==0) {sys.L3[i].write_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"replacements")==0) {sys.L3[i].replacements=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"write_backs")==0) {sys.L3[i].write_backs=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"miss_buffer_accesses")==0) {sys.L3[i].miss_buffer_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"fill_buffer_accesses")==0) {sys.L3[i].fill_buffer_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_accesses")==0) {sys.L3[i].prefetch_buffer_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_writes")==0) {sys.L3[i].prefetch_buffer_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_reads")==0) {sys.L3[i].prefetch_buffer_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"prefetch_buffer_hits")==0) {sys.L3[i].prefetch_buffer_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"wbb_writes")==0) {sys.L3[i].wbb_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"wbb_reads")==0) {sys.L3[i].wbb_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"conflicts")==0) {sys.L3[i].conflicts=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) {sys.L3[i].duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_accesses")==0) {sys.L3[i].homenode_read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_accesses")==0) {sys.L3[i].homenode_read_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_hits")==0) {sys.L3[i].homenode_read_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_write_hits")==0) {sys.L3[i].homenode_write_hits=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_read_misses")==0) {sys.L3[i].homenode_read_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"homenode_write_misses")==0) {sys.L3[i].homenode_write_misses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"dir_duty_cycle")==0) {sys.L3[i].dir_duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + + } + } + w=w+1; + } + else { + printf("The value of homogeneous_L3s or number_of_L3s is not correct!"); + exit(0); + } + } + } + //__________________________________________Get system.NoC[0..n]____________________________________________ + w=OrderofComponents_3layer+1; + tmpOrderofComponents_3layer=OrderofComponents_3layer; + if (sys.homogeneous_NoCs==1) OrderofComponents_3layer=OrderofComponents_3layer+1; + else OrderofComponents_3layer=OrderofComponents_3layer+sys.number_of_NoCs; + + for (i=0; i<(OrderofComponents_3layer-tmpOrderofComponents_3layer); i++) + { + xNode3=xNode2.getChildNode("component",w); + if (xNode3.isEmpty()==1) { + printf("The value of homogeneous_NoCs or number_of_NoCs is not correct!"); + exit(0); + } + else + { + if (strstr(xNode3.getAttribute("name"),"noc")!=NULL) + { + { //For NoC0-NoCi + //Get all params with system.NoC? + itmp=xNode3.nChildNode("param"); + for(k=0; k<itmp; k++) + { + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clockrate")==0) {sys.NoC[i].clockrate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"type")==0) {sys.NoC[i].type=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"topology")==0) {strcpy(sys.NoC[i].topology,(xNode3.getChildNode("param",k).getAttribute("value")));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"horizontal_nodes")==0) {sys.NoC[i].horizontal_nodes=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"vertical_nodes")==0) {sys.NoC[i].vertical_nodes=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"has_global_link")==0) {sys.NoC[i].has_global_link=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"link_throughput")==0) {sys.NoC[i].link_throughput=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"link_latency")==0) {sys.NoC[i].link_latency=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"input_ports")==0) {sys.NoC[i].input_ports=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"output_ports")==0) {sys.NoC[i].output_ports=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"virtual_channel_per_port")==0) {sys.NoC[i].virtual_channel_per_port=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"flit_bits")==0) {sys.NoC[i].flit_bits=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"input_buffer_entries_per_vc")==0) {sys.NoC[i].input_buffer_entries_per_vc=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"dual_pump")==0) {sys.NoC[i].dual_pump=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"chip_coverage")==0) {sys.NoC[i].chip_coverage=atof(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"link_routing_over_percentage")==0) {sys.NoC[i].route_over_perc=atof(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"ports_of_input_buffer")==0) + { + strtmp.assign(xNode3.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.NoC[i].ports_of_input_buffer[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.NoC[i].ports_of_input_buffer[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + continue; + } + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_of_crossbars")==0) {sys.NoC[i].number_of_crossbars=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"crossbar_type")==0) {strcpy(sys.NoC[i].crossbar_type,(xNode3.getChildNode("param",k).getAttribute("value")));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"crosspoint_type")==0) {strcpy(sys.NoC[i].crosspoint_type,(xNode3.getChildNode("param",k).getAttribute("value")));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"arbiter_type")==0) {sys.NoC[i].arbiter_type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + } + NumofCom_4=xNode3.nChildNode("component"); //get the number of components within the third layer + for(j=0; j<NumofCom_4; j++) + { + xNode4=xNode3.getChildNode("component",j); + if (strcmp(xNode4.getAttribute("name"),"xbar0")==0) + { //find PBT + itmp=xNode4.nChildNode("param"); + for(k=0; k<itmp; k++) + { //get all items of param in system.XoC0.xbar0--xbar0 + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"number_of_inputs_of_crossbars")==0) {sys.NoC[i].xbar0.number_of_inputs_of_crossbars=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"number_of_outputs_of_crossbars")==0) {sys.NoC[i].xbar0.number_of_outputs_of_crossbars=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"flit_bits")==0) {sys.NoC[i].xbar0.flit_bits=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"input_buffer_entries_per_port")==0) {sys.NoC[i].xbar0.input_buffer_entries_per_port=atoi(xNode4.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode4.getChildNode("param",k).getAttribute("name"),"ports_of_input_buffer")==0) + { + strtmp.assign(xNode4.getChildNode("param",k).getAttribute("value")); + m=0; + for(n=0; n<strtmp.length(); n++) + { + if (strtmp[n]!=',') + { + sprintf(chtmp,"%c",strtmp[n]); + strcat(chtmp1,chtmp); + } + else{ + sys.NoC[i].xbar0.ports_of_input_buffer[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + sys.NoC[i].xbar0.ports_of_input_buffer[m]=atoi(chtmp1); + m++; + chtmp1[0]='\0'; + } + } + itmp=xNode4.nChildNode("stat"); + for(k=0; k<itmp; k++) + { //get all items of stat in system.core0.predictor--PBT + if (strcmp(xNode4.getChildNode("stat",k).getAttribute("name"),"predictor_accesses")==0) sys.core[i].predictor.predictor_accesses=atof(xNode4.getChildNode("stat",k).getAttribute("value")); + } + } + } + //Get all stats with system.NoC? + itmp=xNode3.nChildNode("stat"); + for(k=0; k<itmp; k++) + { + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_accesses")==0) sys.NoC[i].total_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value")); + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) sys.NoC[i].duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value")); + } + } + w=w+1; + } + } + } + //__________________________________________Get system.mem____________________________________________ + if (OrderofComponents_3layer>0) OrderofComponents_3layer=OrderofComponents_3layer+1; + xNode3=xNode2.getChildNode("component",OrderofComponents_3layer); + if (xNode3.isEmpty()==1) { + printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!"); + exit(0); + } + if (strstr(xNode3.getAttribute("id"),"system.mem")!=NULL) + { + + itmp=xNode3.nChildNode("param"); + for(k=0; k<itmp; k++) + { //get all items of param in system.mem + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"mem_tech_node")==0) {sys.mem.mem_tech_node=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"device_clock")==0) {sys.mem.device_clock=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"peak_transfer_rate")==0) {sys.mem.peak_transfer_rate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"capacity_per_channel")==0) {sys.mem.capacity_per_channel=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_ranks")==0) {sys.mem.number_ranks=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"num_banks_of_DRAM_chip")==0) {sys.mem.num_banks_of_DRAM_chip=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"Block_width_of_DRAM_chip")==0) {sys.mem.Block_width_of_DRAM_chip=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"output_width_of_DRAM_chip")==0) {sys.mem.output_width_of_DRAM_chip=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"page_size_of_DRAM_chip")==0) {sys.mem.page_size_of_DRAM_chip=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"burstlength_of_DRAM_chip")==0) {sys.mem.burstlength_of_DRAM_chip=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"internal_prefetch_of_DRAM_chip")==0) {sys.mem.internal_prefetch_of_DRAM_chip=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + } + itmp=xNode3.nChildNode("stat"); + for(k=0; k<itmp; k++) + { //get all items of stat in system.mem + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_accesses")==0) {sys.mem.memory_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_reads")==0) {sys.mem.memory_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_writes")==0) {sys.mem.memory_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + } + } + else{ + printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!"); + exit(0); + } + //__________________________________________Get system.mc____________________________________________ + if (OrderofComponents_3layer>0) OrderofComponents_3layer=OrderofComponents_3layer+1; + xNode3=xNode2.getChildNode("component",OrderofComponents_3layer); + if (xNode3.isEmpty()==1) { + printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!"); + exit(0); + } + if (strstr(xNode3.getAttribute("id"),"system.mc")!=NULL) + { + itmp=xNode3.nChildNode("param"); + for(k=0; k<itmp; k++) + { //get all items of param in system.mem + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"mc_clock")==0) {sys.mc.mc_clock=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"block_size")==0) {sys.mc.llc_line_length=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_mcs")==0) {sys.mc.number_mcs=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"memory_channels_per_mc")==0) {sys.mc.memory_channels_per_mc=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"req_window_size_per_channel")==0) {sys.mc.req_window_size_per_channel=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"IO_buffer_size_per_channel")==0) {sys.mc.IO_buffer_size_per_channel=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"databus_width")==0) {sys.mc.databus_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"addressbus_width")==0) {sys.mc.addressbus_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"peak_transfer_rate")==0) {sys.mc.peak_transfer_rate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_ranks")==0) {sys.mc.number_ranks=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"LVDS")==0) {sys.mc.LVDS=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"type")==0) {sys.mc.type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"withPHY")==0) {sys.mc.withPHY=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + + } + itmp=xNode3.nChildNode("stat"); + for(k=0; k<itmp; k++) + { //get all items of stat in system.mendirectory + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_accesses")==0) {sys.mc.memory_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_reads")==0) {sys.mc.memory_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_writes")==0) {sys.mc.memory_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + } + } + else{ + printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!"); + exit(0); + } + //__________________________________________Get system.niu____________________________________________ + if (OrderofComponents_3layer>0) OrderofComponents_3layer=OrderofComponents_3layer+1; + xNode3=xNode2.getChildNode("component",OrderofComponents_3layer); + if (xNode3.isEmpty()==1) { + printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!"); + exit(0); + } + if (strstr(xNode3.getAttribute("id"),"system.niu")!=NULL) + { + itmp=xNode3.nChildNode("param"); + for(k=0; k<itmp; k++) + { //get all items of param in system.mem + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clockrate")==0) {sys.niu.clockrate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_units")==0) {sys.niu.number_units=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"type")==0) {sys.niu.type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + } + itmp=xNode3.nChildNode("stat"); + for(k=0; k<itmp; k++) + { //get all items of stat in system.mendirectory + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) {sys.niu.duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_load_perc")==0) {sys.niu.total_load_perc=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + } + } + else{ + printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!"); + exit(0); + } + + //__________________________________________Get system.pcie____________________________________________ + if (OrderofComponents_3layer>0) OrderofComponents_3layer=OrderofComponents_3layer+1; + xNode3=xNode2.getChildNode("component",OrderofComponents_3layer); + if (xNode3.isEmpty()==1) { + printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!"); + exit(0); + } + if (strstr(xNode3.getAttribute("id"),"system.pcie")!=NULL) + { + itmp=xNode3.nChildNode("param"); + for(k=0; k<itmp; k++) + { //get all items of param in system.mem + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"clockrate")==0) {sys.pcie.clockrate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_units")==0) {sys.pcie.number_units=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"num_channels")==0) {sys.pcie.num_channels=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"type")==0) {sys.pcie.type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"withPHY")==0) {sys.pcie.withPHY=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + + } + itmp=xNode3.nChildNode("stat"); + for(k=0; k<itmp; k++) + { //get all items of stat in system.mendirectory + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) {sys.pcie.duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_load_perc")==0) {sys.pcie.total_load_perc=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + } + } + else{ + printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!"); + exit(0); + } + //__________________________________________Get system.flashcontroller____________________________________________ + if (OrderofComponents_3layer>0) OrderofComponents_3layer=OrderofComponents_3layer+1; + xNode3=xNode2.getChildNode("component",OrderofComponents_3layer); + if (xNode3.isEmpty()==1) { + printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!"); + exit(0); + } + if (strstr(xNode3.getAttribute("id"),"system.flashc")!=NULL) + { + itmp=xNode3.nChildNode("param"); + for(k=0; k<itmp; k++) + { //get all items of param in system.mem +// if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"flashc_clock")==0) {sys.flashc.mc_clock=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} +// if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"block_size")==0) {sys.flashc.llc_line_length=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_flashcs")==0) {sys.flashc.number_mcs=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} +// if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"memory_channels_per_flashc")==0) {sys.flashc.memory_channels_per_mc=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} +// if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"req_window_size_per_channel")==0) {sys.flashc.req_window_size_per_channel=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} +// if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"IO_buffer_size_per_channel")==0) {sys.flashc.IO_buffer_size_per_channel=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} +// if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"databus_width")==0) {sys.flashc.databus_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} +// if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"addressbus_width")==0) {sys.flashc.addressbus_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"peak_transfer_rate")==0) {sys.flashc.peak_transfer_rate=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} +// if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_ranks")==0) {sys.flashc.number_ranks=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} +// if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"LVDS")==0) {sys.flashc.LVDS=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"type")==0) {sys.flashc.type=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"withPHY")==0) {sys.flashc.withPHY=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;} + + } + itmp=xNode3.nChildNode("stat"); + for(k=0; k<itmp; k++) + { //get all items of stat in system.mendirectory +// if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_accesses")==0) {sys.flashc.memory_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} +// if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_reads")==0) {sys.flashc.memory_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} +// if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_writes")==0) {sys.flashc.memory_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"duty_cycle")==0) {sys.flashc.duty_cycle=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + if (strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"total_load_perc")==0) {sys.flashc.total_load_perc=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;} + + } + } + else{ + printf("some value(s) of number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs is/are not correct!"); + exit(0); + } + + } +} +void ParseXML::initialize() //Initialize all +{ + //All number_of_* at the level of 'system' 03/21/2009 + sys.number_of_cores=1; + sys.number_of_L1Directories=1; + sys.number_of_L2Directories=1; + sys.number_of_L2s=1; + sys.Private_L2 = false; + sys.number_of_L3s=1; + sys.number_of_NoCs=1; + // All params at the level of 'system' + //strcpy(sys.homogeneous_cores,"default"); + sys.core_tech_node=1; + sys.target_core_clockrate=1; + sys.target_chip_area=1; + sys.temperature=1; + sys.number_cache_levels=1; + sys.homogeneous_cores=1; + sys.homogeneous_L1Directories=1; + sys.homogeneous_L2Directories=1; + sys.homogeneous_L2s=1; + sys.homogeneous_L3s=1; + sys.homogeneous_NoCs=1; + sys.homogeneous_ccs=1; + + sys.Max_area_deviation=1; + sys.Max_power_deviation=1; + sys.device_type=1; + sys.longer_channel_device =true; + sys.Embedded =false; + sys.opt_dynamic_power=false; + sys.opt_lakage_power=false; + sys.opt_clockrate=true; + sys.opt_area=false; + sys.interconnect_projection_type=1; + int i,j; + for (i=0; i<=63; i++) + { + sys.core[i].clock_rate=1; + sys.core[i].opt_local = true; + sys.core[i].x86 = false; + sys.core[i].machine_bits=1; + sys.core[i].virtual_address_width=1; + sys.core[i].physical_address_width=1; + sys.core[i].opcode_width=1; + sys.core[i].micro_opcode_width=1; + //strcpy(sys.core[i].machine_type,"default"); + sys.core[i].internal_datapath_width=1; + sys.core[i].number_hardware_threads=1; + sys.core[i].fetch_width=1; + sys.core[i].number_instruction_fetch_ports=1; + sys.core[i].decode_width=1; + sys.core[i].issue_width=1; + sys.core[i].peak_issue_width=1; + sys.core[i].commit_width=1; + for (j=0; j<20; j++) sys.core[i].pipelines_per_core[j]=1; + for (j=0; j<20; j++) sys.core[i].pipeline_depth[j]=1; + strcpy(sys.core[i].FPU,"default"); + strcpy(sys.core[i]. divider_multiplier,"default"); + sys.core[i].ALU_per_core=1; + sys.core[i].FPU_per_core=1.0; + sys.core[i].MUL_per_core=1; + sys.core[i].instruction_buffer_size=1; + sys.core[i].decoded_stream_buffer_size=1; + //strcpy(sys.core[i].instruction_window_scheme,"default"); + sys.core[i].instruction_window_size=1; + sys.core[i].ROB_size=1; + sys.core[i].archi_Regs_IRF_size=1; + sys.core[i].archi_Regs_FRF_size=1; + sys.core[i].phy_Regs_IRF_size=1; + sys.core[i].phy_Regs_FRF_size=1; + //strcpy(sys.core[i].rename_scheme,"default"); + sys.core[i].register_windows_size=1; + strcpy(sys.core[i].LSU_order,"default"); + sys.core[i].store_buffer_size=1; + sys.core[i].load_buffer_size=1; + sys.core[i].memory_ports=1; + strcpy(sys.core[i].Dcache_dual_pump,"default"); + sys.core[i].RAS_size=1; + //all stats at the level of system.core(0-n) + sys.core[i].total_instructions=1; + sys.core[i].int_instructions=1; + sys.core[i].fp_instructions=1; + sys.core[i].branch_instructions=1; + sys.core[i].branch_mispredictions=1; + sys.core[i].committed_instructions=1; + sys.core[i].load_instructions=1; + sys.core[i].store_instructions=1; + sys.core[i].total_cycles=1; + sys.core[i].idle_cycles=1; + sys.core[i].busy_cycles=1; + sys.core[i].instruction_buffer_reads=1; + sys.core[i].instruction_buffer_write=1; + sys.core[i].ROB_reads=1; + sys.core[i].ROB_writes=1; + sys.core[i].rename_accesses=1; + sys.core[i].inst_window_reads=1; + sys.core[i].inst_window_writes=1; + sys.core[i].inst_window_wakeup_accesses=1; + sys.core[i].inst_window_selections=1; + sys.core[i].archi_int_regfile_reads=1; + sys.core[i].archi_float_regfile_reads=1; + sys.core[i].phy_int_regfile_reads=1; + sys.core[i].phy_float_regfile_reads=1; + sys.core[i].windowed_reg_accesses=1; + sys.core[i].windowed_reg_transports=1; + sys.core[i].function_calls=1; + sys.core[i].ialu_accesses=1; + sys.core[i].fpu_accesses=1; + sys.core[i].mul_accesses=1; + sys.core[i].cdb_alu_accesses=1; + sys.core[i].cdb_mul_accesses=1; + sys.core[i].cdb_fpu_accesses=1; + sys.core[i].load_buffer_reads=1; + sys.core[i].load_buffer_writes=1; + sys.core[i].load_buffer_cams=1; + sys.core[i].store_buffer_reads=1; + sys.core[i].store_buffer_writes=1; + sys.core[i].store_buffer_cams=1; + sys.core[i].store_buffer_forwards=1; + sys.core[i].main_memory_access=1; + sys.core[i].main_memory_read=1; + sys.core[i].main_memory_write=1; + sys.core[i].IFU_duty_cycle = 1; + sys.core[i].BR_duty_cycle = 1; + sys.core[i].LSU_duty_cycle = 1; + sys.core[i].MemManU_I_duty_cycle =1; + sys.core[i].MemManU_D_duty_cycle =1; + sys.core[i].ALU_duty_cycle =1; + sys.core[i].MUL_duty_cycle =1; + sys.core[i].FPU_duty_cycle =1; + sys.core[i].ALU_cdb_duty_cycle =1; + sys.core[i].MUL_cdb_duty_cycle =1; + sys.core[i].FPU_cdb_duty_cycle =1; + //system.core?.predictor + sys.core[i].predictor.prediction_width=1; + strcpy(sys.core[i].predictor.prediction_scheme,"default"); + sys.core[i].predictor.predictor_size=1; + sys.core[i].predictor.predictor_entries=1; + sys.core[i].predictor.local_predictor_entries=1; + for (j=0; j<20; j++) sys.core[i].predictor.local_predictor_size[j]=1; + sys.core[i].predictor.global_predictor_entries=1; + sys.core[i].predictor.global_predictor_bits=1; + sys.core[i].predictor.chooser_predictor_entries=1; + sys.core[i].predictor.chooser_predictor_bits=1; + sys.core[i].predictor.predictor_accesses=1; + //system.core?.itlb + sys.core[i].itlb.number_entries=1; + sys.core[i].itlb.total_hits=1; + sys.core[i].itlb.total_accesses=1; + sys.core[i].itlb.total_misses=1; + //system.core?.icache + for (j=0; j<20; j++) sys.core[i].icache.icache_config[j]=1; + //strcpy(sys.core[i].icache.buffer_sizes,"default"); + sys.core[i].icache.total_accesses=1; + sys.core[i].icache.read_accesses=1; + sys.core[i].icache.read_misses=1; + sys.core[i].icache.replacements=1; + sys.core[i].icache.read_hits=1; + sys.core[i].icache.total_hits=1; + sys.core[i].icache.total_misses=1; + sys.core[i].icache.miss_buffer_access=1; + sys.core[i].icache.fill_buffer_accesses=1; + sys.core[i].icache.prefetch_buffer_accesses=1; + sys.core[i].icache.prefetch_buffer_writes=1; + sys.core[i].icache.prefetch_buffer_reads=1; + sys.core[i].icache.prefetch_buffer_hits=1; + //system.core?.dtlb + sys.core[i].dtlb.number_entries=1; + sys.core[i].dtlb.total_accesses=1; + sys.core[i].dtlb.read_accesses=1; + sys.core[i].dtlb.write_accesses=1; + sys.core[i].dtlb.write_hits=1; + sys.core[i].dtlb.read_hits=1; + sys.core[i].dtlb.read_misses=1; + sys.core[i].dtlb.write_misses=1; + sys.core[i].dtlb.total_hits=1; + sys.core[i].dtlb.total_misses=1; + //system.core?.dcache + for (j=0; j<20; j++) sys.core[i].dcache.dcache_config[j]=1; + //strcpy(sys.core[i].dcache.buffer_sizes,"default"); + sys.core[i].dcache.total_accesses=1; + sys.core[i].dcache.read_accesses=1; + sys.core[i].dcache.write_accesses=1; + sys.core[i].dcache.total_hits=1; + sys.core[i].dcache.total_misses=1; + sys.core[i].dcache.read_hits=1; + sys.core[i].dcache.write_hits=1; + sys.core[i].dcache.read_misses=1; + sys.core[i].dcache.write_misses=1; + sys.core[i].dcache.replacements=1; + sys.core[i].dcache.write_backs=1; + sys.core[i].dcache.miss_buffer_access=1; + sys.core[i].dcache.fill_buffer_accesses=1; + sys.core[i].dcache.prefetch_buffer_accesses=1; + sys.core[i].dcache.prefetch_buffer_writes=1; + sys.core[i].dcache.prefetch_buffer_reads=1; + sys.core[i].dcache.prefetch_buffer_hits=1; + sys.core[i].dcache.wbb_writes=1; + sys.core[i].dcache.wbb_reads=1; + //system.core?.BTB + for (j=0; j<20; j++) sys.core[i].BTB.BTB_config[j]=1; + sys.core[i].BTB.total_accesses=1; + sys.core[i].BTB.read_accesses=1; + sys.core[i].BTB.write_accesses=1; + sys.core[i].BTB.total_hits=1; + sys.core[i].BTB.total_misses=1; + sys.core[i].BTB.read_hits=1; + sys.core[i].BTB.write_hits=1; + sys.core[i].BTB.read_misses=1; + sys.core[i].BTB.write_misses=1; + sys.core[i].BTB.replacements=1; + } + + //system_L1directory + for (i=0; i<=63; i++) + { + for (j=0; j<20; j++) sys.L1Directory[i].Dir_config[j]=1; + for (j=0; j<20; j++) sys.L1Directory[i].buffer_sizes[j]=1; + sys.L1Directory[i].clockrate=1; + sys.L1Directory[i].ports[20]=1; + sys.L1Directory[i].device_type=1; + strcpy(sys.L1Directory[i].threeD_stack,"default"); + sys.L1Directory[i].total_accesses=1; + sys.L1Directory[i].read_accesses=1; + sys.L1Directory[i].write_accesses=1; + sys.L1Directory[i].duty_cycle =1; + } + //system_L2directory + for (i=0; i<=63; i++) + { + for (j=0; j<20; j++) sys.L2Directory[i].Dir_config[j]=1; + for (j=0; j<20; j++) sys.L2Directory[i].buffer_sizes[j]=1; + sys.L2Directory[i].clockrate=1; + sys.L2Directory[i].ports[20]=1; + sys.L2Directory[i].device_type=1; + strcpy(sys.L2Directory[i].threeD_stack,"default"); + sys.L2Directory[i].total_accesses=1; + sys.L2Directory[i].read_accesses=1; + sys.L2Directory[i].write_accesses=1; + sys.L2Directory[i].duty_cycle =1; + } + for (i=0; i<=63; i++) + { + //system_L2 + for (j=0; j<20; j++) sys.L2[i].L2_config[j]=1; + sys.L2[i].clockrate=1; + for (j=0; j<20; j++) sys.L2[i].ports[j]=1; + sys.L2[i].device_type=1; + strcpy(sys.L2[i].threeD_stack,"default"); + for (j=0; j<20; j++) sys.L2[i].buffer_sizes[j]=1; + sys.L2[i].total_accesses=1; + sys.L2[i].read_accesses=1; + sys.L2[i].write_accesses=1; + sys.L2[i].total_hits=1; + sys.L2[i].total_misses=1; + sys.L2[i].read_hits=1; + sys.L2[i].write_hits=1; + sys.L2[i].read_misses=1; + sys.L2[i].write_misses=1; + sys.L2[i].replacements=1; + sys.L2[i].write_backs=1; + sys.L2[i].miss_buffer_accesses=1; + sys.L2[i].fill_buffer_accesses=1; + sys.L2[i].prefetch_buffer_accesses=1; + sys.L2[i].prefetch_buffer_writes=1; + sys.L2[i].prefetch_buffer_reads=1; + sys.L2[i].prefetch_buffer_hits=1; + sys.L2[i].wbb_writes=1; + sys.L2[i].wbb_reads=1; + sys.L2[i].duty_cycle =1; + sys.L2[i].merged_dir=false; + sys.L2[i].homenode_read_accesses =1; + sys.L2[i].homenode_write_accesses=1; + sys.L2[i].homenode_read_hits=1; + sys.L2[i].homenode_write_hits=1; + sys.L2[i].homenode_read_misses=1; + sys.L2[i].homenode_write_misses=1; + sys.L2[i].dir_duty_cycle=1; + } + for (i=0; i<=63; i++) + { + //system_L3 + for (j=0; j<20; j++) sys.L3[i].L3_config[j]=1; + sys.L3[i].clockrate=1; + for (j=0; j<20; j++) sys.L3[i].ports[j]=1; + sys.L3[i].device_type=1; + strcpy(sys.L3[i].threeD_stack,"default"); + for (j=0; j<20; j++) sys.L3[i].buffer_sizes[j]=1; + sys.L3[i].total_accesses=1; + sys.L3[i].read_accesses=1; + sys.L3[i].write_accesses=1; + sys.L3[i].total_hits=1; + sys.L3[i].total_misses=1; + sys.L3[i].read_hits=1; + sys.L3[i].write_hits=1; + sys.L3[i].read_misses=1; + sys.L3[i].write_misses=1; + sys.L3[i].replacements=1; + sys.L3[i].write_backs=1; + sys.L3[i].miss_buffer_accesses=1; + sys.L3[i].fill_buffer_accesses=1; + sys.L3[i].prefetch_buffer_accesses=1; + sys.L3[i].prefetch_buffer_writes=1; + sys.L3[i].prefetch_buffer_reads=1; + sys.L3[i].prefetch_buffer_hits=1; + sys.L3[i].wbb_writes=1; + sys.L3[i].wbb_reads=1; + sys.L3[i].duty_cycle =1; + sys.L3[i].merged_dir=false; + sys.L3[i].homenode_read_accesses =1; + sys.L3[i].homenode_write_accesses=1; + sys.L3[i].homenode_read_hits=1; + sys.L3[i].homenode_write_hits=1; + sys.L3[i].homenode_read_misses=1; + sys.L3[i].homenode_write_misses=1; + sys.L3[i].dir_duty_cycle=1; + } + //system_NoC + for (i=0; i<=63; i++) + { + sys.NoC[i].clockrate=1; + sys.NoC[i].type=true; + sys.NoC[i].chip_coverage=1; + sys.NoC[i].has_global_link = true; + strcpy(sys.NoC[i].topology,"default"); + sys.NoC[i].horizontal_nodes=1; + sys.NoC[i].vertical_nodes=1; + sys.NoC[i].input_ports=1; + sys.NoC[i].output_ports=1; + sys.NoC[i].virtual_channel_per_port=1; + sys.NoC[i].flit_bits=1; + sys.NoC[i].input_buffer_entries_per_vc=1; + sys.NoC[i].total_accesses=1; + sys.NoC[i].duty_cycle=1; + sys.NoC[i].route_over_perc = 0.5; + for (j=0; j<20; j++) sys.NoC[i].ports_of_input_buffer[j]=1; + sys.NoC[i].number_of_crossbars=1; + strcpy(sys.NoC[i].crossbar_type,"default"); + strcpy(sys.NoC[i].crosspoint_type,"default"); + //system.NoC?.xbar0; + sys.NoC[i].xbar0.number_of_inputs_of_crossbars=1; + sys.NoC[i].xbar0.number_of_outputs_of_crossbars=1; + sys.NoC[i].xbar0.flit_bits=1; + sys.NoC[i].xbar0.input_buffer_entries_per_port=1; + sys.NoC[i].xbar0.ports_of_input_buffer[20]=1; + sys.NoC[i].xbar0.crossbar_accesses=1; + } + //system_mem + sys.mem.mem_tech_node=1; + sys.mem.device_clock=1; + sys.mem.capacity_per_channel=1; + sys.mem.number_ranks=1; + sys.mem.peak_transfer_rate =1; + sys.mem.num_banks_of_DRAM_chip=1; + sys.mem.Block_width_of_DRAM_chip=1; + sys.mem.output_width_of_DRAM_chip=1; + sys.mem.page_size_of_DRAM_chip=1; + sys.mem.burstlength_of_DRAM_chip=1; + sys.mem.internal_prefetch_of_DRAM_chip=1; + sys.mem.memory_accesses=1; + sys.mem.memory_reads=1; + sys.mem.memory_writes=1; + //system_mc + sys.mc.mc_clock =1; + sys.mc.number_mcs=1; + sys.mc.peak_transfer_rate =1; + sys.mc.memory_channels_per_mc=1; + sys.mc.number_ranks=1; + sys.mc.req_window_size_per_channel=1; + sys.mc.IO_buffer_size_per_channel=1; + sys.mc.databus_width=1; + sys.mc.addressbus_width=1; + sys.mc.memory_accesses=1; + sys.mc.memory_reads=1; + sys.mc.memory_writes=1; + sys.mc.LVDS=true; + sys.mc.type=1; + //system_niu + sys.niu.clockrate =1; + sys.niu.number_units=1; + sys.niu.type = 1; + sys.niu.duty_cycle =1; + sys.niu.total_load_perc=1; + //system_pcie + sys.pcie.clockrate =1; + sys.pcie.number_units=1; + sys.pcie.num_channels=1; + sys.pcie.type = 1; + sys.pcie.withPHY = false; + sys.pcie.duty_cycle =1; + sys.pcie.total_load_perc=1; + //system_flash_controller + sys.flashc.mc_clock =1; + sys.flashc.number_mcs=1; + sys.flashc.peak_transfer_rate =1; + sys.flashc.memory_channels_per_mc=1; + sys.flashc.number_ranks=1; + sys.flashc.req_window_size_per_channel=1; + sys.flashc.IO_buffer_size_per_channel=1; + sys.flashc.databus_width=1; + sys.flashc.addressbus_width=1; + sys.flashc.memory_accesses=1; + sys.flashc.memory_reads=1; + sys.flashc.memory_writes=1; + sys.flashc.LVDS=true; + sys.flashc.withPHY = false; + sys.flashc.type =1; + sys.flashc.duty_cycle =1; + sys.flashc.total_load_perc=1; +} diff --git a/ext/mcpat/XML_Parse.h b/ext/mcpat/XML_Parse.h new file mode 100644 index 000000000..88fd3dac2 --- /dev/null +++ b/ext/mcpat/XML_Parse.h @@ -0,0 +1,591 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#ifndef XML_PARSE_H_ +#define XML_PARSE_H_ + + +//#ifdef WIN32 +//#define _CRT_SECURE_NO_DEPRECATE +//#endif + +#include <stdio.h> +#include <string.h> + +#include <iostream> + +#include "xmlParser.h" +using namespace std; + +/* +void myfree(char *t); // {free(t);} +ToXMLStringTool tx,tx2; +*/ +//all subnodes at the level of system.core(0-n) +//cache_policy is added into cache property arrays;//0 no write or write-though with non-write allocate;1 write-back with write-allocate + +typedef struct{ + int prediction_width; + char prediction_scheme[20]; + int predictor_size; + int predictor_entries; + int local_predictor_size[20]; + int local_predictor_entries; + int global_predictor_entries; + int global_predictor_bits; + int chooser_predictor_entries; + int chooser_predictor_bits; + double predictor_accesses; +} predictor_systemcore; +typedef struct{ + int number_entries; + int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate + double total_hits; + double total_accesses; + double total_misses; + double conflicts; +} itlb_systemcore; +typedef struct{ + //params + double icache_config[20]; + int buffer_sizes[20]; + int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate + //stats + double total_accesses; + double read_accesses; + double read_misses; + double replacements; + double read_hits; + double total_hits; + double total_misses; + double miss_buffer_access; + double fill_buffer_accesses; + double prefetch_buffer_accesses; + double prefetch_buffer_writes; + double prefetch_buffer_reads; + double prefetch_buffer_hits; + double conflicts; +} icache_systemcore; +typedef struct{ + //params + int number_entries; + int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate + //stats + double total_accesses; + double read_accesses; + double write_accesses; + double write_hits; + double read_hits; + double read_misses; + double write_misses; + double total_hits; + double total_misses; + double conflicts; +} dtlb_systemcore; +typedef struct{ + //params + double dcache_config[20]; + int buffer_sizes[20]; + int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate + //stats + double total_accesses; + double read_accesses; + double write_accesses; + double total_hits; + double total_misses; + double read_hits; + double write_hits; + double read_misses; + double write_misses; + double replacements; + double write_backs; + double miss_buffer_access; + double fill_buffer_accesses; + double prefetch_buffer_accesses; + double prefetch_buffer_writes; + double prefetch_buffer_reads; + double prefetch_buffer_hits; + double wbb_writes; + double wbb_reads; + double conflicts; +} dcache_systemcore; +typedef struct{ + //params + int BTB_config[20]; + //stats + double total_accesses; + double read_accesses; + double write_accesses; + double total_hits; + double total_misses; + double read_hits; + double write_hits; + double read_misses; + double write_misses; + double replacements; +} BTB_systemcore; +typedef struct{ + //all params at the level of system.core(0-n) + int clock_rate; + bool opt_local; + bool x86; + int machine_bits; + int virtual_address_width; + int physical_address_width; + int opcode_width; + int micro_opcode_width; + int instruction_length; + int machine_type; + int internal_datapath_width; + int number_hardware_threads; + int fetch_width; + int number_instruction_fetch_ports; + int decode_width; + int issue_width; + int peak_issue_width; + int commit_width; + int pipelines_per_core[20]; + int pipeline_depth[20]; + char FPU[20]; + char divider_multiplier[20]; + int ALU_per_core; + double FPU_per_core; + int MUL_per_core; + int instruction_buffer_size; + int decoded_stream_buffer_size; + int instruction_window_scheme; + int instruction_window_size; + int fp_instruction_window_size; + int ROB_size; + int archi_Regs_IRF_size; + int archi_Regs_FRF_size; + int phy_Regs_IRF_size; + int phy_Regs_FRF_size; + int rename_scheme; + int register_windows_size; + char LSU_order[20]; + int store_buffer_size; + int load_buffer_size; + int memory_ports; + char Dcache_dual_pump[20]; + int RAS_size; + int fp_issue_width; + int prediction_width; + int number_of_BTB; + int number_of_BPT; + + //all stats at the level of system.core(0-n) + double total_instructions; + double int_instructions; + double fp_instructions; + double branch_instructions; + double branch_mispredictions; + double committed_instructions; + double committed_int_instructions; + double committed_fp_instructions; + double load_instructions; + double store_instructions; + double total_cycles; + double idle_cycles; + double busy_cycles; + double instruction_buffer_reads; + double instruction_buffer_write; + double ROB_reads; + double ROB_writes; + double rename_accesses; + double fp_rename_accesses; + double rename_reads; + double rename_writes; + double fp_rename_reads; + double fp_rename_writes; + double inst_window_reads; + double inst_window_writes; + double inst_window_wakeup_accesses; + double inst_window_selections; + double fp_inst_window_reads; + double fp_inst_window_writes; + double fp_inst_window_wakeup_accesses; + double fp_inst_window_selections; + double archi_int_regfile_reads; + double archi_float_regfile_reads; + double phy_int_regfile_reads; + double phy_float_regfile_reads; + double phy_int_regfile_writes; + double phy_float_regfile_writes; + double archi_int_regfile_writes; + double archi_float_regfile_writes; + double int_regfile_reads; + double float_regfile_reads; + double int_regfile_writes; + double float_regfile_writes; + double windowed_reg_accesses; + double windowed_reg_transports; + double function_calls; + double context_switches; + double ialu_accesses; + double fpu_accesses; + double mul_accesses; + double cdb_alu_accesses; + double cdb_mul_accesses; + double cdb_fpu_accesses; + double load_buffer_reads; + double load_buffer_writes; + double load_buffer_cams; + double store_buffer_reads; + double store_buffer_writes; + double store_buffer_cams; + double store_buffer_forwards; + double main_memory_access; + double main_memory_read; + double main_memory_write; + double pipeline_duty_cycle; + + double IFU_duty_cycle ; + double BR_duty_cycle ; + double LSU_duty_cycle ; + double MemManU_I_duty_cycle; + double MemManU_D_duty_cycle ; + double ALU_duty_cycle ; + double MUL_duty_cycle ; + double FPU_duty_cycle ; + double ALU_cdb_duty_cycle ; + double MUL_cdb_duty_cycle ; + double FPU_cdb_duty_cycle ; + + //all subnodes at the level of system.core(0-n) + predictor_systemcore predictor; + itlb_systemcore itlb; + icache_systemcore icache; + dtlb_systemcore dtlb; + dcache_systemcore dcache; + BTB_systemcore BTB; + +} system_core; +typedef struct{ + //params + int Directory_type; + double Dir_config[20]; + int buffer_sizes[20]; + int clockrate; + int ports[20]; + int device_type; + int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate + char threeD_stack[20]; + //stats + double total_accesses; + double read_accesses; + double write_accesses; + double read_misses; + double write_misses; + double conflicts; + double duty_cycle; +} system_L1Directory; +typedef struct{ + //params + int Directory_type; + double Dir_config[20]; + int buffer_sizes[20]; + int clockrate; + int ports[20]; + int device_type; + int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate + char threeD_stack[20]; + //stats + double total_accesses; + double read_accesses; + double write_accesses; + double read_misses; + double write_misses; + double conflicts; + double duty_cycle; +} system_L2Directory; +typedef struct{ + //params + double L2_config[20]; + int clockrate; + int ports[20]; + int device_type; + int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate + char threeD_stack[20]; + int buffer_sizes[20]; + //stats + double total_accesses; + double read_accesses; + double write_accesses; + double total_hits; + double total_misses; + double read_hits; + double write_hits; + double read_misses; + double write_misses; + double replacements; + double write_backs; + double miss_buffer_accesses; + double fill_buffer_accesses; + double prefetch_buffer_accesses; + double prefetch_buffer_writes; + double prefetch_buffer_reads; + double prefetch_buffer_hits; + double wbb_writes; + double wbb_reads; + double conflicts; + double duty_cycle; + + bool merged_dir; + double homenode_read_accesses; + double homenode_write_accesses; + double homenode_read_hits; + double homenode_write_hits; + double homenode_read_misses; + double homenode_write_misses; + double dir_duty_cycle; +} system_L2; +typedef struct{ + //params + double L3_config[20]; + int clockrate; + int ports[20]; + int device_type; + int cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate + char threeD_stack[20]; + int buffer_sizes[20]; + //stats + double total_accesses; + double read_accesses; + double write_accesses; + double total_hits; + double total_misses; + double read_hits; + double write_hits; + double read_misses; + double write_misses; + double replacements; + double write_backs; + double miss_buffer_accesses; + double fill_buffer_accesses; + double prefetch_buffer_accesses; + double prefetch_buffer_writes; + double prefetch_buffer_reads; + double prefetch_buffer_hits; + double wbb_writes; + double wbb_reads; + double conflicts; + double duty_cycle; + + bool merged_dir; + double homenode_read_accesses; + double homenode_write_accesses; + double homenode_read_hits; + double homenode_write_hits; + double homenode_read_misses; + double homenode_write_misses; + double dir_duty_cycle; +} system_L3; +typedef struct{ + //params + int number_of_inputs_of_crossbars; + int number_of_outputs_of_crossbars; + int flit_bits; + int input_buffer_entries_per_port; + int ports_of_input_buffer[20]; + //stats + double crossbar_accesses; +} xbar0_systemNoC; +typedef struct{ + //params + int clockrate; + bool type; + bool has_global_link; + char topology[20]; + int horizontal_nodes; + int vertical_nodes; + int link_throughput; + int link_latency; + int input_ports; + int output_ports; + int virtual_channel_per_port; + int flit_bits; + int input_buffer_entries_per_vc; + int ports_of_input_buffer[20]; + int dual_pump; + int number_of_crossbars; + char crossbar_type[20]; + char crosspoint_type[20]; + xbar0_systemNoC xbar0; + int arbiter_type; + double chip_coverage; + //stats + double total_accesses; + double duty_cycle; + double route_over_perc; +} system_NoC; +typedef struct{ + //params + int mem_tech_node; + int device_clock; + int peak_transfer_rate; + int internal_prefetch_of_DRAM_chip; + int capacity_per_channel; + int number_ranks; + int num_banks_of_DRAM_chip; + int Block_width_of_DRAM_chip; + int output_width_of_DRAM_chip; + int page_size_of_DRAM_chip; + int burstlength_of_DRAM_chip; + //stats + double memory_accesses; + double memory_reads; + double memory_writes; +} system_mem; +typedef struct{ + //params + //Common Param for mc and fc + double peak_transfer_rate; + int number_mcs; + bool withPHY; + int type; + + //FCParam + //stats + double duty_cycle; + double total_load_perc; + + //McParam + int mc_clock; + int llc_line_length; + int memory_channels_per_mc; + int number_ranks; + int req_window_size_per_channel; + int IO_buffer_size_per_channel; + int databus_width; + int addressbus_width; + bool LVDS; + + //stats + double memory_accesses; + double memory_reads; + double memory_writes; +} system_mc; + +typedef struct{ + //params + int clockrate; + int number_units; + int type; + //stats + double duty_cycle; + double total_load_perc; +} system_niu; + +typedef struct{ + //params + int clockrate; + int number_units; + int num_channels; + int type; + bool withPHY; + //stats + double duty_cycle; + double total_load_perc; +} system_pcie; + +typedef struct{ + //All number_of_* at the level of 'system' Ying 03/21/2009 + int number_of_cores; + int number_of_L1Directories; + int number_of_L2Directories; + int number_of_L2s; + bool Private_L2; + int number_of_L3s; + int number_of_NoCs; + int number_of_dir_levels; + int domain_size; + int first_level_dir; + // All params at the level of 'system' + int homogeneous_cores; + int homogeneous_L1Directories; + int homogeneous_L2Directories; + double core_tech_node; + int target_core_clockrate; + int target_chip_area; + int temperature; + int number_cache_levels; + int L1_property; + int L2_property; + int homogeneous_L2s; + int L3_property; + int homogeneous_L3s; + int homogeneous_NoCs; + int homogeneous_ccs; + int Max_area_deviation; + int Max_power_deviation; + int device_type; + bool longer_channel_device; + bool Embedded; + bool opt_dynamic_power; + bool opt_lakage_power; + bool opt_clockrate; + bool opt_area; + int interconnect_projection_type; + int machine_bits; + int virtual_address_width; + int physical_address_width; + int virtual_memory_page_size; + double total_cycles; + //system.core(0-n):3rd level + system_core core[64]; + system_L1Directory L1Directory[64]; + system_L2Directory L2Directory[64]; + system_L2 L2[64]; + system_L3 L3[64]; + system_NoC NoC[64]; + system_mem mem; + system_mc mc; + system_mc flashc; + system_niu niu; + system_pcie pcie; +} root_system; + +class ParseXML +{ +public: + void parse(char* filepath); + void initialize(); +public: + root_system sys; +}; + + +#endif /* XML_PARSE_H_ */ + + + + diff --git a/ext/mcpat/Xeon.xml b/ext/mcpat/Xeon.xml new file mode 100644 index 000000000..534210485 --- /dev/null +++ b/ext/mcpat/Xeon.xml @@ -0,0 +1,455 @@ +<?xml version="1.0" ?> +<component id="root" name="root"> + <component id="system" name="system"> + <!--McPAT will skip the components if number is set to 0 --> + <param name="number_of_cores" value="2"/> + <param name="number_of_L1Directories" value="0"/> + <param name="number_of_L2Directories" value="0"/> + <param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports --> + <param name="Private_L2" value="1"/><!--1 Private, 0 shared/coherent --> + <param name="number_of_L3s" value="1"/> <!-- This number means how many L3 clusters --> + <param name="number_of_NoCs" value="1"/> + <param name="homogeneous_cores" value="1"/><!--1 means homo --> + <param name="homogeneous_L2s" value="1"/> + <param name="homogeneous_L1Directorys" value="1"/> + <param name="homogeneous_L2Directorys" value="1"/> + <param name="homogeneous_L3s" value="1"/> + <param name="homogeneous_ccs" value="1"/><!--cache coherece hardware --> + <param name="homogeneous_NoCs" value="1"/> + <param name="core_tech_node" value="65"/><!-- nm --> + <param name="target_core_clockrate" value="3400"/><!--MHz --> + <param name="temperature" value="380"/> <!-- Kelvin --> + <param name="number_cache_levels" value="3"/> + <param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology --> + <param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) --> + <param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate --> + <param name="machine_bits" value="64"/> + <param name="virtual_address_width" value="64"/> + <param name="physical_address_width" value="52"/> + <param name="virtual_memory_page_size" value="4096"/> + <!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller + default value is machine_bits, if not set --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of + virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank --> + <!-- *********************** cores ******************* --> + <component id="system.core0" name="core0"> + <!-- Core property --> + <param name="clock_rate" value="3400"/> + <!-- for cores with unknow timing, set to 0 to force off the opt flag --> + <param name="opt_local" value="0"/> + <param name="instruction_length" value="32"/> + <param name="opcode_width" value="16"/> + <param name="x86" value="1"/> + <param name="micro_opcode_width" value="8"/> + <param name="machine_type" value="0"/> + <!-- inorder/OoO; 1 inorder; 0 OOO--> + <param name="number_hardware_threads" value="2"/> + <!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor, + it only may be more than one in SMT processors. BTB ports always equals to fetch ports since + branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> + <param name="fetch_width" value="4"/> + <!-- fetch_width determins the size of cachelines of L1 cache block --> + <param name="number_instruction_fetch_ports" value="1"/> + <param name="decode_width" value="4"/> + <!-- decode_width determins the number of ports of the + renaming table (both RAM and CAM) scheme --> + <param name="issue_width" value="4"/> + <param name="peak_issue_width" value="6"/> + <!-- issue_width determins the number of ports of Issue window and other logic + as in the complexity effective proccessors paper; issue_width==dispatch_width --> + <param name="commit_width" value="4"/> + <!-- commit_width determins the number of ports of register files --> + <param name="fp_issue_width" value="2"/> + <param name="prediction_width" value="1"/> + <!-- number of branch instructions can be predicted simultannouesl--> + <!-- Current version of McPAT does not distinguish int and floating point pipelines + Theses parameters are reserved for future use.--> + <param name="pipelines_per_core" value="1,1"/> + <!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared--> + <param name="pipeline_depth" value="31,31"/> + <!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops --> + <!-- issue and exe unit--> + <param name="ALU_per_core" value="6"/> + <!-- contains an adder, a shifter, and a logical unit --> + <param name="MUL_per_core" value="1"/> + <!-- For MUL and Div --> + <param name="FPU_per_core" value="2"/> + <!-- buffer between IF and ID stage --> + <param name="instruction_buffer_size" value="32"/> + <!-- buffer between ID and sche/exe stage --> + <param name="decoded_stream_buffer_size" value="16"/> + <param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED--> + <!-- McPAT support 2 types of OoO cores, RS based and physical reg based--> + <param name="instruction_window_size" value="64"/> + <param name="fp_instruction_window_size" value="64"/> + <!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 --> + <param name="ROB_size" value="128"/> + <!-- each in-flight instruction has an entry in ROB --> + <!-- registers --> + <param name="archi_Regs_IRF_size" value="16"/><!-- X86-64 has 16GPR --> + <param name="archi_Regs_FRF_size" value="32"/><!-- MMX + XMM --> + <!-- if OoO processor, phy_reg number is needed for renaming logic, + renaming logic is for both integer and floating point insts. --> + <param name="phy_Regs_IRF_size" value="256"/> + <param name="phy_Regs_FRF_size" value="256"/> + <!-- rename logic --> + <param name="rename_scheme" value="0"/> + <!-- can be RAM based(0) or CAM based(1) rename scheme + RAM-based scheme will have free list, status table; + CAM-based scheme have the valid bit in the data field of the CAM + both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions; + Detailed RAT Implementation see TR --> + <param name="register_windows_size" value="0"/> + <!-- how many windows in the windowed register file, sun processors; + no register windowing is used when this number is 0 --> + <!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha), + They will always try to exeute out-of-order though. --> + <param name="LSU_order" value="inorder"/> + <param name="store_buffer_size" value="96"/> + <!-- By default, in-order cores do not have load buffers --> + <param name="load_buffer_size" value="48"/> + <!-- number of ports refer to sustainable concurrent memory accesses --> + <param name="memory_ports" value="2"/> + <!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer + as well as the ports of Dcache which is connected to LSU --> + <!-- dual-pumped Dcache can be used to save the extra read/write ports --> + <param name="RAS_size" value="64"/> + <!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check --> + <!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops --> + <stat name="total_instructions" value="400000"/> + <stat name="int_instructions" value="200000"/> + <stat name="fp_instructions" value="100000"/> + <stat name="branch_instructions" value="100000"/> + <stat name="branch_mispredictions" value="0"/> + <stat name="load_instructions" value="0"/> + <stat name="store_instructions" value="50000"/> + <stat name="committed_instructions" value="400000"/> + <stat name="committed_int_instructions" value="200000"/> + <stat name="committed_fp_instructions" value="100000"/> + <stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous --> + <!-- the following cycle stats are used for heterogeneouse cores only, + please ignore them if homogeneouse cores --> + <stat name="total_cycles" value="100000"/> + <stat name="idle_cycles" value="0"/> + <stat name="busy_cycles" value="100000"/> + <!-- instruction buffer stats --> + <!-- ROB stats, both RS and Phy based OoOs have ROB + performance simulator should capture the difference on accesses, + otherwise, McPAT has to guess based on number of commited instructions. --> + <stat name="ROB_reads" value="400000"/> + <stat name="ROB_writes" value="400000"/> + <!-- RAT accesses --> + <stat name="rename_reads" value="800000"/> <!--lookup in renaming logic --> + <stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic --> + <stat name="fp_rename_reads" value="200000"/> + <stat name="fp_rename_writes" value="100000"/> + <!-- decode and rename stage use this, should be total ic - nop --> + <!-- Inst window stats --> + <stat name="inst_window_reads" value="400000"/> + <stat name="inst_window_writes" value="400000"/> + <stat name="inst_window_wakeup_accesses" value="800000"/> + <stat name="fp_inst_window_reads" value="200000"/> + <stat name="fp_inst_window_writes" value="200000"/> + <stat name="fp_inst_window_wakeup_accesses" value="400000"/> + <!-- RF accesses --> + <stat name="int_regfile_reads" value="600000"/> + <stat name="float_regfile_reads" value="100000"/> + <stat name="int_regfile_writes" value="300000"/> + <stat name="float_regfile_writes" value="50000"/> + <!-- accesses to the working reg --> + <stat name="function_calls" value="5"/> + <stat name="context_switches" value="260343"/> + <!-- Number of Windowes switches (number of function calls and returns)--> + <!-- Alu stats by default, the processor has one FPU that includes the divider and + multiplier. The fpu accesses should include accesses to multiplier and divider --> + <stat name="ialu_accesses" value="300000"/> + <stat name="fpu_accesses" value="100000"/> + <stat name="mul_accesses" value="200000"/> + <stat name="cdb_alu_accesses" value="300000"/> + <stat name="cdb_mul_accesses" value="200000"/> + <stat name="cdb_fpu_accesses" value="100000"/> + <!-- multiple cycle accesses should be counted multiple times, + otherwise, McPAT can use internal counter for different floating point instructions + to get final accesses. But that needs detailed info for floating point inst mix --> + <!-- currently the performance simulator should + make sure all the numbers are final numbers, + including the explicit read/write accesses, + and the implicite accesses such as replacements and etc. + Future versions of McPAT may be able to reason the implicite access + based on param and stats of last level cache + The same rule applies to all cache access stats too! --> + <!-- following is AF for max power computation. + Do not change them, unless you understand them--> + <stat name="IFU_duty_cycle" value="1"/> + <stat name="LSU_duty_cycle" value="0.5"/> + <stat name="MemManU_I_duty_cycle" value="1"/> + <stat name="MemManU_D_duty_cycle" value="0.5"/> + <stat name="ALU_duty_cycle" value="1"/> + <stat name="MUL_duty_cycle" value="0.3"/> + <stat name="FPU_duty_cycle" value="0.3"/> + <stat name="ALU_cdb_duty_cycle" value="1"/> + <stat name="MUL_cdb_duty_cycle" value="0.3"/> + <stat name="FPU_cdb_duty_cycle" value="0.3"/> + <param name="number_of_BPT" value="2"/> + <component id="system.core0.predictor" name="PBT"> + <!-- branch predictor; tournament predictor see Alpha implementation --> + <param name="local_predictor_size" value="10,3"/> + <param name="local_predictor_entries" value="1024"/> + <param name="global_predictor_entries" value="4096"/> + <param name="global_predictor_bits" value="2"/> + <param name="chooser_predictor_entries" value="4096"/> + <param name="chooser_predictor_bits" value="2"/> + <!-- These parameters can be combined like below in next version + <param name="load_predictor" value="10,3,1024"/> + <param name="global_predictor" value="4096,2"/> + <param name="predictor_chooser" value="4096,2"/> + --> + </component> + <component id="system.core0.itlb" name="itlb"> + <param name="number_entries" value="128"/> + <stat name="total_accesses" value="200000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + <!-- there is no write requests to itlb although writes happen to itlb after miss, + which is actually a replacement --> + </component> + <component id="system.core0.icache" name="icache"> + <!-- there is no write requests to itlb although writes happen to it after miss, + which is actually a replacement --> + <param name="icache_config" value="131072,32,8,1,8,3,32,0"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy, --> + <!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate --> + <param name="buffer_sizes" value="16, 16, 16,0"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="200000"/> + <stat name="read_misses" value="0"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dtlb" name="dtlb"> + <param name="number_entries" value="128"/><!--dual threads--> + <stat name="total_accesses" value="400000"/> + <stat name="total_misses" value="4"/> + <stat name="conflicts" value="0"/> + </component> + <component id="system.core0.dcache" name="dcache"> + <!-- all the buffer related are optional --> + <param name="dcache_config" value="16384,16,4,1, 3,3, 16,1 "/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + </component> + <param name="number_of_BTB" value="2"/> + <component id="system.core0.BTB" name="BTB"> + <!-- all the buffer related are optional --> + <param name="BTB_config" value="5120,4,2,1, 1,3"/> <!--should be 4096 + 1024 --> + <!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <stat name="read_accesses" value="400000"/> <!--See IFU code for guideline --> + <stat name="write_accesses" value="0"/> + </component> + </component> + <component id="system.L1Directory0" name="L1Directory0"> + <param name="Directory_type" value="0"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="4096,2,0,1,100,100, 8"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3400"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="800000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="20"/> + </component> + <component id="system.L2Directory0" name="L2Directory0"> + <param name="Directory_type" value="1"/> + <!--0 cam based shadowed tag. 1 directory cache --> + <param name="Dir_config" value="1048576,16,16,1,2, 100"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="buffer_sizes" value="8, 8, 8, 8"/> + <!-- all the buffer related are optional --> + <param name="clockrate" value="3400"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw search ports --> + <param name="device_type" value="0"/> + <!-- altough there are multiple access types, + Performance simulator needs to cast them into reads or writes + e.g. the invalidates can be considered as writes --> + <stat name="read_accesses" value="58824"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="100"/> + </component> + <component id="system.L20" name="L20"> + <!-- all the buffer related are optional --> + <param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/> + <!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy --> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <param name="clockrate" value="3400"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <stat name="read_accesses" value="200000"/> + <stat name="write_accesses" value="27276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="1.0"/> + </component> + +<!--**********************************************************************--> +<component id="system.L30" name="L30"> + <param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/> + <!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,--> + <param name="clockrate" value="850"/> + <param name="ports" value="1,1,1"/> + <!-- number of r, w, and rw ports --> + <param name="device_type" value="0"/> + <param name="buffer_sizes" value="16, 16, 16, 16"/> + <!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> + <stat name="read_accesses" value="11824"/> + <stat name="write_accesses" value="11276"/> + <stat name="read_misses" value="1632"/> + <stat name="write_misses" value="183"/> + <stat name="conflicts" value="0"/> + <stat name="duty_cycle" value="1.0"/> + </component> +<!--**********************************************************************--> + <component id="system.NoC0" name="noc0"> + <param name="clockrate" value="3400"/> + <param name="type" value="0"/> + <!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus + at each time only one node can send req --> + <param name="horizontal_nodes" value="1"/> + <param name="vertical_nodes" value="1"/> + <param name="has_global_link" value="0"/> + <!-- 1 has global link, 0 does not have global link --> + <param name="link_throughput" value="1"/><!--w.r.t clock --> + <param name="link_latency" value="1"/><!--w.r.t clock --> + <!-- througput >= latency --> + <!-- Router architecture --> + <param name="input_ports" value="1"/> + <param name="output_ports" value="1"/> + <!-- For bus the I/O ports should be 1 --> + <param name="flit_bits" value="256"/> + <param name="chip_coverage" value="1"/> + <!-- When multiple NOC present, one NOC will cover part of the whole chip. + chip_coverage <=1 --> + <param name="link_routing_over_percentage" value="0.5"/> + <!-- Links can route over other components or occupy whole area. + by default, 50% of the NoC global links routes over other + components --> + <stat name="total_accesses" value="100000"/> + <!-- This is the number of total accesses within the whole network not for each router --> + <stat name="duty_cycle" value="1"/> + </component> +<!--**********************************************************************--> + <component id="system.mem" name="mem"> + <!-- Main memory property --> + <param name="mem_tech_node" value="32"/> + <param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB --> + <param name="peak_transfer_rate" value="6400"/><!--MB/S--> + <param name="internal_prefetch_of_DRAM_chip" value="4"/> + <!-- 2 for DDR, 4 for DDR2, 8 for DDR3...--> + <!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property --> + <!-- above numbers can be easily found from Wikipedia --> + <param name="capacity_per_channel" value="4096"/> <!-- MB --> + <!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank + Current McPAT assumes single DIMMs are used.--> + <param name="number_ranks" value="2"/> + <param name="num_banks_of_DRAM_chip" value="8"/> + <param name="Block_width_of_DRAM_chip" value="64"/> <!-- B --> + <param name="output_width_of_DRAM_chip" value="8"/> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip--> + <param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 --> + <param name="burstlength_of_DRAM_chip" value="8"/> + <stat name="memory_accesses" value="1052"/> + <stat name="memory_reads" value="1052"/> + <stat name="memory_writes" value="1052"/> + </component> + <component id="system.mc" name="mc"> + <!-- Memeory controllers are for DDR(2,3...) DIMMs --> + <!-- current version of McPAT uses published values for base parameters of memory controller + improvments on MC will be added in later versions. --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> + <param name="peak_transfer_rate" value="3200"/><!--MB/S--> + <param name="block_size" value="64"/><!--B--> + <param name="number_mcs" value="0"/> + <!-- current McPAT only supports homogeneous memory controllers --> + <param name="memory_channels_per_mc" value="1"/> + <param name="number_ranks" value="2"/> + <param name="withPHY" value="0"/> + <!-- # of ranks of each channel--> + <param name="req_window_size_per_channel" value="32"/> + <param name="IO_buffer_size_per_channel" value="32"/> + <param name="databus_width" value="128"/> + <param name="addressbus_width" value="51"/> + <!-- McPAT will add the control bus width to the addressbus width automatically --> + <stat name="memory_accesses" value="33333"/> + <stat name="memory_reads" value="16667"/> + <stat name="memory_writes" value="16667"/> + <!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate + the average power per MC or per channel. This is sufficent for most application. + Further trackdown can be easily added in later versions. --> + </component> +<!--**********************************************************************--> + <component id="system.niu" name="niu"> + <!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller --> + <!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. + the low bound of clock rate of a 10Gb MAC is 150Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate + the average power per nic or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.pcie" name="pcie"> + <!-- On chip PCIe controller, including Phy--> + <!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. + the low bound of clock rate of a PCIe per lane logic is 120Mhz --> + <param name="type" value="0"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="clockrate" value="350"/> + <param name="number_units" value="0"/> + <param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate + the average power per pcie controller or per channel. This is sufficent for most application. --> + </component> +<!--**********************************************************************--> + <component id="system.flashc" name="flashc"> + <param name="number_flashcs" value="0"/> + <param name="type" value="1"/> <!-- 1: low power; 0 high performance --> + <param name="withPHY" value="1"/> + <param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S --> + <stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 --> + <stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth --> + <!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate + the average power per fc or per channel. This is sufficent for most application --> + </component> +<!--**********************************************************************--> + + </component> +</component> + diff --git a/ext/mcpat/arch_const.h b/ext/mcpat/arch_const.h new file mode 100644 index 000000000..b0dfeaa39 --- /dev/null +++ b/ext/mcpat/arch_const.h @@ -0,0 +1,276 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#ifndef ARCH_CONST_H_ +#define ARCH_CONST_H_ + +typedef struct{ + unsigned int capacity; + unsigned int assoc;//fully + unsigned int blocksize; +} array_inputs; + +//Do Not change, unless you want to bypass the XML interface and do not care about the default values. +//Global parameters +const int number_of_cores = 8; +const int number_of_L2s = 1; +const int number_of_L3s = 1; +const int number_of_NoCs = 1; + +const double archi_F_sz_nm = 90.0; +const unsigned int dev_type = 0; +const double CLOCKRATE = 1.2*1e9; +const double AF = 0.5; +//const bool inorder = true; +const bool embedded = false; //NEW + +const bool homogeneous_cores = true; +const bool temperature = 360; +const int number_cache_levels = 3; +const int L1_property = 0; //private 0; coherent 1, shared 2. +const int L2_property = 2; +const bool homogeneous_L2s = true; +const bool L3_property = 2; +const bool homogeneous_L3s = true; +const double Max_area_deviation = 50; +const double Max_dynamic_deviation =50; //New +const int opt_dynamic_power = 1; +const int opt_lakage_power = 0; +const int opt_area = 0; +const int interconnect_projection_type = 0; + +//******************************Core Parameters +#if (inorder) +const int opcode_length = 8;//Niagara +const int reg_length = 5;//Niagara +const int instruction_length = 32;//Niagara +const int data_width = 64; +#else +const int opcode_length = 8;//16;//Niagara +const int reg_length = 7;//Niagara +const int instruction_length = 32;//Niagara +const int data_width = 64; +#endif + + +//Caches +//itlb +const int itlbsize=512; +const int itlbassoc=0;//fully +const int itlbblocksize=8; +//icache +const int icachesize=32768; +const int icacheassoc=4; +const int icacheblocksize=32; +//dtlb +const int dtlbsize=512; +const int dtlbassoc=0;//fully +const int dtlbblocksize=8; +//dcache +const int dcachesize=32768; +const int dcacheassoc=4; +const int dcacheblocksize=32; +const int dcache_write_buffers=8; + +//cache controllers +//IB, +const int numIBEntries = 64; +const int IBsize = 64;//2*4*instruction_length/8*2; +const int IBassoc = 0;//In Niagara it is still fully associ +const int IBblocksize = 4; + +//IFB and MIL should have the same parameters CAM +const int IFBsize=128;// +const int IFBassoc=0;//In Niagara it is still fully associ +const int IFBblocksize=4; + + + + +const int icache_write_buffers=8; + +//register file RAM +const int regfilesize=5760; +const int regfileassoc=1; +const int regfileblocksize=18; +//regwin RAM +const int regwinsize=256; +const int regwinassoc=1; +const int regwinblocksize=8; + + + +//store buffer, lsq +const int lsqsize=512; +const int lsqassoc=0; +const int lsqblocksize=8; + +//data fill queue RAM +const int dfqsize=1024; +const int dfqassoc=1; +const int dfqblocksize=16; + +//outside the cores +//L2 cache bank +const int l2cachesize=262144; +const int l2cacheassoc=16; +const int l2cacheblocksize=64; + +//L2 directory +const int l2dirsize=1024; +const int l2dirassoc=0; +const int l2dirblocksize=2; + +//crossbar +//PCX +const int PCX_NUMBER_INPUT_PORTS_CROSSBAR = 8; +const int PCX_NUMBER_OUTPUT_PORTS_CROSSBAR = 9; +const int PCX_NUMBER_SIGNALS_PER_PORT_CROSSBAR =144; +//PCX buffer RAM +const int pcx_buffersize=1024; +const int pcx_bufferassoc=1; +const int pcx_bufferblocksize=32; +const int pcx_numbuffer=5; +//pcx arbiter +const int pcx_arbsize=128; +const int pcx_arbassoc=1; +const int pcx_arbblocksize=2; +const int pcx_numarb=5; + +//CPX +const int CPX_NUMBER_INPUT_PORTS_CROSSBAR = 5; +const int CPX_NUMBER_OUTPUT_PORTS_CROSSBAR = 8; +const int CPX_NUMBER_SIGNALS_PER_PORT_CROSSBAR =150; +//CPX buffer RAM +const int cpx_buffersize=1024; +const int cpx_bufferassoc=1; +const int cpx_bufferblocksize=32; +const int cpx_numbuffer=8; +//cpx arbiter +const int cpx_arbsize=128; +const int cpx_arbassoc=1; +const int cpx_arbblocksize=2; +const int cpx_numarb=8; + + + + + +const int numPhysFloatRegs=256; +const int numPhysIntRegs=32; +const int numROBEntries=192; +const int umRobs=1; + +const int BTBEntries=4096; +const int BTBTagSize=16; +const int LFSTSize=1024; +const int LQEntries=32; +const int RASSize=16; +const int SQEntries=32; +const int SSITSize=1024; +const int activity=0; +const int backComSize=5; +const int cachePorts=200; +const int choiceCtrBits=2; +const int choicePredictorSize=8192; + + +const int commitWidth=8; +const int decodeWidth=8; +const int dispatchWidth=8; +const int fetchWidth=8; +const int issueWidth=1; +const int renameWidth=8; +//what is this forwardComSize=5?? + +const int globalCtrBits=2; +const int globalHistoryBits=13; +const int globalPredictorSize=8192; + + + +const int localCtrBits=2; +const int localHistoryBits=11; +const int localHistoryTableSize=2048; +const int localPredictorSize=2048; + +const double Woutdrvnandn =30 *0.09;//(24.0 * LSCALE) +const double Woutdrvnandp =12.5 *0.09;//(10.0 * LSCALE) +const double Woutdrvnorn =7.5*0.09;//(6.0 * LSCALE) +const double Woutdrvnorp =50 * 0.09;// (40.0 * LSCALE) +const double Woutdrivern =60*0.09;//(48.0 * LSCALE) +const double Woutdriverp =100 * 0.09;//(80.0 * LSCALE) + +/* +smtCommitPolicy=RoundRobin +smtFetchPolicy=SingleThread +smtIQPolicy=Partitioned +smtIQThreshold=100 +smtLSQPolicy=Partitioned +smtLSQThreshold=100 +smtNumFetchingThreads=1 +smtROBPolicy=Partitioned +smtROBThreshold=100 +squashWidth=8 +*/ + +/* +prefetch_access=false +prefetch_cache_check_push=true +prefetch_data_accesses_only=false +prefetch_degree=1 +prefetch_latency=10000 +prefetch_miss=false +prefetch_past_page=false +prefetch_policy=none +prefetch_serial_squash=false +prefetch_use_cpu_id=true +prefetcher_size=100 +prioritizeRequests=false +repl=Null + + +split=false +split_size=0 +subblock_size=0 +tgts_per_mshr=20 +trace_addr=0 +two_queue=false + +cpu_side=system.cpu0.dcache_port +mem_side=system.tol2bus.port[2] +*/ + +//[system.cpu0.dtb] +//type=AlphaDT + + +#endif /* ARCH_CONST_H_ */ diff --git a/ext/mcpat/array.cc b/ext/mcpat/array.cc new file mode 100644 index 000000000..975f82fad --- /dev/null +++ b/ext/mcpat/array.cc @@ -0,0 +1,302 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#define GLOBALVAR +#include <cassert> +#include <cmath> +#include <iostream> + +#include "area.h" +#include "array.h" +#include "decoder.h" +#include "globalvar.h" +#include "parameter.h" + +using namespace std; + +ArrayST::ArrayST(const InputParameter *configure_interface, + string _name, + enum Device_ty device_ty_, + bool opt_local_, + enum Core_type core_ty_, + bool _is_default) +:l_ip(*configure_interface), + name(_name), + device_ty(device_ty_), + opt_local(opt_local_), + core_ty(core_ty_), + is_default(_is_default) + { + + if (l_ip.cache_sz<64) l_ip.cache_sz=64; + l_ip.error_checking();//not only do the error checking but also fill some missing parameters + optimize_array(); + +} + + +void ArrayST::compute_base_power() + { + //l_ip.out_w =l_ip.line_sz*8; + local_result=cacti_interface(&l_ip); + + } + +void ArrayST::optimize_array() +{ + list<uca_org_t > candidate_solutions(0); + list<uca_org_t >::iterator candidate_iter, min_dynamic_energy_iter; + + uca_org_t * temp_res = 0; + local_result.valid=false; + + double throughput=l_ip.throughput, latency=l_ip.latency; + double area_efficiency_threshold = 20.0; + bool throughput_overflow=true, latency_overflow=true; + compute_base_power(); + + if ((local_result.cycle_time - throughput) <= 1e-10 ) + throughput_overflow=false; + if ((local_result.access_time - latency)<= 1e-10) + latency_overflow=false; + + if (opt_for_clk && opt_local) + { + if (throughput_overflow || latency_overflow) + { + l_ip.ed=0; + + l_ip.delay_wt = 100;//Fixed number, make sure timing can be satisfied. + l_ip.cycle_time_wt = 1000; + + l_ip.area_wt = 10;//Fixed number, This is used to exhaustive search for individual components. + l_ip.dynamic_power_wt = 10;//Fixed number, This is used to exhaustive search for individual components. + l_ip.leakage_power_wt = 10; + + l_ip.delay_dev = 1000000;//Fixed number, make sure timing can be satisfied. + l_ip.cycle_time_dev = 100; + + l_ip.area_dev = 1000000;//Fixed number, This is used to exhaustive search for individual components. + l_ip.dynamic_power_dev = 1000000;//Fixed number, This is used to exhaustive search for individual components. + l_ip.leakage_power_dev = 1000000; + + throughput_overflow=true; //Reset overflow flag before start optimization iterations + latency_overflow=true; + + temp_res = &local_result; //Clean up the result for optimized for ED^2P + temp_res->cleanup(); + } + + + while ((throughput_overflow || latency_overflow)&&l_ip.cycle_time_dev > 10)// && l_ip.delay_dev > 10 + { + compute_base_power(); + + l_ip.cycle_time_dev-=10;//This is the time_dev to be used for next iteration + + // from best area to worst area -->worst timing to best timing + if ((((local_result.cycle_time - throughput) <= 1e-10 ) && (local_result.access_time - latency)<= 1e-10)|| + (local_result.data_array2->area_efficiency < area_efficiency_threshold && l_ip.assoc == 0)) + { //if no satisfiable solution is found,the most aggressive one is left + candidate_solutions.push_back(local_result); + //output_data_csv(candidate_solutions.back()); + if (((local_result.cycle_time - throughput) <= 1e-10) && ((local_result.access_time - latency)<= 1e-10)) + //ensure stop opt not because of cam + { + throughput_overflow=false; + latency_overflow=false; + } + + } + else + { + //TODO: whether checking the partial satisfied results too, or just change the mark??? + if ((local_result.cycle_time - throughput) <= 1e-10) + throughput_overflow=false; + if ((local_result.access_time - latency)<= 1e-10) + latency_overflow=false; + + if (l_ip.cycle_time_dev > 10) + { //if not >10 local_result is the last result, it cannot be cleaned up + temp_res = &local_result; //Only solutions not saved in the list need to be cleaned up + temp_res->cleanup(); + } + } +// l_ip.cycle_time_dev-=10; +// l_ip.delay_dev-=10; + + } + + + if (l_ip.assoc > 0) + { + //For array structures except CAM and FA, Give warning but still provide a result with best timing found + if (throughput_overflow==true) + cout<< "Warning: " << name<<" array structure cannot satisfy throughput constraint." << endl; + if (latency_overflow==true) + cout<< "Warning: " << name<<" array structure cannot satisfy latency constraint." << endl; + } + +// else +// { +// /*According to "Content-Addressable Memory (CAM) Circuits and +// Architectures": A Tutorial and Survey +// by Kostas Pagiamtzis et al. +// CAM structures can be heavily pipelined and use look-ahead techniques, +// therefore timing can be relaxed. But McPAT does not model the advanced +// techniques. If continue optimizing, the area efficiency will be too low +// */ +// //For CAM and FA, stop opt if area efficiency is too low +// if (throughput_overflow==true) +// cout<< "Warning: " <<" McPAT stopped optimization on throughput for "<< name +// <<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl; +// if (latency_overflow==true) +// cout<< "Warning: " <<" McPAT stopped optimization on latency for "<< name +// <<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl; +// } + + //double min_dynamic_energy, min_dynamic_power, min_leakage_power, min_cycle_time; + double min_dynamic_energy=BIGNUM; + if (candidate_solutions.empty()==false) + { + local_result.valid=true; + for (candidate_iter = candidate_solutions.begin(); candidate_iter != candidate_solutions.end(); ++candidate_iter) + + { + if (min_dynamic_energy > (candidate_iter)->power.readOp.dynamic) + { + min_dynamic_energy = (candidate_iter)->power.readOp.dynamic; + min_dynamic_energy_iter = candidate_iter; + local_result = *(min_dynamic_energy_iter); + //TODO: since results are reordered results and l_ip may miss match. Therefore, the final output spread sheets may show the miss match. + + } + else + { + candidate_iter->cleanup() ; + } + + } + + + } + candidate_solutions.clear(); + } + + double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); + + double macro_layout_overhead = g_tp.macro_layout_overhead; + double chip_PR_overhead = g_tp.chip_layout_overhead; + double total_overhead = macro_layout_overhead*chip_PR_overhead; + local_result.area *= total_overhead; + + //maintain constant power density + double pppm_t[4] = {total_overhead,1,1,total_overhead}; + + double sckRation = g_tp.sckt_co_eff; + local_result.power.readOp.dynamic *= sckRation; + local_result.power.writeOp.dynamic *= sckRation; + local_result.power.searchOp.dynamic *= sckRation; + local_result.power.readOp.leakage *= l_ip.nbanks; + local_result.power.readOp.longer_channel_leakage = + local_result.power.readOp.leakage*long_channel_device_reduction; + local_result.power = local_result.power* pppm_t; + + local_result.data_array2->power.readOp.dynamic *= sckRation; + local_result.data_array2->power.writeOp.dynamic *= sckRation; + local_result.data_array2->power.searchOp.dynamic *= sckRation; + local_result.data_array2->power.readOp.leakage *= l_ip.nbanks; + local_result.data_array2->power.readOp.longer_channel_leakage = + local_result.data_array2->power.readOp.leakage*long_channel_device_reduction; + local_result.data_array2->power = local_result.data_array2->power* pppm_t; + + + if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache) + { + local_result.tag_array2->power.readOp.dynamic *= sckRation; + local_result.tag_array2->power.writeOp.dynamic *= sckRation; + local_result.tag_array2->power.searchOp.dynamic *= sckRation; + local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks; + local_result.tag_array2->power.readOp.longer_channel_leakage = + local_result.tag_array2->power.readOp.leakage*long_channel_device_reduction; + local_result.tag_array2->power = local_result.tag_array2->power* pppm_t; + } + + +} + +void ArrayST::leakage_feedback(double temperature) +{ + // Update the temperature. l_ip is already set and error-checked in the creator function. + l_ip.temp = (unsigned int)round(temperature/10.0)*10; + + // This corresponds to cacti_interface() in the initialization process. Leakage power is updated here. + reconfigure(&l_ip,&local_result); + + // Scale the power values. This is part of ArrayST::optimize_array(). + double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); + + double macro_layout_overhead = g_tp.macro_layout_overhead; + double chip_PR_overhead = g_tp.chip_layout_overhead; + double total_overhead = macro_layout_overhead*chip_PR_overhead; + + double pppm_t[4] = {total_overhead,1,1,total_overhead}; + + double sckRation = g_tp.sckt_co_eff; + local_result.power.readOp.dynamic *= sckRation; + local_result.power.writeOp.dynamic *= sckRation; + local_result.power.searchOp.dynamic *= sckRation; + local_result.power.readOp.leakage *= l_ip.nbanks; + local_result.power.readOp.longer_channel_leakage = local_result.power.readOp.leakage*long_channel_device_reduction; + local_result.power = local_result.power* pppm_t; + + local_result.data_array2->power.readOp.dynamic *= sckRation; + local_result.data_array2->power.writeOp.dynamic *= sckRation; + local_result.data_array2->power.searchOp.dynamic *= sckRation; + local_result.data_array2->power.readOp.leakage *= l_ip.nbanks; + local_result.data_array2->power.readOp.longer_channel_leakage = local_result.data_array2->power.readOp.leakage*long_channel_device_reduction; + local_result.data_array2->power = local_result.data_array2->power* pppm_t; + + if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache) + { + local_result.tag_array2->power.readOp.dynamic *= sckRation; + local_result.tag_array2->power.writeOp.dynamic *= sckRation; + local_result.tag_array2->power.searchOp.dynamic *= sckRation; + local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks; + local_result.tag_array2->power.readOp.longer_channel_leakage = local_result.tag_array2->power.readOp.leakage*long_channel_device_reduction; + local_result.tag_array2->power = local_result.tag_array2->power* pppm_t; + } +} + +ArrayST:: ~ArrayST() +{ + local_result.cleanup(); +} diff --git a/ext/mcpat/array.h b/ext/mcpat/array.h new file mode 100644 index 000000000..8c6124d46 --- /dev/null +++ b/ext/mcpat/array.h @@ -0,0 +1,101 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#ifndef ARRAY_H_ +#define ARRAY_H_ + +#include <iostream> +#include <string> + +#include "basic_components.h" +#include "cacti_interface.h" +#include "component.h" +#include "const.h" +#include "parameter.h" + +using namespace std; + +class ArrayST :public Component{ + public: + ArrayST(){}; + ArrayST(const InputParameter *configure_interface, string _name, enum Device_ty device_ty_, bool opt_local_=true, enum Core_type core_ty_=Inorder, bool _is_default=true); + + InputParameter l_ip; + string name; + enum Device_ty device_ty; + bool opt_local; + enum Core_type core_ty; + bool is_default; + uca_org_t local_result; + + statsDef tdp_stats; + statsDef rtp_stats; + statsDef stats_t; + powerDef power_t; + + virtual void optimize_array(); + virtual void compute_base_power(); + virtual ~ArrayST(); + + void leakage_feedback(double temperature); +}; + +class InstCache :public Component{ +public: + ArrayST* caches; + ArrayST* missb; + ArrayST* ifb; + ArrayST* prefetchb; + powerDef power_t;//temp value holder for both (max) power and runtime power + InstCache(){caches=0;missb=0;ifb=0;prefetchb=0;}; + ~InstCache(){ + if (caches) {//caches->local_result.cleanup(); + delete caches; caches=0;} + if (missb) {//missb->local_result.cleanup(); + delete missb; missb=0;} + if (ifb) {//ifb->local_result.cleanup(); + delete ifb; ifb=0;} + if (prefetchb) {//prefetchb->local_result.cleanup(); + delete prefetchb; prefetchb=0;} + }; +}; + +class DataCache :public InstCache{ +public: + ArrayST* wbb; + DataCache(){wbb=0;}; + ~DataCache(){ + if (wbb) {//wbb->local_result.cleanup(); + delete wbb; wbb=0;} + }; +}; + +#endif /* TLB_H_ */ diff --git a/ext/mcpat/basic_components.cc b/ext/mcpat/basic_components.cc new file mode 100644 index 000000000..f288d7479 --- /dev/null +++ b/ext/mcpat/basic_components.cc @@ -0,0 +1,127 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#include <cassert> +#include <cmath> +#include <iostream> + +#include "basic_components.h" + +double longer_channel_device_reduction( + enum Device_ty device_ty, + enum Core_type core_ty) +{ + + double longer_channel_device_percentage_core; + double longer_channel_device_percentage_uncore; + double longer_channel_device_percentage_llc; + + double long_channel_device_reduction; + + longer_channel_device_percentage_llc = 1.0; + longer_channel_device_percentage_uncore = 0.82; + if (core_ty==OOO) + { + longer_channel_device_percentage_core = 0.56;//0.54 Xeon Tulsa //0.58 Nehelam + //longer_channel_device_percentage_uncore = 0.76;//0.85 Nehelam + + } + else + { + longer_channel_device_percentage_core = 0.8;//0.8;//Niagara + //longer_channel_device_percentage_uncore = 0.9;//Niagara + } + + if (device_ty==Core_device) + { + long_channel_device_reduction = (1- longer_channel_device_percentage_core) + + longer_channel_device_percentage_core * g_tp.peri_global.long_channel_leakage_reduction; + } + else if (device_ty==Uncore_device) + { + long_channel_device_reduction = (1- longer_channel_device_percentage_uncore) + + longer_channel_device_percentage_uncore * g_tp.peri_global.long_channel_leakage_reduction; + } + else if (device_ty==LLC_device) + { + long_channel_device_reduction = (1- longer_channel_device_percentage_llc) + + longer_channel_device_percentage_llc * g_tp.peri_global.long_channel_leakage_reduction; + } + else + { + cout<<"unknown device category"<<endl; + exit(0); + } + + return long_channel_device_reduction; +} + +statsComponents operator+(const statsComponents & x, const statsComponents & y) +{ + statsComponents z; + + z.access = x.access + y.access; + z.hit = x.hit + y.hit; + z.miss = x.miss + y.miss; + + return z; +} + +statsComponents operator*(const statsComponents & x, double const * const y) +{ + statsComponents z; + + z.access = x.access*y[0]; + z.hit = x.hit*y[1]; + z.miss = x.miss*y[2]; + + return z; +} + +statsDef operator+(const statsDef & x, const statsDef & y) +{ + statsDef z; + + z.readAc = x.readAc + y.readAc; + z.writeAc = x.writeAc + y.writeAc; + z.searchAc = x.searchAc + y.searchAc; + return z; +} + +statsDef operator*(const statsDef & x, double const * const y) +{ + statsDef z; + + z.readAc = x.readAc*y; + z.writeAc = x.writeAc*y; + z.searchAc = x.searchAc*y; + return z; +} diff --git a/ext/mcpat/basic_components.h b/ext/mcpat/basic_components.h new file mode 100644 index 000000000..ce3e639cd --- /dev/null +++ b/ext/mcpat/basic_components.h @@ -0,0 +1,265 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#ifndef BASIC_COMPONENTS_H_ +#define BASIC_COMPONENTS_H_ + +#include <vector> + +#include "XML_Parse.h" +#include "parameter.h" + +const double cdb_overhead = 1.1; + +enum FU_type { + FPU, + ALU, + MUL +}; + +enum Core_type { + OOO, + Inorder +}; + +enum Renaming_type { + RAMbased, + CAMbased +}; + +enum Scheduler_type { + PhysicalRegFile, + ReservationStation +}; + +enum cache_level { + L2, + L3, + L1Directory, + L2Directory +}; + +enum MemoryCtrl_type { + MC, //memory controller + FLASHC //flash controller +}; + +enum Dir_type { + ST,//shadowed tag + DC,//directory cache + SBT,//static bank tag + NonDir + +}; + +enum Cache_policy { + Write_through, + Write_back +}; + +enum Device_ty { + Core_device, + Uncore_device, + LLC_device +}; + +class statsComponents +{ + public: + double access; + double hit; + double miss; + + statsComponents() : access(0), hit(0), miss(0) {} + statsComponents(const statsComponents & obj) { *this = obj; } + statsComponents & operator=(const statsComponents & rhs) + { + access = rhs.access; + hit = rhs.hit; + miss = rhs.miss; + return *this; + } + void reset() { access = 0; hit = 0; miss = 0;} + + friend statsComponents operator+(const statsComponents & x, const statsComponents & y); + friend statsComponents operator*(const statsComponents & x, double const * const y); +}; + +class statsDef +{ + public: + statsComponents readAc; + statsComponents writeAc; + statsComponents searchAc; + + statsDef() : readAc(), writeAc(),searchAc() { } + void reset() { readAc.reset(); writeAc.reset();searchAc.reset();} + + friend statsDef operator+(const statsDef & x, const statsDef & y); + friend statsDef operator*(const statsDef & x, double const * const y); +}; + +double longer_channel_device_reduction( + enum Device_ty device_ty=Core_device, + enum Core_type core_ty=Inorder); + +class CoreDynParam { +public: + CoreDynParam(){}; + CoreDynParam(ParseXML *XML_interface, int ithCore_); + // :XML(XML_interface), + // ithCore(ithCore_) + // core_ty(inorder), + // rm_ty(CAMbased), + // scheu_ty(PhysicalRegFile), + // clockRate(1e9),//1GHz + // arch_ireg_width(32), + // arch_freg_width(32), + // phy_ireg_width(128), + // phy_freg_width(128), + // perThreadState(8), + // globalCheckpoint(32), + // instructionLength(32){}; + //ParseXML * XML; + bool opt_local; + bool x86; + bool Embedded; + enum Core_type core_ty; + enum Renaming_type rm_ty; + enum Scheduler_type scheu_ty; + double clockRate,executionTime; + int arch_ireg_width, arch_freg_width, phy_ireg_width, phy_freg_width; + int num_IRF_entry, num_FRF_entry, num_ifreelist_entries, num_ffreelist_entries; + int fetchW, decodeW,issueW,peak_issueW, commitW,peak_commitW, predictionW, fp_issueW, fp_decodeW; + int perThreadState, globalCheckpoint, instruction_length, pc_width, opcode_length, micro_opcode_length; + int num_hthreads, pipeline_stages, fp_pipeline_stages, num_pipelines, num_fp_pipelines; + int num_alus, num_muls; + double num_fpus; + int int_data_width, fp_data_width,v_address_width, p_address_width; + double pipeline_duty_cycle, total_cycles, busy_cycles, idle_cycles; + bool regWindowing,multithreaded; + double pppm_lkg_multhread[4]; + double IFU_duty_cycle,BR_duty_cycle,LSU_duty_cycle,MemManU_I_duty_cycle, + MemManU_D_duty_cycle, ALU_duty_cycle,MUL_duty_cycle, + FPU_duty_cycle, ALU_cdb_duty_cycle,MUL_cdb_duty_cycle, + FPU_cdb_duty_cycle; + ~CoreDynParam(){}; +}; + +class CacheDynParam { +public: + CacheDynParam(){}; + CacheDynParam(ParseXML *XML_interface, int ithCache_); + string name; + enum Dir_type dir_ty; + double clockRate,executionTime; + double capacity, blockW, assoc, nbanks; + double throughput, latency; + double duty_cycle, dir_duty_cycle; + //double duty_cycle; + int missb_size, fu_size, prefetchb_size, wbb_size; + ~CacheDynParam(){}; +}; + +class MCParam { +public: + MCParam(){}; + MCParam(ParseXML *XML_interface, int ithCache_); + string name; + double clockRate,num_mcs, peakDataTransferRate, num_channels; + // double mcTEPowerperGhz; + // double mcPHYperGbit; + // double area; + int llcBlockSize, dataBusWidth, addressBusWidth; + int opcodeW; + int memAccesses; + int memRank; + int type; + double frontend_duty_cycle, duty_cycle, perc_load; + double executionTime, reads, writes; + bool LVDS, withPHY; + + ~MCParam(){}; +}; + +class NoCParam { +public: + NoCParam(){}; + NoCParam(ParseXML *XML_interface, int ithCache_); + string name; + double clockRate; + int flit_size; + int input_ports, output_ports, min_ports, global_linked_ports; + int virtual_channel_per_port,input_buffer_entries_per_vc; + int horizontal_nodes,vertical_nodes, total_nodes; + double executionTime, total_access, link_throughput,link_latency, + duty_cycle, chip_coverage, route_over_perc; + bool has_global_link, type; + + ~NoCParam(){}; +}; + +class ProcParam { +public: + ProcParam(){}; + ProcParam(ParseXML *XML_interface, int ithCache_); + string name; + int numCore, numL2, numL3, numNOC, numL1Dir, numL2Dir,numMC, numMCChannel; + bool homoCore, homoL2, homoL3, homoNOC, homoL1Dir, homoL2Dir; + + ~ProcParam(){}; +}; + +class NIUParam { +public: + NIUParam(){}; + NIUParam(ParseXML *XML_interface, int ithCache_); + string name; + double clockRate; + int num_units; + int type; + double duty_cycle, perc_load; + ~NIUParam(){}; +}; + +class PCIeParam { +public: + PCIeParam(){}; + PCIeParam(ParseXML *XML_interface, int ithCache_); + string name; + double clockRate; + int num_channels, num_units; + bool withPHY; + int type; + double duty_cycle, perc_load; + ~PCIeParam(){}; +}; +#endif /* BASIC_COMPONENTS_H_ */ diff --git a/ext/mcpat/cacti/README b/ext/mcpat/cacti/README new file mode 100644 index 000000000..de429d2bb --- /dev/null +++ b/ext/mcpat/cacti/README @@ -0,0 +1,94 @@ +----------------------------------------------------------- + ____ _ ____ _____ ___ __ ____ + / ___| / \ / ___|_ _|_ _| / /_ | ___| + | | / _ \| | | | | | | '_ \ |___ \ + | |___ / ___ \ |___ | | | | | (_) | ___) | + \____/_/ \_\____| |_| |___| \___(_)____/ + + + A Tool to Model Caches/Memories +----------------------------------------------------------- + +CACTI is an analytical tool that takes a set of cache/memory para- +meters as input and calculates its access time, power, cycle +time, and area. +CACTI was originally developed by Dr. Jouppi and Dr. Wilton +in 1993 and since then it has undergone five major +revisions. + +List of features (version 1-6.5): +=============================== +The following is the list of features supported by the tool. + +* Power, delay, area, and cycle time model for + direct mapped caches + set-associative caches + fully associative caches + Embedded DRAM memories + Commodity DRAM memories + +* Support for modeling multi-ported uniform cache access (UCA) + and multi-banked, multi-ported non-uniform cache access (NUCA). + +* Leakage power calculation that also considers the operating + temperature of the cache. + +* Router power model. + +* Interconnect model with different delay, power, and area + properties including low-swing wire model. + +* An interface to perform trade-off analysis involving power, delay, + area, and bandwidth. + +* All process specific values used by the tool are obtained + from ITRS and currently, the tool supports 90nm, 65nm, 45nm, + and 32nm technology nodes. + +Version 6.5 has a new c++ code base and includes numerous bug fixes. +CACTI 5.3 and 6.0 activate an entire row of mats to read/write a single +block of data. This technique improves reliability at the cost of +power. CACTI 6.5 activates minimum number of mats just enough to retrieve +a block to minimize power. + +How to use the tool? +==================== +Prior versions of CACTI take input parameters such as cache +size and technology node as a set of command line arguments. +To avoid a long list of command line arguments, +CACTI 6.5 lets users specify their cache model in a more +detailed manner by using a config file (cache.cfg). + +-> define the cache model using cache.cfg +-> run the "cacti" binary <./cacti -infile cache.cfg> + +CACTI6.5 also provides a command line interface similar to earlier versions +of CACTI. The command line interface can be used as + +./cacti cache_size line_size associativity rw_ports excl_read_ports excl_write_ports + single_ended_read_ports search_ports banks tech_node output_width specific_tag tag_width + access_mode cache main_mem obj_func_delay obj_func_dynamic_power obj_func_leakage_power + obj_func_cycle_time obj_func_area dev_func_delay dev_func_dynamic_power dev_func_leakage_power + dev_func_area dev_func_cycle_time ed_ed2_none temp wt data_arr_ram_cell_tech_flavor_in + data_arr_peri_global_tech_flavor_in tag_arr_ram_cell_tech_flavor_in tag_arr_peri_global_tech_flavor_in + interconnect_projection_type_in wire_inside_mat_type_in wire_outside_mat_type_in + REPEATERS_IN_HTREE_SEGMENTS_in VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in + BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in PAGE_SIZE_BITS_in BURST_LENGTH_in + INTERNAL_PREFETCH_WIDTH_in force_wiretype wiretype force_config ndwl ndbl nspd ndcm + ndsam1 ndsam2 ecc + +For complete documentation of the tool, please refer CACTI-5.3 and 6.0 +technical reports and the following paper, +"Optimizing NUCA Organizations and Wiring Alternatives for +Large Caches With CACTI 6.0", that appears in MICRO 2007. + +We are still improving the tool and refining the code. If you +have any comments, questions, or suggestions please write to +us. + +Naveen Muralimanohar Jung Ho Ahn Sheng Li +naveen.muralimanohar@hp.com gajh@snu.ac.kr sheng.li@hp.com + + + + diff --git a/ext/mcpat/cacti/Ucache.cc b/ext/mcpat/cacti/Ucache.cc new file mode 100644 index 000000000..f3e1227df --- /dev/null +++ b/ext/mcpat/cacti/Ucache.cc @@ -0,0 +1,916 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + +#include <pthread.h> + +#include <algorithm> +#include <cmath> +#include <ctime> +#include <iostream> +#include <list> + +#include "Ucache.h" +#include "area.h" +#include "bank.h" +#include "basic_circuit.h" +#include "component.h" +#include "const.h" +#include "decoder.h" +#include "parameter.h" +#include "subarray.h" +#include "uca.h" + +using namespace std; + +const uint32_t nthreads = NTHREADS; + + +void min_values_t::update_min_values(const min_values_t * val) +{ + min_delay = (min_delay > val->min_delay) ? val->min_delay : min_delay; + min_dyn = (min_dyn > val->min_dyn) ? val->min_dyn : min_dyn; + min_leakage = (min_leakage > val->min_leakage) ? val->min_leakage : min_leakage; + min_area = (min_area > val->min_area) ? val->min_area : min_area; + min_cyc = (min_cyc > val->min_cyc) ? val->min_cyc : min_cyc; +} + + + +void min_values_t::update_min_values(const uca_org_t & res) +{ + min_delay = (min_delay > res.access_time) ? res.access_time : min_delay; + min_dyn = (min_dyn > res.power.readOp.dynamic) ? res.power.readOp.dynamic : min_dyn; + min_leakage = (min_leakage > res.power.readOp.leakage) ? res.power.readOp.leakage : min_leakage; + min_area = (min_area > res.area) ? res.area : min_area; + min_cyc = (min_cyc > res.cycle_time) ? res.cycle_time : min_cyc; +} + +void min_values_t::update_min_values(const nuca_org_t * res) +{ + min_delay = (min_delay > res->nuca_pda.delay) ? res->nuca_pda.delay : min_delay; + min_dyn = (min_dyn > res->nuca_pda.power.readOp.dynamic) ? res->nuca_pda.power.readOp.dynamic : min_dyn; + min_leakage = (min_leakage > res->nuca_pda.power.readOp.leakage) ? res->nuca_pda.power.readOp.leakage : min_leakage; + min_area = (min_area > res->nuca_pda.area.get_area()) ? res->nuca_pda.area.get_area() : min_area; + min_cyc = (min_cyc > res->nuca_pda.cycle_time) ? res->nuca_pda.cycle_time : min_cyc; +} + +void min_values_t::update_min_values(const mem_array * res) +{ + min_delay = (min_delay > res->access_time) ? res->access_time : min_delay; + min_dyn = (min_dyn > res->power.readOp.dynamic) ? res->power.readOp.dynamic : min_dyn; + min_leakage = (min_leakage > res->power.readOp.leakage) ? res->power.readOp.leakage : min_leakage; + min_area = (min_area > res->area) ? res->area : min_area; + min_cyc = (min_cyc > res->cycle_time) ? res->cycle_time : min_cyc; +} + + + +void * calc_time_mt_wrapper(void * void_obj) +{ + calc_time_mt_wrapper_struct * calc_obj = (calc_time_mt_wrapper_struct *) void_obj; + uint32_t tid = calc_obj->tid; + list<mem_array *> & data_arr = calc_obj->data_arr; + list<mem_array *> & tag_arr = calc_obj->tag_arr; + bool is_tag = calc_obj->is_tag; + bool pure_ram = calc_obj->pure_ram; + bool pure_cam = calc_obj->pure_cam; + bool is_main_mem = calc_obj->is_main_mem; + double Nspd_min = calc_obj->Nspd_min; + min_values_t * data_res = calc_obj->data_res; + min_values_t * tag_res = calc_obj->tag_res; + + data_arr.clear(); + data_arr.push_back(new mem_array); + tag_arr.clear(); + tag_arr.push_back(new mem_array); + + uint32_t Ndwl_niter = _log2(MAXDATAN) + 1; + uint32_t Ndbl_niter = _log2(MAXDATAN) + 1; + uint32_t Ndcm_niter = _log2(MAX_COL_MUX) + 1; + uint32_t niter = Ndwl_niter * Ndbl_niter * Ndcm_niter; + + + bool is_valid_partition; + int wt_min, wt_max; + + if (g_ip->force_wiretype) { + if (g_ip->wt == 0) { + wt_min = Low_swing; + wt_max = Low_swing; + } + else { + wt_min = Global; + wt_max = Low_swing-1; + } + } + else { + wt_min = Global; + wt_max = Low_swing; + } + + for (double Nspd = Nspd_min; Nspd <= MAXDATASPD; Nspd *= 2) + { + for (int wr = wt_min; wr <= wt_max; wr++) + { + for (uint32_t iter = tid; iter < niter; iter += nthreads) + { + // reconstruct Ndwl, Ndbl, Ndcm + unsigned int Ndwl = 1 << (iter / (Ndbl_niter * Ndcm_niter)); + unsigned int Ndbl = 1 << ((iter / (Ndcm_niter))%Ndbl_niter); + unsigned int Ndcm = 1 << (iter % Ndcm_niter); + for(unsigned int Ndsam_lev_1 = 1; Ndsam_lev_1 <= MAX_COL_MUX; Ndsam_lev_1 *= 2) + { + for(unsigned int Ndsam_lev_2 = 1; Ndsam_lev_2 <= MAX_COL_MUX; Ndsam_lev_2 *= 2) + { + //for debuging + if (g_ip->force_cache_config && is_tag == false) + { + wr = g_ip->wt; + Ndwl = g_ip->ndwl; + Ndbl = g_ip->ndbl; + Ndcm = g_ip->ndcm; + if(g_ip->nspd != 0) { + Nspd = g_ip->nspd; + } + if(g_ip->ndsam1 != 0) { + Ndsam_lev_1 = g_ip->ndsam1; + Ndsam_lev_2 = g_ip->ndsam2; + } + } + + if (is_tag == true) + { + is_valid_partition = calculate_time(is_tag, pure_ram, pure_cam, Nspd, Ndwl, + Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, + tag_arr.back(), 0, NULL, NULL, + is_main_mem); + } + // If it's a fully-associative cache, the data array partition parameters are identical to that of + // the tag array, so compute data array partition properties also here. + if (is_tag == false || g_ip->fully_assoc) + { + is_valid_partition = calculate_time(is_tag/*false*/, pure_ram, pure_cam, Nspd, Ndwl, + Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, + data_arr.back(), 0, NULL, NULL, + is_main_mem); + } + + if (is_valid_partition) + { + if (is_tag == true) + { + tag_arr.back()->wt = (enum Wire_type) wr; + tag_res->update_min_values(tag_arr.back()); + tag_arr.push_back(new mem_array); + } + if (is_tag == false || g_ip->fully_assoc) + { + data_arr.back()->wt = (enum Wire_type) wr; + data_res->update_min_values(data_arr.back()); + data_arr.push_back(new mem_array); + } + } + + if (g_ip->force_cache_config && is_tag == false) + { + wr = wt_max; + iter = niter; + if(g_ip->nspd != 0) { + Nspd = MAXDATASPD; + } + if (g_ip->ndsam1 != 0) { + Ndsam_lev_1 = MAX_COL_MUX+1; + Ndsam_lev_2 = MAX_COL_MUX+1; + } + } + } + } + } + } + } + + delete data_arr.back(); + delete tag_arr.back(); + data_arr.pop_back(); + tag_arr.pop_back(); + + pthread_exit(NULL); +} + + + +bool calculate_time( + bool is_tag, + int pure_ram, + bool pure_cam, + double Nspd, + unsigned int Ndwl, + unsigned int Ndbl, + unsigned int Ndcm, + unsigned int Ndsam_lev_1, + unsigned int Ndsam_lev_2, + mem_array *ptr_array, + int flag_results_populate, + results_mem_array *ptr_results, + uca_org_t *ptr_fin_res, + bool is_main_mem) +{ + DynamicParameter dyn_p(is_tag, pure_ram, pure_cam, Nspd, Ndwl, Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, is_main_mem); + + if (dyn_p.is_valid == false) + { + return false; + } + + UCA * uca = new UCA(dyn_p); + + + if (flag_results_populate) + { //For the final solution, populate the ptr_results data structure -- TODO: copy only necessary variables + } + else + { + int num_act_mats_hor_dir = uca->bank.dp.num_act_mats_hor_dir; + int num_mats = uca->bank.dp.num_mats; + bool is_fa = uca->bank.dp.fully_assoc; + bool pure_cam = uca->bank.dp.pure_cam; + ptr_array->Ndwl = Ndwl; + ptr_array->Ndbl = Ndbl; + ptr_array->Nspd = Nspd; + ptr_array->deg_bl_muxing = dyn_p.deg_bl_muxing; + ptr_array->Ndsam_lev_1 = Ndsam_lev_1; + ptr_array->Ndsam_lev_2 = Ndsam_lev_2; + ptr_array->access_time = uca->access_time; + ptr_array->cycle_time = uca->cycle_time; + ptr_array->multisubbank_interleave_cycle_time = uca->multisubbank_interleave_cycle_time; + ptr_array->area_ram_cells = uca->area_all_dataramcells; + ptr_array->area = uca->area.get_area(); + ptr_array->height = uca->area.h; + ptr_array->width = uca->area.w; + ptr_array->mat_height = uca->bank.mat.area.h; + ptr_array->mat_length = uca->bank.mat.area.w; + ptr_array->subarray_height = uca->bank.mat.subarray.area.h; + ptr_array->subarray_length = uca->bank.mat.subarray.area.w; + ptr_array->power = uca->power; + ptr_array->delay_senseamp_mux_decoder = + MAX(uca->delay_array_to_sa_mux_lev_1_decoder, + uca->delay_array_to_sa_mux_lev_2_decoder); + ptr_array->delay_before_subarray_output_driver = uca->delay_before_subarray_output_driver; + ptr_array->delay_from_subarray_output_driver_to_output = uca->delay_from_subarray_out_drv_to_out; + + ptr_array->delay_route_to_bank = uca->htree_in_add->delay; + ptr_array->delay_input_htree = uca->bank.htree_in_add->delay; + ptr_array->delay_row_predecode_driver_and_block = uca->bank.mat.r_predec->delay; + ptr_array->delay_row_decoder = uca->bank.mat.row_dec->delay; + ptr_array->delay_bitlines = uca->bank.mat.delay_bitline; + ptr_array->delay_matchlines = uca->bank.mat.delay_matchchline; + ptr_array->delay_sense_amp = uca->bank.mat.delay_sa; + ptr_array->delay_subarray_output_driver = uca->bank.mat.delay_subarray_out_drv_htree; + ptr_array->delay_dout_htree = uca->bank.htree_out_data->delay; + ptr_array->delay_comparator = uca->bank.mat.delay_comparator; + + ptr_array->all_banks_height = uca->area.h; + ptr_array->all_banks_width = uca->area.w; + ptr_array->area_efficiency = uca->area_all_dataramcells * 100 / (uca->area.get_area()); + + ptr_array->power_routing_to_bank = uca->power_routing_to_bank; + ptr_array->power_addr_input_htree = uca->bank.htree_in_add->power; + ptr_array->power_data_input_htree = uca->bank.htree_in_data->power; +// cout<<"power_data_input_htree"<<uca->bank.htree_in_data->power.readOp.leakage<<endl; + ptr_array->power_data_output_htree = uca->bank.htree_out_data->power; +// cout<<"power_data_output_htree"<<uca->bank.htree_out_data->power.readOp.leakage<<endl; + ptr_array->power_row_predecoder_drivers = uca->bank.mat.r_predec->driver_power; + ptr_array->power_row_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_row_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_row_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_row_predecoder_blocks = uca->bank.mat.r_predec->block_power; + ptr_array->power_row_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_row_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_row_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_row_decoders = uca->bank.mat.power_row_decoders; + ptr_array->power_row_decoders.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_row_decoders.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_row_decoders.searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_bit_mux_predecoder_drivers = uca->bank.mat.b_mux_predec->driver_power; + ptr_array->power_bit_mux_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_bit_mux_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_bit_mux_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_bit_mux_predecoder_blocks = uca->bank.mat.b_mux_predec->block_power; + ptr_array->power_bit_mux_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_bit_mux_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_bit_mux_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_bit_mux_decoders = uca->bank.mat.power_bit_mux_decoders; + ptr_array->power_bit_mux_decoders.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_bit_mux_decoders.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_bit_mux_decoders.searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_senseamp_mux_lev_1_predecoder_drivers = uca->bank.mat.sa_mux_lev_1_predec->driver_power; + ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_senseamp_mux_lev_1_predecoder_blocks = uca->bank.mat.sa_mux_lev_1_predec->block_power; + ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_senseamp_mux_lev_1_decoders = uca->bank.mat.power_sa_mux_lev_1_decoders; + ptr_array->power_senseamp_mux_lev_1_decoders.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_senseamp_mux_lev_1_decoders.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_senseamp_mux_lev_1_decoders.searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_senseamp_mux_lev_2_predecoder_drivers = uca->bank.mat.sa_mux_lev_2_predec->driver_power; + ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_senseamp_mux_lev_2_predecoder_blocks = uca->bank.mat.sa_mux_lev_2_predec->block_power; + ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_senseamp_mux_lev_2_decoders = uca->bank.mat.power_sa_mux_lev_2_decoders; + ptr_array->power_senseamp_mux_lev_2_decoders .readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_senseamp_mux_lev_2_decoders .writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_senseamp_mux_lev_2_decoders .searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_bitlines = uca->bank.mat.power_bitline; + ptr_array->power_bitlines.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_bitlines.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_bitlines.searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_sense_amps = uca->bank.mat.power_sa; + ptr_array->power_sense_amps.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_sense_amps.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_sense_amps.searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_prechg_eq_drivers = uca->bank.mat.power_bl_precharge_eq_drv; + ptr_array->power_prechg_eq_drivers.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_prechg_eq_drivers.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_prechg_eq_drivers.searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_output_drivers_at_subarray = uca->bank.mat.power_subarray_out_drv; + ptr_array->power_output_drivers_at_subarray.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_output_drivers_at_subarray.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_output_drivers_at_subarray.searchOp.dynamic *= num_act_mats_hor_dir; + + ptr_array->power_comparators = uca->bank.mat.power_comparator; + ptr_array->power_comparators.readOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_comparators.writeOp.dynamic *= num_act_mats_hor_dir; + ptr_array->power_comparators.searchOp.dynamic *= num_act_mats_hor_dir; + +// cout << " num of mats: " << dyn_p.num_mats << endl; + if (is_fa || pure_cam) + { + ptr_array->power_htree_in_search = uca->bank.htree_in_search->power; +// cout<<"power_htree_in_search"<<uca->bank.htree_in_search->power.readOp.leakage<<endl; + ptr_array->power_htree_out_search = uca->bank.htree_out_search->power; +// cout<<"power_htree_out_search"<<uca->bank.htree_out_search->power.readOp.leakage<<endl; + ptr_array->power_searchline = uca->bank.mat.power_searchline; +// cout<<"power_searchlineh"<<uca->bank.mat.power_searchline.readOp.leakage<<endl; + ptr_array->power_searchline.searchOp.dynamic *= num_mats; + ptr_array->power_searchline_precharge = uca->bank.mat.power_searchline_precharge; + ptr_array->power_searchline_precharge.searchOp.dynamic *= num_mats; + ptr_array->power_matchlines = uca->bank.mat.power_matchline; + ptr_array->power_matchlines.searchOp.dynamic *= num_mats; + ptr_array->power_matchline_precharge = uca->bank.mat.power_matchline_precharge; + ptr_array->power_matchline_precharge.searchOp.dynamic *= num_mats; + ptr_array->power_matchline_to_wordline_drv = uca->bank.mat.power_ml_to_ram_wl_drv; +// cout<<"power_matchline.searchOp.leakage"<<uca->bank.mat.power_matchline.searchOp.leakage<<endl; + } + + ptr_array->activate_energy = uca->activate_energy; + ptr_array->read_energy = uca->read_energy; + ptr_array->write_energy = uca->write_energy; + ptr_array->precharge_energy = uca->precharge_energy; + ptr_array->refresh_power = uca->refresh_power; + ptr_array->leak_power_subbank_closed_page = uca->leak_power_subbank_closed_page; + ptr_array->leak_power_subbank_open_page = uca->leak_power_subbank_open_page; + ptr_array->leak_power_request_and_reply_networks = uca->leak_power_request_and_reply_networks; + + ptr_array->precharge_delay = uca->precharge_delay; + + +// cout<<"power_matchline.searchOp.leakage"<<uca->bank.mat.<<endl; +// +// if (!(is_fa || pure_cam)) +// { +// cout << " num of cols: " << dyn_p.num_c_subarray << endl; +// } +// else if (is_fa) +// { +// cout << " num of cols: " << dyn_p.tag_num_c_subarray+ dyn_p.data_num_c_subarray<< endl; +// } else +// cout << " num of cols: " << dyn_p.tag_num_c_subarray<< endl; +// cout << uca->bank.mat.subarray.get_total_cell_area()<<endl; + } + + + delete uca; + return true; +} + + + +bool check_uca_org(uca_org_t & u, min_values_t *minval) +{ + if (((u.access_time - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev) { + return false; + } + if (((u.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 > + g_ip->dynamic_power_dev) { + return false; + } + if (((u.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 > + g_ip->leakage_power_dev) { + return false; + } + if (((u.cycle_time - minval->min_cyc)/minval->min_cyc)*100 > + g_ip->cycle_time_dev) { + return false; + } + if (((u.area - minval->min_area)/minval->min_area)*100 > + g_ip->area_dev) { + return false; + } + return true; +} + +bool check_mem_org(mem_array & u, const min_values_t *minval) +{ + if (((u.access_time - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev) { + return false; + } + if (((u.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 > + g_ip->dynamic_power_dev) { + return false; + } + if (((u.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 > + g_ip->leakage_power_dev) { + return false; + } + if (((u.cycle_time - minval->min_cyc)/minval->min_cyc)*100 > + g_ip->cycle_time_dev) { + return false; + } + if (((u.area - minval->min_area)/minval->min_area)*100 > + g_ip->area_dev) { + return false; + } + return true; +} + + + + +void find_optimal_uca(uca_org_t *res, min_values_t * minval, list<uca_org_t> & ulist) +{ + double cost = 0; + double min_cost = BIGNUM; + float d, a, dp, lp, c; + + dp = g_ip->dynamic_power_wt; + lp = g_ip->leakage_power_wt; + a = g_ip->area_wt; + d = g_ip->delay_wt; + c = g_ip->cycle_time_wt; + + if (ulist.empty() == true) + { + cout << "ERROR: no valid cache organizations found" << endl; + exit(0); + } + + for (list<uca_org_t>::iterator niter = ulist.begin(); niter != ulist.end(); niter++) + { + if (g_ip->ed == 1) + { + cost = ((niter)->access_time/minval->min_delay) * ((niter)->power.readOp.dynamic/minval->min_dyn); + if (min_cost > cost) + { + min_cost = cost; + *res = (*(niter)); + } + } + else if (g_ip->ed == 2) + { + cost = ((niter)->access_time/minval->min_delay)* + ((niter)->access_time/minval->min_delay)* + ((niter)->power.readOp.dynamic/minval->min_dyn); + if (min_cost > cost) + { + min_cost = cost; + *res = (*(niter)); + } + } + else + { + /* + * check whether the current organization + * meets the input deviation constraints + */ + bool v = check_uca_org(*niter, minval); + //if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling + + if (v) + { + cost = (d * ((niter)->access_time/minval->min_delay) + + c * ((niter)->cycle_time/minval->min_cyc) + + dp * ((niter)->power.readOp.dynamic/minval->min_dyn) + + lp * ((niter)->power.readOp.leakage/minval->min_leakage) + + a * ((niter)->area/minval->min_area)); + //fprintf(stderr, "cost = %g\n", cost); + + if (min_cost > cost) { + min_cost = cost; + *res = (*(niter)); + niter = ulist.erase(niter); + if (niter!=ulist.begin()) + niter--; + } + } + else { + niter = ulist.erase(niter); + if (niter!=ulist.begin()) + niter--; + } + } + } + + if (min_cost == BIGNUM) + { + cout << "ERROR: no cache organizations met optimization criteria" << endl; + exit(0); + } +} + + + +void filter_tag_arr(const min_values_t * min, list<mem_array *> & list) +{ + double cost = BIGNUM; + double cur_cost; + double wt_delay = g_ip->delay_wt, wt_dyn = g_ip->dynamic_power_wt, wt_leakage = g_ip->leakage_power_wt, wt_cyc = g_ip->cycle_time_wt, wt_area = g_ip->area_wt; + mem_array * res = NULL; + + if (list.empty() == true) + { + cout << "ERROR: no valid tag organizations found" << endl; + exit(1); + } + + + while (list.empty() != true) + { + bool v = check_mem_org(*list.back(), min); + if (v) + { + cur_cost = wt_delay * (list.back()->access_time/min->min_delay) + + wt_dyn * (list.back()->power.readOp.dynamic/min->min_dyn) + + wt_leakage * (list.back()->power.readOp.leakage/min->min_leakage) + + wt_area * (list.back()->area/min->min_area) + + wt_cyc * (list.back()->cycle_time/min->min_cyc); + } + else + { + cur_cost = BIGNUM; + } + if (cur_cost < cost) + { + if (res != NULL) + { + delete res; + } + cost = cur_cost; + res = list.back(); + } + else + { + delete list.back(); + } + list.pop_back(); + } + if(!res) + { + cout << "ERROR: no valid tag organizations found" << endl; + exit(0); + } + + list.push_back(res); +} + + + +void filter_data_arr(list<mem_array *> & curr_list) +{ + if (curr_list.empty() == true) + { + cout << "ERROR: no valid data array organizations found" << endl; + exit(1); + } + + list<mem_array *>::iterator iter; + + for (iter = curr_list.begin(); iter != curr_list.end(); ++iter) + { + mem_array * m = *iter; + + if (m == NULL) exit(1); + + if(((m->access_time - m->arr_min->min_delay)/m->arr_min->min_delay > 0.5) && + ((m->power.readOp.dynamic - m->arr_min->min_dyn)/m->arr_min->min_dyn > 0.5)) + { + delete m; + iter = curr_list.erase(iter); + iter --; + } + } +} + + + +/* + * Performs exhaustive search across different sub-array sizes, + * wire types and aspect ratios to find an optimal UCA organization + * 1. First different valid tag array organizations are calculated + * and stored in tag_arr array + * 2. The exhaustive search is repeated to find valid data array + * organizations and stored in data_arr array + * 3. Cache area, delay, power, and cycle time for different + * cache organizations are calculated based on the + * above results + * 4. Cache model with least cost is picked from sol_list + */ +void solve(uca_org_t *fin_res) +{ + bool is_dram = false; + int pure_ram = g_ip->pure_ram; + bool pure_cam = g_ip->pure_cam; + + init_tech_params(g_ip->F_sz_um, false); + + + list<mem_array *> tag_arr (0); + list<mem_array *> data_arr(0); + list<mem_array *>::iterator miter; + list<uca_org_t> sol_list(1, uca_org_t()); + + fin_res->tag_array.access_time = 0; + fin_res->tag_array.Ndwl = 0; + fin_res->tag_array.Ndbl = 0; + fin_res->tag_array.Nspd = 0; + fin_res->tag_array.deg_bl_muxing = 0; + fin_res->tag_array.Ndsam_lev_1 = 0; + fin_res->tag_array.Ndsam_lev_2 = 0; + + + // distribute calculate_time() execution to multiple threads + calc_time_mt_wrapper_struct * calc_array = new calc_time_mt_wrapper_struct[nthreads]; + pthread_t threads[nthreads]; + + for (uint32_t t = 0; t < nthreads; t++) + { + calc_array[t].tid = t; + calc_array[t].pure_ram = pure_ram; + calc_array[t].pure_cam = pure_cam; + calc_array[t].data_res = new min_values_t(); + calc_array[t].tag_res = new min_values_t(); + } + + bool is_tag; + uint32_t ram_cell_tech_type; + + // If it's a cache, first calculate the area, delay and power for all tag array partitions. + if (!(pure_ram||pure_cam||g_ip->fully_assoc)) + { //cache + is_tag = true; + ram_cell_tech_type = g_ip->tag_arr_ram_cell_tech_type; + is_dram = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram)); + init_tech_params(g_ip->F_sz_um, is_tag); + + for (uint32_t t = 0; t < nthreads; t++) + { + calc_array[t].is_tag = is_tag; + calc_array[t].is_main_mem = false; + calc_array[t].Nspd_min = 0.125; + pthread_create(&threads[t], NULL, calc_time_mt_wrapper, (void *)(&(calc_array[t]))); + } + + for (uint32_t t = 0; t < nthreads; t++) + { + pthread_join(threads[t], NULL); + } + + for (uint32_t t = 0; t < nthreads; t++) + { + calc_array[t].data_arr.sort(mem_array::lt); + data_arr.merge(calc_array[t].data_arr, mem_array::lt); + calc_array[t].tag_arr.sort(mem_array::lt); + tag_arr.merge(calc_array[t].tag_arr, mem_array::lt); + } + } + + + // calculate the area, delay and power for all data array partitions (for cache or plain RAM). +// if (!g_ip->fully_assoc) +// {//in the new cacti, cam, fully_associative cache are processed as single array in the data portion + is_tag = false; + ram_cell_tech_type = g_ip->data_arr_ram_cell_tech_type; + is_dram = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram)); + init_tech_params(g_ip->F_sz_um, is_tag); + + for (uint32_t t = 0; t < nthreads; t++) + { + calc_array[t].is_tag = is_tag; + calc_array[t].is_main_mem = g_ip->is_main_mem; + if (!(pure_cam||g_ip->fully_assoc)) + { + calc_array[t].Nspd_min = (double)(g_ip->out_w)/(double)(g_ip->block_sz*8); + } + else + { + calc_array[t].Nspd_min = 1; + } + + pthread_create(&threads[t], NULL, calc_time_mt_wrapper, (void *)(&(calc_array[t]))); + } + + for (uint32_t t = 0; t < nthreads; t++) + { + pthread_join(threads[t], NULL); + } + + data_arr.clear(); + for (uint32_t t = 0; t < nthreads; t++) + { + calc_array[t].data_arr.sort(mem_array::lt); + data_arr.merge(calc_array[t].data_arr, mem_array::lt); + } +// } + + + min_values_t * d_min = new min_values_t(); + min_values_t * t_min = new min_values_t(); + min_values_t * cache_min = new min_values_t(); + + for (uint32_t t = 0; t < nthreads; t++) + { + d_min->update_min_values(calc_array[t].data_res); + t_min->update_min_values(calc_array[t].tag_res); + } + + for (miter = data_arr.begin(); miter != data_arr.end(); miter++) + { + (*miter)->arr_min = d_min; + } + + + //cout << data_arr.size() << "\t" << tag_arr.size() <<" before\n"; + filter_data_arr(data_arr); + if(!(pure_ram||pure_cam||g_ip->fully_assoc)) + { + filter_tag_arr(t_min, tag_arr); + } + //cout << data_arr.size() << "\t" << tag_arr.size() <<" after\n"; + + + if (pure_ram||pure_cam||g_ip->fully_assoc) + { + for (miter = data_arr.begin(); miter != data_arr.end(); miter++) + { + uca_org_t & curr_org = sol_list.back(); + curr_org.tag_array2 = NULL; + curr_org.data_array2 = (*miter); + + curr_org.find_delay(); + curr_org.find_energy(); + curr_org.find_area(); + curr_org.find_cyc(); + + //update min values for the entire cache + cache_min->update_min_values(curr_org); + + sol_list.push_back(uca_org_t()); + } + } + else + { + while (tag_arr.empty() != true) + { + mem_array * arr_temp = (tag_arr.back()); + //delete tag_arr.back(); + tag_arr.pop_back(); + + for (miter = data_arr.begin(); miter != data_arr.end(); miter++) + { + uca_org_t & curr_org = sol_list.back(); + curr_org.tag_array2 = arr_temp; + curr_org.data_array2 = (*miter); + + curr_org.find_delay(); + curr_org.find_energy(); + curr_org.find_area(); + curr_org.find_cyc(); + + //update min values for the entire cache + cache_min->update_min_values(curr_org); + + sol_list.push_back(uca_org_t()); + } + } + } + + sol_list.pop_back(); + + find_optimal_uca(fin_res, cache_min, sol_list); + + sol_list.clear(); + + for (miter = data_arr.begin(); miter != data_arr.end(); ++miter) + { + if (*miter != fin_res->data_array2) + { + delete *miter; + } + } + data_arr.clear(); + + for (uint32_t t = 0; t < nthreads; t++) + { + delete calc_array[t].data_res; + delete calc_array[t].tag_res; + } + + delete [] calc_array; + delete cache_min; + delete d_min; + delete t_min; +} + +void update(uca_org_t *fin_res) +{ + if(fin_res->tag_array2) + { + init_tech_params(g_ip->F_sz_um,true); + DynamicParameter tag_arr_dyn_p(true, g_ip->pure_ram, g_ip->pure_cam, fin_res->tag_array2->Nspd, fin_res->tag_array2->Ndwl, fin_res->tag_array2->Ndbl, fin_res->tag_array2->Ndcm, fin_res->tag_array2->Ndsam_lev_1, fin_res->tag_array2->Ndsam_lev_2, g_ip->is_main_mem); + if(tag_arr_dyn_p.is_valid) + { + UCA * tag_arr = new UCA(tag_arr_dyn_p); + fin_res->tag_array2->power = tag_arr->power; + } + else + { + cout << "ERROR: Cannot retrieve array structure for leakage feedback" << endl; + exit(1); + } + } + init_tech_params(g_ip->F_sz_um,false); + DynamicParameter data_arr_dyn_p(false, g_ip->pure_ram, g_ip->pure_cam, fin_res->data_array2->Nspd, fin_res->data_array2->Ndwl, fin_res->data_array2->Ndbl, fin_res->data_array2->Ndcm, fin_res->data_array2->Ndsam_lev_1, fin_res->data_array2->Ndsam_lev_2, g_ip->is_main_mem); + if(data_arr_dyn_p.is_valid) + { + UCA * data_arr = new UCA(data_arr_dyn_p); + fin_res->data_array2->power = data_arr->power; + } + else + { + cout << "ERROR: Cannot retrieve array structure for leakage feedback" << endl; + exit(1); + } + + fin_res->find_energy(); +} + diff --git a/ext/mcpat/cacti/Ucache.h b/ext/mcpat/cacti/Ucache.h new file mode 100644 index 000000000..20985fff1 --- /dev/null +++ b/ext/mcpat/cacti/Ucache.h @@ -0,0 +1,115 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + +#ifndef __UCACHE_H__ +#define __UCACHE_H__ + +#include <list> + +#include "area.h" +#include "nuca.h" +#include "router.h" + +class min_values_t +{ + public: + double min_delay; + double min_dyn; + double min_leakage; + double min_area; + double min_cyc; + + min_values_t() : min_delay(BIGNUM), min_dyn(BIGNUM), min_leakage(BIGNUM), min_area(BIGNUM), min_cyc(BIGNUM) { } + + void update_min_values(const min_values_t * val); + void update_min_values(const uca_org_t & res); + void update_min_values(const nuca_org_t * res); + void update_min_values(const mem_array * res); +}; + + + +struct solution +{ + int tag_array_index; + int data_array_index; + list<mem_array *>::iterator tag_array_iter; + list<mem_array *>::iterator data_array_iter; + double access_time; + double cycle_time; + double area; + double efficiency; + powerDef total_power; +}; + + + +bool calculate_time( + bool is_tag, + int pure_ram, + bool pure_cam, + double Nspd, + unsigned int Ndwl, + unsigned int Ndbl, + unsigned int Ndcm, + unsigned int Ndsam_lev_1, + unsigned int Ndsam_lev_2, + mem_array *ptr_array, + int flag_results_populate, + results_mem_array *ptr_results, + uca_org_t *ptr_fin_res, + bool is_main_mem); +void update(uca_org_t *fin_res); + +void solve(uca_org_t *fin_res); +void init_tech_params(double tech, bool is_tag); + + +struct calc_time_mt_wrapper_struct +{ + uint32_t tid; + bool is_tag; + bool pure_ram; + bool pure_cam; + bool is_main_mem; + double Nspd_min; + + min_values_t * data_res; + min_values_t * tag_res; + + list<mem_array *> data_arr; + list<mem_array *> tag_arr; +}; + +void *calc_time_mt_wrapper(void * void_obj); + +#endif diff --git a/ext/mcpat/cacti/arbiter.cc b/ext/mcpat/cacti/arbiter.cc new file mode 100644 index 000000000..6664abf13 --- /dev/null +++ b/ext/mcpat/cacti/arbiter.cc @@ -0,0 +1,130 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#include "arbiter.h" + +Arbiter::Arbiter( + double n_req, + double flit_size_, + double output_len, + TechnologyParameter::DeviceType *dt + ):R(n_req), flit_size(flit_size_), + o_len (output_len), deviceType(dt) +{ + min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_; + Vdd = dt->Vdd; + double technology = g_ip->F_sz_um; + NTn1 = 13.5*technology/2; + PTn1 = 76*technology/2; + NTn2 = 13.5*technology/2; + PTn2 = 76*technology/2; + NTi = 12.5*technology/2; + PTi = 25*technology/2; + NTtr = 10*technology/2; /*Transmission gate's nmos tr. length*/ + PTtr = 20*technology/2; /* pmos tr. length*/ +} + +Arbiter::~Arbiter(){} + +double +Arbiter::arb_req() { + double temp = ((R-1)*(2*gate_C(NTn1, 0)+gate_C(PTn1, 0)) + 2*gate_C(NTn2, 0) + + gate_C(PTn2, 0) + gate_C(NTi, 0) + gate_C(PTi, 0) + + drain_C_(NTi, 0, 1, 1, g_tp.cell_h_def) + drain_C_(PTi, 1, 1, 1, g_tp.cell_h_def)); + return temp; +} + +double +Arbiter::arb_pri() { + double temp = 2*(2*gate_C(NTn1, 0)+gate_C(PTn1, 0)); /* switching capacitance + of flip-flop is ignored */ + return temp; +} + + +double +Arbiter::arb_grant() { + double temp = drain_C_(NTn1, 0, 1, 1, g_tp.cell_h_def)*2 + drain_C_(PTn1, 1, 1, 1, g_tp.cell_h_def) + crossbar_ctrline(); + return temp; +} + +double +Arbiter::arb_int() { + double temp = (drain_C_(NTn1, 0, 1, 1, g_tp.cell_h_def)*2 + drain_C_(PTn1, 1, 1, 1, g_tp.cell_h_def) + + 2*gate_C(NTn2, 0) + gate_C(PTn2, 0)); + return temp; +} + +void +Arbiter::compute_power() { + power.readOp.dynamic = (R*arb_req()*Vdd*Vdd/2 + R*arb_pri()*Vdd*Vdd/2 + + arb_grant()*Vdd*Vdd + arb_int()*0.5*Vdd*Vdd); + double nor1_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTn1*2, min_w_pmos * PTn1*2, 2, nor); + double nor2_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTn2*R, min_w_pmos * PTn2*R, 2, nor); + double not_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTi, min_w_pmos * PTi, 1, inv); + double nor1_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTn1*2, min_w_pmos * PTn1*2, 2, nor); + double nor2_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTn2*R, min_w_pmos * PTn2*R, 2, nor); + double not_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTi, min_w_pmos * PTi, 1, inv); + power.readOp.leakage = (nor1_leak + nor2_leak + not_leak)*Vdd; //FIXME include priority table leakage + power.readOp.gate_leakage = nor1_leak_gate*Vdd + nor2_leak_gate*Vdd + not_leak_gate*Vdd; +} + +double //wire cap with triple spacing +Arbiter::Cw3(double length) { + Wire wc(g_ip->wt, length, 1, 3, 3); + double temp = (wc.wire_cap(length,true)); + return temp; +} + +double +Arbiter::crossbar_ctrline() { + double temp = (Cw3(o_len * 1e-6 /* m */) + + drain_C_(NTi, 0, 1, 1, g_tp.cell_h_def) + drain_C_(PTi, 1, 1, 1, g_tp.cell_h_def) + + gate_C(NTi, 0) + gate_C(PTi, 0)); + return temp; +} + +double +Arbiter::transmission_buf_ctrcap() { + double temp = gate_C(NTtr, 0)+gate_C(PTtr, 0); + return temp; +} + + +void Arbiter::print_arbiter() +{ + cout << "\nArbiter Stats (" << R << " input arbiter" << ")\n\n"; + cout << "Flit size : " << flit_size << " bits" << endl; + cout << "Dynamic Power : " << power.readOp.dynamic*1e9 << " (nJ)" << endl; + cout << "Leakage Power : " << power.readOp.leakage*1e3 << " (mW)" << endl; +} + + diff --git a/ext/mcpat/cacti/arbiter.h b/ext/mcpat/cacti/arbiter.h new file mode 100644 index 000000000..32ada92c2 --- /dev/null +++ b/ext/mcpat/cacti/arbiter.h @@ -0,0 +1,79 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#ifndef __ARBITER__ +#define __ARBITER__ + +#include <assert.h> + +#include <iostream> + +#include "basic_circuit.h" +#include "cacti_interface.h" +#include "component.h" +#include "mat.h" +#include "parameter.h" +#include "wire.h" + +class Arbiter : public Component +{ + public: + Arbiter( + double Req, + double flit_sz, + double output_len, + TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)); + ~Arbiter(); + + void print_arbiter(); + double arb_req(); + double arb_pri(); + double arb_grant(); + double arb_int(); + void compute_power(); + double Cw3(double len); + double crossbar_ctrline(); + double transmission_buf_ctrcap(); + + + + private: + double NTn1, PTn1, NTn2, PTn2, R, PTi, NTi; + double flit_size; + double NTtr, PTtr; + double o_len; + TechnologyParameter::DeviceType *deviceType; + double TriS1, TriS2; + double min_w_pmos, Vdd; + +}; + +#endif diff --git a/ext/mcpat/cacti/area.cc b/ext/mcpat/cacti/area.cc new file mode 100644 index 000000000..14ea4a9d7 --- /dev/null +++ b/ext/mcpat/cacti/area.cc @@ -0,0 +1,47 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#include <cassert> +#include <cmath> +#include <iostream> + +#include "area.h" +#include "basic_circuit.h" +#include "component.h" +#include "decoder.h" +#include "parameter.h" + +using namespace std; + + + diff --git a/ext/mcpat/cacti/area.h b/ext/mcpat/cacti/area.h new file mode 100644 index 000000000..7705e6250 --- /dev/null +++ b/ext/mcpat/cacti/area.h @@ -0,0 +1,71 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#ifndef __AREA_H__ +#define __AREA_H__ + +#include "basic_circuit.h" +#include "cacti_interface.h" + +using namespace std; + +class Area +{ + public: + double w; + double h; + + Area():w(0), h(0), area(0) { } + double get_w() const { return w; } + double get_h() const { return h; } + double get_area() const + { + if (w == 0 && h == 0) + { + return area; + } + else + { + return w*h; + } + } + void set_w(double w_) { w = w_; } + void set_h(double h_) { h = h_; } + void set_area(double a_) { area = a_; } + + private: + double area; +}; + +#endif + diff --git a/ext/mcpat/cacti/bank.cc b/ext/mcpat/cacti/bank.cc new file mode 100755 index 000000000..a18c7f1ed --- /dev/null +++ b/ext/mcpat/cacti/bank.cc @@ -0,0 +1,198 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#include <iostream> + +#include "bank.h" + +Bank::Bank(const DynamicParameter & dyn_p): + dp(dyn_p), mat(dp), + num_addr_b_mat(dyn_p.number_addr_bits_mat), + num_mats_hor_dir(dyn_p.num_mats_h_dir), num_mats_ver_dir(dyn_p.num_mats_v_dir) +{ + int RWP; + int ERP; + int EWP; + int SCHP; + + if (dp.use_inp_params) + { + RWP = dp.num_rw_ports; + ERP = dp.num_rd_ports; + EWP = dp.num_wr_ports; + SCHP = dp.num_search_ports; + } + else + { + RWP = g_ip->num_rw_ports; + ERP = g_ip->num_rd_ports; + EWP = g_ip->num_wr_ports; + SCHP = g_ip->num_search_ports; + } + + int total_addrbits = (dp.number_addr_bits_mat + dp.number_subbanks_decode)*(RWP+ERP+EWP); + int datainbits = dp.num_di_b_bank_per_port * (RWP + EWP); + int dataoutbits = dp.num_do_b_bank_per_port * (RWP + ERP); + int searchinbits; + int searchoutbits; + + if (dp.fully_assoc || dp.pure_cam) + { + datainbits = dp.num_di_b_bank_per_port * (RWP + EWP); + dataoutbits = dp.num_do_b_bank_per_port * (RWP + ERP); + searchinbits = dp.num_si_b_bank_per_port * SCHP; + searchoutbits = dp.num_so_b_bank_per_port * SCHP; + } + + if (!(dp.fully_assoc || dp.pure_cam)) + { + if (g_ip->fast_access && dp.is_tag == false) + { + dataoutbits *= g_ip->data_assoc; + } + + htree_in_add = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h, + total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Add_htree); + htree_in_data = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h, + total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree); + htree_out_data = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h, + total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree); + +// htree_out_data = new Htree2 (g_ip->wt,(double) 100, (double)100, +// total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree); + + area.w = htree_in_data->area.w; + area.h = htree_in_data->area.h; + } + else + { + htree_in_add = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h, + total_addrbits, datainbits, searchinbits,dataoutbits,searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Add_htree); + htree_in_data = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h, + total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree); + htree_out_data = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h, + total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits,num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree); + htree_in_search = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h, + total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree,true, true); + htree_out_search = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h, + total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits,num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree,true); + + area.w = htree_in_data->area.w; + area.h = htree_in_data->area.h; + } + + num_addr_b_row_dec = _log2(mat.subarray.num_rows); + num_addr_b_routed_to_mat_for_act = num_addr_b_row_dec; + num_addr_b_routed_to_mat_for_rd_or_wr = num_addr_b_mat - num_addr_b_row_dec; +} + + + +Bank::~Bank() +{ + delete htree_in_add; + delete htree_out_data; + delete htree_in_data; + if (dp.fully_assoc || dp.pure_cam) + { + delete htree_in_search; + delete htree_out_search; + } +} + + + +double Bank::compute_delays(double inrisetime) +{ + return mat.compute_delays(inrisetime); +} + + + +void Bank::compute_power_energy() +{ + mat.compute_power_energy(); + + if (!(dp.fully_assoc || dp.pure_cam)) + { + power.readOp.dynamic += mat.power.readOp.dynamic * dp.num_act_mats_hor_dir; + power.readOp.leakage += mat.power.readOp.leakage * dp.num_mats; + power.readOp.gate_leakage += mat.power.readOp.gate_leakage * dp.num_mats; + + power.readOp.dynamic += htree_in_add->power.readOp.dynamic; + power.readOp.dynamic += htree_out_data->power.readOp.dynamic; + + power.readOp.leakage += htree_in_add->power.readOp.leakage; + power.readOp.leakage += htree_in_data->power.readOp.leakage; + power.readOp.leakage += htree_out_data->power.readOp.leakage; + power.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage; + power.readOp.gate_leakage += htree_in_data->power.readOp.gate_leakage; + power.readOp.gate_leakage += htree_out_data->power.readOp.gate_leakage; + } + else + { + + power.readOp.dynamic += mat.power.readOp.dynamic ;//for fa and cam num_act_mats_hor_dir is 1 for plain r/w + power.readOp.leakage += mat.power.readOp.leakage * dp.num_mats; + power.readOp.gate_leakage += mat.power.readOp.gate_leakage * dp.num_mats; + + power.searchOp.dynamic += mat.power.searchOp.dynamic * dp.num_mats; + power.searchOp.dynamic += mat.power_bl_precharge_eq_drv.searchOp.dynamic + + mat.power_sa.searchOp.dynamic + + mat.power_bitline.searchOp.dynamic + + mat.power_subarray_out_drv.searchOp.dynamic+ + mat.ml_to_ram_wl_drv->power.readOp.dynamic; + + power.readOp.dynamic += htree_in_add->power.readOp.dynamic; + power.readOp.dynamic += htree_out_data->power.readOp.dynamic; + + power.searchOp.dynamic += htree_in_search->power.searchOp.dynamic; + power.searchOp.dynamic += htree_out_search->power.searchOp.dynamic; + + power.readOp.leakage += htree_in_add->power.readOp.leakage; + power.readOp.leakage += htree_in_data->power.readOp.leakage; + power.readOp.leakage += htree_out_data->power.readOp.leakage; + power.readOp.leakage += htree_in_search->power.readOp.leakage; + power.readOp.leakage += htree_out_search->power.readOp.leakage; + + + power.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage; + power.readOp.gate_leakage += htree_in_data->power.readOp.gate_leakage; + power.readOp.gate_leakage += htree_out_data->power.readOp.gate_leakage; + power.readOp.gate_leakage += htree_in_search->power.readOp.gate_leakage; + power.readOp.gate_leakage += htree_out_search->power.readOp.gate_leakage; + + } + +} + diff --git a/ext/mcpat/cacti/bank.h b/ext/mcpat/cacti/bank.h new file mode 100755 index 000000000..153609ab0 --- /dev/null +++ b/ext/mcpat/cacti/bank.h @@ -0,0 +1,69 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#ifndef __BANK_H__ +#define __BANK_H__ + +#include "component.h" +#include "decoder.h" +#include "htree2.h" +#include "mat.h" + +class Bank : public Component +{ + public: + Bank(const DynamicParameter & dyn_p); + ~Bank(); + double compute_delays(double inrisetime); // return outrisetime + void compute_power_energy(); + + const DynamicParameter & dp; + Mat mat; + Htree2 *htree_in_add; + Htree2 *htree_in_data; + Htree2 *htree_out_data; + Htree2 *htree_in_search; + Htree2 *htree_out_search; + + int num_addr_b_mat; + int num_mats_hor_dir; + int num_mats_ver_dir; + + int num_addr_b_row_dec; + int num_addr_b_routed_to_mat_for_act; + int num_addr_b_routed_to_mat_for_rd_or_wr; +}; + + + +#endif diff --git a/ext/mcpat/cacti/basic_circuit.cc b/ext/mcpat/cacti/basic_circuit.cc new file mode 100644 index 000000000..6efd5dd27 --- /dev/null +++ b/ext/mcpat/cacti/basic_circuit.cc @@ -0,0 +1,829 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + + +#include <cassert> +#include <cmath> +#include <iostream> + +#include "basic_circuit.h" +#include "parameter.h" + +uint32_t _log2(uint64_t num) +{ + uint32_t log2 = 0; + + if (num == 0) + { + std::cerr << "log0?" << std::endl; + exit(1); + } + + while (num > 1) + { + num = (num >> 1); + log2++; + } + + return log2; +} + + +bool is_pow2(int64_t val) +{ + if (val <= 0) + { + return false; + } + else if (val == 1) + { + return true; + } + else + { + return (_log2(val) != _log2(val-1)); + } +} + + +int powers (int base, int n) +{ + int i, p; + + p = 1; + for (i = 1; i <= n; ++i) + p *= base; + return p; +} + +/*----------------------------------------------------------------------*/ + +double logtwo (double x) +{ + assert(x > 0); + return ((double) (log (x) / log (2.0))); +} + +/*----------------------------------------------------------------------*/ + + +double gate_C( + double width, + double wirelength, + bool _is_dram, + bool _is_cell, + bool _is_wl_tr) +{ + const TechnologyParameter::DeviceType * dt; + + if (_is_dram && _is_cell) + { + dt = &g_tp.dram_acc; //DRAM cell access transistor + } + else if (_is_dram && _is_wl_tr) + { + dt = &g_tp.dram_wl; //DRAM wordline transistor + } + else if (!_is_dram && _is_cell) + { + dt = &g_tp.sram_cell; // SRAM cell access transistor + } + else + { + dt = &g_tp.peri_global; + } + + return (dt->C_g_ideal + dt->C_overlap + 3*dt->C_fringe)*width + dt->l_phy*Cpolywire; +} + + +// returns gate capacitance in Farads +// actually this function is the same as gate_C() now +double gate_C_pass( + double width, // gate width in um (length is Lphy_periph_global) + double wirelength, // poly wire length going to gate in lambda + bool _is_dram, + bool _is_cell, + bool _is_wl_tr) +{ + // v5.0 + const TechnologyParameter::DeviceType * dt; + + if ((_is_dram) && (_is_cell)) + { + dt = &g_tp.dram_acc; //DRAM cell access transistor + } + else if ((_is_dram) && (_is_wl_tr)) + { + dt = &g_tp.dram_wl; //DRAM wordline transistor + } + else if ((!_is_dram) && _is_cell) + { + dt = &g_tp.sram_cell; // SRAM cell access transistor + } + else + { + dt = &g_tp.peri_global; + } + + return (dt->C_g_ideal + dt->C_overlap + 3*dt->C_fringe)*width + dt->l_phy*Cpolywire; +} + + + +double drain_C_( + double width, + int nchannel, + int stack, + int next_arg_thresh_folding_width_or_height_cell, + double fold_dimension, + bool _is_dram, + bool _is_cell, + bool _is_wl_tr) +{ + double w_folded_tr; + const TechnologyParameter::DeviceType * dt; + + if ((_is_dram) && (_is_cell)) + { + dt = &g_tp.dram_acc; // DRAM cell access transistor + } + else if ((_is_dram) && (_is_wl_tr)) + { + dt = &g_tp.dram_wl; // DRAM wordline transistor + } + else if ((!_is_dram) && _is_cell) + { + dt = &g_tp.sram_cell; // SRAM cell access transistor + } + else + { + dt = &g_tp.peri_global; + } + + double c_junc_area = dt->C_junc; + double c_junc_sidewall = dt->C_junc_sidewall; + double c_fringe = 2*dt->C_fringe; + double c_overlap = 2*dt->C_overlap; + double drain_C_metal_connecting_folded_tr = 0; + + // determine the width of the transistor after folding (if it is getting folded) + if (next_arg_thresh_folding_width_or_height_cell == 0) + { // interpret fold_dimension as the the folding width threshold + // i.e. the value of transistor width above which the transistor gets folded + w_folded_tr = fold_dimension; + } + else + { // interpret fold_dimension as the height of the cell that this transistor is part of. + double h_tr_region = fold_dimension - 2 * g_tp.HPOWERRAIL; + // TODO : w_folded_tr must come from Component::compute_gate_area() + double ratio_p_to_n = 2.0 / (2.0 + 1.0); + if (nchannel) + { + w_folded_tr = (1 - ratio_p_to_n) * (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS); + } + else + { + w_folded_tr = ratio_p_to_n * (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS); + } + } + int num_folded_tr = (int) (ceil(width / w_folded_tr)); + + if (num_folded_tr < 2) + { + w_folded_tr = width; + } + + double total_drain_w = (g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact) + // only for drain + (stack - 1) * g_tp.spacing_poly_to_poly; + double drain_h_for_sidewall = w_folded_tr; + double total_drain_height_for_cap_wrt_gate = w_folded_tr + 2 * w_folded_tr * (stack - 1); + if (num_folded_tr > 1) + { + total_drain_w += (num_folded_tr - 2) * (g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact) + + (num_folded_tr - 1) * ((stack - 1) * g_tp.spacing_poly_to_poly); + + if (num_folded_tr%2 == 0) + { + drain_h_for_sidewall = 0; + } + total_drain_height_for_cap_wrt_gate *= num_folded_tr; + drain_C_metal_connecting_folded_tr = g_tp.wire_local.C_per_um * total_drain_w; + } + + double drain_C_area = c_junc_area * total_drain_w * w_folded_tr; + double drain_C_sidewall = c_junc_sidewall * (drain_h_for_sidewall + 2 * total_drain_w); + double drain_C_wrt_gate = (c_fringe + c_overlap) * total_drain_height_for_cap_wrt_gate; + + return (drain_C_area + drain_C_sidewall + drain_C_wrt_gate + drain_C_metal_connecting_folded_tr); +} + + +double tr_R_on( + double width, + int nchannel, + int stack, + bool _is_dram, + bool _is_cell, + bool _is_wl_tr) +{ + const TechnologyParameter::DeviceType * dt; + + if ((_is_dram) && (_is_cell)) + { + dt = &g_tp.dram_acc; //DRAM cell access transistor + } + else if ((_is_dram) && (_is_wl_tr)) + { + dt = &g_tp.dram_wl; //DRAM wordline transistor + } + else if ((!_is_dram) && _is_cell) + { + dt = &g_tp.sram_cell; // SRAM cell access transistor + } + else + { + dt = &g_tp.peri_global; + } + + double restrans = (nchannel) ? dt->R_nch_on : dt->R_pch_on; + return (stack * restrans / width); +} + + +/* This routine operates in reverse: given a resistance, it finds + * the transistor width that would have this R. It is used in the + * data wordline to estimate the wordline driver size. */ + +// returns width in um +double R_to_w( + double res, + int nchannel, + bool _is_dram, + bool _is_cell, + bool _is_wl_tr) +{ + const TechnologyParameter::DeviceType * dt; + + if ((_is_dram) && (_is_cell)) + { + dt = &g_tp.dram_acc; //DRAM cell access transistor + } + else if ((_is_dram) && (_is_wl_tr)) + { + dt = &g_tp.dram_wl; //DRAM wordline transistor + } + else if ((!_is_dram) && (_is_cell)) + { + dt = &g_tp.sram_cell; // SRAM cell access transistor + } + else + { + dt = &g_tp.peri_global; + } + + double restrans = (nchannel) ? dt->R_nch_on : dt->R_pch_on; + return (restrans / res); +} + + +double pmos_to_nmos_sz_ratio( + bool _is_dram, + bool _is_wl_tr) +{ + double p_to_n_sizing_ratio; + if ((_is_dram) && (_is_wl_tr)) + { //DRAM wordline transistor + p_to_n_sizing_ratio = g_tp.dram_wl.n_to_p_eff_curr_drv_ratio; + } + else + { //DRAM or SRAM all other transistors + p_to_n_sizing_ratio = g_tp.peri_global.n_to_p_eff_curr_drv_ratio; + } + return p_to_n_sizing_ratio; +} + + +// "Timing Models for MOS Circuits" by Mark Horowitz, 1984 +double horowitz( + double inputramptime, // input rise time + double tf, // time constant of gate + double vs1, // threshold voltage + double vs2, // threshold voltage + int rise) // whether input rises or fall +{ + if (inputramptime == 0 && vs1 == vs2) + { + return tf * (vs1 < 1 ? -log(vs1) : log(vs1)); + } + double a, b, td; + + a = inputramptime / tf; + if (rise == RISE) + { + b = 0.5; + td = tf * sqrt(log(vs1)*log(vs1) + 2*a*b*(1.0 - vs1)) + tf*(log(vs1) - log(vs2)); + } + else + { + b = 0.4; + td = tf * sqrt(log(1.0 - vs1)*log(1.0 - vs1) + 2*a*b*(vs1)) + tf*(log(1.0 - vs1) - log(1.0 - vs2)); + } + return (td); +} + +double cmos_Ileak( + double nWidth, + double pWidth, + bool _is_dram, + bool _is_cell, + bool _is_wl_tr) +{ + TechnologyParameter::DeviceType * dt; + + if ((!_is_dram)&&(_is_cell)) + { //SRAM cell access transistor + dt = &(g_tp.sram_cell); + } + else if ((_is_dram)&&(_is_wl_tr)) + { //DRAM wordline transistor + dt = &(g_tp.dram_wl); + } + else + { //DRAM or SRAM all other transistors + dt = &(g_tp.peri_global); + } + return nWidth*dt->I_off_n + pWidth*dt->I_off_p; +} + + +double simplified_nmos_leakage( + double nwidth, + bool _is_dram, + bool _is_cell, + bool _is_wl_tr) +{ + TechnologyParameter::DeviceType * dt; + + if ((!_is_dram)&&(_is_cell)) + { //SRAM cell access transistor + dt = &(g_tp.sram_cell); + } + else if ((_is_dram)&&(_is_wl_tr)) + { //DRAM wordline transistor + dt = &(g_tp.dram_wl); + } + else + { //DRAM or SRAM all other transistors + dt = &(g_tp.peri_global); + } + return nwidth * dt->I_off_n; +} + +int factorial(int n, int m) +{ + int fa = m, i; + for (i=m+1; i<=n; i++) + fa *=i; + return fa; +} + +int combination(int n, int m) +{ + int ret; + ret = factorial(n, m+1) / factorial(n - m); + return ret; +} + +double simplified_pmos_leakage( + double pwidth, + bool _is_dram, + bool _is_cell, + bool _is_wl_tr) +{ + TechnologyParameter::DeviceType * dt; + + if ((!_is_dram)&&(_is_cell)) + { //SRAM cell access transistor + dt = &(g_tp.sram_cell); + } + else if ((_is_dram)&&(_is_wl_tr)) + { //DRAM wordline transistor + dt = &(g_tp.dram_wl); + } + else + { //DRAM or SRAM all other transistors + dt = &(g_tp.peri_global); + } + return pwidth * dt->I_off_p; +} + +double cmos_Ig_n( + double nWidth, + bool _is_dram, + bool _is_cell, + bool _is_wl_tr) +{ + TechnologyParameter::DeviceType * dt; + + if ((!_is_dram)&&(_is_cell)) + { //SRAM cell access transistor + dt = &(g_tp.sram_cell); + } + else if ((_is_dram)&&(_is_wl_tr)) + { //DRAM wordline transistor + dt = &(g_tp.dram_wl); + } + else + { //DRAM or SRAM all other transistors + dt = &(g_tp.peri_global); + } + return nWidth*dt->I_g_on_n; +} + +double cmos_Ig_p( + double pWidth, + bool _is_dram, + bool _is_cell, + bool _is_wl_tr) +{ + TechnologyParameter::DeviceType * dt; + + if ((!_is_dram)&&(_is_cell)) + { //SRAM cell access transistor + dt = &(g_tp.sram_cell); + } + else if ((_is_dram)&&(_is_wl_tr)) + { //DRAM wordline transistor + dt = &(g_tp.dram_wl); + } + else + { //DRAM or SRAM all other transistors + dt = &(g_tp.peri_global); + } + return pWidth*dt->I_g_on_p; +} + +double cmos_Isub_leakage( + double nWidth, + double pWidth, + int fanin, + enum Gate_type g_type, + bool _is_dram, + bool _is_cell, + bool _is_wl_tr, + enum Half_net_topology topo) +{ + assert (fanin>=1); + double nmos_leak = simplified_nmos_leakage(nWidth, _is_dram, _is_cell, _is_wl_tr); + double pmos_leak = simplified_pmos_leakage(pWidth, _is_dram, _is_cell, _is_wl_tr); + double Isub=0; + int num_states; + int num_off_tx; + + num_states = int(pow(2.0, fanin)); + + switch (g_type) + { + case nmos: + if (fanin==1) + { + Isub = nmos_leak/num_states; + } + else + { + if (topo==parallel) + { + Isub=nmos_leak*fanin/num_states; //only when all tx are off, leakage power is non-zero. The possibility of this state is 1/num_states + } + else + { + for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) //when num_off_tx ==0 there is no leakage power + { + //Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx))); + Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx); + } + Isub /=num_states; + } + + } + break; + case pmos: + if (fanin==1) + { + Isub = pmos_leak/num_states; + } + else + { + if (topo==parallel) + { + Isub=pmos_leak*fanin/num_states; //only when all tx are off, leakage power is non-zero. The possibility of this state is 1/num_states + } + else + { + for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) //when num_off_tx ==0 there is no leakage power + { + //Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx))); + Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx); + } + Isub /=num_states; + } + + } + break; + case inv: + Isub = (nmos_leak + pmos_leak)/2; + break; + case nand: + Isub += fanin*pmos_leak;//the pullup network + for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) // the pulldown network + { + //Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx))); + Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx); + } + Isub /=num_states; + break; + case nor: + for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) // the pullup network + { + //Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx))); + Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx); + } + Isub += fanin*nmos_leak;//the pulldown network + Isub /=num_states; + break; + case tri: + Isub += (nmos_leak + pmos_leak)/2;//enabled + Isub += nmos_leak*UNI_LEAK_STACK_FACTOR; //disabled upper bound of leakage power + Isub /=2; + break; + case tg: + Isub = (nmos_leak + pmos_leak)/2; + break; + default: + assert(0); + break; + } + + return Isub; +} + + +double cmos_Ig_leakage( + double nWidth, + double pWidth, + int fanin, + enum Gate_type g_type, + bool _is_dram, + bool _is_cell, + bool _is_wl_tr, + enum Half_net_topology topo) +{ + assert (fanin>=1); + double nmos_leak = cmos_Ig_n(nWidth, _is_dram, _is_cell, _is_wl_tr); + double pmos_leak = cmos_Ig_p(pWidth, _is_dram, _is_cell, _is_wl_tr); + double Ig_on=0; + int num_states; + int num_on_tx; + + num_states = int(pow(2.0, fanin)); + + switch (g_type) + { + case nmos: + if (fanin==1) + { + Ig_on = nmos_leak/num_states; + } + else + { + if (topo==parallel) + { + for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++) + { + Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx; + } + } + else + { + Ig_on += nmos_leak * fanin;//pull down network when all TXs are on. + //num_on_tx is the number of on tx + for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1] + { + Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated. + } + Ig_on /=num_states; + } + } + break; + case pmos: + if (fanin==1) + { + Ig_on = pmos_leak/num_states; + } + else + { + if (topo==parallel) + { + for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++) + { + Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx; + } + } + else + { + Ig_on += pmos_leak * fanin;//pull down network when all TXs are on. + //num_on_tx is the number of on tx + for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1] + { + Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated. + } + Ig_on /=num_states; + } + } + break; + + case inv: + Ig_on = (nmos_leak + pmos_leak)/2; + break; + case nand: + //pull up network + for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)//when num_on_tx=[1,n] + { + Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx; + } + + //pull down network + Ig_on += nmos_leak * fanin;//pull down network when all TXs are on. + //num_on_tx is the number of on tx + for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1] + { + Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated. + } + Ig_on /=num_states; + break; + case nor: + // num_on_tx is the number of on tx in pull up network + Ig_on += pmos_leak * fanin;//pull up network when all TXs are on. + for (num_on_tx=1; num_on_tx<fanin; num_on_tx++) + { + Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx/2; + + } + //pull down network + for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)//when num_on_tx=[1,n] + { + Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx; + } + Ig_on /=num_states; + break; + case tri: + Ig_on += (2*nmos_leak + 2*pmos_leak)/2;//enabled + Ig_on += (nmos_leak + pmos_leak)/2; //disabled upper bound of leakage power + Ig_on /=2; + break; + case tg: + Ig_on = (nmos_leak + pmos_leak)/2; + break; + default: + assert(0); + break; + } + + return Ig_on; +} + +double shortcircuit_simple( + double vt, + double velocity_index, + double c_in, + double c_out, + double w_nmos, + double w_pmos, + double i_on_n, + double i_on_p, + double i_on_n_in, + double i_on_p_in, + double vdd) +{ + + double p_short_circuit, p_short_circuit_discharge, p_short_circuit_charge, p_short_circuit_discharge_low, p_short_circuit_discharge_high, p_short_circuit_charge_low, p_short_circuit_charge_high; //this is actually energy + double fo_n, fo_p, fanout, beta_ratio, vt_to_vdd_ratio; + + fo_n = i_on_n/i_on_n_in; + fo_p = i_on_p/i_on_p_in; + fanout = c_out/c_in; + beta_ratio = i_on_p/i_on_n; + vt_to_vdd_ratio = vt/vdd; + + //p_short_circuit_discharge_low = 10/3*(pow(0.5-vt_to_vdd_ratio,3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio; + p_short_circuit_discharge_low = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio; + p_short_circuit_charge_low = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_n*fo_n/fanout*beta_ratio; +// double t1, t2, t3, t4, t5; +// t1=pow(((vdd-vt)-vt_to_vdd_ratio),3); +// t2=pow(velocity_index,2.0); +// t3=pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio); +// t4=t1/t2/t3; +// cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl; + + p_short_circuit_discharge_high = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_p/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index); + p_short_circuit_charge_high = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_n/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index); + +// t1=pow(((vdd-vt)-vt_to_vdd_ratio),1.5); +// t2=pow(2, 3*vt_to_vdd_ratio+2*velocity_index); +// t3=t1/t2; +// cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl; +// p_short_circuit_discharge = 1.0/(1.0/p_short_circuit_discharge_low + 1.0/p_short_circuit_discharge_high); +// p_short_circuit_charge = 1/(1/p_short_circuit_charge_low + 1/p_short_circuit_charge_high); //harmmoic mean cannot be applied simple formulas. + + p_short_circuit_discharge = p_short_circuit_discharge_low; + p_short_circuit_charge = p_short_circuit_charge_low; + p_short_circuit = (p_short_circuit_discharge + p_short_circuit_charge)/2; + + return (p_short_circuit); +} + +double shortcircuit( + double vt, + double velocity_index, + double c_in, + double c_out, + double w_nmos, + double w_pmos, + double i_on_n, + double i_on_p, + double i_on_n_in, + double i_on_p_in, + double vdd) +{ + + double p_short_circuit=0, p_short_circuit_discharge;//, p_short_circuit_charge, p_short_circuit_discharge_low, p_short_circuit_discharge_high, p_short_circuit_charge_low, p_short_circuit_charge_high; //this is actually energy + double fo_n, fo_p, fanout, beta_ratio, vt_to_vdd_ratio; + double f_alpha, k_v, e, g_v_alpha, h_v_alpha; + + fo_n = i_on_n/i_on_n_in; + fo_p = i_on_p/i_on_p_in; + fanout = 1; + beta_ratio = i_on_p/i_on_n; + vt_to_vdd_ratio = vt/vdd; + e = 2.71828; + f_alpha = 1/(velocity_index+2) -velocity_index/(2*(velocity_index+3)) +velocity_index/(velocity_index+4)*(velocity_index/2-1); + k_v = 0.9/0.8+(vdd-vt)/0.8*log(10*(vdd-vt)/e); + g_v_alpha = (velocity_index + 1)*pow((1-velocity_index),velocity_index)*pow((1-velocity_index),velocity_index/2)/f_alpha/pow((1-velocity_index-velocity_index),(velocity_index/2+velocity_index+2)); + h_v_alpha = pow(2, velocity_index)*(velocity_index+1)*pow((1-velocity_index),velocity_index)/pow((1-velocity_index-velocity_index),(velocity_index+1)); + + //p_short_circuit_discharge_low = 10/3*(pow(0.5-vt_to_vdd_ratio,3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio; +// p_short_circuit_discharge_low = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio; +// p_short_circuit_charge_low = 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_n*fo_n/fanout*beta_ratio; +// double t1, t2, t3, t4, t5; +// t1=pow(((vdd-vt)-vt_to_vdd_ratio),3); +// t2=pow(velocity_index,2.0); +// t3=pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio); +// t4=t1/t2/t3; +// +// cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl; +// +// +// p_short_circuit_discharge_high = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_p/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index); +// p_short_circuit_charge_high = pow(((vdd-vt)-vt_to_vdd_ratio),1.5)*c_in*vdd*vdd*fo_n/10/pow(2, 3*vt_to_vdd_ratio+2*velocity_index); +// +// p_short_circuit_discharge = 1.0/(1.0/p_short_circuit_discharge_low + 1.0/p_short_circuit_discharge_high); +// p_short_circuit_charge = 1/(1/p_short_circuit_charge_low + 1/p_short_circuit_charge_high); +// +// p_short_circuit = (p_short_circuit_discharge + p_short_circuit_charge)/2; +// +// p_short_circuit = p_short_circuit_discharge; + + p_short_circuit_discharge = k_v*vdd*vdd*c_in*fo_p*fo_p/((vdd-vt)*g_v_alpha*fanout*beta_ratio/2/k_v + h_v_alpha*fo_p); + return (p_short_circuit); +} diff --git a/ext/mcpat/cacti/basic_circuit.h b/ext/mcpat/cacti/basic_circuit.h new file mode 100644 index 000000000..aaab6c0ea --- /dev/null +++ b/ext/mcpat/cacti/basic_circuit.h @@ -0,0 +1,248 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#ifndef __BASIC_CIRCUIT_H__ +#define __BASIC_CIRCUIT_H__ + +#include "cacti_interface.h" +#include "const.h" + +using namespace std; + +#define UNI_LEAK_STACK_FACTOR 0.43 + +int powers (int base, int n); +bool is_pow2(int64_t val); +uint32_t _log2(uint64_t num); +int factorial(int n, int m = 1); +int combination(int n, int m); + +//#define DBG +#ifdef DBG + #define PRINTDW(a);\ + a; +#else + #define PRINTDW(a);\ + +#endif + + +enum Wire_placement { + outside_mat, + inside_mat, + local_wires +}; + + + +enum Htree_type { + Add_htree, + Data_in_htree, + Data_out_htree, + Search_in_htree, + Search_out_htree, +}; + +enum Gate_type { + nmos, + pmos, + inv, + nand, + nor, + tri, + tg +}; + +enum Half_net_topology { + parallel, + series +}; + +double logtwo (double x); + +double gate_C( + double width, + double wirelength, + bool _is_dram = false, + bool _is_sram = false, + bool _is_wl_tr = false); + +double gate_C_pass( + double width, + double wirelength, + bool _is_dram = false, + bool _is_sram = false, + bool _is_wl_tr = false); + +double drain_C_( + double width, + int nchannel, + int stack, + int next_arg_thresh_folding_width_or_height_cell, + double fold_dimension, + bool _is_dram = false, + bool _is_sram = false, + bool _is_wl_tr = false); + +double tr_R_on( + double width, + int nchannel, + int stack, + bool _is_dram = false, + bool _is_sram = false, + bool _is_wl_tr = false); + +double R_to_w( + double res, + int nchannel, + bool _is_dram = false, + bool _is_sram = false, + bool _is_wl_tr = false); + +double horowitz ( + double inputramptime, + double tf, + double vs1, + double vs2, + int rise); + +double pmos_to_nmos_sz_ratio( + bool _is_dram = false, + bool _is_wl_tr = false); + +double simplified_nmos_leakage( + double nwidth, + bool _is_dram = false, + bool _is_cell = false, + bool _is_wl_tr = false); + +double simplified_pmos_leakage( + double pwidth, + bool _is_dram = false, + bool _is_cell = false, + bool _is_wl_tr = false); + + +double cmos_Ileak( + double nWidth, + double pWidth, + bool _is_dram = false, + bool _is_cell = false, + bool _is_wl_tr = false); + +double cmos_Ig_n( + double nWidth, + bool _is_dram = false, + bool _is_cell = false, + bool _is_wl_tr= false); + +double cmos_Ig_p( + double pWidth, + bool _is_dram = false, + bool _is_cell = false, + bool _is_wl_tr= false); + + +double cmos_Isub_leakage( + double nWidth, + double pWidth, + int fanin, + enum Gate_type g_type, + bool _is_dram = false, + bool _is_cell = false, + bool _is_wl_tr = false, + enum Half_net_topology topo = series); + +double cmos_Ig_leakage( + double nWidth, + double pWidth, + int fanin, + enum Gate_type g_type, + bool _is_dram = false, + bool _is_cell = false, + bool _is_wl_tr = false, + enum Half_net_topology topo = series); + +double shortcircuit( + double vt, + double velocity_index, + double c_in, + double c_out, + double w_nmos, + double w_pmos, + double i_on_n, + double i_on_p, + double i_on_n_in, + double i_on_p_in, + double vdd); + +double shortcircuit_simple( + double vt, + double velocity_index, + double c_in, + double c_out, + double w_nmos, + double w_pmos, + double i_on_n, + double i_on_p, + double i_on_n_in, + double i_on_p_in, + double vdd); +//set power point product mask; strictly speaking this is not real point product +inline void set_pppm( + double * pppv, + double a=1, + double b=1, + double c=1, + double d=1 + ){ + pppv[0]= a; + pppv[1]= b; + pppv[2]= c; + pppv[3]= d; + +} + +inline void set_sppm( + double * sppv, + double a=1, + double b=1, + double c=1, + double d=1 + ){ + sppv[0]= a; + sppv[1]= b; + sppv[2]= c; +} + +#endif diff --git a/ext/mcpat/cacti/batch_tests b/ext/mcpat/cacti/batch_tests new file mode 100755 index 000000000..45a03898e --- /dev/null +++ b/ext/mcpat/cacti/batch_tests @@ -0,0 +1,41 @@ +rm -rf ./out.csv +./cacti 8192 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 16384 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 32768 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 65536 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 131072 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 262144 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 524288 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 1048576 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 2097152 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 4194304 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 8388608 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 8192 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 16384 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 32768 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 65536 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 131072 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 262144 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 524288 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 1048576 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 2097152 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 4194304 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 8388608 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1 +./cacti 8192 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 16384 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 32768 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 65536 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 131072 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 262144 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 524288 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 1048576 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 2097152 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 4194304 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 8388608 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 2097152 64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 4194304 64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 8388608 64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 +./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1 diff --git a/ext/mcpat/cacti/cache.cfg b/ext/mcpat/cacti/cache.cfg new file mode 100755 index 000000000..03de34a13 --- /dev/null +++ b/ext/mcpat/cacti/cache.cfg @@ -0,0 +1,175 @@ +# Cache size +//-size (bytes) 2048 +//-size (bytes) 4096 +//-size (bytes) 32768 +//-size (bytes) 262144 +//-size (bytes) 1048576 +//-size (bytes) 2097152 +//-size (bytes) 4194304 +//-size (bytes) 8388608 +//-size (bytes) 16777216 +//-size (bytes) 33554432 +//-size (bytes) 134217728 +//-size (bytes) 67108864 +-size (bytes) 1073741824 + +# Line size +//-block size (bytes) 8 +-block size (bytes) 64 + +# To model Fully Associative cache, set associativity to zero +//-associativity 0 +//-associativity 2 +//-associativity 4 +-associativity 8 +//-associativity 16 + +-read-write port 1 +-exclusive read port 0 +-exclusive write port 0 +-single ended read ports 0 + +# Multiple banks connected using a bus +-UCA bank count 1 +-technology (u) 0.022 +//-technology (u) 0.040 +//-technology (u) 0.032 +//-technology (u) 0.090 + +# following three parameters are meaningful only for main memories + +-page size (bits) 8192 +-burst length 8 +-internal prefetch width 8 + +# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram) +-Data array cell type - "itrs-hp" +//-Data array cell type - "itrs-lstp" +//-Data array cell type - "itrs-lop" + +# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop) +-Data array peripheral type - "itrs-hp" +//-Data array peripheral type - "itrs-lstp" +//-Data array peripheral type - "itrs-lop" + +# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram) +-Tag array cell type - "itrs-hp" +//-Tag array cell type - "itrs-lstp" +//-Tag array cell type - "itrs-lop" + +# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop) +-Tag array peripheral type - "itrs-hp" +//-Tag array peripheral type - "itrs-lstp" +//-Tag array peripheral type - "itrs-lop + +# Bus width include data bits and address bits required by the decoder +//-output/input bus width 16 +-output/input bus width 512 + +// 300-400 in steps of 10 +-operating temperature (K) 360 + +# Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file) +# or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report) +-cache type "cache" +//-cache type "ram" +//-cache type "main memory" + +# to model special structure like branch target buffers, directory, etc. +# change the tag size parameter +# if you want cacti to calculate the tagbits, set the tag size to "default" +-tag size (b) "default" +//-tag size (b) 22 + +# fast - data and tag access happen in parallel +# sequential - data array is accessed after accessing the tag array +# normal - data array lookup and tag access happen in parallel +# final data block is broadcasted in data array h-tree +# after getting the signal from the tag array +//-access mode (normal, sequential, fast) - "fast" +-access mode (normal, sequential, fast) - "normal" +//-access mode (normal, sequential, fast) - "sequential" + + +# DESIGN OBJECTIVE for UCA (or banks in NUCA) +-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:100:0 + +# Percentage deviation from the minimum value +# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization +# that compromises at most 10% delay. +# NOTE: Try reasonable values for % deviation. Inconsistent deviation +# percentage values will not produce any valid organizations. For example, +# 0:0:100:100:100 will try to identify an organization that has both +# least delay and dynamic power. Since such an organization is not possible, CACTI will +# throw an error. Refer CACTI-6 Technical report for more details +-deviate (delay, dynamic power, leakage power, cycle time, area) 20:100000:100000:100000:100000 + +# Objective for NUCA +-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100 +-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000 + +# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for +# energy-delay or energy-delay sq. product +# Note: Optimize tag will disable weight or deviate values mentioned above +# Set it to NONE to let weight and deviate values determine the +# appropriate cache configuration +//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED" +-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2" +//-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE" + +-Cache model (NUCA, UCA) - "UCA" +//-Cache model (NUCA, UCA) - "NUCA" + +# In order for CACTI to find the optimal NUCA bank value the following +# variable should be assigned 0. +-NUCA bank count 0 + +# NOTE: for nuca network frequency is set to a default value of +# 5GHz in time.c. CACTI automatically +# calculates the maximum possible frequency and downgrades this value if necessary + +# By default CACTI considers both full-swing and low-swing +# wires to find an optimal configuration. However, it is possible to +# restrict the search space by changing the signalling from "default" to +# "fullswing" or "lowswing" type. +//-Wire signalling (fullswing, lowswing, default) - "Global_10" +-Wire signalling (fullswing, lowswing, default) - "default" +//-Wire signalling (fullswing, lowswing, default) - "lowswing" + +//-Wire inside mat - "global" +-Wire inside mat - "semi-global" +//-Wire outside mat - "global" +-Wire outside mat - "semi-global" + +//-Interconnect projection - "conservative" +-Interconnect projection - "aggressive" + +# Contention in network (which is a function of core count and cache level) is one of +# the critical factor used for deciding the optimal bank count value +# core count can be 4, 8, or 16 +//-Core count 4 +-Core count 8 +//-Core count 16 +-Cache level (L2/L3) - "L3" + +-Add ECC - "true" + +//-Print level (DETAILED, CONCISE) - "CONCISE" +-Print level (DETAILED, CONCISE) - "DETAILED" + +# for debugging +//-Print input parameters - "true" +-Print input parameters - "false" +# force CACTI to model the cache with the +# following Ndbl, Ndwl, Nspd, Ndsam, +# and Ndcm values +//-Force cache config - "true" +-Force cache config - "false" +-Ndwl 1 +-Ndbl 1 +-Nspd 0 +-Ndcm 1 +-Ndsam1 0 +-Ndsam2 0 + + diff --git a/ext/mcpat/cacti/cacti.i b/ext/mcpat/cacti/cacti.i new file mode 100644 index 000000000..796413872 --- /dev/null +++ b/ext/mcpat/cacti/cacti.i @@ -0,0 +1,8 @@ +%module cacti +%{ +/* Includes the header in the wrapper code */ +#include "cacti_interface.h" +%} + +/* Parse the header file to generate wrappers */ +%include "cacti_interface.h"
\ No newline at end of file diff --git a/ext/mcpat/cacti/cacti.mk b/ext/mcpat/cacti/cacti.mk new file mode 100644 index 000000000..4d6de8db8 --- /dev/null +++ b/ext/mcpat/cacti/cacti.mk @@ -0,0 +1,51 @@ +TARGET = cacti +SHELL = /bin/sh +.PHONY: all depend clean +.SUFFIXES: .cc .o + +ifndef NTHREADS + NTHREADS = 8 +endif + + +LIBS = +INCS = -lm + +ifeq ($(TAG),dbg) + DBG = -Wall + OPT = -ggdb -g -O0 -DNTHREADS=1 -gstabs+ +else + DBG = + OPT = -O3 -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS) +endif + +#CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT) +CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT) +CXX = g++ -m32 +CC = gcc -m32 + +SRCS = area.cc bank.cc mat.cc main.cc Ucache.cc io.cc technology.cc basic_circuit.cc parameter.cc \ + decoder.cc component.cc uca.cc subarray.cc wire.cc htree2.cc \ + cacti_interface.cc router.cc nuca.cc crossbar.cc arbiter.cc + +OBJS = $(patsubst %.cc,obj_$(TAG)/%.o,$(SRCS)) +PYTHONLIB_SRCS = $(patsubst main.cc, ,$(SRCS)) obj_$(TAG)/cacti_wrap.cc +PYTHONLIB_OBJS = $(patsubst %.cc,%.o,$(PYTHONLIB_SRCS)) +INCLUDES = -I /usr/include/python2.4 -I /usr/lib/python2.4/config + +all: obj_$(TAG)/$(TARGET) + cp -f obj_$(TAG)/$(TARGET) $(TARGET) + +obj_$(TAG)/$(TARGET) : $(OBJS) + $(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread + +#obj_$(TAG)/%.o : %.cc +# $(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $< + +obj_$(TAG)/%.o : %.cc + $(CXX) $(CXXFLAGS) -c $< -o $@ + +clean: + -rm -f *.o _cacti.so cacti.py $(TARGET) + + diff --git a/ext/mcpat/cacti/cacti_interface.cc b/ext/mcpat/cacti/cacti_interface.cc new file mode 100644 index 000000000..b6d0d13de --- /dev/null +++ b/ext/mcpat/cacti/cacti_interface.cc @@ -0,0 +1,173 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#include <pthread.h> + +#include <algorithm> +#include <cmath> +#include <ctime> +#include <iostream> + +#include "Ucache.h" +#include "area.h" +#include "basic_circuit.h" +#include "cacti_interface.h" +#include "component.h" +#include "const.h" +#include "parameter.h" + +using namespace std; + + +bool mem_array::lt(const mem_array * m1, const mem_array * m2) +{ + if (m1->Nspd < m2->Nspd) return true; + else if (m1->Nspd > m2->Nspd) return false; + else if (m1->Ndwl < m2->Ndwl) return true; + else if (m1->Ndwl > m2->Ndwl) return false; + else if (m1->Ndbl < m2->Ndbl) return true; + else if (m1->Ndbl > m2->Ndbl) return false; + else if (m1->deg_bl_muxing < m2->deg_bl_muxing) return true; + else if (m1->deg_bl_muxing > m2->deg_bl_muxing) return false; + else if (m1->Ndsam_lev_1 < m2->Ndsam_lev_1) return true; + else if (m1->Ndsam_lev_1 > m2->Ndsam_lev_1) return false; + else if (m1->Ndsam_lev_2 < m2->Ndsam_lev_2) return true; + else return false; +} + + + +void uca_org_t::find_delay() +{ + mem_array * data_arr = data_array2; + mem_array * tag_arr = tag_array2; + + // check whether it is a regular cache or scratch ram + if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc) + { + access_time = data_arr->access_time; + } + // Both tag and data lookup happen in parallel + // and the entire set is sent over the data array h-tree without + // waiting for the way-select signal --TODO add the corresponding + // power overhead Nav + else if (g_ip->fast_access == true) + { + access_time = MAX(tag_arr->access_time, data_arr->access_time); + } + // Tag is accessed first. On a hit, way-select signal along with the + // address is sent to read/write the appropriate block in the data + // array + else if (g_ip->is_seq_acc == true) + { + access_time = tag_arr->access_time + data_arr->access_time; + } + // Normal access: tag array access and data array access happen in parallel. + // But, the data array will wait for the way-select and transfer only the + // appropriate block over the h-tree. + else + { + access_time = MAX(tag_arr->access_time + data_arr->delay_senseamp_mux_decoder, + data_arr->delay_before_subarray_output_driver) + + data_arr->delay_from_subarray_output_driver_to_output; + } +} + + + +void uca_org_t::find_energy() +{ + if (!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc))//(g_ip->is_cache) + power = data_array2->power + tag_array2->power; + else + power = data_array2->power; +} + + + +void uca_org_t::find_area() +{ + if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)//(g_ip->is_cache == false) + { + cache_ht = data_array2->height; + cache_len = data_array2->width; + } + else + { + cache_ht = MAX(tag_array2->height, data_array2->height); + cache_len = tag_array2->width + data_array2->width; + } + area = cache_ht * cache_len; +} + +void uca_org_t::adjust_area() +{ + double area_adjust; + if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc) + { + if (data_array2->area_efficiency/100.0<0.2) + { + //area_adjust = sqrt(area/(area*(data_array2->area_efficiency/100.0)/0.2)); + area_adjust = sqrt(0.2/(data_array2->area_efficiency/100.0)); + cache_ht = cache_ht/area_adjust; + cache_len = cache_len/area_adjust; + } + } + area = cache_ht * cache_len; +} + +void uca_org_t::find_cyc() +{ + if ((g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc))//(g_ip->is_cache == false) + { + cycle_time = data_array2->cycle_time; + } + else + { + cycle_time = MAX(tag_array2->cycle_time, + data_array2->cycle_time); + } +} + +uca_org_t :: uca_org_t() +:tag_array2(0), + data_array2(0) +{ + +} + +void uca_org_t :: cleanup() +{ + if (data_array2!=0) + delete data_array2; + if (tag_array2!=0) + delete tag_array2; +} diff --git a/ext/mcpat/cacti/cacti_interface.h b/ext/mcpat/cacti/cacti_interface.h new file mode 100644 index 000000000..f37596554 --- /dev/null +++ b/ext/mcpat/cacti/cacti_interface.h @@ -0,0 +1,633 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#ifndef __CACTI_INTERFACE_H__ +#define __CACTI_INTERFACE_H__ + +#include <iostream> +#include <list> +#include <map> +#include <string> +#include <vector> + +#include "const.h" + +using namespace std; + + +class min_values_t; +class mem_array; +class uca_org_t; + + +class powerComponents +{ + public: + double dynamic; + double leakage; + double gate_leakage; + double short_circuit; + double longer_channel_leakage; + + powerComponents() : dynamic(0), leakage(0), gate_leakage(0), short_circuit(0), longer_channel_leakage(0) { } + powerComponents(const powerComponents & obj) { *this = obj; } + powerComponents & operator=(const powerComponents & rhs) + { + dynamic = rhs.dynamic; + leakage = rhs.leakage; + gate_leakage = rhs.gate_leakage; + short_circuit = rhs.short_circuit; + longer_channel_leakage = rhs.longer_channel_leakage; + return *this; + } + void reset() { dynamic = 0; leakage = 0; gate_leakage = 0; short_circuit = 0;longer_channel_leakage = 0;} + + friend powerComponents operator+(const powerComponents & x, const powerComponents & y); + friend powerComponents operator*(const powerComponents & x, double const * const y); +}; + + + +class powerDef +{ + public: + powerComponents readOp; + powerComponents writeOp; + powerComponents searchOp;//Sheng: for CAM and FA + + powerDef() : readOp(), writeOp(), searchOp() { } + void reset() { readOp.reset(); writeOp.reset(); searchOp.reset();} + + friend powerDef operator+(const powerDef & x, const powerDef & y); + friend powerDef operator*(const powerDef & x, double const * const y); +}; + +enum Wire_type +{ + Global /* gloabl wires with repeaters */, + Global_5 /* 5% delay penalty */, + Global_10 /* 10% delay penalty */, + Global_20 /* 20% delay penalty */, + Global_30 /* 30% delay penalty */, + Low_swing /* differential low power wires with high area overhead */, + Semi_global /* mid-level wires with repeaters*/, + Transmission /* tranmission lines with high area overhead */, + Optical /* optical wires */, + Invalid_wtype +}; + + + +class InputParameter +{ + public: + void parse_cfg(const string & infile); + + bool error_checking(); // return false if the input parameters are problematic + void display_ip(); + + unsigned int cache_sz; // in bytes + unsigned int line_sz; + unsigned int assoc; + unsigned int nbanks; + unsigned int out_w;// == nr_bits_out + bool specific_tag; + unsigned int tag_w; + unsigned int access_mode; + unsigned int obj_func_dyn_energy; + unsigned int obj_func_dyn_power; + unsigned int obj_func_leak_power; + unsigned int obj_func_cycle_t; + + double F_sz_nm; // feature size in nm + double F_sz_um; // feature size in um + unsigned int num_rw_ports; + unsigned int num_rd_ports; + unsigned int num_wr_ports; + unsigned int num_se_rd_ports; // number of single ended read ports + unsigned int num_search_ports; // Sheng: number of search ports for CAM + bool is_main_mem; + bool is_cache; + bool pure_ram; + bool pure_cam; + bool rpters_in_htree; // if there are repeaters in htree segment + unsigned int ver_htree_wires_over_array; + unsigned int broadcast_addr_din_over_ver_htrees; + unsigned int temp; + + unsigned int ram_cell_tech_type; + unsigned int peri_global_tech_type; + unsigned int data_arr_ram_cell_tech_type; + unsigned int data_arr_peri_global_tech_type; + unsigned int tag_arr_ram_cell_tech_type; + unsigned int tag_arr_peri_global_tech_type; + + unsigned int burst_len; + unsigned int int_prefetch_w; + unsigned int page_sz_bits; + + unsigned int ic_proj_type; // interconnect_projection_type + unsigned int wire_is_mat_type; // wire_inside_mat_type + unsigned int wire_os_mat_type; // wire_outside_mat_type + enum Wire_type wt; + int force_wiretype; + bool print_input_args; + unsigned int nuca_cache_sz; // TODO + int ndbl, ndwl, nspd, ndsam1, ndsam2, ndcm; + bool force_cache_config; + + int cache_level; + int cores; + int nuca_bank_count; + int force_nuca_bank; + + int delay_wt, dynamic_power_wt, leakage_power_wt, + cycle_time_wt, area_wt; + int delay_wt_nuca, dynamic_power_wt_nuca, leakage_power_wt_nuca, + cycle_time_wt_nuca, area_wt_nuca; + + int delay_dev, dynamic_power_dev, leakage_power_dev, + cycle_time_dev, area_dev; + int delay_dev_nuca, dynamic_power_dev_nuca, leakage_power_dev_nuca, + cycle_time_dev_nuca, area_dev_nuca; + int ed; //ED or ED2 optimization + int nuca; + + bool fast_access; + unsigned int block_sz; // bytes + unsigned int tag_assoc; + unsigned int data_assoc; + bool is_seq_acc; + bool fully_assoc; + unsigned int nsets; // == number_of_sets + int print_detail; + + + bool add_ecc_b_; + //parameters for design constraint + double throughput; + double latency; + bool pipelinable; + int pipeline_stages; + int per_stage_vector; + bool with_clock_grid; +}; + + +typedef struct{ + int Ndwl; + int Ndbl; + double Nspd; + int deg_bl_muxing; + int Ndsam_lev_1; + int Ndsam_lev_2; + int number_activated_mats_horizontal_direction; + int number_subbanks; + int page_size_in_bits; + double delay_route_to_bank; + double delay_crossbar; + double delay_addr_din_horizontal_htree; + double delay_addr_din_vertical_htree; + double delay_row_predecode_driver_and_block; + double delay_row_decoder; + double delay_bitlines; + double delay_sense_amp; + double delay_subarray_output_driver; + double delay_bit_mux_predecode_driver_and_block; + double delay_bit_mux_decoder; + double delay_senseamp_mux_lev_1_predecode_driver_and_block; + double delay_senseamp_mux_lev_1_decoder; + double delay_senseamp_mux_lev_2_predecode_driver_and_block; + double delay_senseamp_mux_lev_2_decoder; + double delay_input_htree; + double delay_output_htree; + double delay_dout_vertical_htree; + double delay_dout_horizontal_htree; + double delay_comparator; + double access_time; + double cycle_time; + double multisubbank_interleave_cycle_time; + double delay_request_network; + double delay_inside_mat; + double delay_reply_network; + double trcd; + double cas_latency; + double precharge_delay; + powerDef power_routing_to_bank; + powerDef power_addr_input_htree; + powerDef power_data_input_htree; + powerDef power_data_output_htree; + powerDef power_addr_horizontal_htree; + powerDef power_datain_horizontal_htree; + powerDef power_dataout_horizontal_htree; + powerDef power_addr_vertical_htree; + powerDef power_datain_vertical_htree; + powerDef power_row_predecoder_drivers; + powerDef power_row_predecoder_blocks; + powerDef power_row_decoders; + powerDef power_bit_mux_predecoder_drivers; + powerDef power_bit_mux_predecoder_blocks; + powerDef power_bit_mux_decoders; + powerDef power_senseamp_mux_lev_1_predecoder_drivers; + powerDef power_senseamp_mux_lev_1_predecoder_blocks; + powerDef power_senseamp_mux_lev_1_decoders; + powerDef power_senseamp_mux_lev_2_predecoder_drivers; + powerDef power_senseamp_mux_lev_2_predecoder_blocks; + powerDef power_senseamp_mux_lev_2_decoders; + powerDef power_bitlines; + powerDef power_sense_amps; + powerDef power_prechg_eq_drivers; + powerDef power_output_drivers_at_subarray; + powerDef power_dataout_vertical_htree; + powerDef power_comparators; + powerDef power_crossbar; + powerDef total_power; + double area; + double all_banks_height; + double all_banks_width; + double bank_height; + double bank_width; + double subarray_memory_cell_area_height; + double subarray_memory_cell_area_width; + double mat_height; + double mat_width; + double routing_area_height_within_bank; + double routing_area_width_within_bank; + double area_efficiency; +// double perc_power_dyn_routing_to_bank; +// double perc_power_dyn_addr_horizontal_htree; +// double perc_power_dyn_datain_horizontal_htree; +// double perc_power_dyn_dataout_horizontal_htree; +// double perc_power_dyn_addr_vertical_htree; +// double perc_power_dyn_datain_vertical_htree; +// double perc_power_dyn_row_predecoder_drivers; +// double perc_power_dyn_row_predecoder_blocks; +// double perc_power_dyn_row_decoders; +// double perc_power_dyn_bit_mux_predecoder_drivers; +// double perc_power_dyn_bit_mux_predecoder_blocks; +// double perc_power_dyn_bit_mux_decoders; +// double perc_power_dyn_senseamp_mux_lev_1_predecoder_drivers; +// double perc_power_dyn_senseamp_mux_lev_1_predecoder_blocks; +// double perc_power_dyn_senseamp_mux_lev_1_decoders; +// double perc_power_dyn_senseamp_mux_lev_2_predecoder_drivers; +// double perc_power_dyn_senseamp_mux_lev_2_predecoder_blocks; +// double perc_power_dyn_senseamp_mux_lev_2_decoders; +// double perc_power_dyn_bitlines; +// double perc_power_dyn_sense_amps; +// double perc_power_dyn_prechg_eq_drivers; +// double perc_power_dyn_subarray_output_drivers; +// double perc_power_dyn_dataout_vertical_htree; +// double perc_power_dyn_comparators; +// double perc_power_dyn_crossbar; +// double perc_power_dyn_spent_outside_mats; +// double perc_power_leak_routing_to_bank; +// double perc_power_leak_addr_horizontal_htree; +// double perc_power_leak_datain_horizontal_htree; +// double perc_power_leak_dataout_horizontal_htree; +// double perc_power_leak_addr_vertical_htree; +// double perc_power_leak_datain_vertical_htree; +// double perc_power_leak_row_predecoder_drivers; +// double perc_power_leak_row_predecoder_blocks; +// double perc_power_leak_row_decoders; +// double perc_power_leak_bit_mux_predecoder_drivers; +// double perc_power_leak_bit_mux_predecoder_blocks; +// double perc_power_leak_bit_mux_decoders; +// double perc_power_leak_senseamp_mux_lev_1_predecoder_drivers; +// double perc_power_leak_senseamp_mux_lev_1_predecoder_blocks; +// double perc_power_leak_senseamp_mux_lev_1_decoders; +// double perc_power_leak_senseamp_mux_lev_2_predecoder_drivers; +// double perc_power_leak_senseamp_mux_lev_2_predecoder_blocks; +// double perc_power_leak_senseamp_mux_lev_2_decoders; +// double perc_power_leak_bitlines; +// double perc_power_leak_sense_amps; +// double perc_power_leak_prechg_eq_drivers; +// double perc_power_leak_subarray_output_drivers; +// double perc_power_leak_dataout_vertical_htree; +// double perc_power_leak_comparators; +// double perc_power_leak_crossbar; +// double perc_leak_mats; +// double perc_active_mats; + double refresh_power; + double dram_refresh_period; + double dram_array_availability; + double dyn_read_energy_from_closed_page; + double dyn_read_energy_from_open_page; + double leak_power_subbank_closed_page; + double leak_power_subbank_open_page; + double leak_power_request_and_reply_networks; + double activate_energy; + double read_energy; + double write_energy; + double precharge_energy; +} results_mem_array; + + +class uca_org_t +{ + public: + mem_array * tag_array2; + mem_array * data_array2; + double access_time; + double cycle_time; + double area; + double area_efficiency; + powerDef power; + double leak_power_with_sleep_transistors_in_mats; + double cache_ht; + double cache_len; + char file_n[100]; + double vdd_periph_global; + bool valid; + results_mem_array tag_array; + results_mem_array data_array; + + uca_org_t(); + void find_delay(); + void find_energy(); + void find_area(); + void find_cyc(); + void adjust_area();//for McPAT only to adjust routing overhead + void cleanup(); + ~uca_org_t(){}; +}; + +void reconfigure(InputParameter *local_interface, uca_org_t *fin_res); + +uca_org_t cacti_interface(const string & infile_name); +//McPAT's plain interface, please keep !!! +uca_org_t cacti_interface(InputParameter * const local_interface); +//McPAT's plain interface, please keep !!! +uca_org_t init_interface(InputParameter * const local_interface); +//McPAT's plain interface, please keep !!! +uca_org_t cacti_interface( + int cache_size, + int line_size, + int associativity, + int rw_ports, + int excl_read_ports, + int excl_write_ports, + int single_ended_read_ports, + int search_ports, + int banks, + double tech_node, + int output_width, + int specific_tag, + int tag_width, + int access_mode, + int cache, + int main_mem, + int obj_func_delay, + int obj_func_dynamic_power, + int obj_func_leakage_power, + int obj_func_cycle_time, + int obj_func_area, + int dev_func_delay, + int dev_func_dynamic_power, + int dev_func_leakage_power, + int dev_func_area, + int dev_func_cycle_time, + int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate + int temp, + int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing + int data_arr_ram_cell_tech_flavor_in, + int data_arr_peri_global_tech_flavor_in, + int tag_arr_ram_cell_tech_flavor_in, + int tag_arr_peri_global_tech_flavor_in, + int interconnect_projection_type_in, + int wire_inside_mat_type_in, + int wire_outside_mat_type_in, + int REPEATERS_IN_HTREE_SEGMENTS_in, + int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in, + int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in, + int PAGE_SIZE_BITS_in, + int BURST_LENGTH_in, + int INTERNAL_PREFETCH_WIDTH_in, + int force_wiretype, + int wiretype, + int force_config, + int ndwl, + int ndbl, + int nspd, + int ndcm, + int ndsam1, + int ndsam2, + int ecc); +// int cache_size, +// int line_size, +// int associativity, +// int rw_ports, +// int excl_read_ports, +// int excl_write_ports, +// int single_ended_read_ports, +// int banks, +// double tech_node, +// int output_width, +// int specific_tag, +// int tag_width, +// int access_mode, +// int cache, +// int main_mem, +// int obj_func_delay, +// int obj_func_dynamic_power, +// int obj_func_leakage_power, +// int obj_func_area, +// int obj_func_cycle_time, +// int dev_func_delay, +// int dev_func_dynamic_power, +// int dev_func_leakage_power, +// int dev_func_area, +// int dev_func_cycle_time, +// int temp, +// int data_arr_ram_cell_tech_flavor_in, +// int data_arr_peri_global_tech_flavor_in, +// int tag_arr_ram_cell_tech_flavor_in, +// int tag_arr_peri_global_tech_flavor_in, +// int interconnect_projection_type_in, +// int wire_inside_mat_type_in, +// int wire_outside_mat_type_in, +// int REPEATERS_IN_HTREE_SEGMENTS_in, +// int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in, +// int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in, +//// double MAXAREACONSTRAINT_PERC_in, +//// double MAXACCTIMECONSTRAINT_PERC_in, +//// double MAX_PERC_DIFF_IN_DELAY_FROM_BEST_DELAY_REPEATER_SOLUTION_in, +// int PAGE_SIZE_BITS_in, +// int BURST_LENGTH_in, +// int INTERNAL_PREFETCH_WIDTH_in); + +//Naveen's interface +uca_org_t cacti_interface( + int cache_size, + int line_size, + int associativity, + int rw_ports, + int excl_read_ports, + int excl_write_ports, + int single_ended_read_ports, + int banks, + double tech_node, + int page_sz, + int burst_length, + int pre_width, + int output_width, + int specific_tag, + int tag_width, + int access_mode, //0 normal, 1 seq, 2 fast + int cache, //scratch ram or cache + int main_mem, + int obj_func_delay, + int obj_func_dynamic_power, + int obj_func_leakage_power, + int obj_func_area, + int obj_func_cycle_time, + int dev_func_delay, + int dev_func_dynamic_power, + int dev_func_leakage_power, + int dev_func_area, + int dev_func_cycle_time, + int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate + int temp, + int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing + int data_arr_ram_cell_tech_flavor_in, + int data_arr_peri_global_tech_flavor_in, + int tag_arr_ram_cell_tech_flavor_in, + int tag_arr_peri_global_tech_flavor_in, + int interconnect_projection_type_in, // 0 - aggressive, 1 - normal + int wire_inside_mat_type_in, + int wire_outside_mat_type_in, + int is_nuca, // 0 - UCA, 1 - NUCA + int core_count, + int cache_level, // 0 - L2, 1 - L3 + int nuca_bank_count, + int nuca_obj_func_delay, + int nuca_obj_func_dynamic_power, + int nuca_obj_func_leakage_power, + int nuca_obj_func_area, + int nuca_obj_func_cycle_time, + int nuca_dev_func_delay, + int nuca_dev_func_dynamic_power, + int nuca_dev_func_leakage_power, + int nuca_dev_func_area, + int nuca_dev_func_cycle_time, + int REPEATERS_IN_HTREE_SEGMENTS_in,//TODO for now only wires with repeaters are supported + int p_input); + +class mem_array +{ + public: + int Ndcm; + int Ndwl; + int Ndbl; + double Nspd; + int deg_bl_muxing; + int Ndsam_lev_1; + int Ndsam_lev_2; + double access_time; + double cycle_time; + double multisubbank_interleave_cycle_time; + double area_ram_cells; + double area; + powerDef power; + double delay_senseamp_mux_decoder; + double delay_before_subarray_output_driver; + double delay_from_subarray_output_driver_to_output; + double height; + double width; + + double mat_height; + double mat_length; + double subarray_length; + double subarray_height; + + double delay_route_to_bank, + delay_input_htree, + delay_row_predecode_driver_and_block, + delay_row_decoder, + delay_bitlines, + delay_sense_amp, + delay_subarray_output_driver, + delay_dout_htree, + delay_comparator, + delay_matchlines; + + double all_banks_height, + all_banks_width, + area_efficiency; + + powerDef power_routing_to_bank; + powerDef power_addr_input_htree; + powerDef power_data_input_htree; + powerDef power_data_output_htree; + powerDef power_htree_in_search; + powerDef power_htree_out_search; + powerDef power_row_predecoder_drivers; + powerDef power_row_predecoder_blocks; + powerDef power_row_decoders; + powerDef power_bit_mux_predecoder_drivers; + powerDef power_bit_mux_predecoder_blocks; + powerDef power_bit_mux_decoders; + powerDef power_senseamp_mux_lev_1_predecoder_drivers; + powerDef power_senseamp_mux_lev_1_predecoder_blocks; + powerDef power_senseamp_mux_lev_1_decoders; + powerDef power_senseamp_mux_lev_2_predecoder_drivers; + powerDef power_senseamp_mux_lev_2_predecoder_blocks; + powerDef power_senseamp_mux_lev_2_decoders; + powerDef power_bitlines; + powerDef power_sense_amps; + powerDef power_prechg_eq_drivers; + powerDef power_output_drivers_at_subarray; + powerDef power_dataout_vertical_htree; + powerDef power_comparators; + + powerDef power_cam_bitline_precharge_eq_drv; + powerDef power_searchline; + powerDef power_searchline_precharge; + powerDef power_matchlines; + powerDef power_matchline_precharge; + powerDef power_matchline_to_wordline_drv; + + min_values_t *arr_min; + enum Wire_type wt; + + // dram stats + double activate_energy, read_energy, write_energy, precharge_energy, + refresh_power, leak_power_subbank_closed_page, leak_power_subbank_open_page, + leak_power_request_and_reply_networks; + + double precharge_delay; + + static bool lt(const mem_array * m1, const mem_array * m2); +}; + + +#endif diff --git a/ext/mcpat/cacti/component.cc b/ext/mcpat/cacti/component.cc new file mode 100644 index 000000000..733108407 --- /dev/null +++ b/ext/mcpat/cacti/component.cc @@ -0,0 +1,236 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + + +#include <cassert> +#include <cmath> +#include <iostream> + +#include "bank.h" +#include "component.h" +#include "decoder.h" + +using namespace std; + + + +Component::Component() + :area(), power(), rt_power(),delay(0) +{ +} + + + +Component::~Component() +{ +} + + + +double Component::compute_diffusion_width(int num_stacked_in, int num_folded_tr) +{ + double w_poly = g_ip->F_sz_um; + double spacing_poly_to_poly = g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact; + double total_diff_w = 2 * spacing_poly_to_poly + // for both source and drain + num_stacked_in * w_poly + + (num_stacked_in - 1) * g_tp.spacing_poly_to_poly; + + if (num_folded_tr > 1) + { + total_diff_w += (num_folded_tr - 2) * 2 * spacing_poly_to_poly + + (num_folded_tr - 1) * num_stacked_in * w_poly + + (num_folded_tr - 1) * (num_stacked_in - 1) * g_tp.spacing_poly_to_poly; + } + + return total_diff_w; +} + + + +double Component::compute_gate_area( + int gate_type, + int num_inputs, + double w_pmos, + double w_nmos, + double h_gate) +{ + if (w_pmos <= 0.0 || w_nmos <= 0.0) + { + return 0.0; + } + + double w_folded_pmos, w_folded_nmos; + int num_folded_pmos, num_folded_nmos; + double total_ndiff_w, total_pdiff_w; + Area gate; + + double h_tr_region = h_gate - 2 * g_tp.HPOWERRAIL; + double ratio_p_to_n = w_pmos / (w_pmos + w_nmos); + + if (ratio_p_to_n >= 1 || ratio_p_to_n <= 0) + { + return 0.0; + } + + w_folded_pmos = (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS) * ratio_p_to_n; + w_folded_nmos = (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS) * (1 - ratio_p_to_n); + assert(w_folded_pmos > 0); + + num_folded_pmos = (int) (ceil(w_pmos / w_folded_pmos)); + num_folded_nmos = (int) (ceil(w_nmos / w_folded_nmos)); + + switch (gate_type) + { + case INV: + total_ndiff_w = compute_diffusion_width(1, num_folded_nmos); + total_pdiff_w = compute_diffusion_width(1, num_folded_pmos); + break; + + case NOR: + total_ndiff_w = compute_diffusion_width(1, num_inputs * num_folded_nmos); + total_pdiff_w = compute_diffusion_width(num_inputs, num_folded_pmos); + break; + + case NAND: + total_ndiff_w = compute_diffusion_width(num_inputs, num_folded_nmos); + total_pdiff_w = compute_diffusion_width(1, num_inputs * num_folded_pmos); + break; + default: + cout << "Unknown gate type: " << gate_type << endl; + exit(1); + } + + gate.w = MAX(total_ndiff_w, total_pdiff_w); + + if (w_folded_nmos > w_nmos) + { + //means that the height of the gate can + //be made smaller than the input height specified, so calculate the height of the gate. + gate.h = w_nmos + w_pmos + g_tp.MIN_GAP_BET_P_AND_N_DIFFS + 2 * g_tp.HPOWERRAIL; + } + else + { + gate.h = h_gate; + } + return gate.get_area(); +} + + + +double Component::compute_tr_width_after_folding( + double input_width, + double threshold_folding_width) +{//This is actually the width of the cell not the width of a device. +//The width of a cell and the width of a device is orthogonal. + if (input_width <= 0) + { + return 0; + } + + int num_folded_tr = (int) (ceil(input_width / threshold_folding_width)); + double spacing_poly_to_poly = g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact; + double width_poly = g_ip->F_sz_um; + double total_diff_width = num_folded_tr * width_poly + (num_folded_tr + 1) * spacing_poly_to_poly; + + return total_diff_width; +} + + + +double Component::height_sense_amplifier(double pitch_sense_amp) +{ + // compute the height occupied by all PMOS transistors + double h_pmos_tr = compute_tr_width_after_folding(g_tp.w_sense_p, pitch_sense_amp) * 2 + + compute_tr_width_after_folding(g_tp.w_iso, pitch_sense_amp) + + 2 * g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS; + + // compute the height occupied by all NMOS transistors + double h_nmos_tr = compute_tr_width_after_folding(g_tp.w_sense_n, pitch_sense_amp) * 2 + + compute_tr_width_after_folding(g_tp.w_sense_en, pitch_sense_amp) + + 2 * g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS; + + // compute total height by considering gap between the p and n diffusion areas + return h_pmos_tr + h_nmos_tr + g_tp.MIN_GAP_BET_P_AND_N_DIFFS; +} + + + +int Component::logical_effort( + int num_gates_min, + double g, + double F, + double * w_n, + double * w_p, + double C_load, + double p_to_n_sz_ratio, + bool is_dram_, + bool is_wl_tr_, + double max_w_nmos) +{ + int num_gates = (int) (log(F) / log(fopt)); + + // check if num_gates is odd. if so, add 1 to make it even + num_gates+= (num_gates % 2) ? 1 : 0; + num_gates = MAX(num_gates, num_gates_min); + + // recalculate the effective fanout of each stage + double f = pow(F, 1.0 / num_gates); + int i = num_gates - 1; + double C_in = C_load / f; + w_n[i] = (1.0 / (1.0 + p_to_n_sz_ratio)) * C_in / gate_C(1, 0, is_dram_, false, is_wl_tr_); + w_n[i] = MAX(w_n[i], g_tp.min_w_nmos_); + w_p[i] = p_to_n_sz_ratio * w_n[i]; + + if (w_n[i] > max_w_nmos) + { + double C_ld = gate_C((1 + p_to_n_sz_ratio) * max_w_nmos, 0, is_dram_, false, is_wl_tr_); + F = g * C_ld / gate_C(w_n[0] + w_p[0], 0, is_dram_, false, is_wl_tr_); + num_gates = (int) (log(F) / log(fopt)) + 1; + num_gates+= (num_gates % 2) ? 1 : 0; + num_gates = MAX(num_gates, num_gates_min); + f = pow(F, 1.0 / (num_gates - 1)); + i = num_gates - 1; + w_n[i] = max_w_nmos; + w_p[i] = p_to_n_sz_ratio * w_n[i]; + } + + for (i = num_gates - 2; i >= 1; i--) + { + w_n[i] = MAX(w_n[i+1] / f, g_tp.min_w_nmos_); + w_p[i] = p_to_n_sz_ratio * w_n[i]; + } + + assert(num_gates <= MAX_NUMBER_GATES_STAGE); + return num_gates; +} + diff --git a/ext/mcpat/cacti/component.h b/ext/mcpat/cacti/component.h new file mode 100644 index 000000000..75e2cb075 --- /dev/null +++ b/ext/mcpat/cacti/component.h @@ -0,0 +1,84 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#ifndef __COMPONENT_H__ +#define __COMPONENT_H__ + +#include "area.h" +#include "parameter.h" + +using namespace std; + +class Crossbar; +class Bank; + +class Component +{ + public: + Component(); + ~Component(); + + Area area; + powerDef power,rt_power; + double delay; + double cycle_time; + + double compute_gate_area( + int gate_type, + int num_inputs, + double w_pmos, + double w_nmos, + double h_gate); + + double compute_tr_width_after_folding(double input_width, double threshold_folding_width); + double height_sense_amplifier(double pitch_sense_amp); + + protected: + int logical_effort( + int num_gates_min, + double g, + double F, + double * w_n, + double * w_p, + double C_load, + double p_to_n_sz_ratio, + bool is_dram_, + bool is_wl_tr_, + double max_w_nmos); + + private: + double compute_diffusion_width(int num_stacked_in, int num_folded_tr); +}; + +#endif + diff --git a/ext/mcpat/cacti/const.h b/ext/mcpat/cacti/const.h new file mode 100644 index 000000000..aef7d019b --- /dev/null +++ b/ext/mcpat/cacti/const.h @@ -0,0 +1,270 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#ifndef __CONST_H__ +#define __CONST_H__ + +#include <math.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +/* The following are things you might want to change + * when compiling + */ + +/* + * Address bits in a word, and number of output bits from the cache + */ + +/* +was: #define ADDRESS_BITS 32 +now: I'm using 42 bits as in the Power4, +since that's bigger then the 36 bits on the Pentium 4 +and 40 bits on the Opteron +*/ +const int ADDRESS_BITS = 42; + +/*dt: In addition to the tag bits, the tags also include 1 valid bit, 1 dirty bit, 2 bits for a 4-state + cache coherency protocoll (MESI), 1 bit for MRU (change this to log(ways) for full LRU). + So in total we have 1 + 1 + 2 + 1 = 5 */ +const int EXTRA_TAG_BITS = 5; + +/* limits on the various N parameters */ + +const unsigned int MAXDATAN = 512; // maximum for Ndwl and Ndbl +const unsigned int MAXSUBARRAYS = 1048576; // maximum subarrays for data and tag arrays +const unsigned int MAXDATASPD = 256; // maximum for Nspd +const unsigned int MAX_COL_MUX = 256; + + + +#define ROUTER_TYPES 3 +#define WIRE_TYPES 6 + +const double Cpolywire = 0; + + +/* Threshold voltages (as a proportion of Vdd) + If you don't know them, set all values to 0.5 */ +#define VTHFA1 0.452 +#define VTHFA2 0.304 +#define VTHFA3 0.420 +#define VTHFA4 0.413 +#define VTHFA5 0.405 +#define VTHFA6 0.452 +#define VSINV 0.452 +#define VTHCOMPINV 0.437 +#define VTHMUXNAND 0.548 // TODO : this constant must be revisited +#define VTHEVALINV 0.452 +#define VTHSENSEEXTDRV 0.438 + + +//WmuxdrvNANDn and WmuxdrvNANDp are no longer being used but it's part of the old +//delay_comparator function which we are using exactly as it used to be, so just setting these to 0 +const double WmuxdrvNANDn = 0; +const double WmuxdrvNANDp = 0; + + +/*===================================================================*/ +/* + * The following are things you probably wouldn't want to change. + */ + +#define BIGNUM 1e30 +#define INF 9999999 +#define MAX(a,b) (((a)>(b))?(a):(b)) +#define MIN(a,b) (((a)<(b))?(a):(b)) + +/* Used to communicate with the horowitz model */ +#define RISE 1 +#define FALL 0 +#define NCH 1 +#define PCH 0 + + +#define EPSILON 0.5 //v4.1: This constant is being used in order to fix floating point -> integer +//conversion problems that were occuring within CACTI. Typical problem that was occuring was +//that with different compilers a floating point number like 3.0 would get represented as either +//2.9999....or 3.00000001 and then the integer part of the floating point number (3.0) would +//be computed differently depending on the compiler. What we are doing now is to replace +//int (x) with (int) (x+EPSILON) where EPSILON is 0.5. This would fix such problems. Note that +//this works only when x is an integer >= 0. +/* + * Sheng thinks this is more a solution to solve the simple truncate problem + * (http://www.cs.tut.fi/~jkorpela/round.html) rather than the problem mentioned above. + * Unfortunately, this solution causes nasty bugs (different results when using O0 and O3). + * Moreover, round is not correct in CACTI since when an extra fraction of bit/line is needed, + * we need to provide a complete bit/line even the fraction is just 0.01. + * So, in later version than 6.5 we use (int)ceil() to get double to int conversion. + */ + +#define EPSILON2 0.1 +#define EPSILON3 0.6 + + +#define MINSUBARRAYROWS 16 //For simplicity in modeling, for the row decoding structure, we assume +//that each row predecode block is composed of at least one 2-4 decoder. When the outputs from the +//row predecode blocks are combined this means that there are at least 4*4=16 row decode outputs +#define MAXSUBARRAYROWS 262144 //Each row predecode block produces a max of 2^9 outputs. So +//the maximum number of row decode outputs will be 2^9*2^9 +#define MINSUBARRAYCOLS 2 +#define MAXSUBARRAYCOLS 262144 + + +#define INV 0 +#define NOR 1 +#define NAND 2 + + +#define NUMBER_TECH_FLAVORS 4 + +#define NUMBER_INTERCONNECT_PROJECTION_TYPES 2 //aggressive and conservative +//0 = Aggressive projections, 1 = Conservative projections +#define NUMBER_WIRE_TYPES 4 //local, semi-global and global +//1 = 'Semi-global' wire type, 2 = 'Global' wire type + + +const int dram_cell_tech_flavor = 3; + + +#define VBITSENSEMIN 0.08 //minimum bitline sense voltage is fixed to be 80 mV. + +#define fopt 4.0 + +#define INPUT_WIRE_TO_INPUT_GATE_CAP_RATIO 0 +#define BUFFER_SEPARATION_LENGTH_MULTIPLIER 1 +#define NUMBER_MATS_PER_REDUNDANT_MAT 8 + +#define NUMBER_STACKED_DIE_LAYERS 1 + +// this variable can be set to carry out solution optimization for +// a maximum area allocation. +#define STACKED_DIE_LAYER_ALLOTED_AREA_mm2 0 //6.24 //6.21//71.5 + +// this variable can also be employed when solution optimization +// with maximum area allocation is carried out. +#define MAX_PERCENT_AWAY_FROM_ALLOTED_AREA 50 + +// this variable can also be employed when solution optimization +// with maximum area allocation is carried out. +#define MIN_AREA_EFFICIENCY 20 + +// this variable can be employed when solution with a desired +// aspect ratio is required. +#define STACKED_DIE_LAYER_ASPECT_RATIO 1 + +// this variable can be employed when solution with a desired +// aspect ratio is required. +#define MAX_PERCENT_AWAY_FROM_ASPECT_RATIO 101 + +// this variable can be employed to carry out solution optimization +// for a certain target random cycle time. +#define TARGET_CYCLE_TIME_ns 1000000000 + +#define NUMBER_PIPELINE_STAGES 4 + +// this can be used to model the length of interconnect +// between a bank and a crossbar +#define LENGTH_INTERCONNECT_FROM_BANK_TO_CROSSBAR 0 //3791 // 2880//micron + +#define IS_CROSSBAR 0 +#define NUMBER_INPUT_PORTS_CROSSBAR 8 +#define NUMBER_OUTPUT_PORTS_CROSSBAR 8 +#define NUMBER_SIGNALS_PER_PORT_CROSSBAR 256 + + +#define MAT_LEAKAGE_REDUCTION_DUE_TO_SLEEP_TRANSISTORS_FACTOR 1 +#define LEAKAGE_REDUCTION_DUE_TO_LONG_CHANNEL_HP_TRANSISTORS_FACTOR 1 + +#define PAGE_MODE 0 + +#define MAIN_MEM_PER_CHIP_STANDBY_CURRENT_mA 60 +// We are actually not using this variable in the CACTI code. We just want to acknowledge that +// this current should be multiplied by the DDR(n) system VDD value to compute the standby power +// consumed during precharge. + + +const double VDD_STORAGE_LOSS_FRACTION_WORST = 0.125; +const double CU_RESISTIVITY = 0.022; //ohm-micron +const double BULK_CU_RESISTIVITY = 0.018; //ohm-micron +const double PERMITTIVITY_FREE_SPACE = 8.854e-18; //F/micron + +const static uint32_t sram_num_cells_wl_stitching_ = 16; +const static uint32_t dram_num_cells_wl_stitching_ = 64; +const static uint32_t comm_dram_num_cells_wl_stitching_ = 256; +const static double num_bits_per_ecc_b_ = 8.0; + +const double bit_to_byte = 8.0; + +#define MAX_NUMBER_GATES_STAGE 20 +#define MAX_NUMBER_HTREE_NODES 20 +#define NAND2_LEAK_STACK_FACTOR 0.2 +#define NAND3_LEAK_STACK_FACTOR 0.2 +#define NOR2_LEAK_STACK_FACTOR 0.2 +#define INV_LEAK_STACK_FACTOR 0.5 +#define MAX_NUMBER_ARRAY_PARTITIONS 1000000 + +// abbreviations used in this project +// ---------------------------------- +// +// num : number +// rw : read/write +// rd : read +// wr : write +// se : single-ended +// sz : size +// F : feature +// w : width +// h : height or horizontal +// v : vertical or velocity + + +enum ram_cell_tech_type_num +{ + itrs_hp = 0, + itrs_lstp = 1, + itrs_lop = 2, + lp_dram = 3, + comm_dram = 4 +}; + +const double pppm[4] = {1,1,1,1}; +const double pppm_lkg[4] = {0,1,1,0}; +const double pppm_dyn[4] = {1,0,0,0}; +const double pppm_Isub[4] = {0,1,0,0}; +const double pppm_Ig[4] = {0,0,1,0}; +const double pppm_sc[4] = {0,0,0,1}; + + + +#endif diff --git a/ext/mcpat/cacti/contention.dat b/ext/mcpat/cacti/contention.dat new file mode 100755 index 000000000..826553e7e --- /dev/null +++ b/ext/mcpat/cacti/contention.dat @@ -0,0 +1,126 @@ +l34c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l34c64l2b: 9 11 19 29 43 62 81 102 +l34c64l4b: 6 8 12 17 24 29 39 47 +l34c64l8b: 7 8 10 14 18 22 25 30 +l34c64l16b: 7 7 9 12 14 17 20 24 +l34c64l32b: 7 7 9 12 14 17 20 24 -r +l34c64l64b: 7 7 9 12 14 17 20 24 -r +l34c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l34c128l2b: 4 10 19 30 44 64 82 103 +l34c128l4b: 3 6 11 17 24 31 38 47 +l34c128l8b: 3 5 9 13 17 21 25 29 +l34c128l16b: 4 5 7 10 13 16 19 22 +l34c128l32b: 4 5 7 10 13 16 19 22 -r +l34c128l64b: 4 5 7 10 13 16 19 22 -r +l34c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l34c256l2b: 3 10 19 30 44 63 82 103 +l34c256l4b: 3 6 11 17 24 31 38 47 +l34c256l8b: 2 5 8 12 16 20 24 29 +l34c256l16b: 2 4 7 9 12 15 18 21 +l34c256l32b: 2 4 7 9 12 15 18 21 -r +l34c256l64b: 2 4 7 9 12 15 18 21 -r +l38c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l38c64l2b: 57 59 77 90 137 187 219 245 +l38c64l4b: 35 40 48 56 43 61 80 101 +l38c64l8b: 18 27 41 45 52 58 58 58 -r +l38c64l16b: 16 17 19 35 40 49 53 53 -r +l38c64l32b: 15 15 17 19 22 25 30 30 -r +l38c64l64b: 15 15 17 19 22 25 30 30 -r +l38c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l38c128l2b: 38 50 78 93 139 188 220 245 +l38c128l4b: 29 37 46 56 43 61 81 102 +l38c128l8b: 16 30 39 44 50 57 57 57 -r +l38c128l16b: 14 16 19 33 40 47 52 52 -r +l38c128l32b: 14 15 17 20 23 27 31 31 -r +l38c128l64b: 14 15 17 20 23 27 31 31 -r +l38c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l38c256l2b: 35 50 78 94 139 188 220 246 +l38c256l4b: 28 36 45 55 55 61 81 102 +l38c256l8b: 17 30 38 43 50 57 57 57 -r +l38c256l16b: 15 17 21 32 40 47 51 51 +l38c256l32b: 15 17 19 21 24 29 33 33 +l38c256l64b: 15 17 19 21 24 29 33 33 -r +l316c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l316c64l2b: 1000 1000 1000 1000 1000 1000 1000 1000 +l316c64l4b: 34 35 78 126 178 220 252 274 +l316c64l8b: 9 11 23 43 62 87 105 130 +l316c64l16b: 7 9 13 23 33 45 56 67 +l316c64l32b: 5 6 7 10 13 19 25 30 +l316c64l64b: 4 5 6 8 10 14 18 21 +l316c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l316c128l2b: 25 131 243 1000 1000 1000 1000 1000 +l316c128l4b: 8 28 79 127 179 221 253 274 +l316c128l8b: 4 9 22 43 62 88 106 131 +l316c128l16b: 4 6 11 21 32 44 55 67 +l316c128l32b: 4 6 11 12 12 18 24 29 +l316c128l64b: 2 3 5 7 9 13 17 21 +l316c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l316c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000 +l316c256l4b: 5 28 80 128 180 221 253 274 +l316c256l8b: 3 8 22 43 63 88 107 131 +l316c256l16b: 2 5 11 21 32 44 55 67 +l316c256l32b: 2 3 5 8 12 18 24 29 +l316c256l64b: 2 3 4 6 9 13 17 21 +l24c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l24c64l2b: 10 12 24 41 60 86 105 122 +l24c64l4b: 5 7 13 20 29 38 47 56 +l24c64l8b: 5 6 9 14 18 24 29 35 +l24c64l16b: 4 5 7 10 12 16 19 22 +l24c64l32b: 5 5 6 8 10 12 14 17 +l24c64l64b: 5 5 6 8 10 12 14 16 +l24c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l24c128l2b: 1000 1000 1000 1000 1000 1000 1000 1000 +l24c128l4b: 3 7 13 20 29 38 47 57 +l24c128l8b: 3 5 9 13 18 23 29 35 +l24c128l16b: 3 4 6 9 12 15 19 22 +l24c128l32b: 3 4 5 7 9 11 14 16 +l24c128l64b: 1000 1000 1000 1000 1000 1000 1000 1000 +l24c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l24c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000 +l24c256l4b: 2 6 13 20 29 38 47 57 +l24c256l8b: 2 4 8 13 18 23 28 35 +l24c256l16b: 2 3 6 8 11 15 18 22 +l24c256l32b: 2 3 5 6 8 11 14 16 +l24c256l64b: 1000 1000 1000 1000 1000 1000 1000 1000 +l28c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l28c64l2b: 46 52 117 157 188 225 246 261 +l28c64l4b: 19 25 39 54 96 107 120 150 +l28c64l8b: 9 12 21 30 39 47 58 79 +l28c64l16b: 8 9 11 16 25 32 37 42 +l28c64l32b: 7 8 9 11 14 19 23 28 +l28c64l64b: 7 7 8 10 12 14 18 22 +l28c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l28c128l2b: 1000 1000 1000 1000 1000 1000 1000 1000 +l28c128l4b: 12 22 39 54 98 108 130 151 +l28c128l8b: 7 12 21 30 39 48 59 80 +l28c128l16b: 6 8 11 16 24 31 37 42 +l28c128l32b: 6 7 9 11 14 19 24 28 +l28c128l64b: 6 7 9 11 14 19 24 28 +l28c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l28c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000 +l28c256l4b: 12 22 39 54 100 108 130 152 +l28c256l8b: 7 12 21 30 39 48 59 81 +l28c256l16b: 6 8 11 16 24 31 37 42 +l28c256l32b: 6 7 9 11 14 19 24 28 +l28c256l64b: 6 7 9 11 14 19 24 28 +l216c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l216c64l2b: 1000 1000 1000 1000 1000 1000 1000 1000 +l216c64l4b: 34 35 78 126 178 220 252 274 +l216c64l8b: 9 11 23 43 62 87 105 130 +l216c64l16b: 7 9 13 23 33 45 56 67 +l216c64l32b: 5 6 7 10 13 19 25 30 +l216c64l64b: 4 5 6 8 10 14 18 21 +l216c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l216c128l2b: 25 131 243 1000 1000 1000 1000 1000 +l216c128l4b: 8 28 79 127 179 221 253 274 +l216c128l8b: 4 9 22 43 62 88 106 131 +l216c128l16b: 4 6 11 21 32 44 55 67 +l216c128l32b: 4 6 11 12 12 18 24 29 +l216c128l64b: 2 3 5 7 9 13 17 21 +l216c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000 +l216c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000 +l216c256l4b: 5 28 80 128 180 221 253 274 +l216c256l8b: 3 8 22 43 63 88 107 131 +l216c256l16b: 2 5 11 21 32 44 55 67 +l216c256l32b: 2 3 5 8 12 18 24 29 +l216c256l64b: 2 3 4 6 9 13 17 21 diff --git a/ext/mcpat/cacti/crossbar.cc b/ext/mcpat/cacti/crossbar.cc new file mode 100644 index 000000000..a3d8532d5 --- /dev/null +++ b/ext/mcpat/cacti/crossbar.cc @@ -0,0 +1,161 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#include "crossbar.h" + +#define ASPECT_THRESHOLD .8 +#define ADJ 1 + +Crossbar::Crossbar( + double n_inp_, + double n_out_, + double flit_size_, + TechnologyParameter::DeviceType *dt + ):n_inp(n_inp_), n_out(n_out_), flit_size(flit_size_), deviceType(dt) +{ + min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_; + Vdd = dt->Vdd; + CB_ADJ = 1; +} + +Crossbar::~Crossbar(){} + +double Crossbar::output_buffer() +{ + + //Wire winit(4, 4); + double l_eff = n_inp*flit_size*g_tp.wire_outside_mat.pitch; + Wire w1(g_ip->wt, l_eff); + //double s1 = w1.repeater_size *l_eff*ADJ/w1.repeater_spacing; + double s1 = w1.repeater_size * (l_eff <w1.repeater_spacing? l_eff *ADJ/w1.repeater_spacing : ADJ); + double pton_size = deviceType->n_to_p_eff_curr_drv_ratio; + // the model assumes input capacitance of the wire driver = input capacitance of nand + nor = input cap of the driver transistor + TriS1 = s1*(1 + pton_size)/(2 + pton_size + 1 + 2*pton_size); + TriS2 = s1; //driver transistor + + if (TriS1 < 1) + TriS1 = 1; + + double input_cap = gate_C(TriS1*(2*min_w_pmos + g_tp.min_w_nmos_), 0) + + gate_C(TriS1*(min_w_pmos + 2*g_tp.min_w_nmos_), 0); +// input_cap += drain_C_(TriS1*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) + +// drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 + +// gate_C(TriS2*g_tp.min_w_nmos_, 0)+ +// drain_C_(TriS1*min_w_pmos, NCH, 1, 1, g_tp.cell_h_def)*2 + +// drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + +// gate_C(TriS2*min_w_pmos, 0); + tri_int_cap = drain_C_(TriS1*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 + + gate_C(TriS2*g_tp.min_w_nmos_, 0)+ + drain_C_(TriS1*min_w_pmos, NCH, 1, 1, g_tp.cell_h_def)*2 + + drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + gate_C(TriS2*min_w_pmos, 0); + double output_cap = drain_C_(TriS2*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(TriS2*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def); + double ctr_cap = gate_C(TriS2 *(min_w_pmos + g_tp.min_w_nmos_), 0); + + tri_inp_cap = input_cap; + tri_out_cap = output_cap; + tri_ctr_cap = ctr_cap; + return input_cap + output_cap + ctr_cap; +} + +void Crossbar::compute_power() +{ + + Wire winit(4, 4); + double tri_cap = output_buffer(); + assert(tri_cap > 0); + //area of a tristate logic + double g_area = compute_gate_area(INV, 1, TriS2*g_tp.min_w_nmos_, TriS2*min_w_pmos, g_tp.cell_h_def); + g_area *= 2; // to model area of output transistors + g_area += compute_gate_area (NAND, 2, TriS1*2*g_tp.min_w_nmos_, TriS1*min_w_pmos, g_tp.cell_h_def); + g_area += compute_gate_area (NOR, 2, TriS1*g_tp.min_w_nmos_, TriS1*2*min_w_pmos, g_tp.cell_h_def); + double width /*per tristate*/ = g_area/(CB_ADJ * g_tp.cell_h_def); + // effective no. of tristate buffers that need to be laid side by side + int ntri = (int)ceil(g_tp.cell_h_def/(g_tp.wire_outside_mat.pitch)); + double wire_len = MAX(width*ntri*n_out, flit_size*g_tp.wire_outside_mat.pitch*n_out); + Wire w1(g_ip->wt, wire_len); + + area.w = wire_len; + area.h = g_tp.wire_outside_mat.pitch*n_inp*flit_size * CB_ADJ; + Wire w2(g_ip->wt, area.h); + + double aspect_ratio_cb = (area.h/area.w)*(n_out/n_inp); + if (aspect_ratio_cb > 1) aspect_ratio_cb = 1/aspect_ratio_cb; + + if (aspect_ratio_cb < ASPECT_THRESHOLD) { + if (n_out > 2 && n_inp > 2) { + CB_ADJ+=0.2; + //cout << "CB ADJ " << CB_ADJ << endl; + if (CB_ADJ < 4) { + this->compute_power(); + } + } + } + + + + power.readOp.dynamic = (w1.power.readOp.dynamic + w2.power.readOp.dynamic + (tri_inp_cap * n_out + tri_out_cap * n_inp + tri_ctr_cap + tri_int_cap) * Vdd*Vdd)*flit_size; + power.readOp.leakage = n_inp * n_out * flit_size * ( + cmos_Isub_leakage(g_tp.min_w_nmos_*TriS2*2, min_w_pmos*TriS2*2, 1, inv) *Vdd+ + cmos_Isub_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nand)*Vdd+ + cmos_Isub_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nor) *Vdd+ + w1.power.readOp.leakage + w2.power.readOp.leakage); + power.readOp.gate_leakage = n_inp * n_out * flit_size * ( + cmos_Ig_leakage(g_tp.min_w_nmos_*TriS2*2, min_w_pmos*TriS2*2, 1, inv) *Vdd+ + cmos_Ig_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nand)*Vdd+ + cmos_Ig_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nor) *Vdd+ + w1.power.readOp.gate_leakage + w2.power.readOp.gate_leakage); + + // delay calculation + double l_eff = n_inp*flit_size*g_tp.wire_outside_mat.pitch; + Wire wdriver(g_ip->wt, l_eff); + double res = g_tp.wire_outside_mat.R_per_um * (area.w+area.h) + tr_R_on(g_tp.min_w_nmos_*wdriver.repeater_size, NCH, 1); + double cap = g_tp.wire_outside_mat.C_per_um * (area.w + area.h) + n_out*tri_inp_cap + n_inp*tri_out_cap; + delay = horowitz(w1.signal_rise_time(), res*cap, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE); + + Wire wreset(); +} + +void Crossbar::print_crossbar() +{ + cout << "\nCrossbar Stats (" << n_inp << "x" << n_out << ")\n\n"; + cout << "Flit size : " << flit_size << " bits" << endl; + cout << "Width : " << area.w << " u" << endl; + cout << "Height : " << area.h << " u" << endl; + cout << "Dynamic Power : " << power.readOp.dynamic*1e9 * MIN(n_inp, n_out) << " (nJ)" << endl; + cout << "Leakage Power : " << power.readOp.leakage*1e3 << " (mW)" << endl; + cout << "Gate Leakage Power : " << power.readOp.gate_leakage*1e3 << " (mW)" << endl; + cout << "Crossbar Delay : " << delay*1e12 << " ps\n"; +} + + diff --git a/ext/mcpat/cacti/crossbar.h b/ext/mcpat/cacti/crossbar.h new file mode 100644 index 000000000..3b926517c --- /dev/null +++ b/ext/mcpat/cacti/crossbar.h @@ -0,0 +1,85 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + +#ifndef __CROSSBAR__ +#define __CROSSBAR__ + +#include <assert.h> + +#include <iostream> + +#include "basic_circuit.h" +#include "cacti_interface.h" +#include "component.h" +#include "mat.h" +#include "parameter.h" +#include "wire.h" + +class Crossbar : public Component +{ + public: + Crossbar( + double in, + double out, + double flit_sz, + TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)); + ~Crossbar(); + + void print_crossbar(); + double output_buffer(); + void compute_power(); + + double n_inp, n_out; + double flit_size; + double tri_inp_cap, tri_out_cap, tri_ctr_cap, tri_int_cap; + + private: + double CB_ADJ; + /* + * Adjust factor of the height of the cross-point (tri-state buffer) cell (layout) in crossbar + * buffer is adjusted to get an aspect ratio of whole cross bar close to one; + * when adjust the ratio, the number of wires route over the tri-state buffers does not change, + * however, the effective wiring pitch changes. Specifically, since CB_ADJ will increase + * during the adjust, the tri-state buffer will become taller and thiner, and the effective wiring pitch + * will increase. As a result, the height of the crossbar (area.h) will increase. + */ + + TechnologyParameter::DeviceType *deviceType; + double TriS1, TriS2; + double min_w_pmos, Vdd; + +}; + + + + +#endif diff --git a/ext/mcpat/cacti/decoder.cc b/ext/mcpat/cacti/decoder.cc new file mode 100644 index 000000000..0de6f6157 --- /dev/null +++ b/ext/mcpat/cacti/decoder.cc @@ -0,0 +1,1577 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#include <cassert> +#include <cmath> +#include <iostream> + +#include "area.h" +#include "decoder.h" +#include "parameter.h" + +using namespace std; + + +Decoder::Decoder( + int _num_dec_signals, + bool flag_way_select, + double _C_ld_dec_out, + double _R_wire_dec_out, + bool fully_assoc_, + bool is_dram_, + bool is_wl_tr_, + const Area & cell_) +:exist(false), + C_ld_dec_out(_C_ld_dec_out), + R_wire_dec_out(_R_wire_dec_out), + num_gates(0), num_gates_min(2), + delay(0), + //power(), + fully_assoc(fully_assoc_), is_dram(is_dram_), + is_wl_tr(is_wl_tr_), cell(cell_) +{ + + for (int i = 0; i < MAX_NUMBER_GATES_STAGE; i++) + { + w_dec_n[i] = 0; + w_dec_p[i] = 0; + } + + /* + * _num_dec_signals is the number of decoded signal as output + * num_addr_bits_dec is the number of signal to be decoded + * as the decoders input. + */ + int num_addr_bits_dec = _log2(_num_dec_signals); + + if (num_addr_bits_dec < 4) + { + if (flag_way_select) + { + exist = true; + num_in_signals = 2; + } + else + { + num_in_signals = 0; + } + } + else + { + exist = true; + + if (flag_way_select) + { + num_in_signals = 3; + } + else + { + num_in_signals = 2; + } + } + + assert(cell.h>0); + assert(cell.w>0); + // the height of a row-decoder-driver cell is fixed to be 4 * cell.h; + //area.h = 4 * cell.h; + area.h = g_tp.h_dec * cell.h; + + compute_widths(); + compute_area(); +} + + + +void Decoder::compute_widths() +{ + double F; + double p_to_n_sz_ratio = pmos_to_nmos_sz_ratio(is_dram, is_wl_tr); + double gnand2 = (2 + p_to_n_sz_ratio) / (1 + p_to_n_sz_ratio); + double gnand3 = (3 + p_to_n_sz_ratio) / (1 + p_to_n_sz_ratio); + + if (exist) + { + if (num_in_signals == 2 || fully_assoc) + { + w_dec_n[0] = 2 * g_tp.min_w_nmos_; + w_dec_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_; + F = gnand2; + } + else + { + w_dec_n[0] = 3 * g_tp.min_w_nmos_; + w_dec_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_; + F = gnand3; + } + + F *= C_ld_dec_out / (gate_C(w_dec_n[0], 0, is_dram, false, is_wl_tr) + + gate_C(w_dec_p[0], 0, is_dram, false, is_wl_tr)); + num_gates = logical_effort( + num_gates_min, + num_in_signals == 2 ? gnand2 : gnand3, + F, + w_dec_n, + w_dec_p, + C_ld_dec_out, + p_to_n_sz_ratio, + is_dram, + is_wl_tr, + g_tp.max_w_nmos_dec); + } +} + + + +void Decoder::compute_area() +{ + double cumulative_area = 0; + double cumulative_curr = 0; // cumulative leakage current + double cumulative_curr_Ig = 0; // cumulative leakage current + + if (exist) + { // First check if this decoder exists + if (num_in_signals == 2) + { + cumulative_area = compute_gate_area(NAND, 2, w_dec_p[0], w_dec_n[0], area.h); + cumulative_curr = cmos_Isub_leakage(w_dec_n[0], w_dec_p[0], 2, nand,is_dram); + cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[0], w_dec_p[0], 2, nand,is_dram); + } + else if (num_in_signals == 3) + { + cumulative_area = compute_gate_area(NAND, 3, w_dec_p[0], w_dec_n[0], area.h); + cumulative_curr = cmos_Isub_leakage(w_dec_n[0], w_dec_p[0], 3, nand, is_dram);; + cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[0], w_dec_p[0], 3, nand, is_dram); + } + + for (int i = 1; i < num_gates; i++) + { + cumulative_area += compute_gate_area(INV, 1, w_dec_p[i], w_dec_n[i], area.h); + cumulative_curr += cmos_Isub_leakage(w_dec_n[i], w_dec_p[i], 1, inv, is_dram); + cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[i], w_dec_p[i], 1, inv, is_dram); + } + power.readOp.leakage = cumulative_curr * g_tp.peri_global.Vdd; + power.readOp.gate_leakage = cumulative_curr_Ig * g_tp.peri_global.Vdd; + + area.w = (cumulative_area / area.h); + } +} + + + +double Decoder::compute_delays(double inrisetime) +{ + if (exist) + { + double ret_val = 0; // outrisetime + int i; + double rd, tf, this_delay, c_load, c_intrinsic, Vpp; + double Vdd = g_tp.peri_global.Vdd; + + if ((is_wl_tr) && (is_dram)) + { + Vpp = g_tp.vpp; + } + else if (is_wl_tr) + { + Vpp = g_tp.sram_cell.Vdd; + } + else + { + Vpp = g_tp.peri_global.Vdd; + } + + // first check whether a decoder is required at all + rd = tr_R_on(w_dec_n[0], NCH, num_in_signals, is_dram, false, is_wl_tr); + c_load = gate_C(w_dec_n[1] + w_dec_p[1], 0.0, is_dram, false, is_wl_tr); + c_intrinsic = drain_C_(w_dec_p[0], PCH, 1, 1, area.h, is_dram, false, is_wl_tr) * num_in_signals + + drain_C_(w_dec_n[0], NCH, num_in_signals, 1, area.h, is_dram, false, is_wl_tr); + tf = rd * (c_intrinsic + c_load); + this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE); + delay += this_delay; + inrisetime = this_delay / (1.0 - 0.5); + power.readOp.dynamic += (c_load + c_intrinsic) * Vdd * Vdd; + + for (i = 1; i < num_gates - 1; ++i) + { + rd = tr_R_on(w_dec_n[i], NCH, 1, is_dram, false, is_wl_tr); + c_load = gate_C(w_dec_p[i+1] + w_dec_n[i+1], 0.0, is_dram, false, is_wl_tr); + c_intrinsic = drain_C_(w_dec_p[i], PCH, 1, 1, area.h, is_dram, false, is_wl_tr) + + drain_C_(w_dec_n[i], NCH, 1, 1, area.h, is_dram, false, is_wl_tr); + tf = rd * (c_intrinsic + c_load); + this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE); + delay += this_delay; + inrisetime = this_delay / (1.0 - 0.5); + power.readOp.dynamic += (c_load + c_intrinsic) * Vdd * Vdd; + } + + // add delay of final inverter that drives the wordline + i = num_gates - 1; + c_load = C_ld_dec_out; + rd = tr_R_on(w_dec_n[i], NCH, 1, is_dram, false, is_wl_tr); + c_intrinsic = drain_C_(w_dec_p[i], PCH, 1, 1, area.h, is_dram, false, is_wl_tr) + + drain_C_(w_dec_n[i], NCH, 1, 1, area.h, is_dram, false, is_wl_tr); + tf = rd * (c_intrinsic + c_load) + R_wire_dec_out * c_load / 2; + this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE); + delay += this_delay; + ret_val = this_delay / (1.0 - 0.5); + power.readOp.dynamic += c_load * Vpp * Vpp + c_intrinsic * Vdd * Vdd; + + return ret_val; + } + else + { + return 0.0; + } +} + +void Decoder::leakage_feedback(double temperature) +{ + double cumulative_curr = 0; // cumulative leakage current + double cumulative_curr_Ig = 0; // cumulative leakage current + + if (exist) + { // First check if this decoder exists + if (num_in_signals == 2) + { + cumulative_curr = cmos_Isub_leakage(w_dec_n[0], w_dec_p[0], 2, nand,is_dram); + cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[0], w_dec_p[0], 2, nand,is_dram); + } + else if (num_in_signals == 3) + { + cumulative_curr = cmos_Isub_leakage(w_dec_n[0], w_dec_p[0], 3, nand, is_dram);; + cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[0], w_dec_p[0], 3, nand, is_dram); + } + + for (int i = 1; i < num_gates; i++) + { + cumulative_curr += cmos_Isub_leakage(w_dec_n[i], w_dec_p[i], 1, inv, is_dram); + cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[i], w_dec_p[i], 1, inv, is_dram); + } + + power.readOp.leakage = cumulative_curr * g_tp.peri_global.Vdd; + power.readOp.gate_leakage = cumulative_curr_Ig * g_tp.peri_global.Vdd; + } +} + +PredecBlk::PredecBlk( + int num_dec_signals, + Decoder * dec_, + double C_wire_predec_blk_out, + double R_wire_predec_blk_out_, + int num_dec_per_predec, + bool is_dram, + bool is_blk1) + :dec(dec_), + exist(false), + number_input_addr_bits(0), + C_ld_predec_blk_out(0), + R_wire_predec_blk_out(0), + branch_effort_nand2_gate_output(1), + branch_effort_nand3_gate_output(1), + flag_two_unique_paths(false), + flag_L2_gate(0), + number_inputs_L1_gate(0), + number_gates_L1_nand2_path(0), + number_gates_L1_nand3_path(0), + number_gates_L2(0), + min_number_gates_L1(2), + min_number_gates_L2(2), + num_L1_active_nand2_path(0), + num_L1_active_nand3_path(0), + delay_nand2_path(0), + delay_nand3_path(0), + power_nand2_path(), + power_nand3_path(), + power_L2(), + is_dram_(is_dram) +{ + int branch_effort_predec_out; + double C_ld_dec_gate; + int num_addr_bits_dec = _log2(num_dec_signals); + int blk1_num_input_addr_bits = (num_addr_bits_dec + 1) / 2; + int blk2_num_input_addr_bits = num_addr_bits_dec - blk1_num_input_addr_bits; + + w_L1_nand2_n[0] = 0; + w_L1_nand2_p[0] = 0; + w_L1_nand3_n[0] = 0; + w_L1_nand3_p[0] = 0; + + if (is_blk1 == true) + { + if (num_addr_bits_dec <= 0) + { + return; + } + else if (num_addr_bits_dec < 4) + { + // Just one predecoder block is required with NAND2 gates. No decoder required. + // The first level of predecoding directly drives the decoder output load + exist = true; + number_input_addr_bits = num_addr_bits_dec; + R_wire_predec_blk_out = dec->R_wire_dec_out; + C_ld_predec_blk_out = dec->C_ld_dec_out; + } + else + { + exist = true; + number_input_addr_bits = blk1_num_input_addr_bits; + branch_effort_predec_out = (1 << blk2_num_input_addr_bits); + C_ld_dec_gate = num_dec_per_predec * gate_C(dec->w_dec_n[0] + dec->w_dec_p[0], 0, is_dram_, false, false); + R_wire_predec_blk_out = R_wire_predec_blk_out_; + C_ld_predec_blk_out = branch_effort_predec_out * C_ld_dec_gate + C_wire_predec_blk_out; + } + } + else + { + if (num_addr_bits_dec >= 4) + { + exist = true; + number_input_addr_bits = blk2_num_input_addr_bits; + branch_effort_predec_out = (1 << blk1_num_input_addr_bits); + C_ld_dec_gate = num_dec_per_predec * gate_C(dec->w_dec_n[0] + dec->w_dec_p[0], 0, is_dram_, false, false); + R_wire_predec_blk_out = R_wire_predec_blk_out_; + C_ld_predec_blk_out = branch_effort_predec_out * C_ld_dec_gate + C_wire_predec_blk_out; + } + } + + compute_widths(); + compute_area(); +} + + + +void PredecBlk::compute_widths() +{ + double F, c_load_nand3_path, c_load_nand2_path; + double p_to_n_sz_ratio = pmos_to_nmos_sz_ratio(is_dram_); + double gnand2 = (2 + p_to_n_sz_ratio) / (1 + p_to_n_sz_ratio); + double gnand3 = (3 + p_to_n_sz_ratio) / (1 + p_to_n_sz_ratio); + + if (exist == false) return; + + + switch (number_input_addr_bits) + { + case 1: + flag_two_unique_paths = false; + number_inputs_L1_gate = 2; + flag_L2_gate = 0; + break; + case 2: + flag_two_unique_paths = false; + number_inputs_L1_gate = 2; + flag_L2_gate = 0; + break; + case 3: + flag_two_unique_paths = false; + number_inputs_L1_gate = 3; + flag_L2_gate = 0; + break; + case 4: + flag_two_unique_paths = false; + number_inputs_L1_gate = 2; + flag_L2_gate = 2; + branch_effort_nand2_gate_output = 4; + break; + case 5: + flag_two_unique_paths = true; + flag_L2_gate = 2; + branch_effort_nand2_gate_output = 8; + branch_effort_nand3_gate_output = 4; + break; + case 6: + flag_two_unique_paths = false; + number_inputs_L1_gate = 3; + flag_L2_gate = 2; + branch_effort_nand3_gate_output = 8; + break; + case 7: + flag_two_unique_paths = true; + flag_L2_gate = 3; + branch_effort_nand2_gate_output = 32; + branch_effort_nand3_gate_output = 16; + break; + case 8: + flag_two_unique_paths = true; + flag_L2_gate = 3; + branch_effort_nand2_gate_output = 64; + branch_effort_nand3_gate_output = 32; + break; + case 9: + flag_two_unique_paths = false; + number_inputs_L1_gate = 3; + flag_L2_gate = 3; + branch_effort_nand3_gate_output = 64; + break; + default: + assert(0); + break; + } + + // find the number of gates and sizing in second level of predecoder (if there is a second level) + if (flag_L2_gate) + { + if (flag_L2_gate == 2) + { // 2nd level is a NAND2 gate + w_L2_n[0] = 2 * g_tp.min_w_nmos_; + F = gnand2; + } + else + { // 2nd level is a NAND3 gate + w_L2_n[0] = 3 * g_tp.min_w_nmos_; + F = gnand3; + } + w_L2_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_; + F *= C_ld_predec_blk_out / (gate_C(w_L2_n[0], 0, is_dram_) + gate_C(w_L2_p[0], 0, is_dram_)); + number_gates_L2 = logical_effort( + min_number_gates_L2, + flag_L2_gate == 2 ? gnand2 : gnand3, + F, + w_L2_n, + w_L2_p, + C_ld_predec_blk_out, + p_to_n_sz_ratio, + is_dram_, false, + g_tp.max_w_nmos_); + + // Now find the number of gates and widths in first level of predecoder + if ((flag_two_unique_paths)||(number_inputs_L1_gate == 2)) + { // Whenever flag_two_unique_paths is true, it means first level of decoder employs + // both NAND2 and NAND3 gates. Or when number_inputs_L1_gate is 2, it means + // a NAND2 gate is used in the first level of the predecoder + c_load_nand2_path = branch_effort_nand2_gate_output * + (gate_C(w_L2_n[0], 0, is_dram_) + + gate_C(w_L2_p[0], 0, is_dram_)); + w_L1_nand2_n[0] = 2 * g_tp.min_w_nmos_; + w_L1_nand2_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_; + F = gnand2 * c_load_nand2_path / + (gate_C(w_L1_nand2_n[0], 0, is_dram_) + + gate_C(w_L1_nand2_p[0], 0, is_dram_)); + number_gates_L1_nand2_path = logical_effort( + min_number_gates_L1, + gnand2, + F, + w_L1_nand2_n, + w_L1_nand2_p, + c_load_nand2_path, + p_to_n_sz_ratio, + is_dram_, false, + g_tp.max_w_nmos_); + } + + //Now find widths of gates along path in which first gate is a NAND3 + if ((flag_two_unique_paths)||(number_inputs_L1_gate == 3)) + { // Whenever flag_two_unique_paths is TRUE, it means first level of decoder employs + // both NAND2 and NAND3 gates. Or when number_inputs_L1_gate is 3, it means + // a NAND3 gate is used in the first level of the predecoder + c_load_nand3_path = branch_effort_nand3_gate_output * + (gate_C(w_L2_n[0], 0, is_dram_) + + gate_C(w_L2_p[0], 0, is_dram_)); + w_L1_nand3_n[0] = 3 * g_tp.min_w_nmos_; + w_L1_nand3_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_; + F = gnand3 * c_load_nand3_path / + (gate_C(w_L1_nand3_n[0], 0, is_dram_) + + gate_C(w_L1_nand3_p[0], 0, is_dram_)); + number_gates_L1_nand3_path = logical_effort( + min_number_gates_L1, + gnand3, + F, + w_L1_nand3_n, + w_L1_nand3_p, + c_load_nand3_path, + p_to_n_sz_ratio, + is_dram_, false, + g_tp.max_w_nmos_); + } + } + else + { // find number of gates and widths in first level of predecoder block when there is no second level + if (number_inputs_L1_gate == 2) + { + w_L1_nand2_n[0] = 2 * g_tp.min_w_nmos_; + w_L1_nand2_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_; + F = gnand2*C_ld_predec_blk_out / + (gate_C(w_L1_nand2_n[0], 0, is_dram_) + + gate_C(w_L1_nand2_p[0], 0, is_dram_)); + number_gates_L1_nand2_path = logical_effort( + min_number_gates_L1, + gnand2, + F, + w_L1_nand2_n, + w_L1_nand2_p, + C_ld_predec_blk_out, + p_to_n_sz_ratio, + is_dram_, false, + g_tp.max_w_nmos_); + } + else if (number_inputs_L1_gate == 3) + { + w_L1_nand3_n[0] = 3 * g_tp.min_w_nmos_; + w_L1_nand3_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_; + F = gnand3*C_ld_predec_blk_out / + (gate_C(w_L1_nand3_n[0], 0, is_dram_) + + gate_C(w_L1_nand3_p[0], 0, is_dram_)); + number_gates_L1_nand3_path = logical_effort( + min_number_gates_L1, + gnand3, + F, + w_L1_nand3_n, + w_L1_nand3_p, + C_ld_predec_blk_out, + p_to_n_sz_ratio, + is_dram_, false, + g_tp.max_w_nmos_); + } + } +} + + + +void PredecBlk::compute_area() +{ + if (exist) + { // First check whether a predecoder block is needed + int num_L1_nand2 = 0; + int num_L1_nand3 = 0; + int num_L2 = 0; + double tot_area_L1_nand3 =0; + double leak_L1_nand3 =0; + double gate_leak_L1_nand3 =0; + + double tot_area_L1_nand2 = compute_gate_area(NAND, 2, w_L1_nand2_p[0], w_L1_nand2_n[0], g_tp.cell_h_def); + double leak_L1_nand2 = cmos_Isub_leakage(w_L1_nand2_n[0], w_L1_nand2_p[0], 2, nand, is_dram_); + double gate_leak_L1_nand2 = cmos_Ig_leakage(w_L1_nand2_n[0], w_L1_nand2_p[0], 2, nand, is_dram_); + if (number_inputs_L1_gate != 3) { + tot_area_L1_nand3 = 0; + leak_L1_nand3 = 0; + gate_leak_L1_nand3 =0; + } + else { + tot_area_L1_nand3 = compute_gate_area(NAND, 3, w_L1_nand3_p[0], w_L1_nand3_n[0], g_tp.cell_h_def); + leak_L1_nand3 = cmos_Isub_leakage(w_L1_nand3_n[0], w_L1_nand3_p[0], 3, nand); + gate_leak_L1_nand3 = cmos_Ig_leakage(w_L1_nand3_n[0], w_L1_nand3_p[0], 3, nand); + } + + switch (number_input_addr_bits) + { + case 1: //2 NAND2 gates + num_L1_nand2 = 2; + num_L2 = 0; + num_L1_active_nand2_path =1; + num_L1_active_nand3_path =0; + break; + case 2: //4 NAND2 gates + num_L1_nand2 = 4; + num_L2 = 0; + num_L1_active_nand2_path =1; + num_L1_active_nand3_path =0; + break; + case 3: //8 NAND3 gates + num_L1_nand3 = 8; + num_L2 = 0; + num_L1_active_nand2_path =0; + num_L1_active_nand3_path =1; + break; + case 4: //4 + 4 NAND2 gates + num_L1_nand2 = 8; + num_L2 = 16; + num_L1_active_nand2_path =2; + num_L1_active_nand3_path =0; + break; + case 5: //4 NAND2 gates, 8 NAND3 gates + num_L1_nand2 = 4; + num_L1_nand3 = 8; + num_L2 = 32; + num_L1_active_nand2_path =1; + num_L1_active_nand3_path =1; + break; + case 6: //8 + 8 NAND3 gates + num_L1_nand3 = 16; + num_L2 = 64; + num_L1_active_nand2_path =0; + num_L1_active_nand3_path =2; + break; + case 7: //4 + 4 NAND2 gates, 8 NAND3 gates + num_L1_nand2 = 8; + num_L1_nand3 = 8; + num_L2 = 128; + num_L1_active_nand2_path =2; + num_L1_active_nand3_path =1; + break; + case 8: //4 NAND2 gates, 8 + 8 NAND3 gates + num_L1_nand2 = 4; + num_L1_nand3 = 16; + num_L2 = 256; + num_L1_active_nand2_path =2; + num_L1_active_nand3_path =2; + break; + case 9: //8 + 8 + 8 NAND3 gates + num_L1_nand3 = 24; + num_L2 = 512; + num_L1_active_nand2_path =0; + num_L1_active_nand3_path =3; + break; + default: + break; + } + + for (int i = 1; i < number_gates_L1_nand2_path; ++i) + { + tot_area_L1_nand2 += compute_gate_area(INV, 1, w_L1_nand2_p[i], w_L1_nand2_n[i], g_tp.cell_h_def); + leak_L1_nand2 += cmos_Isub_leakage(w_L1_nand2_n[i], w_L1_nand2_p[i], 2, nand, is_dram_); + gate_leak_L1_nand2 += cmos_Ig_leakage(w_L1_nand2_n[i], w_L1_nand2_p[i], 2, nand, is_dram_); + } + tot_area_L1_nand2 *= num_L1_nand2; + leak_L1_nand2 *= num_L1_nand2; + gate_leak_L1_nand2 *= num_L1_nand2; + + for (int i = 1; i < number_gates_L1_nand3_path; ++i) + { + tot_area_L1_nand3 += compute_gate_area(INV, 1, w_L1_nand3_p[i], w_L1_nand3_n[i], g_tp.cell_h_def); + leak_L1_nand3 += cmos_Isub_leakage(w_L1_nand3_n[i], w_L1_nand3_p[i], 3, nand, is_dram_); + gate_leak_L1_nand3 += cmos_Ig_leakage(w_L1_nand3_n[i], w_L1_nand3_p[i], 3, nand, is_dram_); + } + tot_area_L1_nand3 *= num_L1_nand3; + leak_L1_nand3 *= num_L1_nand3; + gate_leak_L1_nand3 *= num_L1_nand3; + + double cumulative_area_L1 = tot_area_L1_nand2 + tot_area_L1_nand3; + double cumulative_area_L2 = 0.0; + double leakage_L2 = 0.0; + double gate_leakage_L2 = 0.0; + + if (flag_L2_gate == 2) + { + cumulative_area_L2 = compute_gate_area(NAND, 2, w_L2_p[0], w_L2_n[0], g_tp.cell_h_def); + leakage_L2 = cmos_Isub_leakage(w_L2_n[0], w_L2_p[0], 2, nand, is_dram_); + gate_leakage_L2 = cmos_Ig_leakage(w_L2_n[0], w_L2_p[0], 2, nand, is_dram_); + } + else if (flag_L2_gate == 3) + { + cumulative_area_L2 = compute_gate_area(NAND, 3, w_L2_p[0], w_L2_n[0], g_tp.cell_h_def); + leakage_L2 = cmos_Isub_leakage(w_L2_n[0], w_L2_p[0], 3, nand, is_dram_); + gate_leakage_L2 = cmos_Ig_leakage(w_L2_n[0], w_L2_p[0], 3, nand, is_dram_); + } + + for (int i = 1; i < number_gates_L2; ++i) + { + cumulative_area_L2 += compute_gate_area(INV, 1, w_L2_p[i], w_L2_n[i], g_tp.cell_h_def); + leakage_L2 += cmos_Isub_leakage(w_L2_n[i], w_L2_p[i], 2, inv, is_dram_); + gate_leakage_L2 += cmos_Ig_leakage(w_L2_n[i], w_L2_p[i], 2, inv, is_dram_); + } + cumulative_area_L2 *= num_L2; + leakage_L2 *= num_L2; + gate_leakage_L2 *= num_L2; + + power_nand2_path.readOp.leakage = leak_L1_nand2 * g_tp.peri_global.Vdd; + power_nand3_path.readOp.leakage = leak_L1_nand3 * g_tp.peri_global.Vdd; + power_L2.readOp.leakage = leakage_L2 * g_tp.peri_global.Vdd; + area.set_area(cumulative_area_L1 + cumulative_area_L2); + power_nand2_path.readOp.gate_leakage = gate_leak_L1_nand2 * g_tp.peri_global.Vdd; + power_nand3_path.readOp.gate_leakage = gate_leak_L1_nand3 * g_tp.peri_global.Vdd; + power_L2.readOp.gate_leakage = gate_leakage_L2 * g_tp.peri_global.Vdd; + } +} + + + +pair<double, double> PredecBlk::compute_delays( + pair<double, double> inrisetime) // <nand2, nand3> +{ + pair<double, double> ret_val; + ret_val.first = 0; // outrisetime_nand2_path + ret_val.second = 0; // outrisetime_nand3_path + + double inrisetime_nand2_path = inrisetime.first; + double inrisetime_nand3_path = inrisetime.second; + int i; + double rd, c_load, c_intrinsic, tf, this_delay; + double Vdd = g_tp.peri_global.Vdd; + + // TODO: following delay calculation part can be greatly simplified. + // first check whether a predecoder block is required + if (exist) + { + //Find delay in first level of predecoder block + //First find delay in path + if ((flag_two_unique_paths) || (number_inputs_L1_gate == 2)) + { + //First gate is a NAND2 gate + rd = tr_R_on(w_L1_nand2_n[0], NCH, 2, is_dram_); + c_load = gate_C(w_L1_nand2_n[1] + w_L1_nand2_p[1], 0.0, is_dram_); + c_intrinsic = 2 * drain_C_(w_L1_nand2_p[0], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(w_L1_nand2_n[0], NCH, 2, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_load); + this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE); + delay_nand2_path += this_delay; + inrisetime_nand2_path = this_delay / (1.0 - 0.5); + power_nand2_path.readOp.dynamic += (c_load + c_intrinsic) * Vdd * Vdd; + + //Add delays of all but the last inverter in the chain + for (i = 1; i < number_gates_L1_nand2_path - 1; ++i) + { + rd = tr_R_on(w_L1_nand2_n[i], NCH, 1, is_dram_); + c_load = gate_C(w_L1_nand2_n[i+1] + w_L1_nand2_p[i+1], 0.0, is_dram_); + c_intrinsic = drain_C_(w_L1_nand2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(w_L1_nand2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_load); + this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE); + delay_nand2_path += this_delay; + inrisetime_nand2_path = this_delay / (1.0 - 0.5); + power_nand2_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd; + } + + //Add delay of the last inverter + i = number_gates_L1_nand2_path - 1; + rd = tr_R_on(w_L1_nand2_n[i], NCH, 1, is_dram_); + if (flag_L2_gate) + { + c_load = branch_effort_nand2_gate_output*(gate_C(w_L2_n[0], 0, is_dram_) + gate_C(w_L2_p[0], 0, is_dram_)); + c_intrinsic = drain_C_(w_L1_nand2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(w_L1_nand2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_load); + this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE); + delay_nand2_path += this_delay; + inrisetime_nand2_path = this_delay / (1.0 - 0.5); + power_nand2_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd; + } + else + { //First level directly drives decoder output load + c_load = C_ld_predec_blk_out; + c_intrinsic = drain_C_(w_L1_nand2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(w_L1_nand2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_load) + R_wire_predec_blk_out * c_load / 2; + this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE); + delay_nand2_path += this_delay; + ret_val.first = this_delay / (1.0 - 0.5); + power_nand2_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd; + } + } + + if ((flag_two_unique_paths) || (number_inputs_L1_gate == 3)) + { //Check if the number of gates in the first level is more than 1. + //First gate is a NAND3 gate + rd = tr_R_on(w_L1_nand3_n[0], NCH, 3, is_dram_); + c_load = gate_C(w_L1_nand3_n[1] + w_L1_nand3_p[1], 0.0, is_dram_); + c_intrinsic = 3 * drain_C_(w_L1_nand3_p[0], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(w_L1_nand3_n[0], NCH, 3, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_load); + this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE); + delay_nand3_path += this_delay; + inrisetime_nand3_path = this_delay / (1.0 - 0.5); + power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd; + + //Add delays of all but the last inverter in the chain + for (i = 1; i < number_gates_L1_nand3_path - 1; ++i) + { + rd = tr_R_on(w_L1_nand3_n[i], NCH, 1, is_dram_); + c_load = gate_C(w_L1_nand3_n[i+1] + w_L1_nand3_p[i+1], 0.0, is_dram_); + c_intrinsic = drain_C_(w_L1_nand3_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(w_L1_nand3_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_load); + this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE); + delay_nand3_path += this_delay; + inrisetime_nand3_path = this_delay / (1.0 - 0.5); + power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd; + } + + //Add delay of the last inverter + i = number_gates_L1_nand3_path - 1; + rd = tr_R_on(w_L1_nand3_n[i], NCH, 1, is_dram_); + if (flag_L2_gate) + { + c_load = branch_effort_nand3_gate_output*(gate_C(w_L2_n[0], 0, is_dram_) + gate_C(w_L2_p[0], 0, is_dram_)); + c_intrinsic = drain_C_(w_L1_nand3_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(w_L1_nand3_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_load); + this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE); + delay_nand3_path += this_delay; + inrisetime_nand3_path = this_delay / (1.0 - 0.5); + power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd; + } + else + { //First level directly drives decoder output load + c_load = C_ld_predec_blk_out; + c_intrinsic = drain_C_(w_L1_nand3_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(w_L1_nand3_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_load) + R_wire_predec_blk_out * c_load / 2; + this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE); + delay_nand3_path += this_delay; + ret_val.second = this_delay / (1.0 - 0.5); + power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd; + } + } + + // Find delay through second level + if (flag_L2_gate) + { + if (flag_L2_gate == 2) + { + rd = tr_R_on(w_L2_n[0], NCH, 2, is_dram_); + c_load = gate_C(w_L2_n[1] + w_L2_p[1], 0.0, is_dram_); + c_intrinsic = 2 * drain_C_(w_L2_p[0], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(w_L2_n[0], NCH, 2, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_load); + this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE); + delay_nand2_path += this_delay; + inrisetime_nand2_path = this_delay / (1.0 - 0.5); + power_L2.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd; + } + else + { // flag_L2_gate = 3 + rd = tr_R_on(w_L2_n[0], NCH, 3, is_dram_); + c_load = gate_C(w_L2_n[1] + w_L2_p[1], 0.0, is_dram_); + c_intrinsic = 3 * drain_C_(w_L2_p[0], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(w_L2_n[0], NCH, 3, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_load); + this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE); + delay_nand3_path += this_delay; + inrisetime_nand3_path = this_delay / (1.0 - 0.5); + power_L2.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd; + } + + for (i = 1; i < number_gates_L2 - 1; ++i) + { + rd = tr_R_on(w_L2_n[i], NCH, 1, is_dram_); + c_load = gate_C(w_L2_n[i+1] + w_L2_p[i+1], 0.0, is_dram_); + c_intrinsic = drain_C_(w_L2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(w_L2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_load); + this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE); + delay_nand2_path += this_delay; + inrisetime_nand2_path = this_delay / (1.0 - 0.5); + this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE); + delay_nand3_path += this_delay; + inrisetime_nand3_path = this_delay / (1.0 - 0.5); + power_L2.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd; + } + + //Add delay of final inverter that drives the wordline decoders + i = number_gates_L2 - 1; + c_load = C_ld_predec_blk_out; + rd = tr_R_on(w_L2_n[i], NCH, 1, is_dram_); + c_intrinsic = drain_C_(w_L2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(w_L2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_load) + R_wire_predec_blk_out * c_load / 2; + this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE); + delay_nand2_path += this_delay; + ret_val.first = this_delay / (1.0 - 0.5); + this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE); + delay_nand3_path += this_delay; + ret_val.second = this_delay / (1.0 - 0.5); + power_L2.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd; + } + } + + delay = (ret_val.first > ret_val.second) ? ret_val.first : ret_val.second; + return ret_val; +} + +void PredecBlk::leakage_feedback(double temperature) +{ + if (exist) + { // First check whether a predecoder block is needed + int num_L1_nand2 = 0; + int num_L1_nand3 = 0; + int num_L2 = 0; + double leak_L1_nand3 =0; + double gate_leak_L1_nand3 =0; + + double leak_L1_nand2 = cmos_Isub_leakage(w_L1_nand2_n[0], w_L1_nand2_p[0], 2, nand, is_dram_); + double gate_leak_L1_nand2 = cmos_Ig_leakage(w_L1_nand2_n[0], w_L1_nand2_p[0], 2, nand, is_dram_); + if (number_inputs_L1_gate != 3) { + leak_L1_nand3 = 0; + gate_leak_L1_nand3 =0; + } + else { + leak_L1_nand3 = cmos_Isub_leakage(w_L1_nand3_n[0], w_L1_nand3_p[0], 3, nand); + gate_leak_L1_nand3 = cmos_Ig_leakage(w_L1_nand3_n[0], w_L1_nand3_p[0], 3, nand); + } + + switch (number_input_addr_bits) + { + case 1: //2 NAND2 gates + num_L1_nand2 = 2; + num_L2 = 0; + num_L1_active_nand2_path =1; + num_L1_active_nand3_path =0; + break; + case 2: //4 NAND2 gates + num_L1_nand2 = 4; + num_L2 = 0; + num_L1_active_nand2_path =1; + num_L1_active_nand3_path =0; + break; + case 3: //8 NAND3 gates + num_L1_nand3 = 8; + num_L2 = 0; + num_L1_active_nand2_path =0; + num_L1_active_nand3_path =1; + break; + case 4: //4 + 4 NAND2 gates + num_L1_nand2 = 8; + num_L2 = 16; + num_L1_active_nand2_path =2; + num_L1_active_nand3_path =0; + break; + case 5: //4 NAND2 gates, 8 NAND3 gates + num_L1_nand2 = 4; + num_L1_nand3 = 8; + num_L2 = 32; + num_L1_active_nand2_path =1; + num_L1_active_nand3_path =1; + break; + case 6: //8 + 8 NAND3 gates + num_L1_nand3 = 16; + num_L2 = 64; + num_L1_active_nand2_path =0; + num_L1_active_nand3_path =2; + break; + case 7: //4 + 4 NAND2 gates, 8 NAND3 gates + num_L1_nand2 = 8; + num_L1_nand3 = 8; + num_L2 = 128; + num_L1_active_nand2_path =2; + num_L1_active_nand3_path =1; + break; + case 8: //4 NAND2 gates, 8 + 8 NAND3 gates + num_L1_nand2 = 4; + num_L1_nand3 = 16; + num_L2 = 256; + num_L1_active_nand2_path =2; + num_L1_active_nand3_path =2; + break; + case 9: //8 + 8 + 8 NAND3 gates + num_L1_nand3 = 24; + num_L2 = 512; + num_L1_active_nand2_path =0; + num_L1_active_nand3_path =3; + break; + default: + break; + } + + for (int i = 1; i < number_gates_L1_nand2_path; ++i) + { + leak_L1_nand2 += cmos_Isub_leakage(w_L1_nand2_n[i], w_L1_nand2_p[i], 2, nand, is_dram_); + gate_leak_L1_nand2 += cmos_Ig_leakage(w_L1_nand2_n[i], w_L1_nand2_p[i], 2, nand, is_dram_); + } + leak_L1_nand2 *= num_L1_nand2; + gate_leak_L1_nand2 *= num_L1_nand2; + + for (int i = 1; i < number_gates_L1_nand3_path; ++i) + { + leak_L1_nand3 += cmos_Isub_leakage(w_L1_nand3_n[i], w_L1_nand3_p[i], 3, nand, is_dram_); + gate_leak_L1_nand3 += cmos_Ig_leakage(w_L1_nand3_n[i], w_L1_nand3_p[i], 3, nand, is_dram_); + } + leak_L1_nand3 *= num_L1_nand3; + gate_leak_L1_nand3 *= num_L1_nand3; + + double leakage_L2 = 0.0; + double gate_leakage_L2 = 0.0; + + if (flag_L2_gate == 2) + { + leakage_L2 = cmos_Isub_leakage(w_L2_n[0], w_L2_p[0], 2, nand, is_dram_); + gate_leakage_L2 = cmos_Ig_leakage(w_L2_n[0], w_L2_p[0], 2, nand, is_dram_); + } + else if (flag_L2_gate == 3) + { + leakage_L2 = cmos_Isub_leakage(w_L2_n[0], w_L2_p[0], 3, nand, is_dram_); + gate_leakage_L2 = cmos_Ig_leakage(w_L2_n[0], w_L2_p[0], 3, nand, is_dram_); + } + + for (int i = 1; i < number_gates_L2; ++i) + { + leakage_L2 += cmos_Isub_leakage(w_L2_n[i], w_L2_p[i], 2, inv, is_dram_); + gate_leakage_L2 += cmos_Ig_leakage(w_L2_n[i], w_L2_p[i], 2, inv, is_dram_); + } + leakage_L2 *= num_L2; + gate_leakage_L2 *= num_L2; + + power_nand2_path.readOp.leakage = leak_L1_nand2 * g_tp.peri_global.Vdd; + power_nand3_path.readOp.leakage = leak_L1_nand3 * g_tp.peri_global.Vdd; + power_L2.readOp.leakage = leakage_L2 * g_tp.peri_global.Vdd; + + power_nand2_path.readOp.gate_leakage = gate_leak_L1_nand2 * g_tp.peri_global.Vdd; + power_nand3_path.readOp.gate_leakage = gate_leak_L1_nand3 * g_tp.peri_global.Vdd; + power_L2.readOp.gate_leakage = gate_leakage_L2 * g_tp.peri_global.Vdd; + } +} + +PredecBlkDrv::PredecBlkDrv( + int way_select_, + PredecBlk * blk_, + bool is_dram) + :flag_driver_exists(0), + number_gates_nand2_path(0), + number_gates_nand3_path(0), + min_number_gates(2), + num_buffers_driving_1_nand2_load(0), + num_buffers_driving_2_nand2_load(0), + num_buffers_driving_4_nand2_load(0), + num_buffers_driving_2_nand3_load(0), + num_buffers_driving_8_nand3_load(0), + num_buffers_nand3_path(0), + c_load_nand2_path_out(0), + c_load_nand3_path_out(0), + r_load_nand2_path_out(0), + r_load_nand3_path_out(0), + delay_nand2_path(0), + delay_nand3_path(0), + power_nand2_path(), + power_nand3_path(), + blk(blk_), dec(blk->dec), + is_dram_(is_dram), + way_select(way_select_) +{ + for (int i = 0; i < MAX_NUMBER_GATES_STAGE; i++) + { + width_nand2_path_n[i] = 0; + width_nand2_path_p[i] = 0; + width_nand3_path_n[i] = 0; + width_nand3_path_p[i] = 0; + } + + number_input_addr_bits = blk->number_input_addr_bits; + + if (way_select > 1) + { + flag_driver_exists = 1; + number_input_addr_bits = way_select; + if (dec->num_in_signals == 2) + { + c_load_nand2_path_out = gate_C(dec->w_dec_n[0] + dec->w_dec_p[0], 0, is_dram_); + num_buffers_driving_2_nand2_load = number_input_addr_bits; + } + else if (dec->num_in_signals == 3) + { + c_load_nand3_path_out = gate_C(dec->w_dec_n[0] + dec->w_dec_p[0], 0, is_dram_); + num_buffers_driving_2_nand3_load = number_input_addr_bits; + } + } + else if (way_select == 0) + { + if (blk->exist) + { + flag_driver_exists = 1; + } + } + + compute_widths(); + compute_area(); +} + + + +void PredecBlkDrv::compute_widths() +{ + // The predecode block driver accepts as input the address bits from the h-tree network. For + // each addr bit it then generates addr and addrbar as outputs. For now ignore the effect of + // inversion to generate addrbar and simply treat addrbar as addr. + + double F; + double p_to_n_sz_ratio = pmos_to_nmos_sz_ratio(is_dram_); + + if (flag_driver_exists) + { + double C_nand2_gate_blk = gate_C(blk->w_L1_nand2_n[0] + blk->w_L1_nand2_p[0], 0, is_dram_); + double C_nand3_gate_blk = gate_C(blk->w_L1_nand3_n[0] + blk->w_L1_nand3_p[0], 0, is_dram_); + + if (way_select == 0) + { + if (blk->number_input_addr_bits == 1) + { //2 NAND2 gates + num_buffers_driving_2_nand2_load = 1; + c_load_nand2_path_out = 2 * C_nand2_gate_blk; + } + else if (blk->number_input_addr_bits == 2) + { //4 NAND2 gates one 2-4 decoder + num_buffers_driving_4_nand2_load = 2; + c_load_nand2_path_out = 4 * C_nand2_gate_blk; + } + else if (blk->number_input_addr_bits == 3) + { //8 NAND3 gates one 3-8 decoder + num_buffers_driving_8_nand3_load = 3; + c_load_nand3_path_out = 8 * C_nand3_gate_blk; + } + else if (blk->number_input_addr_bits == 4) + { //4 + 4 NAND2 gates two 2-4 decoder + num_buffers_driving_4_nand2_load = 4; + c_load_nand2_path_out = 4 * C_nand2_gate_blk; + } + else if (blk->number_input_addr_bits == 5) + { //4 NAND2 gates, 8 NAND3 gates one 2-4 decoder and one 3-8 decoder + num_buffers_driving_4_nand2_load = 2; + num_buffers_driving_8_nand3_load = 3; + c_load_nand2_path_out = 4 * C_nand2_gate_blk; + c_load_nand3_path_out = 8 * C_nand3_gate_blk; + } + else if (blk->number_input_addr_bits == 6) + { //8 + 8 NAND3 gates two 3-8 decoder + num_buffers_driving_8_nand3_load = 6; + c_load_nand3_path_out = 8 * C_nand3_gate_blk; + } + else if (blk->number_input_addr_bits == 7) + { //4 + 4 NAND2 gates, 8 NAND3 gates two 2-4 decoder and one 3-8 decoder + num_buffers_driving_4_nand2_load = 4; + num_buffers_driving_8_nand3_load = 3; + c_load_nand2_path_out = 4 * C_nand2_gate_blk; + c_load_nand3_path_out = 8 * C_nand3_gate_blk; + } + else if (blk->number_input_addr_bits == 8) + { //4 NAND2 gates, 8 + 8 NAND3 gates one 2-4 decoder and two 3-8 decoder + num_buffers_driving_4_nand2_load = 2; + num_buffers_driving_8_nand3_load = 6; + c_load_nand2_path_out = 4 * C_nand2_gate_blk; + c_load_nand3_path_out = 8 * C_nand3_gate_blk; + } + else if (blk->number_input_addr_bits == 9) + { //8 + 8 + 8 NAND3 gates three 3-8 decoder + num_buffers_driving_8_nand3_load = 9; + c_load_nand3_path_out = 8 * C_nand3_gate_blk; + } + } + + if ((blk->flag_two_unique_paths) || + (blk->number_inputs_L1_gate == 2) || + (number_input_addr_bits == 0) || + ((way_select)&&(dec->num_in_signals == 2))) + { //this means that way_select is driving NAND2 in decoder. + width_nand2_path_n[0] = g_tp.min_w_nmos_; + width_nand2_path_p[0] = p_to_n_sz_ratio * width_nand2_path_n[0]; + F = c_load_nand2_path_out / gate_C(width_nand2_path_n[0] + width_nand2_path_p[0], 0, is_dram_); + number_gates_nand2_path = logical_effort( + min_number_gates, + 1, + F, + width_nand2_path_n, + width_nand2_path_p, + c_load_nand2_path_out, + p_to_n_sz_ratio, + is_dram_, false, g_tp.max_w_nmos_); + } + + if ((blk->flag_two_unique_paths) || + (blk->number_inputs_L1_gate == 3) || + ((way_select)&&(dec->num_in_signals == 3))) + { //this means that way_select is driving NAND3 in decoder. + width_nand3_path_n[0] = g_tp.min_w_nmos_; + width_nand3_path_p[0] = p_to_n_sz_ratio * width_nand3_path_n[0]; + F = c_load_nand3_path_out / gate_C(width_nand3_path_n[0] + width_nand3_path_p[0], 0, is_dram_); + number_gates_nand3_path = logical_effort( + min_number_gates, + 1, + F, + width_nand3_path_n, + width_nand3_path_p, + c_load_nand3_path_out, + p_to_n_sz_ratio, + is_dram_, false, g_tp.max_w_nmos_); + } + } +} + + + +void PredecBlkDrv::compute_area() +{ + double area_nand2_path = 0; + double area_nand3_path = 0; + double leak_nand2_path = 0; + double leak_nand3_path = 0; + double gate_leak_nand2_path = 0; + double gate_leak_nand3_path = 0; + + if (flag_driver_exists) + { // first check whether a predecoder block driver is needed + for (int i = 0; i < number_gates_nand2_path; ++i) + { + area_nand2_path += compute_gate_area(INV, 1, width_nand2_path_p[i], width_nand2_path_n[i], g_tp.cell_h_def); + leak_nand2_path += cmos_Isub_leakage(width_nand2_path_n[i], width_nand2_path_p[i], 1, inv,is_dram_); + gate_leak_nand2_path += cmos_Ig_leakage(width_nand2_path_n[i], width_nand2_path_p[i], 1, inv,is_dram_); + } + area_nand2_path *= (num_buffers_driving_1_nand2_load + + num_buffers_driving_2_nand2_load + + num_buffers_driving_4_nand2_load); + leak_nand2_path *= (num_buffers_driving_1_nand2_load + + num_buffers_driving_2_nand2_load + + num_buffers_driving_4_nand2_load); + gate_leak_nand2_path *= (num_buffers_driving_1_nand2_load + + num_buffers_driving_2_nand2_load + + num_buffers_driving_4_nand2_load); + + for (int i = 0; i < number_gates_nand3_path; ++i) + { + area_nand3_path += compute_gate_area(INV, 1, width_nand3_path_p[i], width_nand3_path_n[i], g_tp.cell_h_def); + leak_nand3_path += cmos_Isub_leakage(width_nand3_path_n[i], width_nand3_path_p[i], 1, inv,is_dram_); + gate_leak_nand3_path += cmos_Ig_leakage(width_nand3_path_n[i], width_nand3_path_p[i], 1, inv,is_dram_); + } + area_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load); + leak_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load); + gate_leak_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load); + + power_nand2_path.readOp.leakage = leak_nand2_path * g_tp.peri_global.Vdd; + power_nand3_path.readOp.leakage = leak_nand3_path * g_tp.peri_global.Vdd; + power_nand2_path.readOp.gate_leakage = gate_leak_nand2_path * g_tp.peri_global.Vdd; + power_nand3_path.readOp.gate_leakage = gate_leak_nand3_path * g_tp.peri_global.Vdd; + area.set_area(area_nand2_path + area_nand3_path); + } +} + + + +pair<double, double> PredecBlkDrv::compute_delays( + double inrisetime_nand2_path, + double inrisetime_nand3_path) +{ + pair<double, double> ret_val; + ret_val.first = 0; // outrisetime_nand2_path + ret_val.second = 0; // outrisetime_nand3_path + int i; + double rd, c_gate_load, c_load, c_intrinsic, tf, this_delay; + double Vdd = g_tp.peri_global.Vdd; + + if (flag_driver_exists) + { + for (i = 0; i < number_gates_nand2_path - 1; ++i) + { + rd = tr_R_on(width_nand2_path_n[i], NCH, 1, is_dram_); + c_gate_load = gate_C(width_nand2_path_p[i+1] + width_nand2_path_n[i+1], 0.0, is_dram_); + c_intrinsic = drain_C_(width_nand2_path_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(width_nand2_path_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_gate_load); + this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE); + delay_nand2_path += this_delay; + inrisetime_nand2_path = this_delay / (1.0 - 0.5); + power_nand2_path.readOp.dynamic += (c_gate_load + c_intrinsic) * 0.5 * Vdd * Vdd; + } + + // Final inverter drives the predecoder block or the decoder output load + if (number_gates_nand2_path != 0) + { + i = number_gates_nand2_path - 1; + rd = tr_R_on(width_nand2_path_n[i], NCH, 1, is_dram_); + c_intrinsic = drain_C_(width_nand2_path_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(width_nand2_path_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_); + c_load = c_load_nand2_path_out; + tf = rd * (c_intrinsic + c_load) + r_load_nand2_path_out*c_load/ 2; + this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE); + delay_nand2_path += this_delay; + ret_val.first = this_delay / (1.0 - 0.5); + power_nand2_path.readOp.dynamic += (c_intrinsic + c_load) * 0.5 * Vdd * Vdd; +// cout<< "c_intrinsic = " << c_intrinsic << "c_load" << c_load <<endl; + } + + for (i = 0; i < number_gates_nand3_path - 1; ++i) + { + rd = tr_R_on(width_nand3_path_n[i], NCH, 1, is_dram_); + c_gate_load = gate_C(width_nand3_path_p[i+1] + width_nand3_path_n[i+1], 0.0, is_dram_); + c_intrinsic = drain_C_(width_nand3_path_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(width_nand3_path_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_gate_load); + this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE); + delay_nand3_path += this_delay; + inrisetime_nand3_path = this_delay / (1.0 - 0.5); + power_nand3_path.readOp.dynamic += (c_gate_load + c_intrinsic) * 0.5 * Vdd * Vdd; + } + + // Final inverter drives the predecoder block or the decoder output load + if (number_gates_nand3_path != 0) + { + i = number_gates_nand3_path - 1; + rd = tr_R_on(width_nand3_path_n[i], NCH, 1, is_dram_); + c_intrinsic = drain_C_(width_nand3_path_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(width_nand3_path_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_); + c_load = c_load_nand3_path_out; + tf = rd*(c_intrinsic + c_load) + r_load_nand3_path_out*c_load / 2; + this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE); + delay_nand3_path += this_delay; + ret_val.second = this_delay / (1.0 - 0.5); + power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * 0.5 * Vdd * Vdd; + } + } + return ret_val; +} + + +double PredecBlkDrv::get_rdOp_dynamic_E(int num_act_mats_hor_dir) +{ + return (num_addr_bits_nand2_path()*power_nand2_path.readOp.dynamic + + num_addr_bits_nand3_path()*power_nand3_path.readOp.dynamic) * num_act_mats_hor_dir; +} + + + +Predec::Predec( + PredecBlkDrv * drv1_, + PredecBlkDrv * drv2_) +:blk1(drv1_->blk), blk2(drv2_->blk), drv1(drv1_), drv2(drv2_) +{ + driver_power.readOp.leakage = drv1->power_nand2_path.readOp.leakage + + drv1->power_nand3_path.readOp.leakage + + drv2->power_nand2_path.readOp.leakage + + drv2->power_nand3_path.readOp.leakage; + block_power.readOp.leakage = blk1->power_nand2_path.readOp.leakage + + blk1->power_nand3_path.readOp.leakage + + blk1->power_L2.readOp.leakage + + blk2->power_nand2_path.readOp.leakage + + blk2->power_nand3_path.readOp.leakage + + blk2->power_L2.readOp.leakage; + power.readOp.leakage = driver_power.readOp.leakage + block_power.readOp.leakage; + + driver_power.readOp.gate_leakage = drv1->power_nand2_path.readOp.gate_leakage + + drv1->power_nand3_path.readOp.gate_leakage + + drv2->power_nand2_path.readOp.gate_leakage + + drv2->power_nand3_path.readOp.gate_leakage; + block_power.readOp.gate_leakage = blk1->power_nand2_path.readOp.gate_leakage + + blk1->power_nand3_path.readOp.gate_leakage + + blk1->power_L2.readOp.gate_leakage + + blk2->power_nand2_path.readOp.gate_leakage + + blk2->power_nand3_path.readOp.gate_leakage + + blk2->power_L2.readOp.gate_leakage; + power.readOp.gate_leakage = driver_power.readOp.gate_leakage + block_power.readOp.gate_leakage; +} + +void PredecBlkDrv::leakage_feedback(double temperature) +{ + double leak_nand2_path = 0; + double leak_nand3_path = 0; + double gate_leak_nand2_path = 0; + double gate_leak_nand3_path = 0; + + if (flag_driver_exists) + { // first check whether a predecoder block driver is needed + for (int i = 0; i < number_gates_nand2_path; ++i) + { + leak_nand2_path += cmos_Isub_leakage(width_nand2_path_n[i], width_nand2_path_p[i], 1, inv,is_dram_); + gate_leak_nand2_path += cmos_Ig_leakage(width_nand2_path_n[i], width_nand2_path_p[i], 1, inv,is_dram_); + } + leak_nand2_path *= (num_buffers_driving_1_nand2_load + + num_buffers_driving_2_nand2_load + + num_buffers_driving_4_nand2_load); + gate_leak_nand2_path *= (num_buffers_driving_1_nand2_load + + num_buffers_driving_2_nand2_load + + num_buffers_driving_4_nand2_load); + + for (int i = 0; i < number_gates_nand3_path; ++i) + { + leak_nand3_path += cmos_Isub_leakage(width_nand3_path_n[i], width_nand3_path_p[i], 1, inv,is_dram_); + gate_leak_nand3_path += cmos_Ig_leakage(width_nand3_path_n[i], width_nand3_path_p[i], 1, inv,is_dram_); + } + leak_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load); + gate_leak_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load); + + power_nand2_path.readOp.leakage = leak_nand2_path * g_tp.peri_global.Vdd; + power_nand3_path.readOp.leakage = leak_nand3_path * g_tp.peri_global.Vdd; + power_nand2_path.readOp.gate_leakage = gate_leak_nand2_path * g_tp.peri_global.Vdd; + power_nand3_path.readOp.gate_leakage = gate_leak_nand3_path * g_tp.peri_global.Vdd; + } +} + +double Predec::compute_delays(double inrisetime) +{ + // TODO: Jung Ho thinks that predecoder block driver locates between decoder and predecoder block. + pair<double, double> tmp_pair1, tmp_pair2; + tmp_pair1 = drv1->compute_delays(inrisetime, inrisetime); + tmp_pair1 = blk1->compute_delays(tmp_pair1); + tmp_pair2 = drv2->compute_delays(inrisetime, inrisetime); + tmp_pair2 = blk2->compute_delays(tmp_pair2); + tmp_pair1 = get_max_delay_before_decoder(tmp_pair1, tmp_pair2); + + driver_power.readOp.dynamic = + drv1->num_addr_bits_nand2_path() * drv1->power_nand2_path.readOp.dynamic + + drv1->num_addr_bits_nand3_path() * drv1->power_nand3_path.readOp.dynamic + + drv2->num_addr_bits_nand2_path() * drv2->power_nand2_path.readOp.dynamic + + drv2->num_addr_bits_nand3_path() * drv2->power_nand3_path.readOp.dynamic; + + block_power.readOp.dynamic = + blk1->power_nand2_path.readOp.dynamic*blk1->num_L1_active_nand2_path + + blk1->power_nand3_path.readOp.dynamic*blk1->num_L1_active_nand3_path + + blk1->power_L2.readOp.dynamic + + blk2->power_nand2_path.readOp.dynamic*blk1->num_L1_active_nand2_path + + blk2->power_nand3_path.readOp.dynamic*blk1->num_L1_active_nand3_path + + blk2->power_L2.readOp.dynamic; + + power.readOp.dynamic = driver_power.readOp.dynamic + block_power.readOp.dynamic; + + delay = tmp_pair1.first; + return tmp_pair1.second; +} + + +void Predec::leakage_feedback(double temperature) +{ + drv1->leakage_feedback(temperature); + drv2->leakage_feedback(temperature); + blk1->leakage_feedback(temperature); + blk2->leakage_feedback(temperature); + + driver_power.readOp.leakage = drv1->power_nand2_path.readOp.leakage + + drv1->power_nand3_path.readOp.leakage + + drv2->power_nand2_path.readOp.leakage + + drv2->power_nand3_path.readOp.leakage; + block_power.readOp.leakage = blk1->power_nand2_path.readOp.leakage + + blk1->power_nand3_path.readOp.leakage + + blk1->power_L2.readOp.leakage + + blk2->power_nand2_path.readOp.leakage + + blk2->power_nand3_path.readOp.leakage + + blk2->power_L2.readOp.leakage; + power.readOp.leakage = driver_power.readOp.leakage + block_power.readOp.leakage; + + driver_power.readOp.gate_leakage = drv1->power_nand2_path.readOp.gate_leakage + + drv1->power_nand3_path.readOp.gate_leakage + + drv2->power_nand2_path.readOp.gate_leakage + + drv2->power_nand3_path.readOp.gate_leakage; + block_power.readOp.gate_leakage = blk1->power_nand2_path.readOp.gate_leakage + + blk1->power_nand3_path.readOp.gate_leakage + + blk1->power_L2.readOp.gate_leakage + + blk2->power_nand2_path.readOp.gate_leakage + + blk2->power_nand3_path.readOp.gate_leakage + + blk2->power_L2.readOp.gate_leakage; + power.readOp.gate_leakage = driver_power.readOp.gate_leakage + block_power.readOp.gate_leakage; +} + +// returns <delay, risetime> +pair<double, double> Predec::get_max_delay_before_decoder( + pair<double, double> input_pair1, + pair<double, double> input_pair2) +{ + pair<double, double> ret_val; + double delay; + + delay = drv1->delay_nand2_path + blk1->delay_nand2_path; + ret_val.first = delay; + ret_val.second = input_pair1.first; + delay = drv1->delay_nand3_path + blk1->delay_nand3_path; + if (ret_val.first < delay) + { + ret_val.first = delay; + ret_val.second = input_pair1.second; + } + delay = drv2->delay_nand2_path + blk2->delay_nand2_path; + if (ret_val.first < delay) + { + ret_val.first = delay; + ret_val.second = input_pair2.first; + } + delay = drv2->delay_nand3_path + blk2->delay_nand3_path; + if (ret_val.first < delay) + { + ret_val.first = delay; + ret_val.second = input_pair2.second; + } + + return ret_val; +} + + + +Driver::Driver(double c_gate_load_, double c_wire_load_, double r_wire_load_, bool is_dram) +:number_gates(0), + min_number_gates(2), + c_gate_load(c_gate_load_), + c_wire_load(c_wire_load_), + r_wire_load(r_wire_load_), + delay(0), + power(), + is_dram_(is_dram) +{ + for (int i = 0; i < MAX_NUMBER_GATES_STAGE; i++) + { + width_n[i] = 0; + width_p[i] = 0; + } + + compute_widths(); +} + + +void Driver::compute_widths() +{ + double p_to_n_sz_ratio = pmos_to_nmos_sz_ratio(is_dram_); + double c_load = c_gate_load + c_wire_load; + width_n[0] = g_tp.min_w_nmos_; + width_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_; + + double F = c_load / gate_C(width_n[0] + width_p[0], 0, is_dram_); + number_gates = logical_effort( + min_number_gates, + 1, + F, + width_n, + width_p, + c_load, + p_to_n_sz_ratio, + is_dram_, false, + g_tp.max_w_nmos_); +} + + + +double Driver::compute_delay(double inrisetime) +{ + int i; + double rd, c_load, c_intrinsic, tf; + double this_delay = 0; + + for (i = 0; i < number_gates - 1; ++i) + { + rd = tr_R_on(width_n[i], NCH, 1, is_dram_); + c_load = gate_C(width_n[i+1] + width_p[i+1], 0.0, is_dram_); + c_intrinsic = drain_C_(width_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(width_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_load); + this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE); + delay += this_delay; + inrisetime = this_delay / (1.0 - 0.5); + power.readOp.dynamic += (c_intrinsic + c_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd; + power.readOp.leakage += cmos_Isub_leakage(width_n[i], width_p[i], 1, inv, is_dram_) *g_tp.peri_global.Vdd; + power.readOp.gate_leakage += cmos_Ig_leakage(width_n[i], width_p[i], 1, inv, is_dram_)* g_tp.peri_global.Vdd; + } + + i = number_gates - 1; + c_load = c_gate_load + c_wire_load; + rd = tr_R_on(width_n[i], NCH, 1, is_dram_); + c_intrinsic = drain_C_(width_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) + + drain_C_(width_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_); + tf = rd * (c_intrinsic + c_load) + r_wire_load * (c_wire_load / 2 + c_gate_load); + this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE); + delay += this_delay; + power.readOp.dynamic += (c_intrinsic + c_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd; + power.readOp.leakage += cmos_Isub_leakage(width_n[i], width_p[i], 1, inv, is_dram_) * g_tp.peri_global.Vdd; + power.readOp.gate_leakage += cmos_Ig_leakage(width_n[i], width_p[i], 1, inv, is_dram_)* g_tp.peri_global.Vdd; + + return this_delay / (1.0 - 0.5); +} + diff --git a/ext/mcpat/cacti/decoder.h b/ext/mcpat/cacti/decoder.h new file mode 100644 index 000000000..35631e84b --- /dev/null +++ b/ext/mcpat/cacti/decoder.h @@ -0,0 +1,247 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + +#ifndef __DECODER_H__ +#define __DECODER_H__ + +#include <vector> + +#include "area.h" +#include "component.h" +#include "parameter.h" + +using namespace std; + + +class Decoder : public Component +{ + public: + Decoder( + int _num_dec_signals, + bool flag_way_select, + double _C_ld_dec_out, + double _R_wire_dec_out, + bool fully_assoc_, + bool is_dram_, + bool is_wl_tr_, + const Area & cell_); + + bool exist; + int num_in_signals; + double C_ld_dec_out; + double R_wire_dec_out; + int num_gates; + int num_gates_min; + double w_dec_n[MAX_NUMBER_GATES_STAGE]; + double w_dec_p[MAX_NUMBER_GATES_STAGE]; + double delay; + //powerDef power; + bool fully_assoc; + bool is_dram; + bool is_wl_tr; + const Area & cell; + + + void compute_widths(); + void compute_area(); + double compute_delays(double inrisetime); // return outrisetime + + void leakage_feedback(double temperature); +}; + + + +class PredecBlk : public Component +{ + public: + PredecBlk( + int num_dec_signals, + Decoder * dec, + double C_wire_predec_blk_out, + double R_wire_predec_blk_out, + int num_dec_per_predec, + bool is_dram_, + bool is_blk1); + + Decoder * dec; + bool exist; + int number_input_addr_bits; + double C_ld_predec_blk_out; + double R_wire_predec_blk_out; + int branch_effort_nand2_gate_output; + int branch_effort_nand3_gate_output; + bool flag_two_unique_paths; + int flag_L2_gate; + int number_inputs_L1_gate; + int number_gates_L1_nand2_path; + int number_gates_L1_nand3_path; + int number_gates_L2; + int min_number_gates_L1; + int min_number_gates_L2; + int num_L1_active_nand2_path; + int num_L1_active_nand3_path; + double w_L1_nand2_n[MAX_NUMBER_GATES_STAGE]; + double w_L1_nand2_p[MAX_NUMBER_GATES_STAGE]; + double w_L1_nand3_n[MAX_NUMBER_GATES_STAGE]; + double w_L1_nand3_p[MAX_NUMBER_GATES_STAGE]; + double w_L2_n[MAX_NUMBER_GATES_STAGE]; + double w_L2_p[MAX_NUMBER_GATES_STAGE]; + double delay_nand2_path; + double delay_nand3_path; + powerDef power_nand2_path; + powerDef power_nand3_path; + powerDef power_L2; + + bool is_dram_; + + void compute_widths(); + void compute_area(); + + void leakage_feedback(double temperature); + + pair<double, double> compute_delays(pair<double, double> inrisetime); // <nand2, nand3> + // return <outrise_nand2, outrise_nand3> +}; + + +class PredecBlkDrv : public Component +{ + public: + PredecBlkDrv( + int way_select, + PredecBlk * blk_, + bool is_dram); + + int flag_driver_exists; + int number_input_addr_bits; + int number_gates_nand2_path; + int number_gates_nand3_path; + int min_number_gates; + int num_buffers_driving_1_nand2_load; + int num_buffers_driving_2_nand2_load; + int num_buffers_driving_4_nand2_load; + int num_buffers_driving_2_nand3_load; + int num_buffers_driving_8_nand3_load; + int num_buffers_nand3_path; + double c_load_nand2_path_out; + double c_load_nand3_path_out; + double r_load_nand2_path_out; + double r_load_nand3_path_out; + double width_nand2_path_n[MAX_NUMBER_GATES_STAGE]; + double width_nand2_path_p[MAX_NUMBER_GATES_STAGE]; + double width_nand3_path_n[MAX_NUMBER_GATES_STAGE]; + double width_nand3_path_p[MAX_NUMBER_GATES_STAGE]; + double delay_nand2_path; + double delay_nand3_path; + powerDef power_nand2_path; + powerDef power_nand3_path; + + PredecBlk * blk; + Decoder * dec; + bool is_dram_; + int way_select; + + void compute_widths(); + void compute_area(); + + void leakage_feedback(double temperature); + + + pair<double, double> compute_delays( + double inrisetime_nand2_path, + double inrisetime_nand3_path); // return <outrise_nand2, outrise_nand3> + + inline int num_addr_bits_nand2_path() + { + return num_buffers_driving_1_nand2_load + + num_buffers_driving_2_nand2_load + + num_buffers_driving_4_nand2_load; + } + inline int num_addr_bits_nand3_path() + { + return num_buffers_driving_2_nand3_load + + num_buffers_driving_8_nand3_load; + } + double get_rdOp_dynamic_E(int num_act_mats_hor_dir); +}; + + + +class Predec : public Component +{ + public: + Predec( + PredecBlkDrv * drv1, + PredecBlkDrv * drv2); + + double compute_delays(double inrisetime); // return outrisetime + + void leakage_feedback(double temperature); + PredecBlk * blk1; + PredecBlk * blk2; + PredecBlkDrv * drv1; + PredecBlkDrv * drv2; + + powerDef block_power; + powerDef driver_power; + + private: + // returns <delay, risetime> + pair<double, double> get_max_delay_before_decoder( + pair<double, double> input_pair1, + pair<double, double> input_pair2); +}; + + + +class Driver : public Component +{ + public: + Driver(double c_gate_load_, double c_wire_load_, double r_wire_load_, bool is_dram); + + int number_gates; + int min_number_gates; + double width_n[MAX_NUMBER_GATES_STAGE]; + double width_p[MAX_NUMBER_GATES_STAGE]; + double c_gate_load; + double c_wire_load; + double r_wire_load; + double delay; + powerDef power; + bool is_dram_; + + void compute_widths(); + double compute_delay(double inrisetime); +}; + + +#endif diff --git a/ext/mcpat/cacti/htree2.cc b/ext/mcpat/cacti/htree2.cc new file mode 100644 index 000000000..817ea6a7c --- /dev/null +++ b/ext/mcpat/cacti/htree2.cc @@ -0,0 +1,641 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#include <cassert> +#include <iostream> + +#include "htree2.h" +#include "wire.h" + +Htree2::Htree2( + enum Wire_type wire_model, double mat_w, double mat_h, + int a_bits, int d_inbits, int search_data_in, int d_outbits, int search_data_out, int bl, int wl, enum Htree_type htree_type, + bool uca_tree_, bool search_tree_, TechnologyParameter::DeviceType *dt) + :in_rise_time(0), out_rise_time(0), + tree_type(htree_type), mat_width(mat_w), mat_height(mat_h), + add_bits(a_bits), data_in_bits(d_inbits), search_data_in_bits(search_data_in),data_out_bits(d_outbits), + search_data_out_bits(search_data_out), ndbl(bl), ndwl(wl), + uca_tree(uca_tree_), search_tree(search_tree_), wt(wire_model), deviceType(dt) +{ + assert(ndbl >= 2 && ndwl >= 2); + +// if (ndbl == 1 && ndwl == 1) +// { +// delay = 0; +// power.readOp.dynamic = 0; +// power.readOp.leakage = 0; +// area.w = mat_w; +// area.h = mat_h; +// return; +// } +// if (ndwl == 1) ndwl++; +// if (ndbl == 1) ndbl++; + + max_unpipelined_link_delay = 0; //TODO + min_w_nmos = g_tp.min_w_nmos_; + min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio * min_w_nmos; + + switch (htree_type) + { + case Add_htree: + wire_bw = init_wire_bw = add_bits; + in_htree(); + break; + case Data_in_htree: + wire_bw = init_wire_bw = data_in_bits; + in_htree(); + break; + case Data_out_htree: + wire_bw = init_wire_bw = data_out_bits; + out_htree(); + break; + case Search_in_htree: + wire_bw = init_wire_bw = search_data_in_bits;//in_search_tree is broad cast, out_htree is not. + in_htree(); + break; + case Search_out_htree: + wire_bw = init_wire_bw = search_data_out_bits; + out_htree(); + break; + default: + assert(0); + break; + } + + power_bit = power; + power.readOp.dynamic *= init_wire_bw; + + assert(power.readOp.dynamic >= 0); + assert(power.readOp.leakage >= 0); +} + + + +// nand gate sizing calculation +void Htree2::input_nand(double s1, double s2, double l_eff) +{ + Wire w1(wt, l_eff); + double pton_size = deviceType->n_to_p_eff_curr_drv_ratio; + // input capacitance of a repeater = input capacitance of nand. + double nsize = s1*(1 + pton_size)/(2 + pton_size); + nsize = (nsize < 1) ? 1 : nsize; + + double tc = 2*tr_R_on(nsize*min_w_nmos, NCH, 1) * + (drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)*2 + + 2 * gate_C(s2*(min_w_nmos + min_w_pmos), 0)); + delay+= horowitz (w1.out_rise_time, tc, + deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE); + power.readOp.dynamic += 0.5 * + (2*drain_C_(pton_size * nsize*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) + + 2*gate_C(s2*(min_w_nmos + min_w_pmos), 0)) * + deviceType->Vdd * deviceType->Vdd; + + power.searchOp.dynamic += 0.5 * + (2*drain_C_(pton_size * nsize*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) + + 2*gate_C(s2*(min_w_nmos + min_w_pmos), 0)) * + deviceType->Vdd * deviceType->Vdd * wire_bw ; + power.readOp.leakage += (wire_bw*cmos_Isub_leakage(min_w_nmos*(nsize*2), min_w_pmos * nsize * 2, 2, nand))*deviceType->Vdd; + power.readOp.gate_leakage += (wire_bw*cmos_Ig_leakage(min_w_nmos*(nsize*2), min_w_pmos * nsize * 2, 2, nand))*deviceType->Vdd; +} + + + +// tristate buffer model consisting of not, nand, nor, and driver transistors +void Htree2::output_buffer(double s1, double s2, double l_eff) +{ + Wire w1(wt, l_eff); + double pton_size = deviceType->n_to_p_eff_curr_drv_ratio; + // input capacitance of repeater = input capacitance of nand + nor. + double size = s1*(1 + pton_size)/(2 + pton_size + 1 + 2*pton_size); + double s_eff = //stage eff of a repeater in a wire + (gate_C(s2*(min_w_nmos + min_w_pmos), 0) + w1.wire_cap(l_eff*1e-6,true))/ + gate_C(s2*(min_w_nmos + min_w_pmos), 0); + double tr_size = gate_C(s1*(min_w_nmos + min_w_pmos), 0) * 1/2/(s_eff*gate_C(min_w_pmos, 0)); + size = (size < 1) ? 1 : size; + + double res_nor = 2*tr_R_on(size*min_w_pmos, PCH, 1); + double res_ptrans = tr_R_on(tr_size*min_w_nmos, NCH, 1); + double cap_nand_out = drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 + + gate_C(tr_size*min_w_pmos, 0); + double cap_ptrans_out = 2 *(drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)) + + gate_C(s1*(min_w_nmos + min_w_pmos), 0); + + double tc = res_nor * cap_nand_out + (res_nor + res_ptrans) * cap_ptrans_out; + + + delay += horowitz (w1.out_rise_time, tc, + deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE); + + //nand + power.readOp.dynamic += 0.5 * + (2*drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) + + gate_C(tr_size*(min_w_pmos), 0)) * + deviceType->Vdd * deviceType->Vdd; + + power.searchOp.dynamic += 0.5 * + (2*drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) + + gate_C(tr_size*(min_w_pmos), 0)) * + deviceType->Vdd * deviceType->Vdd*init_wire_bw; + + //not + power.readOp.dynamic += 0.5 * + (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + +drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) + +gate_C(size*(min_w_nmos + min_w_pmos), 0)) * + deviceType->Vdd * deviceType->Vdd; + + power.searchOp.dynamic += 0.5 * + (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + +drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) + +gate_C(size*(min_w_nmos + min_w_pmos), 0)) * + deviceType->Vdd * deviceType->Vdd*init_wire_bw; + + //nor + power.readOp.dynamic += 0.5 * + (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + 2*drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) + +gate_C(tr_size*(min_w_nmos + min_w_pmos), 0)) * + deviceType->Vdd * deviceType->Vdd; + + power.searchOp.dynamic += 0.5 * + (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + 2*drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) + +gate_C(tr_size*(min_w_nmos + min_w_pmos), 0)) * + deviceType->Vdd * deviceType->Vdd*init_wire_bw; + + //output transistor + power.readOp.dynamic += 0.5 * + ((drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + +drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def))*2 + + gate_C(s1*(min_w_nmos + min_w_pmos), 0)) * + deviceType->Vdd * deviceType->Vdd; + + power.searchOp.dynamic += 0.5 * + ((drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + +drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def))*2 + + gate_C(s1*(min_w_nmos + min_w_pmos), 0)) * + deviceType->Vdd * deviceType->Vdd*init_wire_bw; + + if(uca_tree) { + power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/ + power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand + power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor + + power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/ + power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand + power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor + //power.readOp.gate_leakage *=; + } + else { + power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/ + power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand + power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor + + power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/ + power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand + power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor + //power.readOp.gate_leakage *=deviceType->Vdd*wire_bw; + } +} + + + +/* calculates the input h-tree delay/power + * A nand gate is used at each node to + * limit the signal + * The area of an unbalanced htree (rows != columns) + * depends on how data is traversed. + * In the following function, if ( no. of rows < no. of columns), + * then data first traverse in excess hor. links until vertical + * and horizontal nodes are same. + * If no. of rows is bigger, then data traverse in + * a hor. link followed by a ver. link in a repeated + * fashion (similar to a balanced tree) until there are no + * hor. links left. After this it goes through the remaining vertical + * links. + */ + void +Htree2::in_htree() +{ + //temp var + double s1 = 0, s2 = 0, s3 = 0; + double l_eff = 0; + Wire *wtemp1 = 0, *wtemp2 = 0, *wtemp3 = 0; + double len = 0, ht = 0; + int option = 0; + + int h = (int) _log2(ndwl/2); // horizontal nodes + int v = (int) _log2(ndbl/2); // vertical nodes + double len_temp; + double ht_temp; + if (uca_tree) + {//Sheng: this computation do not consider the wires that route from edge to middle. + ht_temp = (mat_height*ndbl/2 +/* since uca_tree models interbank tree, mat_height => bank height */ + ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch * + 2 * (1-pow(0.5,h))))/2; + len_temp = (mat_width*ndwl/2 + + ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch * + 2 * (1-pow(0.5,v))))/2; + } + else + { + if (ndwl == ndbl) { + ht_temp = ((mat_height*ndbl/2) + + ((add_bits + (search_data_in_bits + search_data_out_bits))* (ndbl/2-1) * g_tp.wire_outside_mat.pitch) + + ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h) + )/2; + len_temp = (mat_width*ndwl/2 + + ((add_bits + (search_data_in_bits + search_data_out_bits)) * (ndwl/2-1) * g_tp.wire_outside_mat.pitch) + + ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2; + } + else if (ndwl > ndbl) { + double excess_part = (_log2(ndwl/2) - _log2(ndbl/2)); + ht_temp = ((mat_height*ndbl/2) + + ((add_bits + + (search_data_in_bits + search_data_out_bits)) * ((ndbl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) + + (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * + (2*(1 - pow(0.5, h-v)) + pow(0.5, v-h) * v))/2; + len_temp = (mat_width*ndwl/2 + + ((add_bits + (search_data_in_bits + search_data_out_bits))* (ndwl/2-1) * g_tp.wire_outside_mat.pitch) + + ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2; + } + else { + double excess_part = (_log2(ndbl/2) - _log2(ndwl/2)); + ht_temp = ((mat_height*ndbl/2) + + ((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) + + ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h) + )/2; + len_temp = (mat_width*ndwl/2 + + ((add_bits + (search_data_in_bits + search_data_out_bits)) * ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) + + (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * (h + 2*(1-pow(0.5, v-h))))/2; + } + } + + area.h = ht_temp * 2; + area.w = len_temp * 2; + delay = 0; + power.readOp.dynamic = 0; + power.readOp.leakage = 0; + power.searchOp.dynamic =0; + len = len_temp; + ht = ht_temp/2; + + while (v > 0 || h > 0) + { + if (wtemp1) delete wtemp1; + if (wtemp2) delete wtemp2; + if (wtemp3) delete wtemp3; + + if (h > v) + { + //the iteration considers only one horizontal link + wtemp1 = new Wire(wt, len); // hor + wtemp2 = new Wire(wt, len/2); // ver + len_temp = len; + len /= 2; + wtemp3 = 0; + h--; + option = 0; + } + else if (v>0 && h>0) + { + //considers one horizontal link and one vertical link + wtemp1 = new Wire(wt, len); // hor + wtemp2 = new Wire(wt, ht); // ver + wtemp3 = new Wire(wt, len/2); // next hor + len_temp = len; + ht_temp = ht; + len /= 2; + ht /= 2; + v--; + h--; + option = 1; + } + else + { + // considers only one vertical link + assert(h == 0); + wtemp1 = new Wire(wt, ht); // ver + wtemp2 = new Wire(wt, ht/2); // hor + ht_temp = ht; + ht /= 2; + wtemp3 = 0; + v--; + option = 2; + } + + delay += wtemp1->delay; + power.readOp.dynamic += wtemp1->power.readOp.dynamic; + power.searchOp.dynamic += wtemp1->power.readOp.dynamic*wire_bw; + power.readOp.leakage += wtemp1->power.readOp.leakage*wire_bw; + power.readOp.gate_leakage += wtemp1->power.readOp.gate_leakage*wire_bw; + if ((uca_tree == false && option == 2) || search_tree==true) + { + wire_bw*=2; // wire bandwidth doubles only for vertical branches + } + + if (uca_tree == false) + { + if (len_temp > wtemp1->repeater_spacing) + { + s1 = wtemp1->repeater_size; + l_eff = wtemp1->repeater_spacing; + } + else + { + s1 = (len_temp/wtemp1->repeater_spacing) * wtemp1->repeater_size; + l_eff = len_temp; + } + + if (ht_temp > wtemp2->repeater_spacing) + { + s2 = wtemp2->repeater_size; + } + else + { + s2 = (len_temp/wtemp2->repeater_spacing) * wtemp2->repeater_size; + } + // first level + input_nand(s1, s2, l_eff); + } + + + if (option != 1) + { + continue; + } + + // second level + delay += wtemp2->delay; + power.readOp.dynamic += wtemp2->power.readOp.dynamic; + power.searchOp.dynamic += wtemp2->power.readOp.dynamic*wire_bw; + power.readOp.leakage += wtemp2->power.readOp.leakage*wire_bw; + power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw; + + if (uca_tree) + { + power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw); + power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw; + } + else + { + power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw); + power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw; + wire_bw*=2; + + if (ht_temp > wtemp3->repeater_spacing) + { + s3 = wtemp3->repeater_size; + l_eff = wtemp3->repeater_spacing; + } + else + { + s3 = (len_temp/wtemp3->repeater_spacing) * wtemp3->repeater_size; + l_eff = ht_temp; + } + + input_nand(s2, s3, l_eff); + } + } + + if (wtemp1) delete wtemp1; + if (wtemp2) delete wtemp2; + if (wtemp3) delete wtemp3; +} + + + +/* a tristate buffer is used to handle fan-ins + * The area of an unbalanced htree (rows != columns) + * depends on how data is traversed. + * In the following function, if ( no. of rows < no. of columns), + * then data first traverse in excess hor. links until vertical + * and horizontal nodes are same. + * If no. of rows is bigger, then data traverse in + * a hor. link followed by a ver. link in a repeated + * fashion (similar to a balanced tree) until there are no + * hor. links left. After this it goes through the remaining vertical + * links. + */ +void Htree2::out_htree() +{ + //temp var + double s1 = 0, s2 = 0, s3 = 0; + double l_eff = 0; + Wire *wtemp1 = 0, *wtemp2 = 0, *wtemp3 = 0; + double len = 0, ht = 0; + int option = 0; + + int h = (int) _log2(ndwl/2); + int v = (int) _log2(ndbl/2); + double len_temp; + double ht_temp; + if (uca_tree) + { + ht_temp = (mat_height*ndbl/2 +/* since uca_tree models interbank tree, mat_height => bank height */ + ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch * + 2 * (1-pow(0.5,h))))/2; + len_temp = (mat_width*ndwl/2 + + ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch * + 2 * (1-pow(0.5,v))))/2; + } + else + { + if (ndwl == ndbl) { + ht_temp = ((mat_height*ndbl/2) + + ((add_bits+ (search_data_in_bits + search_data_out_bits)) * (ndbl/2-1) * g_tp.wire_outside_mat.pitch) + + ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h) + )/2; + len_temp = (mat_width*ndwl/2 + + ((add_bits + (search_data_in_bits + search_data_out_bits)) * (ndwl/2-1) * g_tp.wire_outside_mat.pitch) + + ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2; + + } + else if (ndwl > ndbl) { + double excess_part = (_log2(ndwl/2) - _log2(ndbl/2)); + ht_temp = ((mat_height*ndbl/2) + + ((add_bits + (search_data_in_bits + search_data_out_bits)) * ((ndbl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) + + (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * + (2*(1 - pow(0.5, h-v)) + pow(0.5, v-h) * v))/2; + len_temp = (mat_width*ndwl/2 + + ((add_bits + (search_data_in_bits + search_data_out_bits))* (ndwl/2-1) * g_tp.wire_outside_mat.pitch) + + ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2; + } + else { + double excess_part = (_log2(ndbl/2) - _log2(ndwl/2)); + ht_temp = ((mat_height*ndbl/2) + + ((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) + + ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h) + )/2; + len_temp = (mat_width*ndwl/2 + + ((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) + + (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * (h + 2*(1-pow(0.5, v-h))))/2; + } + } + area.h = ht_temp * 2; + area.w = len_temp * 2; + delay = 0; + power.readOp.dynamic = 0; + power.readOp.leakage = 0; + power.readOp.gate_leakage = 0; + //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl; + len = len_temp; + ht = ht_temp/2; + + while (v > 0 || h > 0) + { //finds delay/power of each link in the tree + if (wtemp1) delete wtemp1; + if (wtemp2) delete wtemp2; + if (wtemp3) delete wtemp3; + + if(h > v) { + //the iteration considers only one horizontal link + wtemp1 = new Wire(wt, len); // hor + wtemp2 = new Wire(wt, len/2); // ver + len_temp = len; + len /= 2; + wtemp3 = 0; + h--; + option = 0; + } + else if (v>0 && h>0) { + //considers one horizontal link and one vertical link + wtemp1 = new Wire(wt, len); // hor + wtemp2 = new Wire(wt, ht); // ver + wtemp3 = new Wire(wt, len/2); // next hor + len_temp = len; + ht_temp = ht; + len /= 2; + ht /= 2; + v--; + h--; + option = 1; + } + else { + // considers only one vertical link + assert(h == 0); + wtemp1 = new Wire(wt, ht); // hor + wtemp2 = new Wire(wt, ht/2); // ver + ht_temp = ht; + ht /= 2; + wtemp3 = 0; + v--; + option = 2; + } + delay += wtemp1->delay; + power.readOp.dynamic += wtemp1->power.readOp.dynamic; + power.searchOp.dynamic += wtemp1->power.readOp.dynamic*init_wire_bw; + power.readOp.leakage += wtemp1->power.readOp.leakage*wire_bw; + power.readOp.gate_leakage += wtemp1->power.readOp.gate_leakage*wire_bw; + //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl; + if ((uca_tree == false && option == 2) || search_tree==true) + { + wire_bw*=2; + } + + if (uca_tree == false) + { + if (len_temp > wtemp1->repeater_spacing) + { + s1 = wtemp1->repeater_size; + l_eff = wtemp1->repeater_spacing; + } + else + { + s1 = (len_temp/wtemp1->repeater_spacing) * wtemp1->repeater_size; + l_eff = len_temp; + } + if (ht_temp > wtemp2->repeater_spacing) + { + s2 = wtemp2->repeater_size; + } + else + { + s2 = (len_temp/wtemp2->repeater_spacing) * wtemp2->repeater_size; + } + // first level + output_buffer(s1, s2, l_eff); + } + + + if (option != 1) + { + continue; + } + + // second level + delay += wtemp2->delay; + power.readOp.dynamic += wtemp2->power.readOp.dynamic; + power.searchOp.dynamic += wtemp2->power.readOp.dynamic*init_wire_bw; + power.readOp.leakage += wtemp2->power.readOp.leakage*wire_bw; + power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw; + //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl; + if (uca_tree) + { + power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw); + power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw; + } + else + { + power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw); + power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw; + wire_bw*=2; + + if (ht_temp > wtemp3->repeater_spacing) + { + s3 = wtemp3->repeater_size; + l_eff = wtemp3->repeater_spacing; + } + else + { + s3 = (len_temp/wtemp3->repeater_spacing) * wtemp3->repeater_size; + l_eff = ht_temp; + } + + output_buffer(s2, s3, l_eff); + } + //cout<<"power.readOp.leakage"<<power.readOp.leakage<<endl; + //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl; + //cout<<"wtemp2->power.readOp.gate_leakage"<<wtemp2->power.readOp.gate_leakage<<endl; + } + + if (wtemp1) delete wtemp1; + if (wtemp2) delete wtemp2; + if (wtemp3) delete wtemp3; +} + diff --git a/ext/mcpat/cacti/htree2.h b/ext/mcpat/cacti/htree2.h new file mode 100644 index 000000000..053e43a27 --- /dev/null +++ b/ext/mcpat/cacti/htree2.h @@ -0,0 +1,97 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + +#ifndef __HTREE2_H__ +#define __HTREE2_H__ + +#include "assert.h" +#include "basic_circuit.h" +#include "cacti_interface.h" +#include "component.h" +#include "parameter.h" +#include "subarray.h" +#include "wire.h" + +// leakge power includes entire htree in a bank (when uca_tree == false) +// leakge power includes only part to one bank when uca_tree == true + +class Htree2 : public Component +{ + public: + Htree2(enum Wire_type wire_model, + double mat_w, double mat_h, int add, int data_in, int search_data_in, int data_out, int search_data_out, int bl, int wl, + enum Htree_type h_type, bool uca_tree_ = false, bool search_tree_ = false, + TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)); + ~Htree2() {}; + + void in_htree(); + void out_htree(); + + // repeaters only at h-tree nodes + void limited_in_htree(); + void limited_out_htree(); + void input_nand(double s1, double s2, double l); + void output_buffer(double s1, double s2, double l); + + double in_rise_time, out_rise_time; + + void set_in_rise_time(double rt) + { + in_rise_time = rt; + } + + double max_unpipelined_link_delay; + powerDef power_bit; + + + private: + double wire_bw; + double init_wire_bw; // bus width at root + enum Htree_type tree_type; + double htree_hnodes; + double htree_vnodes; + double mat_width; + double mat_height; + int add_bits, data_in_bits,search_data_in_bits,data_out_bits, search_data_out_bits; + int ndbl, ndwl; + bool uca_tree; // should have full bandwidth to access all banks in the array simultaneously + bool search_tree; + + enum Wire_type wt; + double min_w_nmos; + double min_w_pmos; + + TechnologyParameter::DeviceType *deviceType; + +}; + +#endif diff --git a/ext/mcpat/cacti/io.cc b/ext/mcpat/cacti/io.cc new file mode 100644 index 000000000..56725ab7c --- /dev/null +++ b/ext/mcpat/cacti/io.cc @@ -0,0 +1,2350 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#include <fstream> +#include <iostream> +#include <sstream> + +#include "Ucache.h" +#include "arbiter.h" +#include "area.h" +#include "basic_circuit.h" +#include "crossbar.h" +#include "io.h" +#include "nuca.h" +#include "parameter.h" +//#include "highradix.h" + +using namespace std; + + +/* Parses "cache.cfg" file */ + void +InputParameter::parse_cfg(const string & in_file) +{ + FILE *fp = fopen(in_file.c_str(), "r"); + char line[5000]; + char jk[5000]; + char temp_var[5000]; + + if(!fp) { + cout << in_file << " is missing!\n"; + exit(-1); + } + + while(fscanf(fp, "%[^\n]\n", line) != EOF) { + + if (!strncmp("-size", line, strlen("-size"))) { + sscanf(line, "-size %[(:-~)*]%u", jk, &(cache_sz)); + continue; + } + + if (!strncmp("-page size", line, strlen("-page size"))) { + sscanf(line, "-page size %[(:-~)*]%u", jk, &(page_sz_bits)); + continue; + } + + if (!strncmp("-burst length", line, strlen("-burst length"))) { + sscanf(line, "-burst %[(:-~)*]%u", jk, &(burst_len)); + continue; + } + + if (!strncmp("-internal prefetch width", line, strlen("-internal prefetch width"))) { + sscanf(line, "-internal prefetch %[(:-~)*]%u", jk, &(int_prefetch_w)); + continue; + } + + if (!strncmp("-block", line, strlen("-block"))) { + sscanf(line, "-block size (bytes) %d", &(line_sz)); + continue; + } + + if (!strncmp("-associativity", line, strlen("-associativity"))) { + sscanf(line, "-associativity %d", &(assoc)); + continue; + } + + if (!strncmp("-read-write", line, strlen("-read-write"))) { + sscanf(line, "-read-write port %d", &(num_rw_ports)); + continue; + } + + if (!strncmp("-exclusive read", line, strlen("exclusive read"))) { + sscanf(line, "-exclusive read port %d", &(num_rd_ports)); + continue; + } + + if(!strncmp("-exclusive write", line, strlen("-exclusive write"))) { + sscanf(line, "-exclusive write port %d", &(num_wr_ports)); + continue; + } + + if (!strncmp("-single ended", line, strlen("-single ended"))) { + sscanf(line, "-single %[(:-~)*]%d", jk, + &(num_se_rd_ports)); + continue; + } + + if (!strncmp("-search", line, strlen("-search"))) { + sscanf(line, "-search port %d", &(num_search_ports)); + continue; + } + + if (!strncmp("-UCA bank", line, strlen("-UCA bank"))) { + sscanf(line, "-UCA bank%[((:-~)| )*]%d", jk, &(nbanks)); + continue; + } + + if (!strncmp("-technology", line, strlen("-technology"))) { + sscanf(line, "-technology (u) %lf", &(F_sz_um)); + F_sz_nm = F_sz_um*1000; + continue; + } + + if (!strncmp("-output/input", line, strlen("-output/input"))) { + sscanf(line, "-output/input bus %[(:-~)*]%d", jk, &(out_w)); + continue; + } + + if (!strncmp("-operating temperature", line, strlen("-operating temperature"))) { + sscanf(line, "-operating temperature %[(:-~)*]%d", jk, &(temp)); + continue; + } + + if (!strncmp("-cache type", line, strlen("-cache type"))) { + sscanf(line, "-cache type%[^\"]\"%[^\"]\"", jk, temp_var); + + if (!strncmp("cache", temp_var, sizeof("cache"))) { + is_cache = true; + } + else + { + is_cache = false; + } + + if (!strncmp("main memory", temp_var, sizeof("main memory"))) { + is_main_mem = true; + } + else { + is_main_mem = false; + } + + if (!strncmp("cam", temp_var, sizeof("cam"))) { + pure_cam = true; + } + else { + pure_cam = false; + } + + if (!strncmp("ram", temp_var, sizeof("ram"))) { + pure_ram = true; + } + else { + if (!is_main_mem) + pure_ram = false; + else + pure_ram = true; + } + + continue; + } + + + if (!strncmp("-tag size", line, strlen("-tag size"))) { + sscanf(line, "-tag size%[^\"]\"%[^\"]\"", jk, temp_var); + if (!strncmp("default", temp_var, sizeof("default"))) { + specific_tag = false; + tag_w = 42; /* the acutal value is calculated + * later based on the cache size, bank count, and associativity + */ + } + else { + specific_tag = true; + sscanf(line, "-tag size (b) %d", &(tag_w)); + } + continue; + } + + if (!strncmp("-access mode", line, strlen("-access mode"))) { + sscanf(line, "-access %[^\"]\"%[^\"]\"", jk, temp_var); + if (!strncmp("fast", temp_var, strlen("fast"))) { + access_mode = 2; + } + else if (!strncmp("sequential", temp_var, strlen("sequential"))) { + access_mode = 1; + } + else if(!strncmp("normal", temp_var, strlen("normal"))) { + access_mode = 0; + } + else { + cout << "ERROR: Invalid access mode!\n"; + exit(0); + } + continue; + } + + if (!strncmp("-Data array cell type", line, strlen("-Data array cell type"))) { + sscanf(line, "-Data array cell type %[^\"]\"%[^\"]\"", jk, temp_var); + + if(!strncmp("itrs-hp", temp_var, strlen("itrs-hp"))) { + data_arr_ram_cell_tech_type = 0; + } + else if(!strncmp("itrs-lstp", temp_var, strlen("itrs-lstp"))) { + data_arr_ram_cell_tech_type = 1; + } + else if(!strncmp("itrs-lop", temp_var, strlen("itrs-lop"))) { + data_arr_ram_cell_tech_type = 2; + } + else if(!strncmp("lp-dram", temp_var, strlen("lp-dram"))) { + data_arr_ram_cell_tech_type = 3; + } + else if(!strncmp("comm-dram", temp_var, strlen("comm-dram"))) { + data_arr_ram_cell_tech_type = 4; + } + else { + cout << "ERROR: Invalid type!\n"; + exit(0); + } + continue; + } + + if (!strncmp("-Data array peripheral type", line, strlen("-Data array peripheral type"))) { + sscanf(line, "-Data array peripheral type %[^\"]\"%[^\"]\"", jk, temp_var); + + if(!strncmp("itrs-hp", temp_var, strlen("itrs-hp"))) { + data_arr_peri_global_tech_type = 0; + } + else if(!strncmp("itrs-lstp", temp_var, strlen("itrs-lstp"))) { + data_arr_peri_global_tech_type = 1; + } + else if(!strncmp("itrs-lop", temp_var, strlen("itrs-lop"))) { + data_arr_peri_global_tech_type = 2; + } + else { + cout << "ERROR: Invalid type!\n"; + exit(0); + } + continue; + } + + if (!strncmp("-Tag array cell type", line, strlen("-Tag array cell type"))) { + sscanf(line, "-Tag array cell type %[^\"]\"%[^\"]\"", jk, temp_var); + + if(!strncmp("itrs-hp", temp_var, strlen("itrs-hp"))) { + tag_arr_ram_cell_tech_type = 0; + } + else if(!strncmp("itrs-lstp", temp_var, strlen("itrs-lstp"))) { + tag_arr_ram_cell_tech_type = 1; + } + else if(!strncmp("itrs-lop", temp_var, strlen("itrs-lop"))) { + tag_arr_ram_cell_tech_type = 2; + } + else if(!strncmp("lp-dram", temp_var, strlen("lp-dram"))) { + tag_arr_ram_cell_tech_type = 3; + } + else if(!strncmp("comm-dram", temp_var, strlen("comm-dram"))) { + tag_arr_ram_cell_tech_type = 4; + } + else { + cout << "ERROR: Invalid type!\n"; + exit(0); + } + continue; + } + + if (!strncmp("-Tag array peripheral type", line, strlen("-Tag array peripheral type"))) { + sscanf(line, "-Tag array peripheral type %[^\"]\"%[^\"]\"", jk, temp_var); + + if(!strncmp("itrs-hp", temp_var, strlen("itrs-hp"))) { + tag_arr_peri_global_tech_type = 0; + } + else if(!strncmp("itrs-lstp", temp_var, strlen("itrs-lstp"))) { + tag_arr_peri_global_tech_type = 1; + } + else if(!strncmp("itrs-lop", temp_var, strlen("itrs-lop"))) { + tag_arr_peri_global_tech_type = 2; + } + else { + cout << "ERROR: Invalid type!\n"; + exit(0); + } + continue; + } + if(!strncmp("-design", line, strlen("-design"))) { + sscanf(line, "-%[((:-~)| |,)*]%d:%d:%d:%d:%d", jk, + &(delay_wt), &(dynamic_power_wt), + &(leakage_power_wt), + &(cycle_time_wt), &(area_wt)); + continue; + } + + if(!strncmp("-deviate", line, strlen("-deviate"))) { + sscanf(line, "-%[((:-~)| |,)*]%d:%d:%d:%d:%d", jk, + &(delay_dev), &(dynamic_power_dev), + &(leakage_power_dev), + &(cycle_time_dev), &(area_dev)); + continue; + } + + if(!strncmp("-Optimize", line, strlen("-Optimize"))) { + sscanf(line, "-Optimize %[^\"]\"%[^\"]\"", jk, temp_var); + + if(!strncmp("ED^2", temp_var, strlen("ED^2"))) { + ed = 2; + } + else if(!strncmp("ED", temp_var, strlen("ED"))) { + ed = 1; + } + else { + ed = 0; + } + } + + if(!strncmp("-NUCAdesign", line, strlen("-NUCAdesign"))) { + sscanf(line, "-%[((:-~)| |,)*]%d:%d:%d:%d:%d", jk, + &(delay_wt_nuca), &(dynamic_power_wt_nuca), + &(leakage_power_wt_nuca), + &(cycle_time_wt_nuca), &(area_wt_nuca)); + continue; + } + + if(!strncmp("-NUCAdeviate", line, strlen("-NUCAdeviate"))) { + sscanf(line, "-%[((:-~)| |,)*]%d:%d:%d:%d:%d", jk, + &(delay_dev_nuca), &(dynamic_power_dev_nuca), + &(leakage_power_dev_nuca), + &(cycle_time_dev_nuca), &(area_dev_nuca)); + continue; + } + + if(!strncmp("-Cache model", line, strlen("-cache model"))) { + sscanf(line, "-Cache model %[^\"]\"%[^\"]\"", jk, temp_var); + + if (!strncmp("UCA", temp_var, strlen("UCA"))) { + nuca = 0; + } + else { + nuca = 1; + } + continue; + } + + if(!strncmp("-NUCA bank", line, strlen("-NUCA bank"))) { + sscanf(line, "-NUCA bank count %d", &(nuca_bank_count)); + + if (nuca_bank_count != 0) { + force_nuca_bank = 1; + } + continue; + } + + if(!strncmp("-Wire inside mat", line, strlen("-Wire inside mat"))) { + sscanf(line, "-Wire%[^\"]\"%[^\"]\"", jk, temp_var); + + if (!strncmp("global", temp_var, strlen("global"))) { + wire_is_mat_type = 2; + continue; + } + else if (!strncmp("local", temp_var, strlen("local"))) { + wire_is_mat_type = 0; + continue; + } + else { + wire_is_mat_type = 1; + continue; + } + } + + if(!strncmp("-Wire outside mat", line, strlen("-Wire outside mat"))) { + sscanf(line, "-Wire%[^\"]\"%[^\"]\"", jk, temp_var); + + if (!strncmp("global", temp_var, strlen("global"))) { + wire_os_mat_type = 2; + } + else { + wire_os_mat_type = 1; + } + continue; + } + + if(!strncmp("-Interconnect projection", line, strlen("-Interconnect projection"))) { + sscanf(line, "-Interconnect projection%[^\"]\"%[^\"]\"", jk, temp_var); + + if (!strncmp("aggressive", temp_var, strlen("aggressive"))) { + ic_proj_type = 0; + } + else { + ic_proj_type = 1; + } + continue; + } + + if(!strncmp("-Wire signalling", line, strlen("-wire signalling"))) { + sscanf(line, "-Wire%[^\"]\"%[^\"]\"", jk, temp_var); + + if (!strncmp("default", temp_var, strlen("default"))) { + force_wiretype = 0; + wt = Global; + } + else if (!(strncmp("Global_10", temp_var, strlen("Global_10")))) { + force_wiretype = 1; + wt = Global_10; + } + else if (!(strncmp("Global_20", temp_var, strlen("Global_20")))) { + force_wiretype = 1; + wt = Global_20; + } + else if (!(strncmp("Global_30", temp_var, strlen("Global_30")))) { + force_wiretype = 1; + wt = Global_30; + } + else if (!(strncmp("Global_5", temp_var, strlen("Global_5")))) { + force_wiretype = 1; + wt = Global_5; + } + else if (!(strncmp("Global", temp_var, strlen("Global")))) { + force_wiretype = 1; + wt = Global; + } + else { + wt = Low_swing; + force_wiretype = 1; + } + continue; + } + + + + if(!strncmp("-Core", line, strlen("-Core"))) { + sscanf(line, "-Core count %d\n", &(cores)); + if (cores > 16) { + printf("No. of cores should be less than 16!\n"); + } + continue; + } + + if(!strncmp("-Cache level", line, strlen("-Cache level"))) { + sscanf(line, "-Cache l%[^\"]\"%[^\"]\"", jk, temp_var); + if (!strncmp("L2", temp_var, strlen("L2"))) { + cache_level = 0; + } + else { + cache_level = 1; + } + } + + if(!strncmp("-Print level", line, strlen("-Print level"))) { + sscanf(line, "-Print l%[^\"]\"%[^\"]\"", jk, temp_var); + if (!strncmp("DETAILED", temp_var, strlen("DETAILED"))) { + print_detail = 1; + } + else { + print_detail = 0; + } + + } + if(!strncmp("-Add ECC", line, strlen("-Add ECC"))) { + sscanf(line, "-Add ECC %[^\"]\"%[^\"]\"", jk, temp_var); + if (!strncmp("true", temp_var, strlen("true"))) { + add_ecc_b_ = true; + } + else { + add_ecc_b_ = false; + } + } + + if(!strncmp("-Print input parameters", line, strlen("-Print input parameters"))) { + sscanf(line, "-Print input %[^\"]\"%[^\"]\"", jk, temp_var); + if (!strncmp("true", temp_var, strlen("true"))) { + print_input_args = true; + } + else { + print_input_args = false; + } + } + + if(!strncmp("-Force cache config", line, strlen("-Force cache config"))) { + sscanf(line, "-Force cache %[^\"]\"%[^\"]\"", jk, temp_var); + if (!strncmp("true", temp_var, strlen("true"))) { + force_cache_config = true; + } + else { + force_cache_config = false; + } + } + + if(!strncmp("-Ndbl", line, strlen("-Ndbl"))) { + sscanf(line, "-Ndbl %d\n", &(ndbl)); + continue; + } + if(!strncmp("-Ndwl", line, strlen("-Ndwl"))) { + sscanf(line, "-Ndwl %d\n", &(ndwl)); + continue; + } + if(!strncmp("-Nspd", line, strlen("-Nspd"))) { + sscanf(line, "-Nspd %d\n", &(nspd)); + continue; + } + if(!strncmp("-Ndsam1", line, strlen("-Ndsam1"))) { + sscanf(line, "-Ndsam1 %d\n", &(ndsam1)); + continue; + } + if(!strncmp("-Ndsam2", line, strlen("-Ndsam2"))) { + sscanf(line, "-Ndsam2 %d\n", &(ndsam2)); + continue; + } + if(!strncmp("-Ndcm", line, strlen("-Ndcm"))) { + sscanf(line, "-Ndcm %d\n", &(ndcm)); + continue; + } + + } + rpters_in_htree = true; + fclose(fp); +} + + void +InputParameter::display_ip() +{ + cout << "Cache size : " << cache_sz << endl; + cout << "Block size : " << line_sz << endl; + cout << "Associativity : " << assoc << endl; + cout << "Read only ports : " << num_rd_ports << endl; + cout << "Write only ports : " << num_wr_ports << endl; + cout << "Read write ports : " << num_rw_ports << endl; + cout << "Single ended read ports : " << num_se_rd_ports << endl; + if (fully_assoc||pure_cam) + { + cout << "Search ports : " << num_search_ports << endl; + } + cout << "Cache banks (UCA) : " << nbanks << endl; + cout << "Technology : " << F_sz_um << endl; + cout << "Temperature : " << temp << endl; + cout << "Tag size : " << tag_w << endl; + if (is_cache) { + cout << "array type : " << "Cache" << endl; + } + if (pure_ram) { + cout << "array type : " << "Scratch RAM" << endl; + } + if (pure_cam) + { + cout << "array type : " << "CAM" << endl; + } + cout << "Model as memory : " << is_main_mem << endl; + cout << "Access mode : " << access_mode << endl; + cout << "Data array cell type : " << data_arr_ram_cell_tech_type << endl; + cout << "Data array peripheral type : " << data_arr_peri_global_tech_type << endl; + cout << "Tag array cell type : " << tag_arr_ram_cell_tech_type << endl; + cout << "Tag array peripheral type : " << tag_arr_peri_global_tech_type << endl; + cout << "Optimization target : " << ed << endl; + cout << "Design objective (UCA wt) : " << delay_wt << " " + << dynamic_power_wt << " " << leakage_power_wt << " " << cycle_time_wt + << " " << area_wt << endl; + cout << "Design objective (UCA dev) : " << delay_dev << " " + << dynamic_power_dev << " " << leakage_power_dev << " " << cycle_time_dev + << " " << area_dev << endl; + if (nuca) + { + cout << "Cores : " << cores << endl; + + + cout << "Design objective (NUCA wt) : " << delay_wt_nuca << " " + << dynamic_power_wt_nuca << " " << leakage_power_wt_nuca << " " << cycle_time_wt_nuca + << " " << area_wt_nuca << endl; + cout << "Design objective (NUCA dev) : " << delay_dev_nuca << " " + << dynamic_power_dev_nuca << " " << leakage_power_dev_nuca << " " << cycle_time_dev_nuca + << " " << area_dev_nuca << endl; + } + cout << "Cache model : " << nuca << endl; + cout << "Nuca bank : " << nuca_bank_count << endl; + cout << "Wire inside mat : " << wire_is_mat_type << endl; + cout << "Wire outside mat : " << wire_os_mat_type << endl; + cout << "Interconnect projection : " << ic_proj_type << endl; + cout << "Wire signalling : " << force_wiretype << endl; + cout << "Print level : " << print_detail << endl; + cout << "ECC overhead : " << add_ecc_b_ << endl; + cout << "Page size : " << page_sz_bits << endl; + cout << "Burst length : " << burst_len << endl; + cout << "Internal prefetch width : " << int_prefetch_w << endl; + cout << "Force cache config : " << g_ip->force_cache_config << endl; + if (g_ip->force_cache_config) { + cout << "Ndwl : " << g_ip->ndwl << endl; + cout << "Ndbl : " << g_ip->ndbl << endl; + cout << "Nspd : " << g_ip->nspd << endl; + cout << "Ndcm : " << g_ip->ndcm << endl; + cout << "Ndsam1 : " << g_ip->ndsam1 << endl; + cout << "Ndsam2 : " << g_ip->ndsam2 << endl; + } +} + + + +powerComponents operator+(const powerComponents & x, const powerComponents & y) +{ + powerComponents z; + + z.dynamic = x.dynamic + y.dynamic; + z.leakage = x.leakage + y.leakage; + z.gate_leakage = x.gate_leakage + y.gate_leakage; + z.short_circuit = x.short_circuit + y.short_circuit; + z.longer_channel_leakage = x.longer_channel_leakage + y.longer_channel_leakage; + + return z; +} + +powerComponents operator*(const powerComponents & x, double const * const y) +{ + powerComponents z; + + z.dynamic = x.dynamic*y[0]; + z.leakage = x.leakage*y[1]; + z.gate_leakage = x.gate_leakage*y[2]; + z.short_circuit = x.short_circuit*y[3]; + z.longer_channel_leakage = x.longer_channel_leakage*y[1];//longer channel leakage has the same behavior as normal leakage + + return z; +} + + +powerDef operator+(const powerDef & x, const powerDef & y) +{ + powerDef z; + + z.readOp = x.readOp + y.readOp; + z.writeOp = x.writeOp + y.writeOp; + z.searchOp = x.searchOp + y.searchOp; + return z; +} + +powerDef operator*(const powerDef & x, double const * const y) +{ + powerDef z; + + z.readOp = x.readOp*y; + z.writeOp = x.writeOp*y; + z.searchOp = x.searchOp*y; + return z; +} + +uca_org_t cacti_interface(const string & infile_name) +{ + + uca_org_t fin_res; + //uca_org_t result; + fin_res.valid = false; + + g_ip = new InputParameter(); + g_ip->parse_cfg(infile_name); + if(!g_ip->error_checking()) + exit(0); + if (g_ip->print_input_args) + g_ip->display_ip(); + + init_tech_params(g_ip->F_sz_um, false); + Wire winit; // Do not delete this line. It initializes wires. + + +// For HighRadix Only +// //// Wire wirea(g_ip->wt, 1000); +// //// wirea.print_wire(); +// //// cout << "Wire Area " << wirea.area.get_area() << " sq. u" << endl; +// // winit.print_wire(); +// // +// HighRadix *hr; +// hr = new HighRadix(); +// hr->compute_power(); +// hr->print_router(); +// exit(0); +// +// double sub_switch_sz = 2; +// double rows = 32; +// for (int i=0; i<6; i++) { +// sub_switch_sz = pow(2, i); +// rows = 64/sub_switch_sz; +// hr = new HighRadix(sub_switch_sz, rows, .8/* freq */, 64, 2, 64, 0.7); +// hr->compute_power(); +// hr->print_router(); +// delete hr; +// } +// // HighRadix yarc; +// // yarc.compute_power(); +// // yarc.print_router(); +// winit.print_wire(); +// exit(0); +// For HighRadix Only End + + if (g_ip->nuca == 1) + { + Nuca n(&g_tp.peri_global); + n.sim_nuca(); + } + g_ip->display_ip(); + solve(&fin_res); + + output_UCA(&fin_res); + output_data_csv(fin_res); + + delete (g_ip); + return fin_res; +} + +//cacti6.5's plain interface, please keep !!! +uca_org_t cacti_interface( + int cache_size, + int line_size, + int associativity, + int rw_ports, + int excl_read_ports, + int excl_write_ports, + int single_ended_read_ports, + int banks, + double tech_node, // in nm + int page_sz, + int burst_length, + int pre_width, + int output_width, + int specific_tag, + int tag_width, + int access_mode, //0 normal, 1 seq, 2 fast + int cache, //scratch ram or cache + int main_mem, + int obj_func_delay, + int obj_func_dynamic_power, + int obj_func_leakage_power, + int obj_func_area, + int obj_func_cycle_time, + int dev_func_delay, + int dev_func_dynamic_power, + int dev_func_leakage_power, + int dev_func_area, + int dev_func_cycle_time, + int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate + int temp, + int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing + int data_arr_ram_cell_tech_flavor_in, // 0-4 + int data_arr_peri_global_tech_flavor_in, + int tag_arr_ram_cell_tech_flavor_in, + int tag_arr_peri_global_tech_flavor_in, + int interconnect_projection_type_in, // 0 - aggressive, 1 - normal + int wire_inside_mat_type_in, + int wire_outside_mat_type_in, + int is_nuca, // 0 - UCA, 1 - NUCA + int core_count, + int cache_level, // 0 - L2, 1 - L3 + int nuca_bank_count, + int nuca_obj_func_delay, + int nuca_obj_func_dynamic_power, + int nuca_obj_func_leakage_power, + int nuca_obj_func_area, + int nuca_obj_func_cycle_time, + int nuca_dev_func_delay, + int nuca_dev_func_dynamic_power, + int nuca_dev_func_leakage_power, + int nuca_dev_func_area, + int nuca_dev_func_cycle_time, + int REPEATERS_IN_HTREE_SEGMENTS_in,//TODO for now only wires with repeaters are supported + int p_input) +{ + g_ip = new InputParameter(); + g_ip->add_ecc_b_ = true; + + g_ip->data_arr_ram_cell_tech_type = data_arr_ram_cell_tech_flavor_in; + g_ip->data_arr_peri_global_tech_type = data_arr_peri_global_tech_flavor_in; + g_ip->tag_arr_ram_cell_tech_type = tag_arr_ram_cell_tech_flavor_in; + g_ip->tag_arr_peri_global_tech_type = tag_arr_peri_global_tech_flavor_in; + + g_ip->ic_proj_type = interconnect_projection_type_in; + g_ip->wire_is_mat_type = wire_inside_mat_type_in; + g_ip->wire_os_mat_type = wire_outside_mat_type_in; + g_ip->burst_len = burst_length; + g_ip->int_prefetch_w = pre_width; + g_ip->page_sz_bits = page_sz; + + g_ip->cache_sz = cache_size; + g_ip->line_sz = line_size; + g_ip->assoc = associativity; + g_ip->nbanks = banks; + g_ip->out_w = output_width; + g_ip->specific_tag = specific_tag; + if (tag_width == 0) { + g_ip->tag_w = 42; + } + else { + g_ip->tag_w = tag_width; + } + + g_ip->access_mode = access_mode; + g_ip->delay_wt = obj_func_delay; + g_ip->dynamic_power_wt = obj_func_dynamic_power; + g_ip->leakage_power_wt = obj_func_leakage_power; + g_ip->area_wt = obj_func_area; + g_ip->cycle_time_wt = obj_func_cycle_time; + g_ip->delay_dev = dev_func_delay; + g_ip->dynamic_power_dev = dev_func_dynamic_power; + g_ip->leakage_power_dev = dev_func_leakage_power; + g_ip->area_dev = dev_func_area; + g_ip->cycle_time_dev = dev_func_cycle_time; + g_ip->ed = ed_ed2_none; + + switch(wt) { + case (0): + g_ip->force_wiretype = 0; + g_ip->wt = Global; + break; + case (1): + g_ip->force_wiretype = 1; + g_ip->wt = Global; + break; + case (2): + g_ip->force_wiretype = 1; + g_ip->wt = Global_5; + break; + case (3): + g_ip->force_wiretype = 1; + g_ip->wt = Global_10; + break; + case (4): + g_ip->force_wiretype = 1; + g_ip->wt = Global_20; + break; + case (5): + g_ip->force_wiretype = 1; + g_ip->wt = Global_30; + break; + case (6): + g_ip->force_wiretype = 1; + g_ip->wt = Low_swing; + break; + default: + cout << "Unknown wire type!\n"; + exit(0); + } + + g_ip->delay_wt_nuca = nuca_obj_func_delay; + g_ip->dynamic_power_wt_nuca = nuca_obj_func_dynamic_power; + g_ip->leakage_power_wt_nuca = nuca_obj_func_leakage_power; + g_ip->area_wt_nuca = nuca_obj_func_area; + g_ip->cycle_time_wt_nuca = nuca_obj_func_cycle_time; + g_ip->delay_dev_nuca = dev_func_delay; + g_ip->dynamic_power_dev_nuca = nuca_dev_func_dynamic_power; + g_ip->leakage_power_dev_nuca = nuca_dev_func_leakage_power; + g_ip->area_dev_nuca = nuca_dev_func_area; + g_ip->cycle_time_dev_nuca = nuca_dev_func_cycle_time; + g_ip->nuca = is_nuca; + g_ip->nuca_bank_count = nuca_bank_count; + if(nuca_bank_count > 0) { + g_ip->force_nuca_bank = 1; + } + g_ip->cores = core_count; + g_ip->cache_level = cache_level; + + g_ip->temp = temp; + + g_ip->F_sz_nm = tech_node; + g_ip->F_sz_um = tech_node / 1000; + g_ip->is_main_mem = (main_mem != 0) ? true : false; + g_ip->is_cache = (cache != 0) ? true : false; + g_ip->rpters_in_htree = (REPEATERS_IN_HTREE_SEGMENTS_in != 0) ? true : false; + + g_ip->num_rw_ports = rw_ports; + g_ip->num_rd_ports = excl_read_ports; + g_ip->num_wr_ports = excl_write_ports; + g_ip->num_se_rd_ports = single_ended_read_ports; + g_ip->print_detail = 1; + g_ip->nuca = 0; + + g_ip->wt = Global_5; + g_ip->force_cache_config = false; + g_ip->force_wiretype = false; + g_ip->print_input_args = p_input; + + + uca_org_t fin_res; + fin_res.valid = false; + + if (g_ip->error_checking() == false) exit(0); + if (g_ip->print_input_args) + g_ip->display_ip(); + init_tech_params(g_ip->F_sz_um, false); + Wire winit; // Do not delete this line. It initializes wires. + + if (g_ip->nuca == 1) + { + Nuca n(&g_tp.peri_global); + n.sim_nuca(); + } + solve(&fin_res); + + output_UCA(&fin_res); + + delete (g_ip); + return fin_res; +} + +//McPAT's plain interface, please keep !!! +uca_org_t cacti_interface( + int cache_size, + int line_size, + int associativity, + int rw_ports, + int excl_read_ports,// para5 + int excl_write_ports, + int single_ended_read_ports, + int search_ports, + int banks, + double tech_node,//para10 + int output_width, + int specific_tag, + int tag_width, + int access_mode, + int cache, //para15 + int main_mem, + int obj_func_delay, + int obj_func_dynamic_power, + int obj_func_leakage_power, + int obj_func_cycle_time, //para20 + int obj_func_area, + int dev_func_delay, + int dev_func_dynamic_power, + int dev_func_leakage_power, + int dev_func_area, //para25 + int dev_func_cycle_time, + int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate + int temp, + int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing + int data_arr_ram_cell_tech_flavor_in,//para30 + int data_arr_peri_global_tech_flavor_in, + int tag_arr_ram_cell_tech_flavor_in, + int tag_arr_peri_global_tech_flavor_in, + int interconnect_projection_type_in, + int wire_inside_mat_type_in,//para35 + int wire_outside_mat_type_in, + int REPEATERS_IN_HTREE_SEGMENTS_in, + int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in, + int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in, + int PAGE_SIZE_BITS_in,//para40 + int BURST_LENGTH_in, + int INTERNAL_PREFETCH_WIDTH_in, + int force_wiretype, + int wiretype, + int force_config,//para45 + int ndwl, + int ndbl, + int nspd, + int ndcm, + int ndsam1,//para50 + int ndsam2, + int ecc) +{ + g_ip = new InputParameter(); + + uca_org_t fin_res; + fin_res.valid = false; + + g_ip->data_arr_ram_cell_tech_type = data_arr_ram_cell_tech_flavor_in; + g_ip->data_arr_peri_global_tech_type = data_arr_peri_global_tech_flavor_in; + g_ip->tag_arr_ram_cell_tech_type = tag_arr_ram_cell_tech_flavor_in; + g_ip->tag_arr_peri_global_tech_type = tag_arr_peri_global_tech_flavor_in; + + g_ip->ic_proj_type = interconnect_projection_type_in; + g_ip->wire_is_mat_type = wire_inside_mat_type_in; + g_ip->wire_os_mat_type = wire_outside_mat_type_in; + g_ip->burst_len = BURST_LENGTH_in; + g_ip->int_prefetch_w = INTERNAL_PREFETCH_WIDTH_in; + g_ip->page_sz_bits = PAGE_SIZE_BITS_in; + + g_ip->cache_sz = cache_size; + g_ip->line_sz = line_size; + g_ip->assoc = associativity; + g_ip->nbanks = banks; + g_ip->out_w = output_width; + g_ip->specific_tag = specific_tag; + if (specific_tag == 0) { + g_ip->tag_w = 42; + } + else { + g_ip->tag_w = tag_width; + } + + g_ip->access_mode = access_mode; + g_ip->delay_wt = obj_func_delay; + g_ip->dynamic_power_wt = obj_func_dynamic_power; + g_ip->leakage_power_wt = obj_func_leakage_power; + g_ip->area_wt = obj_func_area; + g_ip->cycle_time_wt = obj_func_cycle_time; + g_ip->delay_dev = dev_func_delay; + g_ip->dynamic_power_dev = dev_func_dynamic_power; + g_ip->leakage_power_dev = dev_func_leakage_power; + g_ip->area_dev = dev_func_area; + g_ip->cycle_time_dev = dev_func_cycle_time; + g_ip->temp = temp; + g_ip->ed = ed_ed2_none; + + g_ip->F_sz_nm = tech_node; + g_ip->F_sz_um = tech_node / 1000; + g_ip->is_main_mem = (main_mem != 0) ? true : false; + g_ip->is_cache = (cache ==1) ? true : false; + g_ip->pure_ram = (cache ==0) ? true : false; + g_ip->pure_cam = (cache ==2) ? true : false; + g_ip->rpters_in_htree = (REPEATERS_IN_HTREE_SEGMENTS_in != 0) ? true : false; + g_ip->ver_htree_wires_over_array = VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in; + g_ip->broadcast_addr_din_over_ver_htrees = BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in; + + g_ip->num_rw_ports = rw_ports; + g_ip->num_rd_ports = excl_read_ports; + g_ip->num_wr_ports = excl_write_ports; + g_ip->num_se_rd_ports = single_ended_read_ports; + g_ip->num_search_ports = search_ports; + + g_ip->print_detail = 1; + g_ip->nuca = 0; + + if (force_wiretype == 0) + { + g_ip->wt = Global; + g_ip->force_wiretype = false; + } + else + { g_ip->force_wiretype = true; + if (wiretype==10) { + g_ip->wt = Global_10; + } + if (wiretype==20) { + g_ip->wt = Global_20; + } + if (wiretype==30) { + g_ip->wt = Global_30; + } + if (wiretype==5) { + g_ip->wt = Global_5; + } + if (wiretype==0) { + g_ip->wt = Low_swing; + } + } + //g_ip->wt = Global_5; + if (force_config == 0) + { + g_ip->force_cache_config = false; + } + else + { + g_ip->force_cache_config = true; + g_ip->ndbl=ndbl; + g_ip->ndwl=ndwl; + g_ip->nspd=nspd; + g_ip->ndcm=ndcm; + g_ip->ndsam1=ndsam1; + g_ip->ndsam2=ndsam2; + + + } + + if (ecc==0){ + g_ip->add_ecc_b_=false; + } + else + { + g_ip->add_ecc_b_=true; + } + + + if(!g_ip->error_checking()) + exit(0); + + init_tech_params(g_ip->F_sz_um, false); + Wire winit; // Do not delete this line. It initializes wires. + + g_ip->display_ip(); + solve(&fin_res); + output_UCA(&fin_res); + output_data_csv(fin_res); + delete (g_ip); + + return fin_res; +} + + + +bool InputParameter::error_checking() +{ + int A; + bool seq_access = false; + fast_access = true; + + switch (access_mode) + { + case 0: + seq_access = false; + fast_access = false; + break; + case 1: + seq_access = true; + fast_access = false; + break; + case 2: + seq_access = false; + fast_access = true; + break; + } + + if(is_main_mem) + { + if(ic_proj_type == 0) + { + cerr << "DRAM model supports only conservative interconnect projection!\n\n"; + return false; + } + } + + + uint32_t B = line_sz; + + if (B < 1) + { + cerr << "Block size must >= 1" << endl; + return false; + } + else if (B*8 < out_w) + { + cerr << "Block size must be at least " << out_w/8 << endl; + return false; + } + + if (F_sz_um <= 0) + { + cerr << "Feature size must be > 0" << endl; + return false; + } + else if (F_sz_um > 0.091) + { + cerr << "Feature size must be <= 90 nm" << endl; + return false; + } + + + uint32_t RWP = num_rw_ports; + uint32_t ERP = num_rd_ports; + uint32_t EWP = num_wr_ports; + uint32_t NSER = num_se_rd_ports; + uint32_t SCHP = num_search_ports; + +//TODO: revisit this. This is an important feature. Sheng thought this should be used +// // If multiple banks and multiple ports are specified, then if number of ports is less than or equal to +// // the number of banks, we assume that the multiple ports are implemented via the multiple banks. +// // In such a case we assume that each bank has 1 RWP port. +// if ((RWP + ERP + EWP) <= nbanks && nbanks>1) +// { +// RWP = 1; +// ERP = 0; +// EWP = 0; +// NSER = 0; +// } +// else if ((RWP < 0) || (EWP < 0) || (ERP < 0)) +// { +// cerr << "Ports must >=0" << endl; +// return false; +// } +// else if (RWP > 2) +// { +// cerr << "Maximum of 2 read/write ports" << endl; +// return false; +// } +// else if ((RWP+ERP+EWP) < 1) + // Changed to new implementation: + // The number of ports specified at input is per bank + if ((RWP+ERP+EWP) < 1) + { + cerr << "Must have at least one port" << endl; + return false; + } + + if (is_pow2(nbanks) == false) + { + cerr << "Number of subbanks should be greater than or equal to 1 and should be a power of 2" << endl; + return false; + } + + int C = cache_sz/nbanks; + if (C < 64) + { + cerr << "Cache size must >=64" << endl; + return false; + } + +//TODO: revisit this +// if (pure_ram==true && assoc!=1) +// { +// cerr << "Pure RAM must have assoc as 1" << endl; +// return false; +// } + + //fully assoc and cam check + if (is_cache && assoc==0) + fully_assoc =true; + else + fully_assoc = false; + + if (pure_cam==true && assoc!=0) + { + cerr << "Pure CAM must have associativity as 0" << endl; + return false; + } + + if (assoc==0 && (pure_cam==false && is_cache ==false)) + { + cerr << "Only CAM or Fully associative cache can have associativity as 0" << endl; + return false; + } + + if ((fully_assoc==true || pure_cam==true) + && (data_arr_ram_cell_tech_type!= tag_arr_ram_cell_tech_type + || data_arr_peri_global_tech_type != tag_arr_peri_global_tech_type )) + { + cerr << "CAM and fully associative cache must have same device type for both data and tag array" << endl; + return false; + } + + if ((fully_assoc==true || pure_cam==true) + && (data_arr_ram_cell_tech_type== lp_dram || data_arr_ram_cell_tech_type== comm_dram)) + { + cerr << "DRAM based CAM and fully associative cache are not supported" << endl; + return false; + } + + if ((fully_assoc==true || pure_cam==true) + && (is_main_mem==true)) + { + cerr << "CAM and fully associative cache cannot be as main memory" << endl; + return false; + } + + if ((fully_assoc || pure_cam) && SCHP<1) + { + cerr << "CAM and fully associative must have at least 1 search port" << endl; + return false; + } + + if (RWP==0 && ERP==0 && SCHP>0 && ((fully_assoc || pure_cam))) + { + ERP=SCHP; + } + +// if ((!(fully_assoc || pure_cam)) && SCHP>=1) +// { +// cerr << "None CAM and fully associative cannot have search ports" << endl; +// return false; +// } + + if (assoc == 0) + { + A = C/B; + //fully_assoc = true; + } + else + { + if (assoc == 1) + { + A = 1; + //fully_assoc = false; + } + else + { + //fully_assoc = false; + A = assoc; + if (is_pow2(A) == false) + { + cerr << "Associativity must be a power of 2" << endl; + return false; + } + } + } + + if (C/(B*A) <= 1 && assoc!=0) + { + cerr << "Number of sets is too small: " << endl; + cerr << " Need to either increase cache size, or decrease associativity or block size" << endl; + cerr << " (or use fully associative cache)" << endl; + return false; + } + + block_sz = B; + + /*dt: testing sequential access mode*/ + if(seq_access) + { + tag_assoc = A; + data_assoc = 1; + is_seq_acc = true; + } + else + { + tag_assoc = A; + data_assoc = A; + is_seq_acc = false; + } + + if (assoc==0) + { + data_assoc = 1; + } + num_rw_ports = RWP; + num_rd_ports = ERP; + num_wr_ports = EWP; + num_se_rd_ports = NSER; + if (!(fully_assoc || pure_cam)) + num_search_ports = 0; + nsets = C/(B*A); + + if (temp < 300 || temp > 400 || temp%10 != 0) + { + cerr << temp << " Temperature must be between 300 and 400 Kelvin and multiple of 10." << endl; + return false; + } + + if (nsets < 1) + { + cerr << "Less than one set..." << endl; + return false; + } + + return true; +} + + + +void output_data_csv(const uca_org_t & fin_res) +{ + //TODO: the csv output should remain + fstream file("out.csv", ios::in); + bool print_index = file.fail(); + file.close(); + + file.open("out.csv", ios::out|ios::app); + if (file.fail() == true) + { + cerr << "File out.csv could not be opened successfully" << endl; + } + else + { + if (print_index == true) + { + file << "Tech node (nm), "; + file << "Capacity (bytes), "; + file << "Number of banks, "; + file << "Associativity, "; + file << "Output width (bits), "; + file << "Access time (ns), "; + file << "Random cycle time (ns), "; +// file << "Multisubbank interleave cycle time (ns), "; + +// file << "Delay request network (ns), "; +// file << "Delay inside mat (ns), "; +// file << "Delay reply network (ns), "; +// file << "Tag array access time (ns), "; +// file << "Data array access time (ns), "; +// file << "Refresh period (microsec), "; +// file << "DRAM array availability (%), "; + file << "Dynamic search energy (nJ), "; + file << "Dynamic read energy (nJ), "; + file << "Dynamic write energy (nJ), "; +// file << "Tag Dynamic read energy (nJ), "; +// file << "Data Dynamic read energy (nJ), "; +// file << "Dynamic read power (mW), "; + file << "Standby leakage per bank(mW), "; +// file << "Leakage per bank with leak power management (mW), "; +// file << "Leakage per bank with leak power management (mW), "; +// file << "Refresh power as percentage of standby leakage, "; + file << "Area (mm2), "; + file << "Ndwl, "; + file << "Ndbl, "; + file << "Nspd, "; + file << "Ndcm, "; + file << "Ndsam_level_1, "; + file << "Ndsam_level_2, "; + file << "Data arrary area efficiency %, "; + file << "Ntwl, "; + file << "Ntbl, "; + file << "Ntspd, "; + file << "Ntcm, "; + file << "Ntsam_level_1, "; + file << "Ntsam_level_2, "; + file << "Tag arrary area efficiency %, "; + +// file << "Resistance per unit micron (ohm-micron), "; +// file << "Capacitance per unit micron (fF per micron), "; +// file << "Unit-length wire delay (ps), "; +// file << "FO4 delay (ps), "; +// file << "delay route to bank (including crossb delay) (ps), "; +// file << "Crossbar delay (ps), "; +// file << "Dyn read energy per access from closed page (nJ), "; +// file << "Dyn read energy per access from open page (nJ), "; +// file << "Leak power of an subbank with page closed (mW), "; +// file << "Leak power of a subbank with page open (mW), "; +// file << "Leak power of request and reply networks (mW), "; +// file << "Number of subbanks, "; +// file << "Page size in bits, "; +// file << "Activate power, "; +// file << "Read power, "; +// file << "Write power, "; +// file << "Precharge power, "; +// file << "tRCD, "; +// file << "CAS latency, "; +// file << "Precharge delay, "; +// file << "Perc dyn energy bitlines, "; +// file << "perc dyn energy wordlines, "; +// file << "perc dyn energy outside mat, "; +// file << "Area opt (perc), "; +// file << "Delay opt (perc), "; +// file << "Repeater opt (perc), "; +// file << "Aspect ratio"; + file << endl; + } + file << g_ip->F_sz_nm << ", "; + file << g_ip->cache_sz << ", "; + file << g_ip->nbanks << ", "; + file << g_ip->tag_assoc << ", "; + file << g_ip->out_w << ", "; + file << fin_res.access_time*1e+9 << ", "; + file << fin_res.cycle_time*1e+9 << ", "; +// file << fin_res.data_array2->multisubbank_interleave_cycle_time*1e+9 << ", "; +// file << fin_res.data_array2->delay_request_network*1e+9 << ", "; +// file << fin_res.data_array2->delay_inside_mat*1e+9 << ", "; +// file << fin_res.data_array2.delay_reply_network*1e+9 << ", "; + +// if (!(g_ip->fully_assoc || g_ip->pure_cam || g_ip->pure_ram)) +// { +// file << fin_res.tag_array2->access_time*1e+9 << ", "; +// } +// else +// { +// file << 0 << ", "; +// } +// file << fin_res.data_array2->access_time*1e+9 << ", "; +// file << fin_res.data_array2->dram_refresh_period*1e+6 << ", "; +// file << fin_res.data_array2->dram_array_availability << ", "; + if (g_ip->fully_assoc || g_ip->pure_cam) + { + file << fin_res.power.searchOp.dynamic*1e+9 << ", "; + } + else + { + file << "N/A" << ", "; + } + file << fin_res.power.readOp.dynamic*1e+9 << ", "; + file << fin_res.power.writeOp.dynamic*1e+9 << ", "; +// if (!(g_ip->fully_assoc || g_ip->pure_cam || g_ip->pure_ram)) +// { +// file << fin_res.tag_array2->power.readOp.dynamic*1e+9 << ", "; +// } +// else +// { +// file << "NA" << ", "; +// } +// file << fin_res.data_array2->power.readOp.dynamic*1e+9 << ", "; +// if (g_ip->fully_assoc || g_ip->pure_cam) +// { +// file << fin_res.power.searchOp.dynamic*1000/fin_res.cycle_time << ", "; +// } +// else +// { +// file << fin_res.power.readOp.dynamic*1000/fin_res.cycle_time << ", "; +// } + + file <<( fin_res.power.readOp.leakage + fin_res.power.readOp.gate_leakage )*1000 << ", "; +// file << fin_res.leak_power_with_sleep_transistors_in_mats*1000 << ", "; +// file << fin_res.data_array.refresh_power / fin_res.data_array.total_power.readOp.leakage << ", "; + file << fin_res.area*1e-6 << ", "; + + file << fin_res.data_array2->Ndwl << ", "; + file << fin_res.data_array2->Ndbl << ", "; + file << fin_res.data_array2->Nspd << ", "; + file << fin_res.data_array2->deg_bl_muxing << ", "; + file << fin_res.data_array2->Ndsam_lev_1 << ", "; + file << fin_res.data_array2->Ndsam_lev_2 << ", "; + file << fin_res.data_array2->area_efficiency << ", "; + if (!(g_ip->fully_assoc || g_ip->pure_cam || g_ip->pure_ram)) + { + file << fin_res.tag_array2->Ndwl << ", "; + file << fin_res.tag_array2->Ndbl << ", "; + file << fin_res.tag_array2->Nspd << ", "; + file << fin_res.tag_array2->deg_bl_muxing << ", "; + file << fin_res.tag_array2->Ndsam_lev_1 << ", "; + file << fin_res.tag_array2->Ndsam_lev_2 << ", "; + file << fin_res.tag_array2->area_efficiency << ", "; + } + else + { + file << "N/A" << ", "; + file << "N/A"<< ", "; + file << "N/A" << ", "; + file << "N/A" << ", "; + file << "N/A" << ", "; + file << "N/A" << ", "; + file << "N/A" << ", "; + } + +// file << g_tp.wire_inside_mat.R_per_um << ", "; +// file << g_tp.wire_inside_mat.C_per_um / 1e-15 << ", "; +// file << g_tp.unit_len_wire_del / 1e-12 << ", "; +// file << g_tp.FO4 / 1e-12 << ", "; +// file << fin_res.data_array.delay_route_to_bank / 1e-9 << ", "; +// file << fin_res.data_array.delay_crossbar / 1e-9 << ", "; +// file << fin_res.data_array.dyn_read_energy_from_closed_page / 1e-9 << ", "; +// file << fin_res.data_array.dyn_read_energy_from_open_page / 1e-9 << ", "; +// file << fin_res.data_array.leak_power_subbank_closed_page / 1e-3 << ", "; +// file << fin_res.data_array.leak_power_subbank_open_page / 1e-3 << ", "; +// file << fin_res.data_array.leak_power_request_and_reply_networks / 1e-3 << ", "; +// file << fin_res.data_array.number_subbanks << ", " ; +// file << fin_res.data_array.page_size_in_bits << ", " ; +// file << fin_res.data_array.activate_energy * 1e9 << ", " ; +// file << fin_res.data_array.read_energy * 1e9 << ", " ; +// file << fin_res.data_array.write_energy * 1e9 << ", " ; +// file << fin_res.data_array.precharge_energy * 1e9 << ", " ; +// file << fin_res.data_array.trcd * 1e9 << ", " ; +// file << fin_res.data_array.cas_latency * 1e9 << ", " ; +// file << fin_res.data_array.precharge_delay * 1e9 << ", " ; +// file << fin_res.data_array.all_banks_height / fin_res.data_array.all_banks_width; + file<<endl; + } + file.close(); +} + + + +void output_UCA(uca_org_t *fr) +{ + // if (NUCA) + if (0) { + cout << "\n\n Detailed Bank Stats:\n"; + cout << " Bank Size (bytes): %d\n" << + (int) (g_ip->cache_sz); + } + else { + if (g_ip->data_arr_ram_cell_tech_type == 3) { + cout << "\n---------- CACTI version 6.5, Uniform Cache Access " << + "Logic Process Based DRAM Model ----------\n"; + } + else if (g_ip->data_arr_ram_cell_tech_type == 4) { + cout << "\n---------- CACTI version 6.5, Uniform" << + "Cache Access Commodity DRAM Model ----------\n"; + } + else { + cout << "\n---------- CACTI version 6.5, Uniform Cache Access " + "SRAM Model ----------\n"; + } + cout << "\nCache Parameters:\n"; + cout << " Total cache size (bytes): " << + (int) (g_ip->cache_sz) << endl; + } + + cout << " Number of banks: " << (int) g_ip->nbanks << endl; + if (g_ip->fully_assoc|| g_ip->pure_cam) + cout << " Associativity: fully associative\n"; + else { + if (g_ip->tag_assoc == 1) + cout << " Associativity: direct mapped\n"; + else + cout << " Associativity: " << + g_ip->tag_assoc << endl; + } + + + cout << " Block size (bytes): " << g_ip->line_sz << endl; + cout << " Read/write Ports: " << + g_ip->num_rw_ports << endl; + cout << " Read ports: " << + g_ip->num_rd_ports << endl; + cout << " Write ports: " << + g_ip->num_wr_ports << endl; + if (g_ip->fully_assoc|| g_ip->pure_cam) + cout << " search ports: " << + g_ip->num_search_ports << endl; + cout << " Technology size (nm): " << + g_ip->F_sz_nm << endl << endl; + + cout << " Access time (ns): " << fr->access_time*1e9 << endl; + cout << " Cycle time (ns): " << fr->cycle_time*1e9 << endl; + if (g_ip->data_arr_ram_cell_tech_type >= 4) { + cout << " Precharge Delay (ns): " << fr->data_array2->precharge_delay*1e9 << endl; + cout << " Activate Energy (nJ): " << fr->data_array2->activate_energy*1e9 << endl; + cout << " Read Energy (nJ): " << fr->data_array2->read_energy*1e9 << endl; + cout << " Write Energy (nJ): " << fr->data_array2->write_energy*1e9 << endl; + cout << " Precharge Energy (nJ): " << fr->data_array2->precharge_energy*1e9 << endl; + cout << " Leakage Power Closed Page (mW): " << fr->data_array2->leak_power_subbank_closed_page*1e3 << endl; + cout << " Leakage Power Open Page (mW): " << fr->data_array2->leak_power_subbank_open_page*1e3 << endl; + cout << " Leakage Power I/O (mW): " << fr->data_array2->leak_power_request_and_reply_networks*1e3 << endl; + cout << " Refresh power (mW): " << + fr->data_array2->refresh_power*1e3 << endl; + } + else { + if ((g_ip->fully_assoc|| g_ip->pure_cam)) + { + cout << " Total dynamic associative search energy per access (nJ): " << + fr->power.searchOp.dynamic*1e9 << endl; +// cout << " Total dynamic read energy per access (nJ): " << +// fr->power.readOp.dynamic*1e9 << endl; +// cout << " Total dynamic write energy per access (nJ): " << +// fr->power.writeOp.dynamic*1e9 << endl; + } +// else +// { + cout << " Total dynamic read energy per access (nJ): " << + fr->power.readOp.dynamic*1e9 << endl; + cout << " Total dynamic write energy per access (nJ): " << + fr->power.writeOp.dynamic*1e9 << endl; +// } + cout << " Total leakage power of a bank" + " (mW): " << fr->power.readOp.leakage*1e3 << endl; + cout << " Total gate leakage power of a bank" + " (mW): " << fr->power.readOp.gate_leakage*1e3 << endl; + } + + if (g_ip->data_arr_ram_cell_tech_type ==3 || g_ip->data_arr_ram_cell_tech_type ==4) + { + } + cout << " Cache height x width (mm): " << + fr->cache_ht*1e-3 << " x " << fr->cache_len*1e-3 << endl << endl; + + + cout << " Best Ndwl : " << fr->data_array2->Ndwl << endl; + cout << " Best Ndbl : " << fr->data_array2->Ndbl << endl; + cout << " Best Nspd : " << fr->data_array2->Nspd << endl; + cout << " Best Ndcm : " << fr->data_array2->deg_bl_muxing << endl; + cout << " Best Ndsam L1 : " << fr->data_array2->Ndsam_lev_1 << endl; + cout << " Best Ndsam L2 : " << fr->data_array2->Ndsam_lev_2 << endl << endl; + + if ((!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) && !g_ip->is_main_mem) + { + cout << " Best Ntwl : " << fr->tag_array2->Ndwl << endl; + cout << " Best Ntbl : " << fr->tag_array2->Ndbl << endl; + cout << " Best Ntspd : " << fr->tag_array2->Nspd << endl; + cout << " Best Ntcm : " << fr->tag_array2->deg_bl_muxing << endl; + cout << " Best Ntsam L1 : " << fr->tag_array2->Ndsam_lev_1 << endl; + cout << " Best Ntsam L2 : " << fr->tag_array2->Ndsam_lev_2 << endl; + } + + switch (fr->data_array2->wt) { + case (0): + cout << " Data array, H-tree wire type: Delay optimized global wires\n"; + break; + case (1): + cout << " Data array, H-tree wire type: Global wires with 5\% delay penalty\n"; + break; + case (2): + cout << " Data array, H-tree wire type: Global wires with 10\% delay penalty\n"; + break; + case (3): + cout << " Data array, H-tree wire type: Global wires with 20\% delay penalty\n"; + break; + case (4): + cout << " Data array, H-tree wire type: Global wires with 30\% delay penalty\n"; + break; + case (5): + cout << " Data array, wire type: Low swing wires\n"; + break; + default: + cout << "ERROR - Unknown wire type " << (int) fr->data_array2->wt <<endl; + exit(0); + } + + if (!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) { + switch (fr->tag_array2->wt) { + case (0): + cout << " Tag array, H-tree wire type: Delay optimized global wires\n"; + break; + case (1): + cout << " Tag array, H-tree wire type: Global wires with 5\% delay penalty\n"; + break; + case (2): + cout << " Tag array, H-tree wire type: Global wires with 10\% delay penalty\n"; + break; + case (3): + cout << " Tag array, H-tree wire type: Global wires with 20\% delay penalty\n"; + break; + case (4): + cout << " Tag array, H-tree wire type: Global wires with 30\% delay penalty\n"; + break; + case (5): + cout << " Tag array, wire type: Low swing wires\n"; + break; + default: + cout << "ERROR - Unknown wire type " << (int) fr->tag_array2->wt <<endl; + exit(-1); + } + } + + if (g_ip->print_detail) + { + //if(g_ip->fully_assoc) return; + + /* Delay stats */ + /* data array stats */ + cout << endl << "Time Components:" << endl << endl; + + cout << " Data side (with Output driver) (ns): " << + fr->data_array2->access_time/1e-9 << endl; + + cout << "\tH-tree input delay (ns): " << + fr->data_array2->delay_route_to_bank * 1e9 + + fr->data_array2->delay_input_htree * 1e9 << endl; + + if (!(g_ip->pure_cam || g_ip->fully_assoc)) + { + cout << "\tDecoder + wordline delay (ns): " << + fr->data_array2->delay_row_predecode_driver_and_block * 1e9 + + fr->data_array2->delay_row_decoder * 1e9 << endl; + } + else + { + cout << "\tCAM search delay (ns): " << + fr->data_array2->delay_matchlines * 1e9 << endl; + } + + cout << "\tBitline delay (ns): " << + fr->data_array2->delay_bitlines/1e-9 << endl; + + cout << "\tSense Amplifier delay (ns): " << + fr->data_array2->delay_sense_amp * 1e9 << endl; + + + cout << "\tH-tree output delay (ns): " << + fr->data_array2->delay_subarray_output_driver * 1e9 + + fr->data_array2->delay_dout_htree * 1e9 << endl; + + if ((!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) && !g_ip->is_main_mem) + { + /* tag array stats */ + cout << endl << " Tag side (with Output driver) (ns): " << + fr->tag_array2->access_time/1e-9 << endl; + + cout << "\tH-tree input delay (ns): " << + fr->tag_array2->delay_route_to_bank * 1e9 + + fr->tag_array2->delay_input_htree * 1e9 << endl; + + cout << "\tDecoder + wordline delay (ns): " << + fr->tag_array2->delay_row_predecode_driver_and_block * 1e9 + + fr->tag_array2->delay_row_decoder * 1e9 << endl; + + cout << "\tBitline delay (ns): " << + fr->tag_array2->delay_bitlines/1e-9 << endl; + + cout << "\tSense Amplifier delay (ns): " << + fr->tag_array2->delay_sense_amp * 1e9 << endl; + + cout << "\tComparator delay (ns): " << + fr->tag_array2->delay_comparator * 1e9 << endl; + + cout << "\tH-tree output delay (ns): " << + fr->tag_array2->delay_subarray_output_driver * 1e9 + + fr->tag_array2->delay_dout_htree * 1e9 << endl; + } + + + + /* Energy/Power stats */ + cout << endl << endl << "Power Components:" << endl << endl; + + if (!(g_ip->pure_cam || g_ip->fully_assoc)) + { + cout << " Data array: Total dynamic read energy/access (nJ): " << + fr->data_array2->power.readOp.dynamic * 1e9 << endl; + cout << "\tTotal leakage read/write power of a bank (mW): " << + fr->data_array2->power.readOp.leakage * 1e3 << endl; + + cout << "\tTotal energy in H-tree (that includes both " + "address and data transfer) (nJ): " << + (fr->data_array2->power_addr_input_htree.readOp.dynamic + + fr->data_array2->power_data_output_htree.readOp.dynamic + + fr->data_array2->power_routing_to_bank.readOp.dynamic) * 1e9 << endl; + + cout << "\tTotal leakage power in H-tree (that includes both " + "address and data network) ((mW)): " << + (fr->data_array2->power_addr_input_htree.readOp.leakage + + fr->data_array2->power_data_output_htree.readOp.leakage + + fr->data_array2->power_routing_to_bank.readOp.leakage) * 1e3 << endl; + + cout << "\tTotal gate leakage power in H-tree (that includes both " + "address and data network) ((mW)): " << + (fr->data_array2->power_addr_input_htree.readOp.gate_leakage + + fr->data_array2->power_data_output_htree.readOp.gate_leakage + + fr->data_array2->power_routing_to_bank.readOp.gate_leakage) * 1e3 << endl; + + cout << "\tOutput Htree inside bank Energy (nJ): " << + fr->data_array2->power_data_output_htree.readOp.dynamic * 1e9 << endl; + cout << "\tDecoder (nJ): " << + fr->data_array2->power_row_predecoder_drivers.readOp.dynamic * 1e9 + + fr->data_array2->power_row_predecoder_blocks.readOp.dynamic * 1e9 << endl; + cout << "\tWordline (nJ): " << + fr->data_array2->power_row_decoders.readOp.dynamic * 1e9 << endl; + cout << "\tBitline mux & associated drivers (nJ): " << + fr->data_array2->power_bit_mux_predecoder_drivers.readOp.dynamic * 1e9 + + fr->data_array2->power_bit_mux_predecoder_blocks.readOp.dynamic * 1e9 + + fr->data_array2->power_bit_mux_decoders.readOp.dynamic * 1e9 << endl; + cout << "\tSense amp mux & associated drivers (nJ): " << + fr->data_array2->power_senseamp_mux_lev_1_predecoder_drivers.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_1_decoders.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_2_decoders.readOp.dynamic * 1e9 << endl; + + cout << "\tBitlines precharge and equalization circuit (nJ): " << + fr->data_array2->power_prechg_eq_drivers.readOp.dynamic * 1e9 << endl; + cout << "\tBitlines (nJ): " << + fr->data_array2->power_bitlines.readOp.dynamic * 1e9 << endl; + cout << "\tSense amplifier energy (nJ): " << + fr->data_array2->power_sense_amps.readOp.dynamic * 1e9 << endl; + cout << "\tSub-array output driver (nJ): " << + fr->data_array2->power_output_drivers_at_subarray.readOp.dynamic * 1e9 << endl; + } + + else if (g_ip->pure_cam) + { + + cout << " CAM array:"<<endl; + cout << " Total dynamic associative search energy/access (nJ): " << + fr->data_array2->power.searchOp.dynamic * 1e9 << endl; + cout << "\tTotal energy in H-tree (that includes both " + "match key and data transfer) (nJ): " << + (fr->data_array2->power_htree_in_search.searchOp.dynamic + + fr->data_array2->power_htree_out_search.searchOp.dynamic + + fr->data_array2->power_routing_to_bank.searchOp.dynamic) * 1e9 << endl; + cout << "\tKeyword input and result output Htrees inside bank Energy (nJ): " << + (fr->data_array2->power_htree_in_search.searchOp.dynamic + + fr->data_array2->power_htree_out_search.searchOp.dynamic) * 1e9 << endl; + cout << "\tSearchlines (nJ): " << + fr->data_array2->power_searchline.searchOp.dynamic * 1e9 + + fr->data_array2->power_searchline_precharge.searchOp.dynamic * 1e9 << endl; + cout << "\tMatchlines (nJ): " << + fr->data_array2->power_matchlines.searchOp.dynamic * 1e9 + + fr->data_array2->power_matchline_precharge.searchOp.dynamic * 1e9 << endl; + cout << "\tSub-array output driver (nJ): " << + fr->data_array2->power_output_drivers_at_subarray.searchOp.dynamic * 1e9 << endl; + + + cout <<endl<< " Total dynamic read energy/access (nJ): " << + fr->data_array2->power.readOp.dynamic * 1e9 << endl; + cout << "\tTotal energy in H-tree (that includes both " + "address and data transfer) (nJ): " << + (fr->data_array2->power_addr_input_htree.readOp.dynamic + + fr->data_array2->power_data_output_htree.readOp.dynamic + + fr->data_array2->power_routing_to_bank.readOp.dynamic) * 1e9 << endl; + cout << "\tOutput Htree inside bank Energy (nJ): " << + fr->data_array2->power_data_output_htree.readOp.dynamic * 1e9 << endl; + cout << "\tDecoder (nJ): " << + fr->data_array2->power_row_predecoder_drivers.readOp.dynamic * 1e9 + + fr->data_array2->power_row_predecoder_blocks.readOp.dynamic * 1e9 << endl; + cout << "\tWordline (nJ): " << + fr->data_array2->power_row_decoders.readOp.dynamic * 1e9 << endl; + cout << "\tBitline mux & associated drivers (nJ): " << + fr->data_array2->power_bit_mux_predecoder_drivers.readOp.dynamic * 1e9 + + fr->data_array2->power_bit_mux_predecoder_blocks.readOp.dynamic * 1e9 + + fr->data_array2->power_bit_mux_decoders.readOp.dynamic * 1e9 << endl; + cout << "\tSense amp mux & associated drivers (nJ): " << + fr->data_array2->power_senseamp_mux_lev_1_predecoder_drivers.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_1_decoders.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_2_decoders.readOp.dynamic * 1e9 << endl; + cout << "\tBitlines (nJ): " << + fr->data_array2->power_bitlines.readOp.dynamic * 1e9 + + fr->data_array2->power_prechg_eq_drivers.readOp.dynamic * 1e9<< endl; + cout << "\tSense amplifier energy (nJ): " << + fr->data_array2->power_sense_amps.readOp.dynamic * 1e9 << endl; + cout << "\tSub-array output driver (nJ): " << + fr->data_array2->power_output_drivers_at_subarray.readOp.dynamic * 1e9 << endl; + + cout << endl <<" Total leakage power of a bank (mW): " << + fr->data_array2->power.readOp.leakage * 1e3 << endl; + } + else + { + cout << " Fully associative array:"<<endl; + cout << " Total dynamic associative search energy/access (nJ): " << + fr->data_array2->power.searchOp.dynamic * 1e9 << endl; + cout << "\tTotal energy in H-tree (that includes both " + "match key and data transfer) (nJ): " << + (fr->data_array2->power_htree_in_search.searchOp.dynamic + + fr->data_array2->power_htree_out_search.searchOp.dynamic + + fr->data_array2->power_routing_to_bank.searchOp.dynamic) * 1e9 << endl; + cout << "\tKeyword input and result output Htrees inside bank Energy (nJ): " << + (fr->data_array2->power_htree_in_search.searchOp.dynamic + + fr->data_array2->power_htree_out_search.searchOp.dynamic) * 1e9 << endl; + cout << "\tSearchlines (nJ): " << + fr->data_array2->power_searchline.searchOp.dynamic * 1e9 + + fr->data_array2->power_searchline_precharge.searchOp.dynamic * 1e9 << endl; + cout << "\tMatchlines (nJ): " << + fr->data_array2->power_matchlines.searchOp.dynamic * 1e9 + + fr->data_array2->power_matchline_precharge.searchOp.dynamic * 1e9 << endl; + cout << "\tData portion wordline (nJ): " << + fr->data_array2->power_matchline_to_wordline_drv.searchOp.dynamic * 1e9 << endl; + cout << "\tData Bitlines (nJ): " << + fr->data_array2->power_bitlines.searchOp.dynamic * 1e9 + + fr->data_array2->power_prechg_eq_drivers.searchOp.dynamic * 1e9 << endl; + cout << "\tSense amplifier energy (nJ): " << + fr->data_array2->power_sense_amps.searchOp.dynamic * 1e9 << endl; + cout << "\tSub-array output driver (nJ): " << + fr->data_array2->power_output_drivers_at_subarray.searchOp.dynamic * 1e9 << endl; + + + cout <<endl<< " Total dynamic read energy/access (nJ): " << + fr->data_array2->power.readOp.dynamic * 1e9 << endl; + cout << "\tTotal energy in H-tree (that includes both " + "address and data transfer) (nJ): " << + (fr->data_array2->power_addr_input_htree.readOp.dynamic + + fr->data_array2->power_data_output_htree.readOp.dynamic + + fr->data_array2->power_routing_to_bank.readOp.dynamic) * 1e9 << endl; + cout << "\tOutput Htree inside bank Energy (nJ): " << + fr->data_array2->power_data_output_htree.readOp.dynamic * 1e9 << endl; + cout << "\tDecoder (nJ): " << + fr->data_array2->power_row_predecoder_drivers.readOp.dynamic * 1e9 + + fr->data_array2->power_row_predecoder_blocks.readOp.dynamic * 1e9 << endl; + cout << "\tWordline (nJ): " << + fr->data_array2->power_row_decoders.readOp.dynamic * 1e9 << endl; + cout << "\tBitline mux & associated drivers (nJ): " << + fr->data_array2->power_bit_mux_predecoder_drivers.readOp.dynamic * 1e9 + + fr->data_array2->power_bit_mux_predecoder_blocks.readOp.dynamic * 1e9 + + fr->data_array2->power_bit_mux_decoders.readOp.dynamic * 1e9 << endl; + cout << "\tSense amp mux & associated drivers (nJ): " << + fr->data_array2->power_senseamp_mux_lev_1_predecoder_drivers.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_1_decoders.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic * 1e9 + + fr->data_array2->power_senseamp_mux_lev_2_decoders.readOp.dynamic * 1e9 << endl; + cout << "\tBitlines (nJ): " << + fr->data_array2->power_bitlines.readOp.dynamic * 1e9 + + fr->data_array2->power_prechg_eq_drivers.readOp.dynamic * 1e9<< endl; + cout << "\tSense amplifier energy (nJ): " << + fr->data_array2->power_sense_amps.readOp.dynamic * 1e9 << endl; + cout << "\tSub-array output driver (nJ): " << + fr->data_array2->power_output_drivers_at_subarray.readOp.dynamic * 1e9 << endl; + + cout << endl <<" Total leakage power of a bank (mW): " << + fr->data_array2->power.readOp.leakage * 1e3 << endl; + } + + + if ((!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) && !g_ip->is_main_mem) + { + cout << endl << " Tag array: Total dynamic read energy/access (nJ): " << + fr->tag_array2->power.readOp.dynamic * 1e9 << endl; + cout << "\tTotal leakage read/write power of a bank (mW): " << + fr->tag_array2->power.readOp.leakage * 1e3 << endl; + cout << "\tTotal energy in H-tree (that includes both " + "address and data transfer) (nJ): " << + (fr->tag_array2->power_addr_input_htree.readOp.dynamic + + fr->tag_array2->power_data_output_htree.readOp.dynamic + + fr->tag_array2->power_routing_to_bank.readOp.dynamic) * 1e9 << endl; + + cout << "\tTotal leakage power in H-tree (that includes both " + "address and data network) ((mW)): " << + (fr->tag_array2->power_addr_input_htree.readOp.leakage + + fr->tag_array2->power_data_output_htree.readOp.leakage + + fr->tag_array2->power_routing_to_bank.readOp.leakage) * 1e3 << endl; + + cout << "\tTotal gate leakage power in H-tree (that includes both " + "address and data network) ((mW)): " << + (fr->tag_array2->power_addr_input_htree.readOp.gate_leakage + + fr->tag_array2->power_data_output_htree.readOp.gate_leakage + + fr->tag_array2->power_routing_to_bank.readOp.gate_leakage) * 1e3 << endl; + + cout << "\tOutput Htree inside a bank Energy (nJ): " << + fr->tag_array2->power_data_output_htree.readOp.dynamic * 1e9 << endl; + cout << "\tDecoder (nJ): " << + fr->tag_array2->power_row_predecoder_drivers.readOp.dynamic * 1e9 + + fr->tag_array2->power_row_predecoder_blocks.readOp.dynamic * 1e9 << endl; + cout << "\tWordline (nJ): " << + fr->tag_array2->power_row_decoders.readOp.dynamic * 1e9 << endl; + cout << "\tBitline mux & associated drivers (nJ): " << + fr->tag_array2->power_bit_mux_predecoder_drivers.readOp.dynamic * 1e9 + + fr->tag_array2->power_bit_mux_predecoder_blocks.readOp.dynamic * 1e9 + + fr->tag_array2->power_bit_mux_decoders.readOp.dynamic * 1e9 << endl; + cout << "\tSense amp mux & associated drivers (nJ): " << + fr->tag_array2->power_senseamp_mux_lev_1_predecoder_drivers.readOp.dynamic * 1e9 + + fr->tag_array2->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic * 1e9 + + fr->tag_array2->power_senseamp_mux_lev_1_decoders.readOp.dynamic * 1e9 + + fr->tag_array2->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic * 1e9 + + fr->tag_array2->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic * 1e9 + + fr->tag_array2->power_senseamp_mux_lev_2_decoders.readOp.dynamic * 1e9 << endl; + cout << "\tBitlines precharge and equalization circuit (nJ): " << + fr->tag_array2->power_prechg_eq_drivers.readOp.dynamic * 1e9 << endl; + cout << "\tBitlines (nJ): " << + fr->tag_array2->power_bitlines.readOp.dynamic * 1e9 << endl; + cout << "\tSense amplifier energy (nJ): " << + fr->tag_array2->power_sense_amps.readOp.dynamic * 1e9 << endl; + cout << "\tSub-array output driver (nJ): " << + fr->tag_array2->power_output_drivers_at_subarray.readOp.dynamic * 1e9 << endl; + } + + cout << endl << endl << "Area Components:" << endl << endl; + /* Data array area stats */ + if (!(g_ip->pure_cam || g_ip->fully_assoc)) + cout << " Data array: Area (mm2): " << fr->data_array2->area * 1e-6 << endl; + else if (g_ip->pure_cam) + cout << " CAM array: Area (mm2): " << fr->data_array2->area * 1e-6 << endl; + else + cout << " Fully associative cache array: Area (mm2): " << fr->data_array2->area * 1e-6 << endl; + cout << "\tHeight (mm): " << + fr->data_array2->all_banks_height*1e-3 << endl; + cout << "\tWidth (mm): " << + fr->data_array2->all_banks_width*1e-3 << endl; + if (g_ip->print_detail) { + cout << "\tArea efficiency (Memory cell area/Total area) - " << + fr->data_array2->area_efficiency << " %" << endl; + cout << "\t\tMAT Height (mm): " << + fr->data_array2->mat_height*1e-3 << endl; + cout << "\t\tMAT Length (mm): " << + fr->data_array2->mat_length*1e-3 << endl; + cout << "\t\tSubarray Height (mm): " << + fr->data_array2->subarray_height*1e-3 << endl; + cout << "\t\tSubarray Length (mm): " << + fr->data_array2->subarray_length*1e-3 << endl; + } + + /* Tag array area stats */ + if ((!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) && !g_ip->is_main_mem) + { + cout << endl << " Tag array: Area (mm2): " << fr->tag_array2->area * 1e-6 << endl; + cout << "\tHeight (mm): " << + fr->tag_array2->all_banks_height*1e-3 << endl; + cout << "\tWidth (mm): " << + fr->tag_array2->all_banks_width*1e-3 << endl; + if (g_ip->print_detail) + { + cout << "\tArea efficiency (Memory cell area/Total area) - " << + fr->tag_array2->area_efficiency << " %" << endl; + cout << "\t\tMAT Height (mm): " << + fr->tag_array2->mat_height*1e-3 << endl; + cout << "\t\tMAT Length (mm): " << + fr->tag_array2->mat_length*1e-3 << endl; + cout << "\t\tSubarray Height (mm): " << + fr->tag_array2->subarray_height*1e-3 << endl; + cout << "\t\tSubarray Length (mm): " << + fr->tag_array2->subarray_length*1e-3 << endl; + } + } + Wire wpr; + wpr.print_wire(); + + //cout << "FO4 = " << g_tp.FO4 << endl; + } +} + +//McPAT's plain interface, please keep !!! +uca_org_t cacti_interface(InputParameter * const local_interface) +{ +// g_ip = new InputParameter(); + //g_ip->add_ecc_b_ = true; + + uca_org_t fin_res; + fin_res.valid = false; + + g_ip = local_interface; + + +// g_ip->data_arr_ram_cell_tech_type = data_arr_ram_cell_tech_flavor_in; +// g_ip->data_arr_peri_global_tech_type = data_arr_peri_global_tech_flavor_in; +// g_ip->tag_arr_ram_cell_tech_type = tag_arr_ram_cell_tech_flavor_in; +// g_ip->tag_arr_peri_global_tech_type = tag_arr_peri_global_tech_flavor_in; +// +// g_ip->ic_proj_type = interconnect_projection_type_in; +// g_ip->wire_is_mat_type = wire_inside_mat_type_in; +// g_ip->wire_os_mat_type = wire_outside_mat_type_in; +// g_ip->burst_len = BURST_LENGTH_in; +// g_ip->int_prefetch_w = INTERNAL_PREFETCH_WIDTH_in; +// g_ip->page_sz_bits = PAGE_SIZE_BITS_in; +// +// g_ip->cache_sz = cache_size; +// g_ip->line_sz = line_size; +// g_ip->assoc = associativity; +// g_ip->nbanks = banks; +// g_ip->out_w = output_width; +// g_ip->specific_tag = specific_tag; +// if (tag_width == 0) { +// g_ip->tag_w = 42; +// } +// else { +// g_ip->tag_w = tag_width; +// } +// +// g_ip->access_mode = access_mode; +// g_ip->delay_wt = obj_func_delay; +// g_ip->dynamic_power_wt = obj_func_dynamic_power; +// g_ip->leakage_power_wt = obj_func_leakage_power; +// g_ip->area_wt = obj_func_area; +// g_ip->cycle_time_wt = obj_func_cycle_time; +// g_ip->delay_dev = dev_func_delay; +// g_ip->dynamic_power_dev = dev_func_dynamic_power; +// g_ip->leakage_power_dev = dev_func_leakage_power; +// g_ip->area_dev = dev_func_area; +// g_ip->cycle_time_dev = dev_func_cycle_time; +// g_ip->temp = temp; +// +// g_ip->F_sz_nm = tech_node; +// g_ip->F_sz_um = tech_node / 1000; +// g_ip->is_main_mem = (main_mem != 0) ? true : false; +// g_ip->is_cache = (cache ==1) ? true : false; +// g_ip->pure_ram = (cache ==0) ? true : false; +// g_ip->pure_cam = (cache ==2) ? true : false; +// g_ip->rpters_in_htree = (REPEATERS_IN_HTREE_SEGMENTS_in != 0) ? true : false; +// g_ip->ver_htree_wires_over_array = VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in; +// g_ip->broadcast_addr_din_over_ver_htrees = BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in; +// +// g_ip->num_rw_ports = rw_ports; +// g_ip->num_rd_ports = excl_read_ports; +// g_ip->num_wr_ports = excl_write_ports; +// g_ip->num_se_rd_ports = single_ended_read_ports; +// g_ip->num_search_ports = search_ports; +// +// g_ip->print_detail = 1; +// g_ip->nuca = 0; +// g_ip->is_cache=true; +// +// if (force_wiretype == 0) +// { +// g_ip->wt = Global; +// g_ip->force_wiretype = false; +// } +// else +// { g_ip->force_wiretype = true; +// if (wiretype==10) { +// g_ip->wt = Global_10; +// } +// if (wiretype==20) { +// g_ip->wt = Global_20; +// } +// if (wiretype==30) { +// g_ip->wt = Global_30; +// } +// if (wiretype==5) { +// g_ip->wt = Global_5; +// } +// if (wiretype==0) { +// g_ip->wt = Low_swing; +// } +// } +// //g_ip->wt = Global_5; +// if (force_config == 0) +// { +// g_ip->force_cache_config = false; +// } +// else +// { +// g_ip->force_cache_config = true; +// g_ip->ndbl=ndbl; +// g_ip->ndwl=ndwl; +// g_ip->nspd=nspd; +// g_ip->ndcm=ndcm; +// g_ip->ndsam1=ndsam1; +// g_ip->ndsam2=ndsam2; +// +// +// } +// +// if (ecc==0){ +// g_ip->add_ecc_b_=false; +// } +// else +// { +// g_ip->add_ecc_b_=true; +// } + + + g_ip->error_checking(); + + + init_tech_params(g_ip->F_sz_um, false); + Wire winit; // Do not delete this line. It initializes wires. + + solve(&fin_res); + +// g_ip->display_ip(); +// output_UCA(&fin_res); +// output_data_csv(fin_res); + + // delete (g_ip); + + return fin_res; +} + +//McPAT's plain interface, please keep !!! +uca_org_t init_interface(InputParameter* const local_interface) +{ + // g_ip = new InputParameter(); + //g_ip->add_ecc_b_ = true; + + uca_org_t fin_res; + fin_res.valid = false; + + g_ip = local_interface; + + +// g_ip->data_arr_ram_cell_tech_type = data_arr_ram_cell_tech_flavor_in; +// g_ip->data_arr_peri_global_tech_type = data_arr_peri_global_tech_flavor_in; +// g_ip->tag_arr_ram_cell_tech_type = tag_arr_ram_cell_tech_flavor_in; +// g_ip->tag_arr_peri_global_tech_type = tag_arr_peri_global_tech_flavor_in; +// +// g_ip->ic_proj_type = interconnect_projection_type_in; +// g_ip->wire_is_mat_type = wire_inside_mat_type_in; +// g_ip->wire_os_mat_type = wire_outside_mat_type_in; +// g_ip->burst_len = BURST_LENGTH_in; +// g_ip->int_prefetch_w = INTERNAL_PREFETCH_WIDTH_in; +// g_ip->page_sz_bits = PAGE_SIZE_BITS_in; +// +// g_ip->cache_sz = cache_size; +// g_ip->line_sz = line_size; +// g_ip->assoc = associativity; +// g_ip->nbanks = banks; +// g_ip->out_w = output_width; +// g_ip->specific_tag = specific_tag; +// if (tag_width == 0) { +// g_ip->tag_w = 42; +// } +// else { +// g_ip->tag_w = tag_width; +// } +// +// g_ip->access_mode = access_mode; +// g_ip->delay_wt = obj_func_delay; +// g_ip->dynamic_power_wt = obj_func_dynamic_power; +// g_ip->leakage_power_wt = obj_func_leakage_power; +// g_ip->area_wt = obj_func_area; +// g_ip->cycle_time_wt = obj_func_cycle_time; +// g_ip->delay_dev = dev_func_delay; +// g_ip->dynamic_power_dev = dev_func_dynamic_power; +// g_ip->leakage_power_dev = dev_func_leakage_power; +// g_ip->area_dev = dev_func_area; +// g_ip->cycle_time_dev = dev_func_cycle_time; +// g_ip->temp = temp; +// +// g_ip->F_sz_nm = tech_node; +// g_ip->F_sz_um = tech_node / 1000; +// g_ip->is_main_mem = (main_mem != 0) ? true : false; +// g_ip->is_cache = (cache ==1) ? true : false; +// g_ip->pure_ram = (cache ==0) ? true : false; +// g_ip->pure_cam = (cache ==2) ? true : false; +// g_ip->rpters_in_htree = (REPEATERS_IN_HTREE_SEGMENTS_in != 0) ? true : false; +// g_ip->ver_htree_wires_over_array = VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in; +// g_ip->broadcast_addr_din_over_ver_htrees = BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in; +// +// g_ip->num_rw_ports = rw_ports; +// g_ip->num_rd_ports = excl_read_ports; +// g_ip->num_wr_ports = excl_write_ports; +// g_ip->num_se_rd_ports = single_ended_read_ports; +// g_ip->num_search_ports = search_ports; +// +// g_ip->print_detail = 1; +// g_ip->nuca = 0; +// +// if (force_wiretype == 0) +// { +// g_ip->wt = Global; +// g_ip->force_wiretype = false; +// } +// else +// { g_ip->force_wiretype = true; +// if (wiretype==10) { +// g_ip->wt = Global_10; +// } +// if (wiretype==20) { +// g_ip->wt = Global_20; +// } +// if (wiretype==30) { +// g_ip->wt = Global_30; +// } +// if (wiretype==5) { +// g_ip->wt = Global_5; +// } +// if (wiretype==0) { +// g_ip->wt = Low_swing; +// } +// } +// //g_ip->wt = Global_5; +// if (force_config == 0) +// { +// g_ip->force_cache_config = false; +// } +// else +// { +// g_ip->force_cache_config = true; +// g_ip->ndbl=ndbl; +// g_ip->ndwl=ndwl; +// g_ip->nspd=nspd; +// g_ip->ndcm=ndcm; +// g_ip->ndsam1=ndsam1; +// g_ip->ndsam2=ndsam2; +// +// +// } +// +// if (ecc==0){ +// g_ip->add_ecc_b_=false; +// } +// else +// { +// g_ip->add_ecc_b_=true; +// } + + + g_ip->error_checking(); + + init_tech_params(g_ip->F_sz_um, false); + Wire winit; // Do not delete this line. It initializes wires. + //solve(&fin_res); + //g_ip->display_ip(); + + //solve(&fin_res); + //output_UCA(&fin_res); + //output_data_csv(fin_res); + // delete (g_ip); + + return fin_res; +} + +void reconfigure(InputParameter *local_interface, uca_org_t *fin_res) +{ + // Copy the InputParameter to global interface (g_ip) and do error checking. + g_ip = local_interface; + g_ip->error_checking(); + + // Initialize technology parameters + init_tech_params(g_ip->F_sz_um,false); + + Wire winit; // Do not delete this line. It initializes wires. + + // This corresponds to solve() in the initialization process. + update(fin_res); +} diff --git a/ext/mcpat/cacti/io.h b/ext/mcpat/cacti/io.h new file mode 100644 index 000000000..b1c2565e0 --- /dev/null +++ b/ext/mcpat/cacti/io.h @@ -0,0 +1,44 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + +#ifndef __IO_H__ +#define __IO_H__ + + +#include "cacti_interface.h" +#include "const.h" + +void output_data_csv(const uca_org_t & fin_res); +void output_UCA(uca_org_t * fin_res); + + +#endif diff --git a/ext/mcpat/cacti/main.cc b/ext/mcpat/cacti/main.cc new file mode 100644 index 000000000..d6e12be62 --- /dev/null +++ b/ext/mcpat/cacti/main.cc @@ -0,0 +1,191 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#include <iostream> + +#include "io.h" + +using namespace std; + + +int main(int argc,char *argv[]) +{ + + uca_org_t result; + if (argc != 53 && argc != 55) + { + bool infile_specified = false; + string infile_name(""); + + for (int32_t i = 0; i < argc; i++) + { + if (argv[i] == string("-infile")) + { + infile_specified = true; + i++; + infile_name = argv[i]; + } + } + + if (infile_specified == false) + { + cerr << " Invalid arguments -- how to use CACTI:" << endl; + cerr << " 1) cacti -infile <input file name>" << endl; + cerr << " 2) cacti arg1 ... arg52 -- please refer to the README file" << endl; + cerr << " No. of arguments input - " << argc << endl; + exit(1); + } + else + { + result = cacti_interface(infile_name); + } + } + else if (argc == 53) + { + result = cacti_interface(atoi(argv[ 1]), + atoi(argv[ 2]), + atoi(argv[ 3]), + atoi(argv[ 4]), + atoi(argv[ 5]), + atoi(argv[ 6]), + atoi(argv[ 7]), + atoi(argv[ 8]), + atoi(argv[ 9]), + atof(argv[10]), + atoi(argv[11]), + atoi(argv[12]), + atoi(argv[13]), + atoi(argv[14]), + atoi(argv[15]), + atoi(argv[16]), + atoi(argv[17]), + atoi(argv[18]), + atoi(argv[19]), + atoi(argv[20]), + atoi(argv[21]), + atoi(argv[22]), + atoi(argv[23]), + atoi(argv[24]), + atoi(argv[25]), + atoi(argv[26]), + atoi(argv[27]), + atoi(argv[28]), + atoi(argv[29]), + atoi(argv[30]), + atoi(argv[31]), + atoi(argv[32]), + atoi(argv[33]), + atoi(argv[34]), + atoi(argv[35]), + atoi(argv[36]), + atoi(argv[37]), + atoi(argv[38]), + atoi(argv[39]), + atoi(argv[40]), + atoi(argv[41]), + atoi(argv[42]), + atoi(argv[43]), + atoi(argv[44]), + atoi(argv[45]), + atoi(argv[46]), + atoi(argv[47]), + atoi(argv[48]), + atoi(argv[49]), + atoi(argv[50]), + atoi(argv[51]), + atoi(argv[52])); + } + else + { + result = cacti_interface(atoi(argv[ 1]), + atoi(argv[ 2]), + atoi(argv[ 3]), + atoi(argv[ 4]), + atoi(argv[ 5]), + atoi(argv[ 6]), + atoi(argv[ 7]), + atoi(argv[ 8]), + atof(argv[ 9]), + atoi(argv[10]), + atoi(argv[11]), + atoi(argv[12]), + atoi(argv[13]), + atoi(argv[14]), + atoi(argv[15]), + atoi(argv[16]), + atoi(argv[17]), + atoi(argv[18]), + atoi(argv[19]), + atoi(argv[20]), + atoi(argv[21]), + atoi(argv[22]), + atoi(argv[23]), + atoi(argv[24]), + atoi(argv[25]), + atoi(argv[26]), + atoi(argv[27]), + atoi(argv[28]), + atoi(argv[29]), + atoi(argv[30]), + atoi(argv[31]), + atoi(argv[32]), + atoi(argv[33]), + atoi(argv[34]), + atoi(argv[35]), + atoi(argv[36]), + atoi(argv[37]), + atoi(argv[38]), + atoi(argv[39]), + atoi(argv[40]), + atoi(argv[41]), + atoi(argv[42]), + atoi(argv[43]), + atoi(argv[44]), + atoi(argv[45]), + atoi(argv[46]), + atoi(argv[47]), + atoi(argv[48]), + atoi(argv[49]), + atoi(argv[50]), + atoi(argv[51]), + atoi(argv[52]), + atoi(argv[53]), + atoi(argv[54])); + } + + result.cleanup(); +// delete result.data_array2; +// if (result.tag_array2!=NULL) +// delete result.tag_array2; + + return 0; +} + diff --git a/ext/mcpat/cacti/makefile b/ext/mcpat/cacti/makefile new file mode 100644 index 000000000..27286916a --- /dev/null +++ b/ext/mcpat/cacti/makefile @@ -0,0 +1,28 @@ +TAR = cacti + +.PHONY: dbg opt depend clean clean_dbg clean_opt + +all: opt + +dbg: $(TAR).mk obj_dbg + @$(MAKE) TAG=dbg -C . -f $(TAR).mk + +opt: $(TAR).mk obj_opt + @$(MAKE) TAG=opt -C . -f $(TAR).mk + +obj_dbg: + mkdir $@ + +obj_opt: + mkdir $@ + +clean: clean_dbg clean_opt + +clean_dbg: obj_dbg + @$(MAKE) TAG=dbg -C . -f $(TAR).mk clean + rm -rf $< + +clean_opt: obj_opt + @$(MAKE) TAG=opt -C . -f $(TAR).mk clean + rm -rf $< + diff --git a/ext/mcpat/cacti/mat.cc b/ext/mcpat/cacti/mat.cc new file mode 100755 index 000000000..ef98107c7 --- /dev/null +++ b/ext/mcpat/cacti/mat.cc @@ -0,0 +1,1748 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#include <cassert> + +#include "mat.h" + +Mat::Mat(const DynamicParameter & dyn_p) + :dp(dyn_p), + power_subarray_out_drv(), + delay_fa_tag(0), delay_cam(0), + delay_before_decoder(0), delay_bitline(0), + delay_wl_reset(0), delay_bl_restore(0), + delay_searchline(0), delay_matchchline(0), + delay_cam_sl_restore(0), delay_cam_ml_reset(0), + delay_fa_ram_wl(0),delay_hit_miss_reset(0), + delay_hit_miss(0), + subarray(dp, dp.fully_assoc), + power_bitline(), per_bitline_read_energy(0), + deg_bl_muxing(dp.deg_bl_muxing), + num_act_mats_hor_dir(dyn_p.num_act_mats_hor_dir), + delay_writeback(0), + cell(subarray.cell), cam_cell(subarray.cam_cell), + is_dram(dyn_p.is_dram), + pure_cam(dyn_p.pure_cam), + num_mats(dp.num_mats), + power_sa(), delay_sa(0), + leak_power_sense_amps_closed_page_state(0), + leak_power_sense_amps_open_page_state(0), + delay_subarray_out_drv(0), + delay_comparator(0), power_comparator(), + num_do_b_mat(dyn_p.num_do_b_mat), num_so_b_mat(dyn_p.num_so_b_mat), + num_subarrays_per_mat(dp.num_subarrays/dp.num_mats), + num_subarrays_per_row(dp.Ndwl/dp.num_mats_h_dir) +{ + assert(num_subarrays_per_mat <= 4); + assert(num_subarrays_per_row <= 2); + is_fa = (dp.fully_assoc) ? true : false; + camFlag = (is_fa || pure_cam);//although cam_cell.w = cell.w for fa, we still differentiate them. + + if (is_fa || pure_cam) + num_subarrays_per_row = num_subarrays_per_mat>2?num_subarrays_per_mat/2:num_subarrays_per_mat; + + if (dp.use_inp_params == 1) { + RWP = dp.num_rw_ports; + ERP = dp.num_rd_ports; + EWP = dp.num_wr_ports; + SCHP = dp.num_search_ports; + } + else { + RWP = g_ip->num_rw_ports; + ERP = g_ip->num_rd_ports; + EWP = g_ip->num_wr_ports; + SCHP = g_ip->num_search_ports; + + } + + double number_sa_subarray; + + if (!is_fa && !pure_cam) + { + number_sa_subarray = subarray.num_cols / deg_bl_muxing; + } + else if (is_fa && !pure_cam) + { + number_sa_subarray = (subarray.num_cols_fa_cam + subarray.num_cols_fa_ram) / deg_bl_muxing; + } + + else + { + number_sa_subarray = (subarray.num_cols_fa_cam) / deg_bl_muxing; + } + + int num_dec_signals = subarray.num_rows; + double C_ld_bit_mux_dec_out = 0; + double C_ld_sa_mux_lev_1_dec_out = 0; + double C_ld_sa_mux_lev_2_dec_out = 0; + double R_wire_wl_drv_out; + + if (!is_fa && !pure_cam) + { + R_wire_wl_drv_out = subarray.num_cols * cell.w * g_tp.wire_local.R_per_um; + } + else if (is_fa && !pure_cam) + { + R_wire_wl_drv_out = (subarray.num_cols_fa_cam * cam_cell.w + subarray.num_cols_fa_ram * cell.w) * g_tp.wire_local.R_per_um ; + } + else + { + R_wire_wl_drv_out = (subarray.num_cols_fa_cam * cam_cell.w ) * g_tp.wire_local.R_per_um; + } + + double R_wire_bit_mux_dec_out = num_subarrays_per_row * subarray.num_cols * g_tp.wire_inside_mat.R_per_um * cell.w;//TODO:revisit for FA + double R_wire_sa_mux_dec_out = num_subarrays_per_row * subarray.num_cols * g_tp.wire_inside_mat.R_per_um * cell.w; + + if (deg_bl_muxing > 1) + { + C_ld_bit_mux_dec_out = + (2 * num_subarrays_per_mat * subarray.num_cols / deg_bl_muxing)*gate_C(g_tp.w_nmos_b_mux, 0, is_dram) + // 2 transistor per cell + num_subarrays_per_row * subarray.num_cols*g_tp.wire_inside_mat.C_per_um*cell.get_w(); + } + + if (dp.Ndsam_lev_1 > 1) + { + C_ld_sa_mux_lev_1_dec_out = + (num_subarrays_per_mat * number_sa_subarray / dp.Ndsam_lev_1)*gate_C(g_tp.w_nmos_sa_mux, 0, is_dram) + + num_subarrays_per_row * subarray.num_cols*g_tp.wire_inside_mat.C_per_um*cell.get_w(); + } + if (dp.Ndsam_lev_2 > 1) + { + C_ld_sa_mux_lev_2_dec_out = + (num_subarrays_per_mat * number_sa_subarray / (dp.Ndsam_lev_1*dp.Ndsam_lev_2))*gate_C(g_tp.w_nmos_sa_mux, 0, is_dram) + + num_subarrays_per_row * subarray.num_cols*g_tp.wire_inside_mat.C_per_um*cell.get_w(); + } + + if (num_subarrays_per_row >= 2) + { + // wire heads for both right and left side of a mat, so half the resistance + R_wire_bit_mux_dec_out /= 2.0; + R_wire_sa_mux_dec_out /= 2.0; + } + + + row_dec = new Decoder( + num_dec_signals, + false, + subarray.C_wl, + R_wire_wl_drv_out, + false/*is_fa*/, + is_dram, + true, + camFlag? cam_cell:cell); +// if (is_fa && (!dp.is_tag)) +// { +// row_dec->exist = true; +// } + bit_mux_dec = new Decoder( + deg_bl_muxing,// This number is 1 for FA or CAM + false, + C_ld_bit_mux_dec_out, + R_wire_bit_mux_dec_out, + false/*is_fa*/, + is_dram, + false, + camFlag? cam_cell:cell); + sa_mux_lev_1_dec = new Decoder( + dp.deg_senseamp_muxing_non_associativity, // This number is 1 for FA or CAM + dp.number_way_select_signals_mat ? true : false,//only sa_mux_lev_1_dec needs way select signal + C_ld_sa_mux_lev_1_dec_out, + R_wire_sa_mux_dec_out, + false/*is_fa*/, + is_dram, + false, + camFlag? cam_cell:cell); + sa_mux_lev_2_dec = new Decoder( + dp.Ndsam_lev_2, // This number is 1 for FA or CAM + false, + C_ld_sa_mux_lev_2_dec_out, + R_wire_sa_mux_dec_out, + false/*is_fa*/, + is_dram, + false, + camFlag? cam_cell:cell); + + double C_wire_predec_blk_out; + double R_wire_predec_blk_out; + + if (!is_fa && !pure_cam) + { + + C_wire_predec_blk_out = num_subarrays_per_row * subarray.num_rows * g_tp.wire_inside_mat.C_per_um * cell.h; + R_wire_predec_blk_out = num_subarrays_per_row * subarray.num_rows * g_tp.wire_inside_mat.R_per_um * cell.h; + + } + else //for pre-decode block's load is same for both FA and CAM + { + C_wire_predec_blk_out = subarray.num_rows * g_tp.wire_inside_mat.C_per_um * cam_cell.h; + R_wire_predec_blk_out = subarray.num_rows * g_tp.wire_inside_mat.R_per_um * cam_cell.h; + } + + + if (is_fa||pure_cam) + num_dec_signals += _log2(num_subarrays_per_mat); + + PredecBlk * r_predec_blk1 = new PredecBlk( + num_dec_signals, + row_dec, + C_wire_predec_blk_out, + R_wire_predec_blk_out, + num_subarrays_per_mat, + is_dram, + true); + PredecBlk * r_predec_blk2 = new PredecBlk( + num_dec_signals, + row_dec, + C_wire_predec_blk_out, + R_wire_predec_blk_out, + num_subarrays_per_mat, + is_dram, + false); + PredecBlk * b_mux_predec_blk1 = new PredecBlk(deg_bl_muxing, bit_mux_dec, 0, 0, 1, is_dram, true); + PredecBlk * b_mux_predec_blk2 = new PredecBlk(deg_bl_muxing, bit_mux_dec, 0, 0, 1, is_dram, false); + PredecBlk * sa_mux_lev_1_predec_blk1 = new PredecBlk(dyn_p.deg_senseamp_muxing_non_associativity, sa_mux_lev_1_dec, 0, 0, 1, is_dram, true); + PredecBlk * sa_mux_lev_1_predec_blk2 = new PredecBlk(dyn_p.deg_senseamp_muxing_non_associativity, sa_mux_lev_1_dec, 0, 0, 1, is_dram, false); + PredecBlk * sa_mux_lev_2_predec_blk1 = new PredecBlk(dp.Ndsam_lev_2, sa_mux_lev_2_dec, 0, 0, 1, is_dram, true); + PredecBlk * sa_mux_lev_2_predec_blk2 = new PredecBlk(dp.Ndsam_lev_2, sa_mux_lev_2_dec, 0, 0, 1, is_dram, false); + dummy_way_sel_predec_blk1 = new PredecBlk(1, sa_mux_lev_1_dec, 0, 0, 0, is_dram, true); + dummy_way_sel_predec_blk2 = new PredecBlk(1, sa_mux_lev_1_dec, 0, 0, 0, is_dram, false); + + PredecBlkDrv * r_predec_blk_drv1 = new PredecBlkDrv(0, r_predec_blk1, is_dram); + PredecBlkDrv * r_predec_blk_drv2 = new PredecBlkDrv(0, r_predec_blk2, is_dram); + PredecBlkDrv * b_mux_predec_blk_drv1 = new PredecBlkDrv(0, b_mux_predec_blk1, is_dram); + PredecBlkDrv * b_mux_predec_blk_drv2 = new PredecBlkDrv(0, b_mux_predec_blk2, is_dram); + PredecBlkDrv * sa_mux_lev_1_predec_blk_drv1 = new PredecBlkDrv(0, sa_mux_lev_1_predec_blk1, is_dram); + PredecBlkDrv * sa_mux_lev_1_predec_blk_drv2 = new PredecBlkDrv(0, sa_mux_lev_1_predec_blk2, is_dram); + PredecBlkDrv * sa_mux_lev_2_predec_blk_drv1 = new PredecBlkDrv(0, sa_mux_lev_2_predec_blk1, is_dram); + PredecBlkDrv * sa_mux_lev_2_predec_blk_drv2 = new PredecBlkDrv(0, sa_mux_lev_2_predec_blk2, is_dram); + way_sel_drv1 = new PredecBlkDrv(dyn_p.number_way_select_signals_mat, dummy_way_sel_predec_blk1, is_dram); + dummy_way_sel_predec_blk_drv2 = new PredecBlkDrv(1, dummy_way_sel_predec_blk2, is_dram); + + r_predec = new Predec(r_predec_blk_drv1, r_predec_blk_drv2); + b_mux_predec = new Predec(b_mux_predec_blk_drv1, b_mux_predec_blk_drv2); + sa_mux_lev_1_predec = new Predec(sa_mux_lev_1_predec_blk_drv1, sa_mux_lev_1_predec_blk_drv2); + sa_mux_lev_2_predec = new Predec(sa_mux_lev_2_predec_blk_drv1, sa_mux_lev_2_predec_blk_drv2); + + subarray_out_wire = new Wire(g_ip->wt, subarray.area.h);//Bug should be subarray.area.w Owen and Sheng + + double driver_c_gate_load; + double driver_c_wire_load; + double driver_r_wire_load; + + if (is_fa || pure_cam) + + { //Although CAM and RAM use different bl pre-charge driver, assuming the precharge p size is the same + driver_c_gate_load = (subarray.num_cols_fa_cam )* gate_C(2 * g_tp.w_pmos_bl_precharge + g_tp.w_pmos_bl_eq, 0, is_dram, false, false); + driver_c_wire_load = subarray.num_cols_fa_cam * cam_cell.w * g_tp.wire_outside_mat.C_per_um; + driver_r_wire_load = subarray.num_cols_fa_cam * cam_cell.w * g_tp.wire_outside_mat.R_per_um; + cam_bl_precharge_eq_drv = new Driver( + driver_c_gate_load, + driver_c_wire_load, + driver_r_wire_load, + is_dram); + + if (!pure_cam) + { + //This is only used for fully asso not pure CAM + driver_c_gate_load = (subarray.num_cols_fa_ram )* gate_C(2 * g_tp.w_pmos_bl_precharge + g_tp.w_pmos_bl_eq, 0, is_dram, false, false); + driver_c_wire_load = subarray.num_cols_fa_ram * cell.w * g_tp.wire_outside_mat.C_per_um; + driver_r_wire_load = subarray.num_cols_fa_ram * cell.w * g_tp.wire_outside_mat.R_per_um; + bl_precharge_eq_drv = new Driver( + driver_c_gate_load, + driver_c_wire_load, + driver_r_wire_load, + is_dram); + } + } + + else + { + driver_c_gate_load = subarray.num_cols * gate_C(2 * g_tp.w_pmos_bl_precharge + g_tp.w_pmos_bl_eq, 0, is_dram, false, false); + driver_c_wire_load = subarray.num_cols * cell.w * g_tp.wire_outside_mat.C_per_um; + driver_r_wire_load = subarray.num_cols * cell.w * g_tp.wire_outside_mat.R_per_um; + bl_precharge_eq_drv = new Driver( + driver_c_gate_load, + driver_c_wire_load, + driver_r_wire_load, + is_dram); + } + double area_row_decoder = row_dec->area.get_area() * subarray.num_rows * (RWP + ERP + EWP); + double w_row_decoder = area_row_decoder / subarray.area.get_h(); + + double h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux = + compute_bit_mux_sa_precharge_sa_mux_wr_drv_wr_mux_h(); + + double h_subarray_out_drv = subarray_out_wire->area.get_area() * + (subarray.num_cols / (deg_bl_muxing * dp.Ndsam_lev_1 * dp.Ndsam_lev_2)) / subarray.area.get_w(); + + + h_subarray_out_drv *= (RWP + ERP + SCHP); + + double h_comparators = 0.0; + double w_row_predecode_output_wires = 0.0; + double h_bit_mux_dec_out_wires = 0.0; + double h_senseamp_mux_dec_out_wires = 0.0; + + if ((!is_fa)&&(dp.is_tag)) + { + //tagbits = (4 * num_cols_subarray / (deg_bl_muxing * dp.Ndsam_lev_1 * dp.Ndsam_lev_2)) / num_do_b_mat; + h_comparators = compute_comparators_height(dp.tagbits, dyn_p.num_do_b_mat, subarray.area.get_w()); + h_comparators *= (RWP + ERP); + } + + + int branch_effort_predec_blk1_out = (1 << r_predec_blk2->number_input_addr_bits); + int branch_effort_predec_blk2_out = (1 << r_predec_blk1->number_input_addr_bits); + w_row_predecode_output_wires = (branch_effort_predec_blk1_out + branch_effort_predec_blk2_out) * + g_tp.wire_inside_mat.pitch * (RWP + ERP + EWP); + + + double h_non_cell_area = (num_subarrays_per_mat / num_subarrays_per_row) * + (h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux + + h_subarray_out_drv + h_comparators); + + double w_non_cell_area = MAX(w_row_predecode_output_wires, num_subarrays_per_row * w_row_decoder); + + if (deg_bl_muxing > 1) + { + h_bit_mux_dec_out_wires = deg_bl_muxing * g_tp.wire_inside_mat.pitch * (RWP + ERP); + } + if (dp.Ndsam_lev_1 > 1) + { + h_senseamp_mux_dec_out_wires = dp.Ndsam_lev_1 * g_tp.wire_inside_mat.pitch * (RWP + ERP); + } + if (dp.Ndsam_lev_2 > 1) + { + h_senseamp_mux_dec_out_wires += dp.Ndsam_lev_2 * g_tp.wire_inside_mat.pitch * (RWP + ERP); + } + + double h_addr_datain_wires; + if (!g_ip->ver_htree_wires_over_array) + { + h_addr_datain_wires = (dp.number_addr_bits_mat + dp.number_way_select_signals_mat + + (dp.num_di_b_mat + dp.num_do_b_mat)/num_subarrays_per_row) * + g_tp.wire_inside_mat.pitch * (RWP + ERP + EWP); + + if (is_fa || pure_cam) + { + h_addr_datain_wires = (dp.number_addr_bits_mat + dp.number_way_select_signals_mat + //TODO: revisit + (dp.num_di_b_mat+ dp.num_do_b_mat )/num_subarrays_per_row) * + g_tp.wire_inside_mat.pitch * (RWP + ERP + EWP) + + (dp.num_si_b_mat + dp.num_so_b_mat )/num_subarrays_per_row * g_tp.wire_inside_mat.pitch * SCHP; + } + //h_non_cell_area = 2 * h_bit_mux_sense_amp_precharge_sa_mux + + //MAX(h_addr_datain_wires, 2 * h_subarray_out_drv); + h_non_cell_area = (h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux + h_comparators + + h_subarray_out_drv) * (num_subarrays_per_mat / num_subarrays_per_row) + + h_addr_datain_wires + + h_bit_mux_dec_out_wires + + h_senseamp_mux_dec_out_wires; + + } + + // double area_rectangle_center_mat = h_non_cell_area * w_non_cell_area; + double area_mat_center_circuitry = (r_predec_blk_drv1->area.get_area() + + b_mux_predec_blk_drv1->area.get_area() + + sa_mux_lev_1_predec_blk_drv1->area.get_area() + + sa_mux_lev_2_predec_blk_drv1->area.get_area() + + way_sel_drv1->area.get_area() + + r_predec_blk_drv2->area.get_area() + + b_mux_predec_blk_drv2->area.get_area() + + sa_mux_lev_1_predec_blk_drv2->area.get_area() + + sa_mux_lev_2_predec_blk_drv2->area.get_area() + + r_predec_blk1->area.get_area() + + b_mux_predec_blk1->area.get_area() + + sa_mux_lev_1_predec_blk1->area.get_area() + + sa_mux_lev_2_predec_blk1->area.get_area() + + r_predec_blk2->area.get_area() + + b_mux_predec_blk2->area.get_area() + + sa_mux_lev_1_predec_blk2->area.get_area() + + sa_mux_lev_2_predec_blk2->area.get_area() + + bit_mux_dec->area.get_area() + + sa_mux_lev_1_dec->area.get_area() + + sa_mux_lev_2_dec->area.get_area()) * (RWP + ERP + EWP); + + double area_efficiency_mat; + +// if (!is_fa) +// { + assert(num_subarrays_per_mat/num_subarrays_per_row>0); + area.h = (num_subarrays_per_mat/num_subarrays_per_row)* subarray.area.h + h_non_cell_area; + area.w = num_subarrays_per_row * subarray.area.get_w() + w_non_cell_area; + area.w = (area.h*area.w + area_mat_center_circuitry) / area.h; + area_efficiency_mat = subarray.area.get_area() * num_subarrays_per_mat * 100.0 / area.get_area(); + +// cout<<"h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux"<<h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux<<endl; +// cout<<"h_comparators"<<h_comparators<<endl; +// cout<<"h_subarray_out_drv"<<h_subarray_out_drv<<endl; +// cout<<"h_addr_datain_wires"<<h_addr_datain_wires<<endl; +// cout<<"h_bit_mux_dec_out_wires"<<h_bit_mux_dec_out_wires<<endl; +// cout<<"h_senseamp_mux_dec_out_wires"<<h_senseamp_mux_dec_out_wires<<endl; +// cout<<"h_non_cell_area"<<h_non_cell_area<<endl; +// cout<<"area.h =" << (num_subarrays_per_mat/num_subarrays_per_row)* subarray.area.h<<endl; +// cout<<"w_non_cell_area"<<w_non_cell_area<<endl; +// cout<<"area_mat_center_circuitry"<<area_mat_center_circuitry<<endl; + + assert(area.h>0); + assert(area.w>0); +// } +// else +// { +// area.h = (num_subarrays_per_mat / num_subarrays_per_row) * subarray.area.get_h() + h_non_cell_area; +// area.w = num_subarrays_per_row * subarray.area.get_w() + w_non_cell_area; +// area.w = (area.h*area.w + area_mat_center_circuitry) / area.h; +// area_efficiency_mat = subarray.area.get_area() * num_subarrays_per_row * 100.0 / area.get_area(); +// } + } + + + +Mat::~Mat() +{ + delete row_dec; + delete bit_mux_dec; + delete sa_mux_lev_1_dec; + delete sa_mux_lev_2_dec; + + delete r_predec->blk1; + delete r_predec->blk2; + delete b_mux_predec->blk1; + delete b_mux_predec->blk2; + delete sa_mux_lev_1_predec->blk1; + delete sa_mux_lev_1_predec->blk2; + delete sa_mux_lev_2_predec->blk1; + delete sa_mux_lev_2_predec->blk2; + delete dummy_way_sel_predec_blk1; + delete dummy_way_sel_predec_blk2; + + delete r_predec->drv1; + delete r_predec->drv2; + delete b_mux_predec->drv1; + delete b_mux_predec->drv2; + delete sa_mux_lev_1_predec->drv1; + delete sa_mux_lev_1_predec->drv2; + delete sa_mux_lev_2_predec->drv1; + delete sa_mux_lev_2_predec->drv2; + delete way_sel_drv1; + delete dummy_way_sel_predec_blk_drv2; + + delete r_predec; + delete b_mux_predec; + delete sa_mux_lev_1_predec; + delete sa_mux_lev_2_predec; + + delete subarray_out_wire; + if (!pure_cam) + delete bl_precharge_eq_drv; + + if (is_fa || pure_cam) + { + delete sl_precharge_eq_drv ; + delete sl_data_drv ; + delete cam_bl_precharge_eq_drv; + delete ml_precharge_drv; + delete ml_to_ram_wl_drv; + } +} + + + +double Mat::compute_delays(double inrisetime) +{ + int k; + double rd, C_intrinsic, C_ld, tf, R_bl_precharge,r_b_metal, R_bl, C_bl; + double outrisetime_search, outrisetime, row_dec_outrisetime; + // delay calculation for tags of fully associative cache + if (is_fa || pure_cam) + { + //Compute search access time + outrisetime_search = compute_cam_delay(inrisetime); + if (is_fa) + { + bl_precharge_eq_drv->compute_delay(0); + k = ml_to_ram_wl_drv->number_gates - 1; + rd = tr_R_on(ml_to_ram_wl_drv->width_n[k], NCH, 1, is_dram, false, true); + C_intrinsic = drain_C_(ml_to_ram_wl_drv->width_n[k], PCH, 1, 1, 4*cell.h, is_dram, false, true) + + drain_C_(ml_to_ram_wl_drv->width_n[k], NCH, 1, 1, 4*cell.h, is_dram, false, true); + C_ld = ml_to_ram_wl_drv->c_gate_load+ ml_to_ram_wl_drv->c_wire_load; + tf = rd * (C_intrinsic + C_ld) + ml_to_ram_wl_drv->r_wire_load * C_ld / 2; + delay_wl_reset = horowitz(0, tf, 0.5, 0.5, RISE); + + R_bl_precharge = tr_R_on(g_tp.w_pmos_bl_precharge, PCH, 1, is_dram, false, false); + r_b_metal = cam_cell.h * g_tp.wire_local.R_per_um;//dummy rows in sram are filled in + R_bl = subarray.num_rows * r_b_metal; + C_bl = subarray.C_bl; + delay_bl_restore = bl_precharge_eq_drv->delay + + log((g_tp.sram.Vbitpre - 0.1 * dp.V_b_sense) / (g_tp.sram.Vbitpre - dp.V_b_sense))* + (R_bl_precharge * C_bl + R_bl * C_bl / 2); + + + outrisetime_search = compute_bitline_delay(outrisetime_search); + outrisetime_search = compute_sa_delay(outrisetime_search); + } + outrisetime_search = compute_subarray_out_drv(outrisetime_search); + subarray_out_wire->set_in_rise_time(outrisetime_search); + outrisetime_search = subarray_out_wire->signal_rise_time(); + delay_subarray_out_drv_htree = delay_subarray_out_drv + subarray_out_wire->delay; + + + //TODO: this is just for compute plain read/write energy for fa and cam, plain read/write access timing need to be revisited. + outrisetime = r_predec->compute_delays(inrisetime); + row_dec_outrisetime = row_dec->compute_delays(outrisetime); + + outrisetime = b_mux_predec->compute_delays(inrisetime); + bit_mux_dec->compute_delays(outrisetime); + + outrisetime = sa_mux_lev_1_predec->compute_delays(inrisetime); + sa_mux_lev_1_dec->compute_delays(outrisetime); + + outrisetime = sa_mux_lev_2_predec->compute_delays(inrisetime); + sa_mux_lev_2_dec->compute_delays(outrisetime); + + if (pure_cam) + { + outrisetime = compute_bitline_delay(row_dec_outrisetime); + outrisetime = compute_sa_delay(outrisetime); + } + return outrisetime_search; + } + else + { + bl_precharge_eq_drv->compute_delay(0); + if (row_dec->exist == true) + { + int k = row_dec->num_gates - 1; + double rd = tr_R_on(row_dec->w_dec_n[k], NCH, 1, is_dram, false, true); + // TODO: this 4*cell.h number must be revisited + double C_intrinsic = drain_C_(row_dec->w_dec_p[k], PCH, 1, 1, 4*cell.h, is_dram, false, true) + + drain_C_(row_dec->w_dec_n[k], NCH, 1, 1, 4*cell.h, is_dram, false, true); + double C_ld = row_dec->C_ld_dec_out; + double tf = rd * (C_intrinsic + C_ld) + row_dec->R_wire_dec_out * C_ld / 2; + delay_wl_reset = horowitz(0, tf, 0.5, 0.5, RISE); + } + double R_bl_precharge = tr_R_on(g_tp.w_pmos_bl_precharge, PCH, 1, is_dram, false, false); + double r_b_metal = cell.h * g_tp.wire_local.R_per_um; + double R_bl = subarray.num_rows * r_b_metal; + double C_bl = subarray.C_bl; + + if (is_dram) + { + delay_bl_restore = bl_precharge_eq_drv->delay + 2.3 * (R_bl_precharge * C_bl + R_bl * C_bl / 2); + } + else + { + delay_bl_restore = bl_precharge_eq_drv->delay + + log((g_tp.sram.Vbitpre - 0.1 * dp.V_b_sense) / (g_tp.sram.Vbitpre - dp.V_b_sense))* + (R_bl_precharge * C_bl + R_bl * C_bl / 2); + } + } + + + + outrisetime = r_predec->compute_delays(inrisetime); + row_dec_outrisetime = row_dec->compute_delays(outrisetime); + + outrisetime = b_mux_predec->compute_delays(inrisetime); + bit_mux_dec->compute_delays(outrisetime); + + outrisetime = sa_mux_lev_1_predec->compute_delays(inrisetime); + sa_mux_lev_1_dec->compute_delays(outrisetime); + + outrisetime = sa_mux_lev_2_predec->compute_delays(inrisetime); + sa_mux_lev_2_dec->compute_delays(outrisetime); + + outrisetime = compute_bitline_delay(row_dec_outrisetime); + outrisetime = compute_sa_delay(outrisetime); + outrisetime = compute_subarray_out_drv(outrisetime); + subarray_out_wire->set_in_rise_time(outrisetime); + outrisetime = subarray_out_wire->signal_rise_time(); + + delay_subarray_out_drv_htree = delay_subarray_out_drv + subarray_out_wire->delay; + + if (dp.is_tag == true && dp.fully_assoc == false) + { + compute_comparator_delay(0); + } + + if (row_dec->exist == false) + { + delay_wl_reset = MAX(r_predec->blk1->delay, r_predec->blk2->delay); + } + return outrisetime; +} + + + +double Mat::compute_bit_mux_sa_precharge_sa_mux_wr_drv_wr_mux_h() +{ + + double height = compute_tr_width_after_folding(g_tp.w_pmos_bl_precharge, camFlag? cam_cell.w:cell.w / (2 *(RWP + ERP + SCHP))) + + compute_tr_width_after_folding(g_tp.w_pmos_bl_eq, camFlag? cam_cell.w:cell.w / (RWP + ERP + SCHP)); // precharge circuitry + + if (deg_bl_muxing > 1) + { + height += compute_tr_width_after_folding(g_tp.w_nmos_b_mux, cell.w / (2 *(RWP + ERP))); // col mux tr height + // height += deg_bl_muxing * g_tp.wire_inside_mat.pitch * (RWP + ERP); // bit mux dec out wires height + } + + height += height_sense_amplifier(/*camFlag? sram_cell.w:*/cell.w * deg_bl_muxing / (RWP + ERP)); // sense_amp_height + + if (dp.Ndsam_lev_1 > 1) + { + height += compute_tr_width_after_folding( + g_tp.w_nmos_sa_mux, cell.w * dp.Ndsam_lev_1 / (RWP + ERP)); // sense_amp_mux_height + //height_senseamp_mux_decode_output_wires = Ndsam * wire_inside_mat_pitch * (RWP + ERP); + } + + if (dp.Ndsam_lev_2 > 1) + { + height += compute_tr_width_after_folding( + g_tp.w_nmos_sa_mux, cell.w * deg_bl_muxing * dp.Ndsam_lev_1 / (RWP + ERP)); // sense_amp_mux_height + //height_senseamp_mux_decode_output_wires = Ndsam * wire_inside_mat_pitch * (RWP + ERP); + + // add height of inverter-buffers between the two levels (pass-transistors) of sense-amp mux + height += 2 * compute_tr_width_after_folding( + pmos_to_nmos_sz_ratio(is_dram) * g_tp.min_w_nmos_, cell.w * dp.Ndsam_lev_2 / (RWP + ERP)); + height += 2 * compute_tr_width_after_folding(g_tp.min_w_nmos_, cell.w * dp.Ndsam_lev_2 / (RWP + ERP)); + } + + // TODO: this should be uncommented... + /*if (deg_bl_muxing * dp.Ndsam_lev_1 * dp.Ndsam_lev_2 > 1) + { + //height_write_mux_decode_output_wires = deg_bl_muxing * Ndsam * g_tp.wire_inside_mat.pitch * (RWP + EWP); + double width_write_driver_write_mux = width_write_driver_or_write_mux(); + double height_write_driver_write_mux = compute_tr_width_after_folding(2 * width_write_driver_write_mux, + cell.w * + // deg_bl_muxing * + dp.Ndsam_lev_1 * dp.Ndsam_lev_2 / (RWP + EWP)); + height += height_write_driver_write_mux; + }*/ + + return height; +} + + + +double Mat::compute_cam_delay(double inrisetime) +{ + + double out_time_ramp, this_delay; + double Rwire, tf, c_intrinsic, rd, Cwire, c_gate_load; + + + double Wdecdrivep, Wdecdriven, Wfadriven, Wfadrivep, Wfadrive2n, Wfadrive2p, Wfadecdrive1n, Wfadecdrive1p, + Wfadecdrive2n, Wfadecdrive2p, Wfadecdriven, Wfadecdrivep, Wfaprechn, Wfaprechp, + Wdummyn, Wdummyinvn, Wdummyinvp, Wfainvn, Wfainvp, Waddrnandn, Waddrnandp, + Wfanandn, Wfanandp, Wfanorn, Wfanorp, Wdecnandn, Wdecnandp, W_hit_miss_n, W_hit_miss_p; + + double c_matchline_metal, r_matchline_metal, c_searchline_metal, r_searchline_metal, dynSearchEng; + int Htagbits; + + double driver_c_gate_load; + double driver_c_wire_load; + double driver_r_wire_load; + //double searchline_precharge_time; + + double leak_power_cc_inverters_sram_cell = 0; + double leak_power_acc_tr_RW_or_WR_port_sram_cell = 0; + double leak_power_RD_port_sram_cell = 0; + double leak_power_SCHP_port_sram_cell = 0; + double leak_comparator_cam_cell =0; + + double gate_leak_comparator_cam_cell = 0; + double gate_leak_power_cc_inverters_sram_cell = 0; + double gate_leak_power_RD_port_sram_cell = 0; + double gate_leak_power_SCHP_port_sram_cell = 0; + + c_matchline_metal = cam_cell.get_w() * g_tp.wire_local.C_per_um; + c_searchline_metal = cam_cell.get_h() * g_tp.wire_local.C_per_um; + r_matchline_metal = cam_cell.get_w() * g_tp.wire_local.R_per_um; + r_searchline_metal = cam_cell.get_h() * g_tp.wire_local.R_per_um; + + dynSearchEng = 0.0; + delay_matchchline = 0.0; + double p_to_n_sizing_r = pmos_to_nmos_sz_ratio(is_dram); + bool linear_scaling = false; + + if (linear_scaling) + { + Wdecdrivep = 450 * g_ip->F_sz_um;//this was 360 micron for the 0.8 micron process + Wdecdriven = 300 * g_ip->F_sz_um;//this was 240 micron for the 0.8 micron process + Wfadriven = 62.5 * g_ip->F_sz_um;//this was 50 micron for the 0.8 micron process + Wfadrivep = 125 * g_ip->F_sz_um;//this was 100 micron for the 0.8 micron process + Wfadrive2n = 250 * g_ip->F_sz_um;//this was 200 micron for the 0.8 micron process + Wfadrive2p = 500 * g_ip->F_sz_um;//this was 400 micron for the 0.8 micron process + Wfadecdrive1n = 6.25 * g_ip->F_sz_um;//this was 5 micron for the 0.8 micron process + Wfadecdrive1p = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process + Wfadecdrive2n = 25 * g_ip->F_sz_um;//this was 20 micron for the 0.8 micron process + Wfadecdrive2p = 50 * g_ip->F_sz_um;//this was 40 micron for the 0.8 micron process + Wfadecdriven = 62.5 * g_ip->F_sz_um;//this was 50 micron for the 0.8 micron process + Wfadecdrivep = 125 * g_ip->F_sz_um;//this was 100 micron for the 0.8 micron process + Wfaprechn = 7.5 * g_ip->F_sz_um;//this was 6 micron for the 0.8 micron process + Wfainvn = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process + Wfainvp = 25 * g_ip->F_sz_um;//this was 20 micron for the 0.8 micron process + Wfanandn = 25 * g_ip->F_sz_um;//this was 20 micron for the 0.8 micron process + Wfanandp = 37.5 * g_ip->F_sz_um;//this was 30 micron for the 0.8 micron process + Wdecnandn = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process + Wdecnandp = 37.5 * g_ip->F_sz_um;//this was 30 micron for the 0.8 micron process + + Wfaprechp = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process + Wdummyn = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process + Wdummyinvn = 75 * g_ip->F_sz_um;//this was 60 micron for the 0.8 micron process + Wdummyinvp = 100 * g_ip->F_sz_um;//this was 80 micron for the 0.8 micron process + Waddrnandn = 62.5 * g_ip->F_sz_um;//this was 50 micron for the 0.8 micron process + Waddrnandp = 62.5 * g_ip->F_sz_um;//this was 50 micron for the 0.8 micron process + Wfanorn = 6.25 * g_ip->F_sz_um;//this was 5 micron for the 0.8 micron process + Wfanorp = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process + W_hit_miss_n = Wdummyn; + W_hit_miss_p = g_tp.min_w_nmos_*p_to_n_sizing_r; + //TODO: this number should updated using new layout; from the NAND to output NOR should be computed using logical effort + } + else + { + Wdecdrivep = 450 * g_ip->F_sz_um;//this was 360 micron for the 0.8 micron process + Wdecdriven = 300 * g_ip->F_sz_um;//this was 240 micron for the 0.8 micron process + Wfadriven = 62.5 * g_ip->F_sz_um;//this was 50 micron for the 0.8 micron process + Wfadrivep = 125 * g_ip->F_sz_um;//this was 100 micron for the 0.8 micron process + Wfadrive2n = 250 * g_ip->F_sz_um;//this was 200 micron for the 0.8 micron process + Wfadrive2p = 500 * g_ip->F_sz_um;//this was 400 micron for the 0.8 micron process + Wfadecdrive1n = 6.25 * g_ip->F_sz_um;//this was 5 micron for the 0.8 micron process + Wfadecdrive1p = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process + Wfadecdrive2n = 25 * g_ip->F_sz_um;//this was 20 micron for the 0.8 micron process + Wfadecdrive2p = 50 * g_ip->F_sz_um;//this was 40 micron for the 0.8 micron process + Wfadecdriven = 62.5 * g_ip->F_sz_um;//this was 50 micron for the 0.8 micron process + Wfadecdrivep = 125 * g_ip->F_sz_um;//this was 100 micron for the 0.8 micron process + Wfaprechn = 7.5 * g_ip->F_sz_um;//this was 6 micron for the 0.8 micron process + Wfainvn = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process + Wfainvp = 25 * g_ip->F_sz_um;//this was 20 micron for the 0.8 micron process + Wfanandn = 25 * g_ip->F_sz_um;//this was 20 micron for the 0.8 micron process + Wfanandp = 37.5 * g_ip->F_sz_um;//this was 30 micron for the 0.8 micron process + Wdecnandn = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process + Wdecnandp = 37.5 * g_ip->F_sz_um;//this was 30 micron for the 0.8 micron process + + Wfaprechp = g_tp.w_pmos_bl_precharge;//this was 10 micron for the 0.8 micron process + Wdummyn = g_tp.cam.cell_nmos_w; + Wdummyinvn = 75 * g_ip->F_sz_um;//this was 60 micron for the 0.8 micron process + Wdummyinvp = 100 * g_ip->F_sz_um;//this was 80 micron for the 0.8 micron process + Waddrnandn = 62.5 * g_ip->F_sz_um;//this was 50 micron for the 0.8 micron process + Waddrnandp = 62.5 * g_ip->F_sz_um;//this was 50 micron for the 0.8 micron process + Wfanorn = 6.25 * g_ip->F_sz_um;//this was 5 micron for the 0.8 micron process + Wfanorp = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process + W_hit_miss_n = Wdummyn; + W_hit_miss_p = g_tp.min_w_nmos_*p_to_n_sizing_r; + } + + Htagbits = (int)(ceil ((double) (subarray.num_cols_fa_cam) / 2.0)); + + /* First stage, searchline is precharged. searchline data driver drives the searchline to open (if miss) the comparators. + search_line_delay, search_line_power, search_line_restore_delay for cycle time computation. + From the driver(am and an) to the comparators in all the rows including the dummy row, + Assuming that comparators in both the normal matching line and the dummy matching line have the same sizing */ + + //Searchline precharge circuitry is same as that of bitline. However, no sharing between search ports and r/w ports + //Searchline precharge routes horizontally + driver_c_gate_load = subarray.num_cols_fa_cam * gate_C(2 * g_tp.w_pmos_bl_precharge + g_tp.w_pmos_bl_eq, 0, is_dram, false, false); + driver_c_wire_load = subarray.num_cols_fa_cam * cam_cell.w * g_tp.wire_outside_mat.C_per_um; + driver_r_wire_load = subarray.num_cols_fa_cam * cam_cell.w * g_tp.wire_outside_mat.R_per_um; + + sl_precharge_eq_drv = new Driver( + driver_c_gate_load, + driver_c_wire_load, + driver_r_wire_load, + is_dram); + + //searchline data driver ; subarray.num_rows + 1 is because of the dummy row + //data drv should only have gate_C not 2*gate_C since the two searchlines are differential--same as bitlines + driver_c_gate_load = (subarray.num_rows + 1) * gate_C(Wdummyn, 0, is_dram, false, false); + driver_c_wire_load = (subarray.num_rows + 1) * c_searchline_metal; + driver_r_wire_load = (subarray.num_rows + 1) * r_searchline_metal; + sl_data_drv = new Driver( + driver_c_gate_load, + driver_c_wire_load, + driver_r_wire_load, + is_dram); + + sl_precharge_eq_drv->compute_delay(0); + double R_bl_precharge = tr_R_on(g_tp.w_pmos_bl_precharge, PCH, 1, is_dram, false, false);//Assuming CAM and SRAM have same Pre_eq_dr + double r_b_metal = cam_cell.h * g_tp.wire_local.R_per_um; + double R_bl = (subarray.num_rows + 1) * r_b_metal; + double C_bl = subarray.C_bl_cam; + delay_cam_sl_restore = sl_precharge_eq_drv->delay + + log(g_tp.cam.Vbitpre)* (R_bl_precharge * C_bl + R_bl * C_bl / 2); + + out_time_ramp = sl_data_drv->compute_delay(inrisetime);//After entering one mat, start to consider the inrisetime from 0(0 is passed from outside) + + //matchline ops delay + delay_matchchline += sl_data_drv->delay; + + /* second stage, from the trasistors in the comparators(both normal row and dummy row) to the NAND gates that combins both half*/ + //matchline delay, matchline power, matchline_reset for cycle time computation, + + ////matchline precharge circuitry routes vertically + //There are two matchline precharge driver chains per subarray. + driver_c_gate_load = (subarray.num_rows + 1) * gate_C(Wfaprechp, 0, is_dram); + driver_c_wire_load = (subarray.num_rows + 1) * c_searchline_metal; + driver_r_wire_load = (subarray.num_rows + 1) * r_searchline_metal; + + ml_precharge_drv = new Driver( + driver_c_gate_load, + driver_c_wire_load, + driver_r_wire_load, + is_dram); + + ml_precharge_drv->compute_delay(0); + + + rd = tr_R_on(Wdummyn, NCH, 2, is_dram); + c_intrinsic = Htagbits*(2*drain_C_(Wdummyn, NCH, 2, 1, g_tp.cell_h_def, is_dram)//TODO: the cell_h_def should be revisit + + drain_C_(Wfaprechp, PCH, 1, 1, g_tp.cell_h_def, is_dram)/Htagbits);//since each halve only has one precharge tx per matchline + + Cwire = c_matchline_metal * Htagbits; + Rwire = r_matchline_metal * Htagbits; + c_gate_load = gate_C(Waddrnandn + Waddrnandp, 0, is_dram); + + double R_ml_precharge = tr_R_on(Wfaprechp, PCH, 1, is_dram); + //double r_ml_metal = cam_cell.w * g_tp.wire_local.R_per_um; + double R_ml = Rwire; + double C_ml = Cwire + c_intrinsic; + delay_cam_ml_reset = ml_precharge_drv->delay + + log(g_tp.cam.Vbitpre)* (R_ml_precharge * C_ml + R_ml * C_ml / 2);//TODO: latest CAM has sense amps on matchlines too + + //matchline ops delay + tf = rd * (c_intrinsic + Cwire / 2 + c_gate_load) + Rwire * (Cwire / 2 + c_gate_load); + this_delay = horowitz(out_time_ramp, tf, VTHFA2, VTHFA3, FALL); + delay_matchchline += this_delay; + out_time_ramp = this_delay / VTHFA3; + + dynSearchEng += ((c_intrinsic + Cwire + c_gate_load)*(subarray.num_rows +1)) //+ 2*drain_C_(Wdummyn, NCH, 2, 1, g_tp.cell_h_def, is_dram))//TODO: need to be precise + * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd *2;//* Ntbl;//each subarry has two halves + + /* third stage, from the NAND2 gates to the drivers in the dummy row */ + rd = tr_R_on(Waddrnandn, NCH, 2, is_dram); + c_intrinsic = drain_C_(Waddrnandn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + + drain_C_(Waddrnandp, PCH, 1, 1, g_tp.cell_h_def, is_dram)*2; + c_gate_load = gate_C(Wdummyinvn + Wdummyinvp, 0, is_dram); + tf = rd * (c_intrinsic + c_gate_load); + this_delay = horowitz(out_time_ramp, tf, VTHFA3, VTHFA4, RISE); + out_time_ramp = this_delay / (1 - VTHFA4); + delay_matchchline += this_delay; + + //only the dummy row has the extra inverter between NAND and NOR gates + dynSearchEng += (c_intrinsic* (subarray.num_rows+1)+ c_gate_load*2) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;// * Ntbl; + + /* fourth stage, from the driver in dummy matchline to the NOR2 gate which drives the wordline of the data portion */ + rd = tr_R_on(Wdummyinvn, NCH, 1, is_dram); + c_intrinsic = drain_C_(Wdummyinvn, NCH, 1, 1, g_tp.cell_h_def, is_dram) + drain_C_(Wdummyinvp, NCH, 1, 1, g_tp.cell_h_def, is_dram); + Cwire = c_matchline_metal * Htagbits + c_searchline_metal * (subarray.num_rows+1)/2; + Rwire = r_matchline_metal * Htagbits + r_searchline_metal * (subarray.num_rows+1)/2; + c_gate_load = gate_C(Wfanorn + Wfanorp, 0, is_dram); + tf = rd * (c_intrinsic + Cwire + c_gate_load) + Rwire * (Cwire / 2 + c_gate_load); + this_delay = horowitz (out_time_ramp, tf, VTHFA4, VTHFA5, FALL); + out_time_ramp = this_delay / VTHFA5; + delay_matchchline += this_delay; + + dynSearchEng += (c_intrinsic + Cwire + subarray.num_rows*c_gate_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;//* Ntbl; + + /*final statge from the NOR gate to drive the wordline of the data portion */ + + //searchline data driver There are two matchline precharge driver chains per subarray. + driver_c_gate_load = gate_C(W_hit_miss_n, 0, is_dram, false, false);//nmos of the pull down logic + driver_c_wire_load = subarray.C_wl_ram; + driver_r_wire_load = subarray.R_wl_ram; + + ml_to_ram_wl_drv = new Driver( + driver_c_gate_load, + driver_c_wire_load, + driver_r_wire_load, + is_dram); + + + + rd = tr_R_on(Wfanorn, NCH, 1, is_dram); + c_intrinsic = 2* drain_C_(Wfanorn, NCH, 1, 1, g_tp.cell_h_def, is_dram) + drain_C_(Wfanorp, NCH, 1, 1, g_tp.cell_h_def, is_dram); + c_gate_load = gate_C(ml_to_ram_wl_drv->width_n[0] + ml_to_ram_wl_drv->width_p[0], 0, is_dram); + tf = rd * (c_intrinsic + c_gate_load); + this_delay = horowitz (out_time_ramp, tf, 0.5, 0.5, RISE); + out_time_ramp = this_delay / (1-0.5); + delay_matchchline += this_delay; + + out_time_ramp = ml_to_ram_wl_drv->compute_delay(out_time_ramp); + + //c_gate_load energy is computed in ml_to_ram_wl_drv + dynSearchEng += (c_intrinsic) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;//* Ntbl; + + + /* peripheral-- hitting logic "CMOS VLSI Design Fig11.51*/ + /*Precharge the hitting logic */ + c_intrinsic = 2*drain_C_(W_hit_miss_p, NCH, 2, 1, g_tp.cell_h_def, is_dram); + Cwire = c_searchline_metal * subarray.num_rows; + Rwire = r_searchline_metal * subarray.num_rows; + c_gate_load = drain_C_(W_hit_miss_n, NCH, 1, 1, g_tp.cell_h_def, is_dram)* subarray.num_rows; + + rd = tr_R_on(W_hit_miss_p, PCH, 1, is_dram, false, false); + //double r_ml_metal = cam_cell.w * g_tp.wire_local.R_per_um; + double R_hit_miss = Rwire; + double C_hit_miss = Cwire + c_intrinsic; + delay_hit_miss_reset = log(g_tp.cam.Vbitpre)* (rd * C_hit_miss + R_hit_miss * C_hit_miss / 2); + dynSearchEng += (c_intrinsic + Cwire + c_gate_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd; + + /*hitting logic evaluation */ + c_intrinsic = 2*drain_C_(W_hit_miss_n, NCH, 2, 1, g_tp.cell_h_def, is_dram); + Cwire = c_searchline_metal * subarray.num_rows; + Rwire = r_searchline_metal * subarray.num_rows; + c_gate_load = drain_C_(W_hit_miss_n, NCH, 1, 1, g_tp.cell_h_def, is_dram)* subarray.num_rows; + + rd = tr_R_on(W_hit_miss_n, PCH, 1, is_dram, false, false); + tf = rd * (c_intrinsic + Cwire / 2 + c_gate_load) + Rwire * (Cwire / 2 + c_gate_load); + + delay_hit_miss = horowitz(0, tf, 0.5, 0.5, FALL); + + if (is_fa) + delay_matchchline += MAX(ml_to_ram_wl_drv->delay, delay_hit_miss); + + dynSearchEng += (c_intrinsic + Cwire + c_gate_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd; + + /* TODO: peripheral-- Priority Encoder, usually this is not necessary in processor components*/ + + power_matchline.searchOp.dynamic = dynSearchEng; + + //leakage in one subarray + double Iport = cmos_Isub_leakage(g_tp.cam.cell_a_w, 0, 1, nmos, false, true);//TODO: how much is the idle time? just by *2? + double Iport_erp = cmos_Isub_leakage(g_tp.cam.cell_a_w, 0, 2, nmos, false, true); + double Icell = cmos_Isub_leakage(g_tp.cam.cell_nmos_w, g_tp.cam.cell_pmos_w, 1, inv, false, true)*2; + double Icell_comparator = cmos_Isub_leakage(Wdummyn, Wdummyn, 1, inv, false, true)*2;//approx XOR with Inv + + leak_power_cc_inverters_sram_cell = Icell * g_tp.cam_cell.Vdd; + leak_comparator_cam_cell = Icell_comparator * g_tp.cam_cell.Vdd; + leak_power_acc_tr_RW_or_WR_port_sram_cell = Iport * g_tp.cam_cell.Vdd; + leak_power_RD_port_sram_cell = Iport_erp * g_tp.cam_cell.Vdd; + leak_power_SCHP_port_sram_cell = 0;//search port and r/w port are sperate, therefore no access txs in search ports + + power_matchline.searchOp.leakage += leak_power_cc_inverters_sram_cell + + leak_comparator_cam_cell + + leak_power_acc_tr_RW_or_WR_port_sram_cell + + leak_power_acc_tr_RW_or_WR_port_sram_cell * (RWP + EWP - 1) + + leak_power_RD_port_sram_cell * ERP + + leak_power_SCHP_port_sram_cell*SCHP; +// power_matchline.searchOp.leakage += leak_comparator_cam_cell; + power_matchline.searchOp.leakage *= (subarray.num_rows+1) * subarray.num_cols_fa_cam;//TODO:dumy line precise + power_matchline.searchOp.leakage += (subarray.num_rows+1) * cmos_Isub_leakage(0, Wfaprechp, 1, pmos) * g_tp.cam_cell.Vdd; + power_matchline.searchOp.leakage += (subarray.num_rows+1) * cmos_Isub_leakage(Waddrnandn, Waddrnandp, 2, nand) * g_tp.cam_cell.Vdd; + power_matchline.searchOp.leakage += (subarray.num_rows+1) * cmos_Isub_leakage(Wfanorn, Wfanorp,2, nor) * g_tp.cam_cell.Vdd; + //In idle states, the hit/miss txs are closed (on) therefore no Isub + power_matchline.searchOp.leakage += 0;// subarray.num_rows * cmos_Isub_leakage(W_hit_miss_n, 0,1, nmos) * g_tp.cam_cell.Vdd+ + // + cmos_Isub_leakage(0, W_hit_miss_p,1, pmos) * g_tp.cam_cell.Vdd; + + //in idle state, Ig_on only possibly exist in access transistors of read only ports + double Ig_port_erp = cmos_Ig_leakage(g_tp.cam.cell_a_w, 0, 1, nmos, false, true); + double Ig_cell = cmos_Ig_leakage(g_tp.cam.cell_nmos_w, g_tp.cam.cell_pmos_w, 1, inv, false, true)*2; + double Ig_cell_comparator = cmos_Ig_leakage(Wdummyn, Wdummyn, 1, inv, false, true)*2;// cmos_Ig_leakage(Wdummyn, 0, 2, nmos)*2; + + gate_leak_comparator_cam_cell = Ig_cell_comparator* g_tp.cam_cell.Vdd; + gate_leak_power_cc_inverters_sram_cell = Ig_cell*g_tp.cam_cell.Vdd; + gate_leak_power_RD_port_sram_cell = Ig_port_erp*g_tp.sram_cell.Vdd; + gate_leak_power_SCHP_port_sram_cell = 0; + + //cout<<"power_matchline.searchOp.leakage"<<power_matchline.searchOp.leakage<<endl; + + power_matchline.searchOp.gate_leakage += gate_leak_power_cc_inverters_sram_cell; + power_matchline.searchOp.gate_leakage += gate_leak_comparator_cam_cell; + power_matchline.searchOp.gate_leakage += gate_leak_power_SCHP_port_sram_cell*SCHP + gate_leak_power_RD_port_sram_cell * ERP; + power_matchline.searchOp.gate_leakage *= (subarray.num_rows+1) * subarray.num_cols_fa_cam;//TODO:dumy line precise + power_matchline.searchOp.gate_leakage += (subarray.num_rows+1) * cmos_Ig_leakage(0, Wfaprechp,1, pmos) * g_tp.cam_cell.Vdd; + power_matchline.searchOp.gate_leakage += (subarray.num_rows+1) * cmos_Ig_leakage(Waddrnandn, Waddrnandp, 2, nand) * g_tp.cam_cell.Vdd; + power_matchline.searchOp.gate_leakage += (subarray.num_rows+1) * cmos_Ig_leakage(Wfanorn, Wfanorp, 2, nor) * g_tp.cam_cell.Vdd; + power_matchline.searchOp.gate_leakage += subarray.num_rows * cmos_Ig_leakage(W_hit_miss_n, 0,1, nmos) * g_tp.cam_cell.Vdd+ + + cmos_Ig_leakage(0, W_hit_miss_p,1, pmos) * g_tp.cam_cell.Vdd; + + + return out_time_ramp; +} + + +double Mat::width_write_driver_or_write_mux() +{ + // calculate resistance of SRAM cell pull-up PMOS transistor + // cam and sram have same cell trasistor properties + double R_sram_cell_pull_up_tr = tr_R_on(g_tp.sram.cell_pmos_w, NCH, 1, is_dram, true); + double R_access_tr = tr_R_on(g_tp.sram.cell_a_w, NCH, 1, is_dram, true); + double target_R_write_driver_and_mux = (2 * R_sram_cell_pull_up_tr - R_access_tr) / 2; + double width_write_driver_nmos = R_to_w(target_R_write_driver_and_mux, NCH, is_dram); + + return width_write_driver_nmos; +} + + + +double Mat::compute_comparators_height( + int tagbits, + int number_ways_in_mat, + double subarray_mem_cell_area_width) +{ + double nand2_area = compute_gate_area(NAND, 2, 0, g_tp.w_comp_n, g_tp.cell_h_def); + double cumulative_area = nand2_area * number_ways_in_mat * tagbits / 4; + return cumulative_area / subarray_mem_cell_area_width; +} + + + +double Mat::compute_bitline_delay(double inrisetime) +{ + double V_b_pre, v_th_mem_cell, V_wl; + double tstep; + double dynRdEnergy = 0.0, dynWriteEnergy = 0.0; + double R_cell_pull_down=0.0, R_cell_acc =0.0, r_dev=0.0; + int deg_senseamp_muxing = dp.Ndsam_lev_1 * dp.Ndsam_lev_2; + + double R_b_metal = camFlag? cam_cell.h:cell.h * g_tp.wire_local.R_per_um; + double R_bl = subarray.num_rows * R_b_metal; + double C_bl = subarray.C_bl; + + // TODO: no leakage for DRAMs? + double leak_power_cc_inverters_sram_cell = 0; + double gate_leak_power_cc_inverters_sram_cell = 0; + double leak_power_acc_tr_RW_or_WR_port_sram_cell = 0; + double leak_power_RD_port_sram_cell = 0; + double gate_leak_power_RD_port_sram_cell = 0; + + if (is_dram == true) + { + V_b_pre = g_tp.dram.Vbitpre; + v_th_mem_cell = g_tp.dram_acc.Vth; + V_wl = g_tp.vpp; + //The access transistor is not folded. So we just need to specify a threshold value for the + //folding width that is equal to or greater than Wmemcella. + R_cell_acc = tr_R_on(g_tp.dram.cell_a_w, NCH, 1, true, true); + r_dev = g_tp.dram_cell_Vdd / g_tp.dram_cell_I_on + R_bl / 2; + } + else + { //SRAM + V_b_pre = g_tp.sram.Vbitpre; + v_th_mem_cell = g_tp.sram_cell.Vth; + V_wl = g_tp.sram_cell.Vdd; + R_cell_pull_down = tr_R_on(g_tp.sram.cell_nmos_w, NCH, 1, false, true); + R_cell_acc = tr_R_on(g_tp.sram.cell_a_w, NCH, 1, false, true); + + //Leakage current of an SRAM cell + double Iport = cmos_Isub_leakage(g_tp.sram.cell_a_w, 0, 1, nmos,false, true);//TODO: how much is the idle time? just by *2? + double Iport_erp = cmos_Isub_leakage(g_tp.sram.cell_a_w, 0, 2, nmos,false, true); + double Icell = cmos_Isub_leakage(g_tp.sram.cell_nmos_w, g_tp.sram.cell_pmos_w, 1, inv,false, true)*2;//two invs per cell + + leak_power_cc_inverters_sram_cell = Icell * g_tp.sram_cell.Vdd; + leak_power_acc_tr_RW_or_WR_port_sram_cell = Iport * g_tp.sram_cell.Vdd; + leak_power_RD_port_sram_cell = Iport_erp * g_tp.sram_cell.Vdd; + + + //in idle state, Ig_on only possibly exist in access transistors of read only ports + double Ig_port_erp = cmos_Ig_leakage(g_tp.sram.cell_a_w, 0, 1, nmos,false, true); + double Ig_cell = cmos_Ig_leakage(g_tp.sram.cell_nmos_w, g_tp.sram.cell_pmos_w, 1, inv,false, true); + + gate_leak_power_cc_inverters_sram_cell = Ig_cell*g_tp.sram_cell.Vdd; + gate_leak_power_RD_port_sram_cell = Ig_port_erp*g_tp.sram_cell.Vdd; + } + + + double C_drain_bit_mux = drain_C_(g_tp.w_nmos_b_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w / (2 *(RWP + ERP + SCHP)), is_dram); + double R_bit_mux = tr_R_on(g_tp.w_nmos_b_mux, NCH, 1, is_dram); + double C_drain_sense_amp_iso = drain_C_(g_tp.w_iso, PCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram); + double R_sense_amp_iso = tr_R_on(g_tp.w_iso, PCH, 1, is_dram); + double C_sense_amp_latch = gate_C(g_tp.w_sense_p + g_tp.w_sense_n, 0, is_dram) + + drain_C_(g_tp.w_sense_n, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) + + drain_C_(g_tp.w_sense_p, PCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram); + double C_drain_sense_amp_mux = drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram); + + if (is_dram) + { + double fraction = dp.V_b_sense / ((g_tp.dram_cell_Vdd/2) * g_tp.dram_cell_C /(g_tp.dram_cell_C + C_bl)); + tstep = 2.3 * fraction * r_dev * + (g_tp.dram_cell_C * (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux)) / + (g_tp.dram_cell_C + (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux)); + delay_writeback = tstep; + dynRdEnergy += (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) * + (g_tp.dram_cell_Vdd / 2) * g_tp.dram_cell_Vdd /* subarray.num_cols * num_subarrays_per_mat*/; + dynWriteEnergy += (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch) * + (g_tp.dram_cell_Vdd / 2) * g_tp.dram_cell_Vdd /* subarray.num_cols * num_subarrays_per_mat*/ * num_act_mats_hor_dir*100; + per_bitline_read_energy = (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) * + (g_tp.dram_cell_Vdd / 2) * g_tp.dram_cell_Vdd; + } + else + { + double tau; + + if (deg_bl_muxing > 1) + { + tau = (R_cell_pull_down + R_cell_acc) * + (C_bl + 2*C_drain_bit_mux + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) + + R_bl * (C_bl/2 + 2*C_drain_bit_mux + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) + + R_bit_mux * (C_drain_bit_mux + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) + + R_sense_amp_iso * (C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux); + dynRdEnergy += (C_bl + 2 * C_drain_bit_mux) * 2 * dp.V_b_sense * g_tp.sram_cell.Vdd /* + subarray.num_cols * num_subarrays_per_mat*/; + dynRdEnergy += (2 * C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) * + 2 * dp.V_b_sense * g_tp.sram_cell.Vdd * (1.0/*subarray.num_cols * num_subarrays_per_mat*/ / deg_bl_muxing); + dynWriteEnergy += ((1.0/*subarray.num_cols *num_subarrays_per_mat*/ / deg_bl_muxing) / deg_senseamp_muxing) * + num_act_mats_hor_dir * (C_bl + 2*C_drain_bit_mux) * g_tp.sram_cell.Vdd * g_tp.sram_cell.Vdd*2; + //Write Ops are differential for SRAM + } + else + { + tau = (R_cell_pull_down + R_cell_acc) * + (C_bl + C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) + R_bl * C_bl / 2 + + R_sense_amp_iso * (C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux); + dynRdEnergy += (C_bl + 2 * C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) * + 2 * dp.V_b_sense * g_tp.sram_cell.Vdd /* subarray.num_cols * num_subarrays_per_mat*/; + dynWriteEnergy += (((1.0/*subarray.num_cols * num_subarrays_per_mat*/ / deg_bl_muxing) / deg_senseamp_muxing) * + num_act_mats_hor_dir * C_bl) * g_tp.sram_cell.Vdd * g_tp.sram_cell.Vdd*2; + + } + tstep = tau * log(V_b_pre / (V_b_pre - dp.V_b_sense)); + power_bitline.readOp.leakage = + leak_power_cc_inverters_sram_cell + + leak_power_acc_tr_RW_or_WR_port_sram_cell + + leak_power_acc_tr_RW_or_WR_port_sram_cell * (RWP + EWP - 1) + + leak_power_RD_port_sram_cell * ERP; + power_bitline.readOp.gate_leakage = gate_leak_power_cc_inverters_sram_cell + + gate_leak_power_RD_port_sram_cell * ERP; + + } + +// cout<<"leak_power_cc_inverters_sram_cell"<<leak_power_cc_inverters_sram_cell<<endl; +// cout<<"leak_power_acc_tr_RW_or_WR_port_sram_cell"<<leak_power_acc_tr_RW_or_WR_port_sram_cell<<endl; +// cout<<"leak_power_acc_tr_RW_or_WR_port_sram_cell"<<leak_power_acc_tr_RW_or_WR_port_sram_cell<<endl; +// cout<<"leak_power_RD_port_sram_cell"<<leak_power_RD_port_sram_cell<<endl; + + + /* take input rise time into account */ + double m = V_wl / inrisetime; + if (tstep <= (0.5 * (V_wl - v_th_mem_cell) / m)) + { + delay_bitline = sqrt(2 * tstep * (V_wl - v_th_mem_cell)/ m); + } + else + { + delay_bitline = tstep + (V_wl - v_th_mem_cell) / (2 * m); + } + + bool is_fa = (dp.fully_assoc) ? true : false; + + if (dp.is_tag == false || is_fa == false) + { + power_bitline.readOp.dynamic = dynRdEnergy; + power_bitline.writeOp.dynamic = dynWriteEnergy; + } + + double outrisetime = 0; + return outrisetime; +} + + + +double Mat::compute_sa_delay(double inrisetime) +{ + //int num_sa_subarray = subarray.num_cols / deg_bl_muxing; //in a subarray + + //Bitline circuitry leakage. + double Iiso = simplified_pmos_leakage(g_tp.w_iso, is_dram); + double IsenseEn = simplified_nmos_leakage(g_tp.w_sense_en, is_dram); + double IsenseN = simplified_nmos_leakage(g_tp.w_sense_n, is_dram); + double IsenseP = simplified_pmos_leakage(g_tp.w_sense_p, is_dram); + + double lkgIdlePh = IsenseEn;//+ 2*IoBufP; + //double lkgWritePh = Iiso + IsenseEn;// + 2*IoBufP + 2*Ipch; + double lkgReadPh = Iiso + IsenseN + IsenseP;//+ IoBufN + IoBufP + 2*IsPch ; + //double lkgRead = lkgReadPh * num_sa_subarray * 4 * num_act_mats_hor_dir + + // lkgIdlePh * num_sa_subarray * 4 * (num_mats - num_act_mats_hor_dir); + double lkgIdle = lkgIdlePh /*num_sa_subarray * num_subarrays_per_mat*/; + leak_power_sense_amps_closed_page_state = lkgIdlePh * g_tp.peri_global.Vdd /* num_sa_subarray * num_subarrays_per_mat*/; + leak_power_sense_amps_open_page_state = lkgReadPh * g_tp.peri_global.Vdd /* num_sa_subarray * num_subarrays_per_mat*/; + + // sense amplifier has to drive logic in "data out driver" and sense precharge load. + // load seen by sense amp. New delay model for sense amp that is sensitive to both the output time + //constant as well as the magnitude of input differential voltage. + double C_ld = gate_C(g_tp.w_sense_p + g_tp.w_sense_n, 0, is_dram) + + drain_C_(g_tp.w_sense_n, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) + + drain_C_(g_tp.w_sense_p, PCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) + + drain_C_(g_tp.w_iso,PCH,1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) + + drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram); + double tau = C_ld / g_tp.gm_sense_amp_latch; + delay_sa = tau * log(g_tp.peri_global.Vdd / dp.V_b_sense); + power_sa.readOp.dynamic = C_ld * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd /* num_sa_subarray + num_subarrays_per_mat * num_act_mats_hor_dir*/; + power_sa.readOp.leakage = lkgIdle * g_tp.peri_global.Vdd; + + double outrisetime = 0; + return outrisetime; +} + + + +double Mat::compute_subarray_out_drv(double inrisetime) +{ + double C_ld, rd, tf, this_delay; + double p_to_n_sz_r = pmos_to_nmos_sz_ratio(is_dram); + + // delay of signal through pass-transistor of first level of sense-amp mux to input of inverter-buffer. + rd = tr_R_on(g_tp.w_nmos_sa_mux, NCH, 1, is_dram); + C_ld = dp.Ndsam_lev_1 * drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) + + gate_C(g_tp.min_w_nmos_ + p_to_n_sz_r * g_tp.min_w_nmos_, 0.0, is_dram); + tf = rd * C_ld; + this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE); + delay_subarray_out_drv += this_delay; + inrisetime = this_delay/(1.0 - 0.5); + power_subarray_out_drv.readOp.dynamic += C_ld * 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd; + power_subarray_out_drv.readOp.leakage += 0; // for now, let leakage of the pass transistor be 0 + power_subarray_out_drv.readOp.gate_leakage += cmos_Ig_leakage(g_tp.w_nmos_sa_mux, 0, 1, nmos)* g_tp.peri_global.Vdd; + // delay of signal through inverter-buffer to second level of sense-amp mux. + // internal delay of buffer + rd = tr_R_on(g_tp.min_w_nmos_, NCH, 1, is_dram); + C_ld = drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def, is_dram) + + drain_C_(p_to_n_sz_r * g_tp.min_w_nmos_, PCH, 1, 1, g_tp.cell_h_def, is_dram) + + gate_C(g_tp.min_w_nmos_ + p_to_n_sz_r * g_tp.min_w_nmos_, 0.0, is_dram); + tf = rd * C_ld; + this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE); + delay_subarray_out_drv += this_delay; + inrisetime = this_delay/(1.0 - 0.5); + power_subarray_out_drv.readOp.dynamic += C_ld * 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd; + power_subarray_out_drv.readOp.leakage += cmos_Isub_leakage(g_tp.min_w_nmos_, p_to_n_sz_r * g_tp.min_w_nmos_, 1, inv, is_dram)* g_tp.peri_global.Vdd; + power_subarray_out_drv.readOp.gate_leakage += cmos_Ig_leakage(g_tp.min_w_nmos_, p_to_n_sz_r * g_tp.min_w_nmos_, 1, inv)* g_tp.peri_global.Vdd; + + // inverter driving drain of pass transistor of second level of sense-amp mux. + rd = tr_R_on(g_tp.min_w_nmos_, NCH, 1, is_dram); + C_ld = drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def, is_dram) + + drain_C_(p_to_n_sz_r * g_tp.min_w_nmos_, PCH, 1, 1, g_tp.cell_h_def, is_dram) + + drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing * dp.Ndsam_lev_1 / (RWP + ERP + SCHP), is_dram); + tf = rd * C_ld; + this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE); + delay_subarray_out_drv += this_delay; + inrisetime = this_delay/(1.0 - 0.5); + power_subarray_out_drv.readOp.dynamic += C_ld * 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd; + power_subarray_out_drv.readOp.leakage += cmos_Isub_leakage(g_tp.min_w_nmos_, p_to_n_sz_r * g_tp.min_w_nmos_, 1, inv)* g_tp.peri_global.Vdd; + power_subarray_out_drv.readOp.gate_leakage += cmos_Ig_leakage(g_tp.min_w_nmos_, p_to_n_sz_r * g_tp.min_w_nmos_, 1, inv)* g_tp.peri_global.Vdd; + + + // delay of signal through pass-transistor to input of subarray output driver. + rd = tr_R_on(g_tp.w_nmos_sa_mux, NCH, 1, is_dram); + C_ld = dp.Ndsam_lev_2 * drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing * dp.Ndsam_lev_1 / (RWP + ERP + SCHP), is_dram) + + //gate_C(subarray_out_wire->repeater_size * g_tp.min_w_nmos_ * (1 + p_to_n_sz_r), 0.0, is_dram); + gate_C(subarray_out_wire->repeater_size *(subarray_out_wire->wire_length/subarray_out_wire->repeater_spacing) * g_tp.min_w_nmos_ * (1 + p_to_n_sz_r), 0.0, is_dram); + tf = rd * C_ld; + this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE); + delay_subarray_out_drv += this_delay; + inrisetime = this_delay/(1.0 - 0.5); + power_subarray_out_drv.readOp.dynamic += C_ld * 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd; + power_subarray_out_drv.readOp.leakage += 0; // for now, let leakage of the pass transistor be 0 + power_subarray_out_drv.readOp.gate_leakage += cmos_Ig_leakage(g_tp.w_nmos_sa_mux, 0, 1, nmos)* g_tp.peri_global.Vdd; + + + return inrisetime; +} + + + +double Mat::compute_comparator_delay(double inrisetime) +{ + int A = g_ip->tag_assoc; + + int tagbits_ = dp.tagbits / 4; // Assuming there are 4 quarter comparators. input tagbits is already + // a multiple of 4. + + /* First Inverter */ + double Ceq = gate_C(g_tp.w_comp_inv_n2+g_tp.w_comp_inv_p2, 0, is_dram) + + drain_C_(g_tp.w_comp_inv_p1, PCH, 1, 1, g_tp.cell_h_def, is_dram) + + drain_C_(g_tp.w_comp_inv_n1, NCH, 1, 1, g_tp.cell_h_def, is_dram); + double Req = tr_R_on(g_tp.w_comp_inv_p1, PCH, 1, is_dram); + double tf = Req*Ceq; + double st1del = horowitz(inrisetime,tf,VTHCOMPINV,VTHCOMPINV,FALL); + double nextinputtime = st1del/VTHCOMPINV; + power_comparator.readOp.dynamic += 0.5 * Ceq * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 4 * A; + + //For each degree of associativity + //there are 4 such quarter comparators + double lkgCurrent = cmos_Isub_leakage(g_tp.w_comp_inv_n1, g_tp.w_comp_inv_p1, 1, inv, is_dram)* 4 * A; + double gatelkgCurrent = cmos_Ig_leakage(g_tp.w_comp_inv_n1, g_tp.w_comp_inv_p1, 1, inv, is_dram)* 4 * A; + /* Second Inverter */ + Ceq = gate_C(g_tp.w_comp_inv_n3+g_tp.w_comp_inv_p3, 0, is_dram) + + drain_C_(g_tp.w_comp_inv_p2, PCH, 1, 1, g_tp.cell_h_def, is_dram) + + drain_C_(g_tp.w_comp_inv_n2, NCH, 1, 1, g_tp.cell_h_def, is_dram); + Req = tr_R_on(g_tp.w_comp_inv_n2, NCH, 1, is_dram); + tf = Req*Ceq; + double st2del = horowitz(nextinputtime,tf,VTHCOMPINV,VTHCOMPINV,RISE); + nextinputtime = st2del/(1.0-VTHCOMPINV); + power_comparator.readOp.dynamic += 0.5 * Ceq * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 4 * A; + lkgCurrent += cmos_Isub_leakage(g_tp.w_comp_inv_n2, g_tp.w_comp_inv_p2, 1, inv, is_dram)* 4 * A; + gatelkgCurrent += cmos_Ig_leakage(g_tp.w_comp_inv_n2, g_tp.w_comp_inv_p2, 1, inv, is_dram)* 4 * A; + + /* Third Inverter */ + Ceq = gate_C(g_tp.w_eval_inv_n+g_tp.w_eval_inv_p, 0, is_dram) + + drain_C_(g_tp.w_comp_inv_p3, PCH, 1, 1, g_tp.cell_h_def, is_dram) + + drain_C_(g_tp.w_comp_inv_n3, NCH, 1, 1, g_tp.cell_h_def, is_dram); + Req = tr_R_on(g_tp.w_comp_inv_p3, PCH, 1, is_dram); + tf = Req*Ceq; + double st3del = horowitz(nextinputtime,tf,VTHCOMPINV,VTHEVALINV,FALL); + nextinputtime = st3del/(VTHEVALINV); + power_comparator.readOp.dynamic += 0.5 * Ceq * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 4 * A; + lkgCurrent += cmos_Isub_leakage(g_tp.w_comp_inv_n3, g_tp.w_comp_inv_p3, 1, inv, is_dram)* 4 * A; + gatelkgCurrent += cmos_Ig_leakage(g_tp.w_comp_inv_n3, g_tp.w_comp_inv_p3, 1, inv, is_dram)* 4 * A; + + /* Final Inverter (virtual ground driver) discharging compare part */ + double r1 = tr_R_on(g_tp.w_comp_n,NCH,2, is_dram); + double r2 = tr_R_on(g_tp.w_eval_inv_n,NCH,1, is_dram); /* was switch */ + double c2 = (tagbits_)*(drain_C_(g_tp.w_comp_n,NCH,1, 1, g_tp.cell_h_def, is_dram) + + drain_C_(g_tp.w_comp_n,NCH,2, 1, g_tp.cell_h_def, is_dram)) + + drain_C_(g_tp.w_eval_inv_p,PCH,1, 1, g_tp.cell_h_def, is_dram) + + drain_C_(g_tp.w_eval_inv_n,NCH,1, 1, g_tp.cell_h_def, is_dram); + double c1 = (tagbits_)*(drain_C_(g_tp.w_comp_n,NCH,1, 1, g_tp.cell_h_def, is_dram) + + drain_C_(g_tp.w_comp_n,NCH,2, 1, g_tp.cell_h_def, is_dram)) + + drain_C_(g_tp.w_comp_p,PCH,1, 1, g_tp.cell_h_def, is_dram) + + gate_C(WmuxdrvNANDn+WmuxdrvNANDp,0, is_dram); + power_comparator.readOp.dynamic += 0.5 * c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 4 * A; + power_comparator.readOp.dynamic += c1 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * (A - 1); + lkgCurrent += cmos_Isub_leakage(g_tp.w_eval_inv_n, g_tp.w_eval_inv_p, 1, inv, is_dram)* 4 * A; + lkgCurrent += cmos_Isub_leakage(g_tp.w_comp_n, g_tp.w_comp_n, 1, inv, is_dram)* 4 * A; // stack factor of 0.2 + + gatelkgCurrent += cmos_Ig_leakage(g_tp.w_eval_inv_n, g_tp.w_eval_inv_p, 1, inv, is_dram)* 4 * A; + gatelkgCurrent += cmos_Ig_leakage(g_tp.w_comp_n, g_tp.w_comp_n, 1, inv, is_dram)* 4 * A;//for gate leakage this equals to a inverter + + /* time to go to threshold of mux driver */ + double tstep = (r2*c2+(r1+r2)*c1)*log(1.0/VTHMUXNAND); + /* take into account non-zero input rise time */ + double m = g_tp.peri_global.Vdd/nextinputtime; + double Tcomparatorni; + + if((tstep) <= (0.5*(g_tp.peri_global.Vdd-g_tp.peri_global.Vth)/m)) + { + double a = m; + double b = 2*((g_tp.peri_global.Vdd*VTHEVALINV)-g_tp.peri_global.Vth); + double c = -2*(tstep)*(g_tp.peri_global.Vdd-g_tp.peri_global.Vth)+1/m*((g_tp.peri_global.Vdd*VTHEVALINV)-g_tp.peri_global.Vth)*((g_tp.peri_global.Vdd*VTHEVALINV)-g_tp.peri_global.Vth); + Tcomparatorni = (-b+sqrt(b*b-4*a*c))/(2*a); + } + else + { + Tcomparatorni = (tstep) + (g_tp.peri_global.Vdd+g_tp.peri_global.Vth)/(2*m) - (g_tp.peri_global.Vdd*VTHEVALINV)/m; + } + delay_comparator = Tcomparatorni+st1del+st2del+st3del; + power_comparator.readOp.leakage = lkgCurrent * g_tp.peri_global.Vdd; + power_comparator.readOp.gate_leakage = gatelkgCurrent * g_tp.peri_global.Vdd; + + return Tcomparatorni / (1.0 - VTHMUXNAND);; +} + + + +void Mat::compute_power_energy() +{ + //for cam and FA, power.readOp is the plain read power, power.searchOp is the associative search related power + //when search all subarrays and all mats are fully active + //when plain read/write only one subarray in a single mat is active. + + // add energy consumed in predecoder drivers. This unit is shared by all subarrays in a mat. + power.readOp.dynamic += r_predec->power.readOp.dynamic + + b_mux_predec->power.readOp.dynamic + + sa_mux_lev_1_predec->power.readOp.dynamic + + sa_mux_lev_2_predec->power.readOp.dynamic; + + // add energy consumed in decoders + power_row_decoders.readOp.dynamic = row_dec->power.readOp.dynamic; + if (!(is_fa||pure_cam)) + power_row_decoders.readOp.dynamic *= num_subarrays_per_mat; + + // add energy consumed in bitline prechagers, SAs, and bitlines + if (!(is_fa||pure_cam)) + { + // add energy consumed in bitline prechagers + power_bl_precharge_eq_drv.readOp.dynamic = bl_precharge_eq_drv->power.readOp.dynamic; + power_bl_precharge_eq_drv.readOp.dynamic *= num_subarrays_per_mat; + + //Add sense amps energy + num_sa_subarray = subarray.num_cols / deg_bl_muxing; + power_sa.readOp.dynamic *= num_sa_subarray*num_subarrays_per_mat ; + + // add energy consumed in bitlines + //cout<<"bitline power"<<power_bitline.readOp.dynamic<<endl; + power_bitline.readOp.dynamic *= num_subarrays_per_mat*subarray.num_cols; + power_bitline.writeOp.dynamic *= num_subarrays_per_mat*subarray.num_cols; + //cout<<"bitline power"<<power_bitline.readOp.dynamic<<"subarray"<<num_subarrays_per_mat<<"cols"<<subarray.num_cols<<endl; + //Add subarray output energy + power_subarray_out_drv.readOp.dynamic = + (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_do_b_mat; + + power.readOp.dynamic += power_bl_precharge_eq_drv.readOp.dynamic + + power_sa.readOp.dynamic + + power_bitline.readOp.dynamic + + power_subarray_out_drv.readOp.dynamic; + + power.readOp.dynamic += power_row_decoders.readOp.dynamic + + bit_mux_dec->power.readOp.dynamic + + sa_mux_lev_1_dec->power.readOp.dynamic + + sa_mux_lev_2_dec->power.readOp.dynamic + + power_comparator.readOp.dynamic; + } + + else if (is_fa) + { + //for plain read/write only one subarray in a mat is active + // add energy consumed in bitline prechagers + power_bl_precharge_eq_drv.readOp.dynamic = bl_precharge_eq_drv->power.readOp.dynamic + + cam_bl_precharge_eq_drv->power.readOp.dynamic; + power_bl_precharge_eq_drv.searchOp.dynamic = bl_precharge_eq_drv->power.readOp.dynamic; + + //Add sense amps energy + num_sa_subarray = (subarray.num_cols_fa_cam + subarray.num_cols_fa_ram)/ deg_bl_muxing; + num_sa_subarray_search = subarray.num_cols_fa_ram/ deg_bl_muxing; + power_sa.searchOp.dynamic = power_sa.readOp.dynamic*num_sa_subarray_search; + power_sa.readOp.dynamic *= num_sa_subarray; + + + // add energy consumed in bitlines + power_bitline.searchOp.dynamic = power_bitline.readOp.dynamic; + power_bitline.readOp.dynamic *= (subarray.num_cols_fa_cam+subarray.num_cols_fa_ram); + power_bitline.writeOp.dynamic *= (subarray.num_cols_fa_cam+subarray.num_cols_fa_ram); + power_bitline.searchOp.dynamic *= subarray.num_cols_fa_ram; + + //Add subarray output energy + power_subarray_out_drv.searchOp.dynamic = + (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_so_b_mat; + power_subarray_out_drv.readOp.dynamic = + (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_do_b_mat; + + + power.readOp.dynamic += power_bl_precharge_eq_drv.readOp.dynamic + + power_sa.readOp.dynamic + + power_bitline.readOp.dynamic + + power_subarray_out_drv.readOp.dynamic; + + power.readOp.dynamic += power_row_decoders.readOp.dynamic + + bit_mux_dec->power.readOp.dynamic + + sa_mux_lev_1_dec->power.readOp.dynamic + + sa_mux_lev_2_dec->power.readOp.dynamic + + power_comparator.readOp.dynamic; + + //add energy consumed inside cam + power_matchline.searchOp.dynamic *= num_subarrays_per_mat; + power_searchline_precharge = sl_precharge_eq_drv->power; + power_searchline_precharge.searchOp.dynamic = power_searchline_precharge.readOp.dynamic * num_subarrays_per_mat; + power_searchline = sl_data_drv->power; + power_searchline.searchOp.dynamic = power_searchline.readOp.dynamic*subarray.num_cols_fa_cam* num_subarrays_per_mat;; + power_matchline_precharge = ml_precharge_drv->power; + power_matchline_precharge.searchOp.dynamic = power_matchline_precharge.readOp.dynamic* num_subarrays_per_mat; + power_ml_to_ram_wl_drv= ml_to_ram_wl_drv->power; + power_ml_to_ram_wl_drv.searchOp.dynamic= ml_to_ram_wl_drv->power.readOp.dynamic; + + power_cam_all_active.searchOp.dynamic = power_matchline.searchOp.dynamic; + power_cam_all_active.searchOp.dynamic +=power_searchline_precharge.searchOp.dynamic; + power_cam_all_active.searchOp.dynamic +=power_searchline.searchOp.dynamic; + power_cam_all_active.searchOp.dynamic +=power_matchline_precharge.searchOp.dynamic; + + power.searchOp.dynamic += power_cam_all_active.searchOp.dynamic; + //power.searchOp.dynamic += ml_to_ram_wl_drv->power.readOp.dynamic; + + } + else + { + // add energy consumed in bitline prechagers + power_bl_precharge_eq_drv.readOp.dynamic = cam_bl_precharge_eq_drv->power.readOp.dynamic; + //power_bl_precharge_eq_drv.readOp.dynamic *= num_subarrays_per_mat; + //power_bl_precharge_eq_drv.searchOp.dynamic = cam_bl_precharge_eq_drv->power.readOp.dynamic; + //power_bl_precharge_eq_drv.searchOp.dynamic *= num_subarrays_per_mat; + + //Add sense amps energy + num_sa_subarray = subarray.num_cols_fa_cam/ deg_bl_muxing; + power_sa.readOp.dynamic *= num_sa_subarray;//*num_subarrays_per_mat; + power_sa.searchOp.dynamic = 0; + + power_bitline.readOp.dynamic *= subarray.num_cols_fa_cam; + power_bitline.searchOp.dynamic = 0; + power_bitline.writeOp.dynamic *= subarray.num_cols_fa_cam; + + power_subarray_out_drv.searchOp.dynamic = + (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_so_b_mat; + power_subarray_out_drv.readOp.dynamic = + (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_do_b_mat; + + power.readOp.dynamic += power_bl_precharge_eq_drv.readOp.dynamic + + power_sa.readOp.dynamic + + power_bitline.readOp.dynamic + + power_subarray_out_drv.readOp.dynamic; + + power.readOp.dynamic += power_row_decoders.readOp.dynamic + + bit_mux_dec->power.readOp.dynamic + + sa_mux_lev_1_dec->power.readOp.dynamic + + sa_mux_lev_2_dec->power.readOp.dynamic + + power_comparator.readOp.dynamic; + + + ////add energy consumed inside cam + power_matchline.searchOp.dynamic *= num_subarrays_per_mat; + power_searchline_precharge = sl_precharge_eq_drv->power; + power_searchline_precharge.searchOp.dynamic = power_searchline_precharge.readOp.dynamic * num_subarrays_per_mat; + power_searchline = sl_data_drv->power; + power_searchline.searchOp.dynamic = power_searchline.readOp.dynamic*subarray.num_cols_fa_cam* num_subarrays_per_mat;; + power_matchline_precharge = ml_precharge_drv->power; + power_matchline_precharge.searchOp.dynamic = power_matchline_precharge.readOp.dynamic* num_subarrays_per_mat; + power_ml_to_ram_wl_drv= ml_to_ram_wl_drv->power; + power_ml_to_ram_wl_drv.searchOp.dynamic= ml_to_ram_wl_drv->power.readOp.dynamic; + + power_cam_all_active.searchOp.dynamic = power_matchline.searchOp.dynamic; + power_cam_all_active.searchOp.dynamic +=power_searchline_precharge.searchOp.dynamic; + power_cam_all_active.searchOp.dynamic +=power_searchline.searchOp.dynamic; + power_cam_all_active.searchOp.dynamic +=power_matchline_precharge.searchOp.dynamic; + + power.searchOp.dynamic += power_cam_all_active.searchOp.dynamic; + //power.searchOp.dynamic += ml_to_ram_wl_drv->power.readOp.dynamic; + + } + + + + // calculate leakage power + if (!(is_fa || pure_cam)) + { + int number_output_drivers_subarray = num_sa_subarray / (dp.Ndsam_lev_1 * dp.Ndsam_lev_2); + + power_bitline.readOp.leakage *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat; + power_bl_precharge_eq_drv.readOp.leakage = bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat; + power_sa.readOp.leakage *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP); + + //num_sa_subarray = subarray.num_cols / deg_bl_muxing; + power_subarray_out_drv.readOp.leakage = + (power_subarray_out_drv.readOp.leakage + subarray_out_wire->power.readOp.leakage) * + number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP); + + power.readOp.leakage += power_bitline.readOp.leakage + + power_bl_precharge_eq_drv.readOp.leakage + + power_sa.readOp.leakage + + power_subarray_out_drv.readOp.leakage; + //cout<<"leakage"<<power.readOp.leakage<<endl; + + power_comparator.readOp.leakage *= num_do_b_mat * (RWP + ERP); + power.readOp.leakage += power_comparator.readOp.leakage; + + //cout<<"leakage1"<<power.readOp.leakage<<endl; + + // leakage power + power_row_decoders.readOp.leakage = row_dec->power.readOp.leakage * subarray.num_rows * num_subarrays_per_mat; + power_bit_mux_decoders.readOp.leakage = bit_mux_dec->power.readOp.leakage * deg_bl_muxing; + power_sa_mux_lev_1_decoders.readOp.leakage = sa_mux_lev_1_dec->power.readOp.leakage * dp.Ndsam_lev_1; + power_sa_mux_lev_2_decoders.readOp.leakage = sa_mux_lev_2_dec->power.readOp.leakage * dp.Ndsam_lev_2; + + power.readOp.leakage += r_predec->power.readOp.leakage + + b_mux_predec->power.readOp.leakage + + sa_mux_lev_1_predec->power.readOp.leakage + + sa_mux_lev_2_predec->power.readOp.leakage + + power_row_decoders.readOp.leakage + + power_bit_mux_decoders.readOp.leakage + + power_sa_mux_lev_1_decoders.readOp.leakage + + power_sa_mux_lev_2_decoders.readOp.leakage; + //cout<<"leakage2"<<power.readOp.leakage<<endl; + + //++++Below is gate leakage + power_bitline.readOp.gate_leakage *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat; + power_bl_precharge_eq_drv.readOp.gate_leakage = bl_precharge_eq_drv->power.readOp.gate_leakage * num_subarrays_per_mat; + power_sa.readOp.gate_leakage *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP); + + //num_sa_subarray = subarray.num_cols / deg_bl_muxing; + power_subarray_out_drv.readOp.gate_leakage = + (power_subarray_out_drv.readOp.gate_leakage + subarray_out_wire->power.readOp.gate_leakage) * + number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP); + + power.readOp.gate_leakage += power_bitline.readOp.gate_leakage + + power_bl_precharge_eq_drv.readOp.gate_leakage + + power_sa.readOp.gate_leakage + + power_subarray_out_drv.readOp.gate_leakage; + //cout<<"leakage"<<power.readOp.leakage<<endl; + + power_comparator.readOp.gate_leakage *= num_do_b_mat * (RWP + ERP); + power.readOp.gate_leakage += power_comparator.readOp.gate_leakage; + + //cout<<"leakage1"<<power.readOp.gate_leakage<<endl; + + // gate_leakage power + power_row_decoders.readOp.gate_leakage = row_dec->power.readOp.gate_leakage * subarray.num_rows * num_subarrays_per_mat; + power_bit_mux_decoders.readOp.gate_leakage = bit_mux_dec->power.readOp.gate_leakage * deg_bl_muxing; + power_sa_mux_lev_1_decoders.readOp.gate_leakage = sa_mux_lev_1_dec->power.readOp.gate_leakage * dp.Ndsam_lev_1; + power_sa_mux_lev_2_decoders.readOp.gate_leakage = sa_mux_lev_2_dec->power.readOp.gate_leakage * dp.Ndsam_lev_2; + + power.readOp.gate_leakage += r_predec->power.readOp.gate_leakage + + b_mux_predec->power.readOp.gate_leakage + + sa_mux_lev_1_predec->power.readOp.gate_leakage + + sa_mux_lev_2_predec->power.readOp.gate_leakage + + power_row_decoders.readOp.gate_leakage + + power_bit_mux_decoders.readOp.gate_leakage + + power_sa_mux_lev_1_decoders.readOp.gate_leakage + + power_sa_mux_lev_2_decoders.readOp.gate_leakage; + } + else if (is_fa) + { + int number_output_drivers_subarray = num_sa_subarray;// / (dp.Ndsam_lev_1 * dp.Ndsam_lev_2); + + power_bitline.readOp.leakage *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat; + power_bl_precharge_eq_drv.readOp.leakage = bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat; + power_bl_precharge_eq_drv.searchOp.leakage = cam_bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat; + power_sa.readOp.leakage *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP + SCHP); + + //cout<<"leakage3"<<power.readOp.leakage<<endl; + + + power_subarray_out_drv.readOp.leakage = + (power_subarray_out_drv.readOp.leakage + subarray_out_wire->power.readOp.leakage) * + number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP + SCHP); + + power.readOp.leakage += power_bitline.readOp.leakage + + power_bl_precharge_eq_drv.readOp.leakage + + power_bl_precharge_eq_drv.searchOp.leakage + + power_sa.readOp.leakage + + power_subarray_out_drv.readOp.leakage; + + //cout<<"leakage4"<<power.readOp.leakage<<endl; + + // leakage power + power_row_decoders.readOp.leakage = row_dec->power.readOp.leakage * subarray.num_rows * num_subarrays_per_mat; + power.readOp.leakage += r_predec->power.readOp.leakage + + power_row_decoders.readOp.leakage; + + //cout<<"leakage5"<<power.readOp.leakage<<endl; + + //inside cam + power_cam_all_active.searchOp.leakage = power_matchline.searchOp.leakage; + power_cam_all_active.searchOp.leakage +=sl_precharge_eq_drv->power.readOp.leakage; + power_cam_all_active.searchOp.leakage +=sl_data_drv->power.readOp.leakage*subarray.num_cols_fa_cam; + power_cam_all_active.searchOp.leakage +=ml_precharge_drv->power.readOp.dynamic; + power_cam_all_active.searchOp.leakage *= num_subarrays_per_mat; + + power.readOp.leakage += power_cam_all_active.searchOp.leakage; + +// cout<<"leakage6"<<power.readOp.leakage<<endl; + + //+++Below is gate leakage + power_bitline.readOp.gate_leakage *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat; + power_bl_precharge_eq_drv.readOp.gate_leakage = bl_precharge_eq_drv->power.readOp.gate_leakage * num_subarrays_per_mat; + power_bl_precharge_eq_drv.searchOp.gate_leakage = cam_bl_precharge_eq_drv->power.readOp.gate_leakage * num_subarrays_per_mat; + power_sa.readOp.gate_leakage *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP + SCHP); + + //cout<<"leakage3"<<power.readOp.gate_leakage<<endl; + + + power_subarray_out_drv.readOp.gate_leakage = + (power_subarray_out_drv.readOp.gate_leakage + subarray_out_wire->power.readOp.gate_leakage) * + number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP + SCHP); + + power.readOp.gate_leakage += power_bitline.readOp.gate_leakage + + power_bl_precharge_eq_drv.readOp.gate_leakage + + power_bl_precharge_eq_drv.searchOp.gate_leakage + + power_sa.readOp.gate_leakage + + power_subarray_out_drv.readOp.gate_leakage; + + //cout<<"leakage4"<<power.readOp.gate_leakage<<endl; + + // gate_leakage power + power_row_decoders.readOp.gate_leakage = row_dec->power.readOp.gate_leakage * subarray.num_rows * num_subarrays_per_mat; + power.readOp.gate_leakage += r_predec->power.readOp.gate_leakage + + power_row_decoders.readOp.gate_leakage; + + //cout<<"leakage5"<<power.readOp.gate_leakage<<endl; + + //inside cam + power_cam_all_active.searchOp.gate_leakage = power_matchline.searchOp.gate_leakage; + power_cam_all_active.searchOp.gate_leakage +=sl_precharge_eq_drv->power.readOp.gate_leakage; + power_cam_all_active.searchOp.gate_leakage +=sl_data_drv->power.readOp.gate_leakage*subarray.num_cols_fa_cam; + power_cam_all_active.searchOp.gate_leakage +=ml_precharge_drv->power.readOp.dynamic; + power_cam_all_active.searchOp.gate_leakage *= num_subarrays_per_mat; + + power.readOp.gate_leakage += power_cam_all_active.searchOp.gate_leakage; + + } + else + { + int number_output_drivers_subarray = num_sa_subarray;// / (dp.Ndsam_lev_1 * dp.Ndsam_lev_2); + + //power_bitline.readOp.leakage *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat; + //power_bl_precharge_eq_drv.readOp.leakage = bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat; + power_bl_precharge_eq_drv.searchOp.leakage = cam_bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat; + power_sa.readOp.leakage *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP + SCHP); + + + power_subarray_out_drv.readOp.leakage = + (power_subarray_out_drv.readOp.leakage + subarray_out_wire->power.readOp.leakage) * + number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP + SCHP); + + power.readOp.leakage += //power_bitline.readOp.leakage + + //power_bl_precharge_eq_drv.readOp.leakage + + power_bl_precharge_eq_drv.searchOp.leakage + + power_sa.readOp.leakage + + power_subarray_out_drv.readOp.leakage; + + // leakage power + power_row_decoders.readOp.leakage = row_dec->power.readOp.leakage * subarray.num_rows * num_subarrays_per_mat*(RWP + ERP + EWP); + power.readOp.leakage += r_predec->power.readOp.leakage + + power_row_decoders.readOp.leakage; + + //inside cam + power_cam_all_active.searchOp.leakage = power_matchline.searchOp.leakage; + power_cam_all_active.searchOp.leakage +=sl_precharge_eq_drv->power.readOp.leakage; + power_cam_all_active.searchOp.leakage +=sl_data_drv->power.readOp.leakage*subarray.num_cols_fa_cam; + power_cam_all_active.searchOp.leakage +=ml_precharge_drv->power.readOp.dynamic; + power_cam_all_active.searchOp.leakage *= num_subarrays_per_mat; + + power.readOp.leakage += power_cam_all_active.searchOp.leakage; + + //+++Below is gate leakage + power_bl_precharge_eq_drv.searchOp.gate_leakage = cam_bl_precharge_eq_drv->power.readOp.gate_leakage * num_subarrays_per_mat; + power_sa.readOp.gate_leakage *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP + SCHP); + + + power_subarray_out_drv.readOp.gate_leakage = + (power_subarray_out_drv.readOp.gate_leakage + subarray_out_wire->power.readOp.gate_leakage) * + number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP + SCHP); + + power.readOp.gate_leakage += //power_bitline.readOp.gate_leakage + + //power_bl_precharge_eq_drv.readOp.gate_leakage + + power_bl_precharge_eq_drv.searchOp.gate_leakage + + power_sa.readOp.gate_leakage + + power_subarray_out_drv.readOp.gate_leakage; + + // gate_leakage power + power_row_decoders.readOp.gate_leakage = row_dec->power.readOp.gate_leakage * subarray.num_rows * num_subarrays_per_mat*(RWP + ERP + EWP); + power.readOp.gate_leakage += r_predec->power.readOp.gate_leakage + + power_row_decoders.readOp.gate_leakage; + + //inside cam + power_cam_all_active.searchOp.gate_leakage = power_matchline.searchOp.gate_leakage; + power_cam_all_active.searchOp.gate_leakage +=sl_precharge_eq_drv->power.readOp.gate_leakage; + power_cam_all_active.searchOp.gate_leakage +=sl_data_drv->power.readOp.gate_leakage*subarray.num_cols_fa_cam; + power_cam_all_active.searchOp.gate_leakage +=ml_precharge_drv->power.readOp.dynamic; + power_cam_all_active.searchOp.gate_leakage *= num_subarrays_per_mat; + + power.readOp.gate_leakage += power_cam_all_active.searchOp.gate_leakage; + } +} + diff --git a/ext/mcpat/cacti/mat.h b/ext/mcpat/cacti/mat.h new file mode 100755 index 000000000..8d038be8b --- /dev/null +++ b/ext/mcpat/cacti/mat.h @@ -0,0 +1,148 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#ifndef __MAT_H__ +#define __MAT_H__ + +#include "component.h" +#include "decoder.h" +#include "subarray.h" +#include "wire.h" + +class Mat : public Component +{ + public: + Mat(const DynamicParameter & dyn_p); + ~Mat(); + double compute_delays(double inrisetime); // return outrisetime + void compute_power_energy(); + + const DynamicParameter & dp; + + // TODO: clean up pointers and powerDefs below + Decoder * row_dec; + Decoder * bit_mux_dec; + Decoder * sa_mux_lev_1_dec; + Decoder * sa_mux_lev_2_dec; + PredecBlk * dummy_way_sel_predec_blk1; + PredecBlk * dummy_way_sel_predec_blk2; + PredecBlkDrv * way_sel_drv1; + PredecBlkDrv * dummy_way_sel_predec_blk_drv2; + + Predec * r_predec; + Predec * b_mux_predec; + Predec * sa_mux_lev_1_predec; + Predec * sa_mux_lev_2_predec; + + Wire * subarray_out_wire; + Driver * bl_precharge_eq_drv; + Driver * cam_bl_precharge_eq_drv;//bitline pre-charge circuit is separated for CAM and RAM arrays. + Driver * ml_precharge_drv;//matchline prechange driver + Driver * sl_precharge_eq_drv;//searchline prechage driver + Driver * sl_data_drv;//search line data driver + Driver * ml_to_ram_wl_drv;//search line data driver + + + powerDef power_row_decoders; + powerDef power_bit_mux_decoders; + powerDef power_sa_mux_lev_1_decoders; + powerDef power_sa_mux_lev_2_decoders; + powerDef power_fa_cam; // TODO: leakage power is not computed yet + powerDef power_bl_precharge_eq_drv; + powerDef power_subarray_out_drv; + powerDef power_cam_all_active; + powerDef power_searchline_precharge; + powerDef power_matchline_precharge; + powerDef power_ml_to_ram_wl_drv; + + double delay_fa_tag, delay_cam; + double delay_before_decoder; + double delay_bitline; + double delay_wl_reset; + double delay_bl_restore; + + double delay_searchline; + double delay_matchchline; + double delay_cam_sl_restore; + double delay_cam_ml_reset; + double delay_fa_ram_wl; + + double delay_hit_miss_reset; + double delay_hit_miss; + + Subarray subarray; + powerDef power_bitline, power_searchline, power_matchline; + double per_bitline_read_energy; + int deg_bl_muxing; + int num_act_mats_hor_dir; + double delay_writeback; + Area cell,cam_cell; + bool is_dram,is_fa, pure_cam, camFlag; + int num_mats; + powerDef power_sa; + double delay_sa; + double leak_power_sense_amps_closed_page_state; + double leak_power_sense_amps_open_page_state; + double delay_subarray_out_drv; + double delay_subarray_out_drv_htree; + double delay_comparator; + powerDef power_comparator; + int num_do_b_mat; + int num_so_b_mat; + int num_sa_subarray; + int num_sa_subarray_search; + double C_bl; + + uint32_t num_subarrays_per_mat; // the number of subarrays in a mat + uint32_t num_subarrays_per_row; // the number of subarrays in a row of a mat + + + private: + double compute_bit_mux_sa_precharge_sa_mux_wr_drv_wr_mux_h(); + double width_write_driver_or_write_mux(); + double compute_comparators_height(int tagbits, int number_ways_in_mat, double subarray_mem_cell_area_w); + double compute_cam_delay(double inrisetime); + double compute_bitline_delay(double inrisetime); + double compute_sa_delay(double inrisetime); + double compute_subarray_out_drv(double inrisetime); + double compute_comparator_delay(double inrisetime); + + int RWP; + int ERP; + int EWP; + int SCHP; +}; + + + +#endif diff --git a/ext/mcpat/cacti/nuca.cc b/ext/mcpat/cacti/nuca.cc new file mode 100644 index 000000000..2aabe843f --- /dev/null +++ b/ext/mcpat/cacti/nuca.cc @@ -0,0 +1,612 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#include <cassert> + +#include "Ucache.h" +#include "nuca.h" + +unsigned int MIN_BANKSIZE=65536; +#define FIXED_OVERHEAD 55e-12 /* clock skew and jitter in s. Ref: Hrishikesh et al ISCA 01 */ +#define LATCH_DELAY 28e-12 /* latch delay in s (later should use FO4 TODO) */ +#define CONTR_2_BANK_LAT 0 + +int cont_stats[2 /*l2 or l3*/][5/* cores */][ROUTER_TYPES][7 /*banks*/][8 /* cycle time */]; + + Nuca::Nuca( + TechnologyParameter::DeviceType *dt = &(g_tp.peri_global) + ):deviceType(dt) +{ + init_cont(); +} + +void +Nuca::init_cont() +{ + FILE *cont; + char line[5000]; + char jk[5000]; + cont = fopen("contention.dat", "r"); + if (!cont) { + cout << "contention.dat file is missing!\n"; + exit(0); + } + + for(int i=0; i<2; i++) { + for(int j=2; j<5; j++) { + for(int k=0; k<ROUTER_TYPES; k++) { + for(int l=0;l<7; l++) { + int *temp = cont_stats[i/*l2 or l3*/][j/*core*/][k/*64 or 128 or 256 link bw*/][l /* no banks*/]; + assert(fscanf(cont, "%[^\n]\n", line) != EOF); + sscanf(line, "%[^:]: %d %d %d %d %d %d %d %d",jk, &temp[0], &temp[1], &temp[2], &temp[3], + &temp[4], &temp[5], &temp[6], &temp[7]); + } + } + } + } + fclose(cont); +} + + void +Nuca::print_cont_stats() +{ + for(int i=0; i<2; i++) { + for(int j=2; j<5; j++) { + for(int k=0; k<ROUTER_TYPES; k++) { + for(int l=0;l<7; l++) { + for(int m=0;l<7; l++) { + cout << cont_stats[i][j][k][l][m] << " "; + } + cout << endl; + } + } + } + } + cout << endl; +} + +Nuca::~Nuca(){ + for (int i = wt_min; i <= wt_max; i++) { + delete wire_vertical[i]; + delete wire_horizontal[i]; + } +} + +/* converts latency (in s) to cycles depending upon the FREQUENCY (in GHz) */ + int +Nuca::calc_cycles(double lat, double oper_freq) +{ + //TODO: convert latch delay to FO4 */ + double cycle_time = (1.0/(oper_freq*1e9)); /*s*/ + cycle_time -= LATCH_DELAY; + cycle_time -= FIXED_OVERHEAD; + + return (int)ceil(lat/cycle_time); +} + + +nuca_org_t::~nuca_org_t() { + // if(h_wire) delete h_wire; + // if(v_wire) delete v_wire; + // if(router) delete router; +} + +/* + * Version - 6.0 + * + * Perform exhaustive search across different bank organizatons, + * router configurations, grid organizations, and wire models and + * find an optimal NUCA organization + * For different bank count values + * 1. Optimal bank organization is calculated + * 2. For each bank organization, find different NUCA organizations + * using various router configurations, grid organizations, + * and wire models. + * 3. NUCA model with the least cost is picked for + * this particular bank count + * Finally include contention statistics and find the optimal + * NUCA configuration + */ + void +Nuca::sim_nuca() +{ + /* temp variables */ + int it, ro, wr; + int num_cyc; + unsigned int i, j, k; + unsigned int r, c; + int l2_c; + int bank_count = 0; + uca_org_t ures; + nuca_org_t *opt_n; + mem_array tag, data; + list<nuca_org_t *> nuca_list; + Router *router_s[ROUTER_TYPES]; + router_s[0] = new Router(64.0, 8, 4, &(g_tp.peri_global)); + router_s[0]->print_router(); + router_s[1] = new Router(128.0, 8, 4, &(g_tp.peri_global)); + router_s[1]->print_router(); + router_s[2] = new Router(256.0, 8, 4, &(g_tp.peri_global)); + router_s[2]->print_router(); + + int core_in; // to store no. of cores + + /* to search diff grid organizations */ + double curr_hop, totno_hops, totno_hhops, totno_vhops, tot_lat, + curr_acclat; + double avg_lat, avg_hop, avg_hhop, avg_vhop, avg_dyn_power, + avg_leakage_power; + + double opt_acclat = INF, opt_avg_lat = INF, opt_tot_lat = INF; + int opt_rows = 0; + int opt_columns = 0; + double opt_totno_hops = 0; + double opt_avg_hop = 0; + double opt_dyn_power = 0, opt_leakage_power = 0; + min_values_t minval; + + int bank_start = 0; + + int flit_width = 0; + + /* vertical and horizontal hop latency values */ + int ver_hop_lat, hor_hop_lat; /* in cycles */ + + + /* no. of different bank sizes to consider */ + int iterations; + + + g_ip->nuca_cache_sz = g_ip->cache_sz; + nuca_list.push_back(new nuca_org_t()); + + if (g_ip->cache_level == 0) l2_c = 1; + else l2_c = 0; + + if (g_ip->cores <= 4) core_in = 2; + else if (g_ip->cores <= 8) core_in = 3; + else if (g_ip->cores <= 16) core_in = 4; + else {cout << "Number of cores should be <= 16!\n"; exit(0);} + + + // set the lower bound to an appropriate value. this depends on cache associativity + if (g_ip->assoc > 2) { + i = 2; + while (i != g_ip->assoc) { + MIN_BANKSIZE *= 2; + i *= 2; + } + } + + iterations = (int)logtwo((int)g_ip->cache_sz/MIN_BANKSIZE); + + if (g_ip->force_wiretype) + { + if (g_ip->wt == Low_swing) { + wt_min = Low_swing; + wt_max = Low_swing; + } + else { + wt_min = Global; + wt_max = Low_swing-1; + } + } + else { + wt_min = Global; + wt_max = Low_swing; + } + if (g_ip->nuca_bank_count != 0) { // simulate just one bank + if (g_ip->nuca_bank_count != 2 && g_ip->nuca_bank_count != 4 && + g_ip->nuca_bank_count != 8 && g_ip->nuca_bank_count != 16 && + g_ip->nuca_bank_count != 32 && g_ip->nuca_bank_count != 64) { + fprintf(stderr,"Incorrect bank count value! Please fix the value in cache.cfg\n"); + } + bank_start = (int)logtwo((double)g_ip->nuca_bank_count); + iterations = bank_start+1; + g_ip->cache_sz = g_ip->cache_sz/g_ip->nuca_bank_count; + } + cout << "Simulating various NUCA configurations\n"; + for (it=bank_start; it<iterations; it++) { /* different bank count values */ + ures.tag_array2 = &tag; + ures.data_array2 = &data; + /* + * find the optimal bank organization + */ + solve(&ures); +// output_UCA(&ures); + bank_count = g_ip->nuca_cache_sz/g_ip->cache_sz; + cout << "====" << g_ip->cache_sz << "\n"; + + for (wr=wt_min; wr<=wt_max; wr++) { + + for (ro=0; ro<ROUTER_TYPES; ro++) + { + flit_width = (int) router_s[ro]->flit_size; //initialize router + nuca_list.back()->nuca_pda.cycle_time = router_s[ro]->cycle_time; + + /* calculate router and wire parameters */ + + double vlength = ures.cache_ht; /* length of the wire (u)*/ + double hlength = ures.cache_len; // u + + /* find delay, area, and power for wires */ + wire_vertical[wr] = new Wire((enum Wire_type) wr, vlength); + wire_horizontal[wr] = new Wire((enum Wire_type) wr, hlength); + + + hor_hop_lat = calc_cycles(wire_horizontal[wr]->delay, + 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); + ver_hop_lat = calc_cycles(wire_vertical[wr]->delay, + 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); + + /* + * assume a grid like topology and explore for optimal network + * configuration using different row and column count values. + */ + for (c=1; c<=(unsigned int)bank_count; c++) { + while (bank_count%c != 0) c++; + r = bank_count/c; + + /* + * to find the avg access latency of a NUCA cache, uncontended + * access time to each bank from the + * cache controller is calculated. + * avg latency = + * sum of the access latencies to individual banks)/bank + * count value. + */ + totno_hops = totno_hhops = totno_vhops = tot_lat = 0; + k = 1; + for (i=0; i<r; i++) { + for (j=0; j<c; j++) { + /* + * vertical hops including the + * first hop from the cache controller + */ + curr_hop = i + 1; + curr_hop += j; /* horizontal hops */ + totno_hhops += j; + totno_vhops += (i+1); + curr_acclat = (i * ver_hop_lat + CONTR_2_BANK_LAT + + j * hor_hop_lat); + + tot_lat += curr_acclat; + totno_hops += curr_hop; + } + } + avg_lat = tot_lat/bank_count; + avg_hop = totno_hops/bank_count; + avg_hhop = totno_hhops/bank_count; + avg_vhop = totno_vhops/bank_count; + + /* net access latency */ + curr_acclat = 2*avg_lat + 2*(router_s[ro]->delay*avg_hop) + + calc_cycles(ures.access_time, + 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); + + /* avg access lat of nuca */ + avg_dyn_power = + avg_hop * + (router_s[ro]->power.readOp.dynamic) + avg_hhop * + (wire_horizontal[wr]->power.readOp.dynamic) * + (g_ip->block_sz*8 + 64) + avg_vhop * + (wire_vertical[wr]->power.readOp.dynamic) * + (g_ip->block_sz*8 + 64) + ures.power.readOp.dynamic; + + avg_leakage_power = + bank_count * router_s[ro]->power.readOp.leakage + + avg_hhop * (wire_horizontal[wr]->power.readOp.leakage* + wire_horizontal[wr]->delay) * flit_width + + avg_vhop * (wire_vertical[wr]->power.readOp.leakage * + wire_horizontal[wr]->delay); + + if (curr_acclat < opt_acclat) { + opt_acclat = curr_acclat; + opt_tot_lat = tot_lat; + opt_avg_lat = avg_lat; + opt_totno_hops = totno_hops; + opt_avg_hop = avg_hop; + opt_rows = r; + opt_columns = c; + opt_dyn_power = avg_dyn_power; + opt_leakage_power = avg_leakage_power; + } + totno_hops = 0; + tot_lat = 0; + totno_hhops = 0; + totno_vhops = 0; + } + nuca_list.back()->wire_pda.power.readOp.dynamic = + opt_avg_hop * flit_width * + (wire_horizontal[wr]->power.readOp.dynamic + + wire_vertical[wr]->power.readOp.dynamic); + nuca_list.back()->avg_hops = opt_avg_hop; + /* network delay/power */ + nuca_list.back()->h_wire = wire_horizontal[wr]; + nuca_list.back()->v_wire = wire_vertical[wr]; + nuca_list.back()->router = router_s[ro]; + /* bank delay/power */ + + nuca_list.back()->bank_pda.delay = ures.access_time; + nuca_list.back()->bank_pda.power = ures.power; + nuca_list.back()->bank_pda.area.h = ures.cache_ht; + nuca_list.back()->bank_pda.area.w = ures.cache_len; + nuca_list.back()->bank_pda.cycle_time = ures.cycle_time; + + num_cyc = calc_cycles(nuca_list.back()->bank_pda.delay /*s*/, + 1/(nuca_list.back()->nuca_pda.cycle_time*.001/*GHz*/)); + if(num_cyc%2 != 0) num_cyc++; + if (num_cyc > 16) num_cyc = 16; // we have data only up to 16 cycles + + if (it < 7) { + nuca_list.back()->nuca_pda.delay = opt_acclat + + cont_stats[l2_c][core_in][ro][it][num_cyc/2-1]; + nuca_list.back()->contention = + cont_stats[l2_c][core_in][ro][it][num_cyc/2-1]; + } + else { + nuca_list.back()->nuca_pda.delay = opt_acclat + + cont_stats[l2_c][core_in][ro][7][num_cyc/2-1]; + nuca_list.back()->contention = + cont_stats[l2_c][core_in][ro][7][num_cyc/2-1]; + } + nuca_list.back()->nuca_pda.power.readOp.dynamic = opt_dyn_power; + nuca_list.back()->nuca_pda.power.readOp.leakage = opt_leakage_power; + + /* array organization */ + nuca_list.back()->bank_count = bank_count; + nuca_list.back()->rows = opt_rows; + nuca_list.back()->columns = opt_columns; + calculate_nuca_area (nuca_list.back()); + + minval.update_min_values(nuca_list.back()); + nuca_list.push_back(new nuca_org_t()); + opt_acclat = BIGNUM; + + } + } + g_ip->cache_sz /= 2; + } + + delete(nuca_list.back()); + nuca_list.pop_back(); + opt_n = find_optimal_nuca(&nuca_list, &minval); + print_nuca(opt_n); + g_ip->cache_sz = g_ip->nuca_cache_sz/opt_n->bank_count; + + list<nuca_org_t *>::iterator niter; + for (niter = nuca_list.begin(); niter != nuca_list.end(); ++niter) + { + delete *niter; + } + nuca_list.clear(); + + for(int i=0; i < ROUTER_TYPES; i++) + { + delete router_s[i]; + } + g_ip->display_ip(); + // g_ip->force_cache_config = true; + // g_ip->ndwl = 8; + // g_ip->ndbl = 16; + // g_ip->nspd = 4; + // g_ip->ndcm = 1; + // g_ip->ndsam1 = 8; + // g_ip->ndsam2 = 32; + +} + + + void +Nuca::print_nuca (nuca_org_t *fr) +{ + printf("\n---------- CACTI version 6.5, Non-uniform Cache Access " + "----------\n\n"); + printf("Optimal number of banks - %d\n", fr->bank_count); + printf("Grid organization rows x columns - %d x %d\n", + fr->rows, fr->columns); + printf("Network frequency - %g GHz\n", + (1/fr->nuca_pda.cycle_time)*1e3); + printf("Cache dimension (mm x mm) - %g x %g\n", + fr->nuca_pda.area.h, + fr->nuca_pda.area.w); + + fr->router->print_router(); + + printf("\n\nWire stats:\n"); + if (fr->h_wire->wt == Global) { + printf("\tWire type - Full swing global wires with least " + "possible delay\n"); + } + else if (fr->h_wire->wt == Global_5) { + printf("\tWire type - Full swing global wires with " + "5%% delay penalty\n"); + } + else if (fr->h_wire->wt == Global_10) { + printf("\tWire type - Full swing global wires with " + "10%% delay penalty\n"); + } + else if (fr->h_wire->wt == Global_20) { + printf("\tWire type - Full swing global wires with " + "20%% delay penalty\n"); + } + else if (fr->h_wire->wt == Global_30) { + printf("\tWire type - Full swing global wires with " + "30%% delay penalty\n"); + } + else if(fr->h_wire->wt == Low_swing) { + printf("\tWire type - Low swing wires\n"); + } + + printf("\tHorizontal link delay - %g (ns)\n", + fr->h_wire->delay*1e9); + printf("\tVertical link delay - %g (ns)\n", + fr->v_wire->delay*1e9); + printf("\tDelay/length - %g (ns/mm)\n", + fr->h_wire->delay*1e9/fr->bank_pda.area.w); + printf("\tHorizontal link energy -dynamic/access %g (nJ)\n" + "\t -leakage %g (nW)\n\n", + fr->h_wire->power.readOp.dynamic*1e9, + fr->h_wire->power.readOp.leakage*1e9); + printf("\tVertical link energy -dynamic/access %g (nJ)\n" + "\t -leakage %g (nW)\n\n", + fr->v_wire->power.readOp.dynamic*1e9, + fr->v_wire->power.readOp.leakage*1e9); + printf("\n\n"); + fr->v_wire->print_wire(); + printf("\n\nBank stats:\n"); +} + + + nuca_org_t * +Nuca::find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval) +{ + double cost = 0; + double min_cost = BIGNUM; + nuca_org_t *res = NULL; + float d, a, dp, lp, c; + int v; + dp = g_ip->dynamic_power_wt_nuca; + lp = g_ip->leakage_power_wt_nuca; + a = g_ip->area_wt_nuca; + d = g_ip->delay_wt_nuca; + c = g_ip->cycle_time_wt_nuca; + + list<nuca_org_t *>::iterator niter; + + + for (niter = n->begin(); niter != n->end(); niter++) { + fprintf(stderr, "\n-----------------------------" + "---------------\n"); + + + printf("NUCA___stats %d \tbankcount: lat = %g \tdynP = %g \twt = %d\t " + "bank_dpower = %g \tleak = %g \tcycle = %g\n", + (*niter)->bank_count, + (*niter)->nuca_pda.delay, + (*niter)->nuca_pda.power.readOp.dynamic, + (*niter)->h_wire->wt, + (*niter)->bank_pda.power.readOp.dynamic, + (*niter)->nuca_pda.power.readOp.leakage, + (*niter)->nuca_pda.cycle_time); + + + if (g_ip->ed == 1) { + cost = ((*niter)->nuca_pda.delay/minval->min_delay)* + ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn); + if (min_cost > cost) { + min_cost = cost; + res = ((*niter)); + } + } + else if (g_ip->ed == 2) { + cost = ((*niter)->nuca_pda.delay/minval->min_delay)* + ((*niter)->nuca_pda.delay/minval->min_delay)* + ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn); + if (min_cost > cost) { + min_cost = cost; + res = ((*niter)); + } + } + else { + /* + * check whether the current organization + * meets the input deviation constraints + */ + v = check_nuca_org((*niter), minval); + if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling + + if (v) { + cost = (d * ((*niter)->nuca_pda.delay/minval->min_delay) + + c * ((*niter)->nuca_pda.cycle_time/minval->min_cyc) + + dp * ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn) + + lp * ((*niter)->nuca_pda.power.readOp.leakage/minval->min_leakage) + + a * ((*niter)->nuca_pda.area.get_area()/minval->min_area)); + fprintf(stderr, "cost = %g\n", cost); + + if (min_cost > cost) { + min_cost = cost; + res = ((*niter)); + } + } + else { + niter = n->erase(niter); + if (niter !=n->begin()) + niter --; + } + } + } + return res; +} + + int +Nuca::check_nuca_org (nuca_org_t *n, min_values_t *minval) +{ + if (((n->nuca_pda.delay - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev_nuca) { + return 0; + } + if (((n->nuca_pda.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 > + g_ip->dynamic_power_dev_nuca) { + return 0; + } + if (((n->nuca_pda.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 > + g_ip->leakage_power_dev_nuca) { + return 0; + } + if (((n->nuca_pda.cycle_time - minval->min_cyc)/minval->min_cyc)*100 > + g_ip->cycle_time_dev_nuca) { + return 0; + } + if (((n->nuca_pda.area.get_area() - minval->min_area)/minval->min_area)*100 > + g_ip->area_dev_nuca) { + return 0; + } + return 1; +} + + void +Nuca::calculate_nuca_area (nuca_org_t *nuca) +{ + nuca->nuca_pda.area.h= + nuca->rows * ((nuca->h_wire->wire_width + + nuca->h_wire->wire_spacing) + * nuca->router->flit_size + + nuca->bank_pda.area.h); + + nuca->nuca_pda.area.w = + nuca->columns * ((nuca->v_wire->wire_width + + nuca->v_wire->wire_spacing) + * nuca->router->flit_size + + nuca->bank_pda.area.w); +} + diff --git a/ext/mcpat/cacti/nuca.h b/ext/mcpat/cacti/nuca.h new file mode 100644 index 000000000..adfe32564 --- /dev/null +++ b/ext/mcpat/cacti/nuca.h @@ -0,0 +1,100 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + +#ifndef __NUCA_H__ +#define __NUCA_H__ + +#include <iostream> + +#include "assert.h" +#include "basic_circuit.h" +#include "cacti_interface.h" +#include "component.h" +#include "io.h" +#include "mat.h" +#include "parameter.h" +#include "router.h" +#include "wire.h" + +class nuca_org_t { + public: + ~nuca_org_t(); +// int size; + /* area, power, access time, and cycle time stats */ + Component nuca_pda; + Component bank_pda; + Component wire_pda; + Wire *h_wire; + Wire *v_wire; + Router *router; + /* for particular network configuration + * calculated based on a cycle accurate + * simulation Ref: CACTI 6 - Tech report + */ + double contention; + + /* grid network stats */ + double avg_hops; + int rows; + int columns; + int bank_count; +}; + + + +class Nuca : public Component +{ + public: + Nuca( + TechnologyParameter::DeviceType *dt); + void print_router(); + ~Nuca(); + void sim_nuca(); + void init_cont(); + int calc_cycles(double lat, double oper_freq); + void calculate_nuca_area (nuca_org_t *nuca); + int check_nuca_org (nuca_org_t *n, min_values_t *minval); + nuca_org_t * find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval); + void print_nuca(nuca_org_t *n); + void print_cont_stats(); + + private: + + TechnologyParameter::DeviceType *deviceType; + int wt_min, wt_max; + Wire *wire_vertical[WIRE_TYPES], + *wire_horizontal[WIRE_TYPES]; + +}; + + +#endif diff --git a/ext/mcpat/cacti/parameter.cc b/ext/mcpat/cacti/parameter.cc new file mode 100644 index 000000000..b71640c19 --- /dev/null +++ b/ext/mcpat/cacti/parameter.cc @@ -0,0 +1,713 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#include <iomanip> +#include <iostream> +#include <string> + +#include "area.h" +#include "parameter.h" + +using namespace std; + + +InputParameter * g_ip; +TechnologyParameter g_tp; + + + +void TechnologyParameter::DeviceType::display(uint32_t indent) +{ + string indent_str(indent, ' '); + + cout << indent_str << "C_g_ideal = " << setw(12) << C_g_ideal << " F/um" << endl; + cout << indent_str << "C_fringe = " << setw(12) << C_fringe << " F/um" << endl; + cout << indent_str << "C_overlap = " << setw(12) << C_overlap << " F/um" << endl; + cout << indent_str << "C_junc = " << setw(12) << C_junc << " F/um^2" << endl; + cout << indent_str << "l_phy = " << setw(12) << l_phy << " um" << endl; + cout << indent_str << "l_elec = " << setw(12) << l_elec << " um" << endl; + cout << indent_str << "R_nch_on = " << setw(12) << R_nch_on << " ohm-um" << endl; + cout << indent_str << "R_pch_on = " << setw(12) << R_pch_on << " ohm-um" << endl; + cout << indent_str << "Vdd = " << setw(12) << Vdd << " V" << endl; + cout << indent_str << "Vth = " << setw(12) << Vth << " V" << endl; + cout << indent_str << "I_on_n = " << setw(12) << I_on_n << " A/um" << endl; + cout << indent_str << "I_on_p = " << setw(12) << I_on_p << " A/um" << endl; + cout << indent_str << "I_off_n = " << setw(12) << I_off_n << " A/um" << endl; + cout << indent_str << "I_off_p = " << setw(12) << I_off_p << " A/um" << endl; + cout << indent_str << "C_ox = " << setw(12) << C_ox << " F/um^2" << endl; + cout << indent_str << "t_ox = " << setw(12) << t_ox << " um" << endl; + cout << indent_str << "n_to_p_eff_curr_drv_ratio = " << n_to_p_eff_curr_drv_ratio << endl; +} + + + +void TechnologyParameter::InterconnectType::display(uint32_t indent) +{ + string indent_str(indent, ' '); + + cout << indent_str << "pitch = " << setw(12) << pitch << " um" << endl; + cout << indent_str << "R_per_um = " << setw(12) << R_per_um << " ohm/um" << endl; + cout << indent_str << "C_per_um = " << setw(12) << C_per_um << " F/um" << endl; +} + +void TechnologyParameter::ScalingFactor::display(uint32_t indent) +{ + string indent_str(indent, ' '); + + cout << indent_str << "logic_scaling_co_eff = " << setw(12) << logic_scaling_co_eff << endl; + cout << indent_str << "curr_core_tx_density = " << setw(12) << core_tx_density << " # of tx/um^2" << endl; +} + +void TechnologyParameter::MemoryType::display(uint32_t indent) +{ + string indent_str(indent, ' '); + + cout << indent_str << "b_w = " << setw(12) << b_w << " um" << endl; + cout << indent_str << "b_h = " << setw(12) << b_h << " um" << endl; + cout << indent_str << "cell_a_w = " << setw(12) << cell_a_w << " um" << endl; + cout << indent_str << "cell_pmos_w = " << setw(12) << cell_pmos_w << " um" << endl; + cout << indent_str << "cell_nmos_w = " << setw(12) << cell_nmos_w << " um" << endl; + cout << indent_str << "Vbitpre = " << setw(12) << Vbitpre << " V" << endl; +} + + + +void TechnologyParameter::display(uint32_t indent) +{ + string indent_str(indent, ' '); + + cout << indent_str << "ram_wl_stitching_overhead_ = " << setw(12) << ram_wl_stitching_overhead_ << " um" << endl; + cout << indent_str << "min_w_nmos_ = " << setw(12) << min_w_nmos_ << " um" << endl; + cout << indent_str << "max_w_nmos_ = " << setw(12) << max_w_nmos_ << " um" << endl; + cout << indent_str << "unit_len_wire_del = " << setw(12) << unit_len_wire_del << " s/um^2" << endl; + cout << indent_str << "FO4 = " << setw(12) << FO4 << " s" << endl; + cout << indent_str << "kinv = " << setw(12) << kinv << " s" << endl; + cout << indent_str << "vpp = " << setw(12) << vpp << " V" << endl; + cout << indent_str << "w_sense_en = " << setw(12) << w_sense_en << " um" << endl; + cout << indent_str << "w_sense_n = " << setw(12) << w_sense_n << " um" << endl; + cout << indent_str << "w_sense_p = " << setw(12) << w_sense_p << " um" << endl; + cout << indent_str << "w_iso = " << setw(12) << w_iso << " um" << endl; + cout << indent_str << "w_poly_contact = " << setw(12) << w_poly_contact << " um" << endl; + cout << indent_str << "spacing_poly_to_poly = " << setw(12) << spacing_poly_to_poly << " um" << endl; + cout << indent_str << "spacing_poly_to_contact = " << setw(12) << spacing_poly_to_contact << " um" << endl; + cout << endl; + cout << indent_str << "w_comp_inv_p1 = " << setw(12) << w_comp_inv_p1 << " um" << endl; + cout << indent_str << "w_comp_inv_p2 = " << setw(12) << w_comp_inv_p2 << " um" << endl; + cout << indent_str << "w_comp_inv_p3 = " << setw(12) << w_comp_inv_p3 << " um" << endl; + cout << indent_str << "w_comp_inv_n1 = " << setw(12) << w_comp_inv_n1 << " um" << endl; + cout << indent_str << "w_comp_inv_n2 = " << setw(12) << w_comp_inv_n2 << " um" << endl; + cout << indent_str << "w_comp_inv_n3 = " << setw(12) << w_comp_inv_n3 << " um" << endl; + cout << indent_str << "w_eval_inv_p = " << setw(12) << w_eval_inv_p << " um" << endl; + cout << indent_str << "w_eval_inv_n = " << setw(12) << w_eval_inv_n << " um" << endl; + cout << indent_str << "w_comp_n = " << setw(12) << w_comp_n << " um" << endl; + cout << indent_str << "w_comp_p = " << setw(12) << w_comp_p << " um" << endl; + cout << endl; + cout << indent_str << "dram_cell_I_on = " << setw(12) << dram_cell_I_on << " A/um" << endl; + cout << indent_str << "dram_cell_Vdd = " << setw(12) << dram_cell_Vdd << " V" << endl; + cout << indent_str << "dram_cell_I_off_worst_case_len_temp = " << setw(12) << dram_cell_I_off_worst_case_len_temp << " A/um" << endl; + cout << indent_str << "dram_cell_C = " << setw(12) << dram_cell_C << " F" << endl; + cout << indent_str << "gm_sense_amp_latch = " << setw(12) << gm_sense_amp_latch << " F/s" << endl; + cout << endl; + cout << indent_str << "w_nmos_b_mux = " << setw(12) << w_nmos_b_mux << " um" << endl; + cout << indent_str << "w_nmos_sa_mux = " << setw(12) << w_nmos_sa_mux << " um" << endl; + cout << indent_str << "w_pmos_bl_precharge = " << setw(12) << w_pmos_bl_precharge << " um" << endl; + cout << indent_str << "w_pmos_bl_eq = " << setw(12) << w_pmos_bl_eq << " um" << endl; + cout << indent_str << "MIN_GAP_BET_P_AND_N_DIFFS = " << setw(12) << MIN_GAP_BET_P_AND_N_DIFFS << " um" << endl; + cout << indent_str << "HPOWERRAIL = " << setw(12) << HPOWERRAIL << " um" << endl; + cout << indent_str << "cell_h_def = " << setw(12) << cell_h_def << " um" << endl; + + cout << endl; + cout << indent_str << "SRAM cell transistor: " << endl; + sram_cell.display(indent + 2); + + cout << endl; + cout << indent_str << "DRAM access transistor: " << endl; + dram_acc.display(indent + 2); + + cout << endl; + cout << indent_str << "DRAM wordline transistor: " << endl; + dram_wl.display(indent + 2); + + cout << endl; + cout << indent_str << "peripheral global transistor: " << endl; + peri_global.display(indent + 2); + + cout << endl; + cout << indent_str << "wire local" << endl; + wire_local.display(indent + 2); + + cout << endl; + cout << indent_str << "wire inside mat" << endl; + wire_inside_mat.display(indent + 2); + + cout << endl; + cout << indent_str << "wire outside mat" << endl; + wire_outside_mat.display(indent + 2); + + cout << endl; + cout << indent_str << "SRAM" << endl; + sram.display(indent + 2); + + cout << endl; + cout << indent_str << "DRAM" << endl; + dram.display(indent + 2); +} + + +DynamicParameter::DynamicParameter(): + use_inp_params(0), cell(), is_valid(true) +{ +} + + + +DynamicParameter::DynamicParameter( + bool is_tag_, + int pure_ram_, + int pure_cam_, + double Nspd_, + unsigned int Ndwl_, + unsigned int Ndbl_, + unsigned int Ndcm_, + unsigned int Ndsam_lev_1_, + unsigned int Ndsam_lev_2_, + bool is_main_mem_): + is_tag(is_tag_), pure_ram(pure_ram_), pure_cam(pure_cam_), tagbits(0), Nspd(Nspd_), Ndwl(Ndwl_), Ndbl(Ndbl_),Ndcm(Ndcm_), + Ndsam_lev_1(Ndsam_lev_1_), Ndsam_lev_2(Ndsam_lev_2_), + number_way_select_signals_mat(0), V_b_sense(0), use_inp_params(0), + is_main_mem(is_main_mem_), cell(), is_valid(false) +{ + ram_cell_tech_type = (is_tag) ? g_ip->tag_arr_ram_cell_tech_type : g_ip->data_arr_ram_cell_tech_type; + is_dram = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram)); + + unsigned int capacity_per_die = g_ip->cache_sz / NUMBER_STACKED_DIE_LAYERS; // capacity per stacked die layer + const TechnologyParameter::InterconnectType & wire_local = g_tp.wire_local; + fully_assoc = (g_ip->fully_assoc) ? true : false; + + if (fully_assoc || pure_cam) + { // fully-assocative cache -- ref: CACTi 2.0 report + if (Ndwl != 1 || //Ndwl is fixed to 1 for FA + Ndcm != 1 || //Ndcm is fixed to 1 for FA + Nspd < 1 || Nspd > 1 || //Nspd is fixed to 1 for FA + Ndsam_lev_1 != 1 || //Ndsam_lev_1 is fixed to one + Ndsam_lev_2 != 1 || //Ndsam_lev_2 is fixed to one + Ndbl < 2) + { + return; + } + } + + if ((is_dram) && (!is_tag) && (Ndcm > 1)) + { + return; // For a DRAM array, each bitline has its own sense-amp + } + + // If it's not an FA tag/data array, Ndwl should be at least two and Ndbl should be + // at least two because an array is assumed to have at least one mat. And a mat + // is formed out of two horizontal subarrays and two vertical subarrays + if (fully_assoc == false && (Ndwl < 1 || Ndbl < 1)) + { + return; + } + + //***********compute row, col of an subarray + if (!(fully_assoc || pure_cam))//Not fully_asso nor cam + { + // if data array, let tagbits = 0 + if (is_tag) + { + if (g_ip->specific_tag) + { + tagbits = g_ip->tag_w; + } + else + { + tagbits = ADDRESS_BITS + EXTRA_TAG_BITS - _log2(capacity_per_die) + + _log2(g_ip->tag_assoc*2 - 1) - _log2(g_ip->nbanks); + + } + tagbits = (((tagbits + 3) >> 2) << 2); + + num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks * + g_ip->block_sz * g_ip->tag_assoc * Ndbl * Nspd));// + EPSILON); + num_c_subarray = (int)ceil((tagbits * g_ip->tag_assoc * Nspd / Ndwl));// + EPSILON); + //burst_length = 1; + } + else + { + num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks * + g_ip->block_sz * g_ip->data_assoc * Ndbl * Nspd));// + EPSILON); + num_c_subarray = (int)ceil((8 * g_ip->block_sz * g_ip->data_assoc * Nspd / Ndwl));// + EPSILON); + EPSILON); + // burst_length = g_ip->block_sz * 8 / g_ip->out_w; + } + + if (num_r_subarray < MINSUBARRAYROWS) return; + if (num_r_subarray == 0) return; + if (num_r_subarray > MAXSUBARRAYROWS) return; + if (num_c_subarray < MINSUBARRAYCOLS) return; + if (num_c_subarray > MAXSUBARRAYCOLS) return; + + } + + else + {//either fully-asso or cam + if (pure_cam) + { + if (g_ip->specific_tag) + { + tagbits = int(ceil(g_ip->tag_w/8.0)*8); + } + else + { + tagbits = int(ceil((ADDRESS_BITS + EXTRA_TAG_BITS)/8.0)*8); +// cout<<"Pure CAM needs tag width to be specified"<<endl; +// exit(0); + } + //tagbits = (((tagbits + 3) >> 2) << 2); + + tag_num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks*tagbits/8.0 * Ndbl));//TODO: error check input of tagbits and blocksize //TODO: for pure CAM, g_ip->block should be number of entries. + //tag_num_c_subarray = (int)(tagbits + EPSILON); + tag_num_c_subarray = tagbits; + if (tag_num_r_subarray == 0) return; + if (tag_num_r_subarray > MAXSUBARRAYROWS) return; + if (tag_num_c_subarray < MINSUBARRAYCOLS) return; + if (tag_num_c_subarray > MAXSUBARRAYCOLS) return; + num_r_subarray = tag_num_r_subarray; + } + else //fully associative + { + if (g_ip->specific_tag) + { + tagbits = g_ip->tag_w; + } + else + { + tagbits = ADDRESS_BITS + EXTRA_TAG_BITS - _log2(g_ip->block_sz);//TODO: should be the page_offset=log2(page size), but this info is not avail with CACTI, for McPAT this is no problem. + } + tagbits = (((tagbits + 3) >> 2) << 2); + + tag_num_r_subarray = (int)(capacity_per_die / (g_ip->nbanks*g_ip->block_sz * Ndbl)); + tag_num_c_subarray = (int)ceil((tagbits * Nspd / Ndwl));// + EPSILON); + if (tag_num_r_subarray == 0) return; + if (tag_num_r_subarray > MAXSUBARRAYROWS) return; + if (tag_num_c_subarray < MINSUBARRAYCOLS) return; + if (tag_num_c_subarray > MAXSUBARRAYCOLS) return; + + data_num_r_subarray = tag_num_r_subarray; + data_num_c_subarray = 8 * g_ip->block_sz; + if (data_num_r_subarray == 0) return; + if (data_num_r_subarray > MAXSUBARRAYROWS) return; + if (data_num_c_subarray < MINSUBARRAYCOLS) return; + if (data_num_c_subarray > MAXSUBARRAYCOLS) return; + num_r_subarray = tag_num_r_subarray; + } + } + + num_subarrays = Ndwl * Ndbl; + //****************end of computation of row, col of an subarray + + // calculate wire parameters + if (fully_assoc || pure_cam) + { + cam_cell.h = g_tp.cam.b_h + 2 * wire_local.pitch * (g_ip->num_rw_ports-1 + g_ip->num_rd_ports + g_ip->num_wr_ports) + + 2 * wire_local.pitch*(g_ip->num_search_ports-1) + wire_local.pitch * g_ip->num_se_rd_ports; + cam_cell.w = g_tp.cam.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports-1 + g_ip->num_rd_ports + g_ip->num_wr_ports) + + 2 * wire_local.pitch*(g_ip->num_search_ports-1) + wire_local.pitch * g_ip->num_se_rd_ports; + + cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_wr_ports +g_ip->num_rw_ports-1 + g_ip->num_rd_ports) + + 2 * wire_local.pitch*(g_ip->num_search_ports-1); + cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports -1 + (g_ip->num_rd_ports - g_ip->num_se_rd_ports) + + g_ip->num_wr_ports) + g_tp.wire_local.pitch * g_ip->num_se_rd_ports + 2 * wire_local.pitch*(g_ip->num_search_ports-1); + } + else + { + if(is_tag) + { + cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 + g_ip->num_rd_ports + + g_ip->num_wr_ports); + cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 + g_ip->num_wr_ports + + (g_ip->num_rd_ports - g_ip->num_se_rd_ports)) + + wire_local.pitch * g_ip->num_se_rd_ports; + } + else + { + if (is_dram) + { + cell.h = g_tp.dram.b_h; + cell.w = g_tp.dram.b_w; + } + else + { + cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_wr_ports + + g_ip->num_rw_ports - 1 + g_ip->num_rd_ports); + cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 + + (g_ip->num_rd_ports - g_ip->num_se_rd_ports) + + g_ip->num_wr_ports) + g_tp.wire_local.pitch * g_ip->num_se_rd_ports; + } + } + } + + double c_b_metal = cell.h * wire_local.C_per_um; + double C_bl; + + if (!(fully_assoc || pure_cam)) + { + if (is_dram) + { + deg_bl_muxing = 1; + if (ram_cell_tech_type == comm_dram) + { + C_bl = num_r_subarray * c_b_metal; + V_b_sense = (g_tp.dram_cell_Vdd/2) * g_tp.dram_cell_C / (g_tp.dram_cell_C + C_bl); + if (V_b_sense < VBITSENSEMIN) + { + return; + } + V_b_sense = VBITSENSEMIN; // in any case, we fix sense amp input signal to a constant value + dram_refresh_period = 64e-3; + } + else + { + double Cbitrow_drain_cap = drain_C_(g_tp.dram.cell_a_w, NCH, 1, 0, cell.w, true, true) / 2.0; + C_bl = num_r_subarray * (Cbitrow_drain_cap + c_b_metal); + V_b_sense = (g_tp.dram_cell_Vdd/2) * g_tp.dram_cell_C /(g_tp.dram_cell_C + C_bl); + + if (V_b_sense < VBITSENSEMIN) + { + return; //Sense amp input signal is smaller that minimum allowable sense amp input signal + } + V_b_sense = VBITSENSEMIN; // in any case, we fix sense amp input signal to a constant value + //v_storage_worst = g_tp.dram_cell_Vdd / 2 - VBITSENSEMIN * (g_tp.dram_cell_C + C_bl) / g_tp.dram_cell_C; + //dram_refresh_period = 1.1 * g_tp.dram_cell_C * v_storage_worst / g_tp.dram_cell_I_off_worst_case_len_temp; + dram_refresh_period = 0.9 * g_tp.dram_cell_C * VDD_STORAGE_LOSS_FRACTION_WORST * g_tp.dram_cell_Vdd / g_tp.dram_cell_I_off_worst_case_len_temp; + } + } + else + { //SRAM + V_b_sense = (0.05 * g_tp.sram_cell.Vdd > VBITSENSEMIN) ? 0.05 * g_tp.sram_cell.Vdd : VBITSENSEMIN; + deg_bl_muxing = Ndcm; + // "/ 2.0" below is due to the fact that two adjacent access transistors share drain + // contacts in a physical layout + double Cbitrow_drain_cap = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0; + C_bl = num_r_subarray * (Cbitrow_drain_cap + c_b_metal); + dram_refresh_period = 0; + } + } + else + { + c_b_metal = cam_cell.h * wire_local.C_per_um;//IBM and SUN design, SRAM array uses dummy cells to fill the blank space due to mismatch on CAM-RAM + V_b_sense = (0.05 * g_tp.sram_cell.Vdd > VBITSENSEMIN) ? 0.05 * g_tp.sram_cell.Vdd : VBITSENSEMIN; + deg_bl_muxing = 1;//FA fix as 1 + // "/ 2.0" below is due to the fact that two adjacent access transistors share drain + // contacts in a physical layout + double Cbitrow_drain_cap = drain_C_(g_tp.cam.cell_a_w, NCH, 1, 0, cam_cell.w, false, true) / 2.0;//TODO: comment out these two lines + C_bl = num_r_subarray * (Cbitrow_drain_cap + c_b_metal); + dram_refresh_period = 0; + } + + + // do/di: data in/out, for fully associative they are the data width for normal read and write + // so/si: search data in/out, for fully associative they are the data width for the search ops + // for CAM, si=di, but so = matching address. do = data out = di (for normal read/write) + // so/si needs broadcase while do/di do not + + if (fully_assoc || pure_cam) + { + switch (Ndbl) { + case (0): + cout << " Invalid Ndbl \n"<<endl; + exit(0); + break; + case (1): + num_mats_h_dir = 1;//one subarray per mat + num_mats_v_dir = 1; + break; + case (2): + num_mats_h_dir = 1;//two subarrays per mat + num_mats_v_dir = 1; + break; + default: + num_mats_h_dir = int(floor(sqrt(Ndbl/4.0)));//4 subbarrys per mat + num_mats_v_dir = int(Ndbl/4.0 / num_mats_h_dir); + } + num_mats = num_mats_h_dir * num_mats_v_dir; + + if (fully_assoc) + { + num_so_b_mat = data_num_c_subarray; + num_do_b_mat = data_num_c_subarray + tagbits; + } + else + { + num_so_b_mat = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays)));//the address contains the matched data + num_do_b_mat = tagbits; + } + } + else + { + num_mats_h_dir = MAX(Ndwl / 2, 1); + num_mats_v_dir = MAX(Ndbl / 2, 1); + num_mats = num_mats_h_dir * num_mats_v_dir; + num_do_b_mat = MAX((num_subarrays/num_mats) * num_c_subarray / (deg_bl_muxing * Ndsam_lev_1 * Ndsam_lev_2), 1); + } + + if (!(fully_assoc|| pure_cam) && (num_do_b_mat < (num_subarrays/num_mats))) + { + return; + } + + + int deg_sa_mux_l1_non_assoc; + //TODO:the i/o for subbank is not necessary and should be removed. + if (!(fully_assoc || pure_cam)) + { + if (!is_tag) + { + if (is_main_mem == true) + { + num_do_b_subbank = g_ip->int_prefetch_w * g_ip->out_w; + deg_sa_mux_l1_non_assoc = Ndsam_lev_1; + } + else + { + if (g_ip->fast_access == true) + { + num_do_b_subbank = g_ip->out_w * g_ip->data_assoc; + deg_sa_mux_l1_non_assoc = Ndsam_lev_1; + } + else + { + + num_do_b_subbank = g_ip->out_w; + deg_sa_mux_l1_non_assoc = Ndsam_lev_1 / g_ip->data_assoc; + if (deg_sa_mux_l1_non_assoc < 1) + { + return; + } + + } + } + } + else + { + num_do_b_subbank = tagbits * g_ip->tag_assoc; + if (num_do_b_mat < tagbits) + { + return; + } + deg_sa_mux_l1_non_assoc = Ndsam_lev_1; + //num_do_b_mat = g_ip->tag_assoc / num_mats_h_dir; + } + } + else + { + if (fully_assoc) + { + num_so_b_subbank = 8 * g_ip->block_sz;//TODO:internal perfetch should be considered also for fa + num_do_b_subbank = num_so_b_subbank + tag_num_c_subarray; + } + else + { + num_so_b_subbank = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays)));//the address contains the matched data + num_do_b_subbank = tag_num_c_subarray; + } + + deg_sa_mux_l1_non_assoc = 1; + } + + deg_senseamp_muxing_non_associativity = deg_sa_mux_l1_non_assoc; + + if (fully_assoc || pure_cam) + { + num_act_mats_hor_dir = 1; + num_act_mats_hor_dir_sl = num_mats_h_dir;//TODO: this is unnecessary, since search op, num_mats is used + } + else + { + num_act_mats_hor_dir = num_do_b_subbank / num_do_b_mat; + if (num_act_mats_hor_dir == 0) + { + return; + } + } + + //compute num_do_mat for tag + if (is_tag) + { + if (!(fully_assoc || pure_cam)) + { + num_do_b_mat = g_ip->tag_assoc / num_act_mats_hor_dir; + num_do_b_subbank = num_act_mats_hor_dir * num_do_b_mat; + } + } + + if ((g_ip->is_cache == false && is_main_mem == true) || (PAGE_MODE == 1 && is_dram)) + { + if (num_act_mats_hor_dir * num_do_b_mat * Ndsam_lev_1 * Ndsam_lev_2 != (int)g_ip->page_sz_bits) + { + return; + } + } + +// if (is_tag == false && g_ip->is_cache == true && !fully_assoc && !pure_cam && //TODO: TODO burst transfer should also apply to RAM arrays + if (is_tag == false && g_ip->is_main_mem == true && + num_act_mats_hor_dir*num_do_b_mat*Ndsam_lev_1*Ndsam_lev_2 < ((int) g_ip->out_w * (int) g_ip->burst_len * (int) g_ip->data_assoc)) + { + return; + } + + if (num_act_mats_hor_dir > num_mats_h_dir) + { + return; + } + + + //compute di for mat subbank and bank + if (!(fully_assoc ||pure_cam)) + { + if(!is_tag) + { + if(g_ip->fast_access == true) + { + num_di_b_mat = num_do_b_mat / g_ip->data_assoc; + } + else + { + num_di_b_mat = num_do_b_mat; + } + } + else + { + num_di_b_mat = tagbits; + } + } + else + { + if (fully_assoc) + { + num_di_b_mat = num_do_b_mat; + //*num_subarrays/num_mats; bits per mat of CAM/FA is as same as cache, + //but inside the mat wire tracks need to be reserved for search data bus + num_si_b_mat = tagbits; + } + else + { + num_di_b_mat = tagbits; + num_si_b_mat = tagbits;//*num_subarrays/num_mats; + } + + } + + num_di_b_subbank = num_di_b_mat * num_act_mats_hor_dir;//normal cache or normal r/w for FA + num_si_b_subbank = num_si_b_mat; //* num_act_mats_hor_dir_sl; inside the data is broadcast + + int num_addr_b_row_dec = _log2(num_r_subarray); + if ((fully_assoc ||pure_cam)) + num_addr_b_row_dec +=_log2(num_subarrays/num_mats); + int number_subbanks = num_mats / num_act_mats_hor_dir; + number_subbanks_decode = _log2(number_subbanks);//TODO: add log2(num_subarray_per_bank) to FA/CAM + + num_rw_ports = g_ip->num_rw_ports; + num_rd_ports = g_ip->num_rd_ports; + num_wr_ports = g_ip->num_wr_ports; + num_se_rd_ports = g_ip->num_se_rd_ports; + num_search_ports = g_ip->num_search_ports; + + if (is_dram && is_main_mem) + { + number_addr_bits_mat = MAX((unsigned int) num_addr_b_row_dec, + _log2(deg_bl_muxing) + _log2(deg_sa_mux_l1_non_assoc) + _log2(Ndsam_lev_2)); + } + else + { + number_addr_bits_mat = num_addr_b_row_dec + _log2(deg_bl_muxing) + + _log2(deg_sa_mux_l1_non_assoc) + _log2(Ndsam_lev_2); + } + + if (!(fully_assoc ||pure_cam)) + { + if (is_tag) + { + num_di_b_bank_per_port = tagbits; + num_do_b_bank_per_port = g_ip->data_assoc; + } + else + { + num_di_b_bank_per_port = g_ip->out_w + g_ip->data_assoc; + num_do_b_bank_per_port = g_ip->out_w; + } + } + else + { + if (fully_assoc) + { + num_di_b_bank_per_port = g_ip->out_w + tagbits;//TODO: out_w or block_sz? + num_si_b_bank_per_port = tagbits; + num_do_b_bank_per_port = g_ip->out_w + tagbits; + num_so_b_bank_per_port = g_ip->out_w; + } + else + { + num_di_b_bank_per_port = tagbits; + num_si_b_bank_per_port = tagbits; + num_do_b_bank_per_port = tagbits; + num_so_b_bank_per_port = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays))); + } + } + + if ((!is_tag) && (g_ip->data_assoc > 1) && (!g_ip->fast_access)) + { + number_way_select_signals_mat = g_ip->data_assoc; + } + + // add ECC adjustment to all data signals that traverse on H-trees. + if (g_ip->add_ecc_b_ == true) + { + num_do_b_mat += (int) (ceil(num_do_b_mat / num_bits_per_ecc_b_)); + num_di_b_mat += (int) (ceil(num_di_b_mat / num_bits_per_ecc_b_)); + num_di_b_subbank += (int) (ceil(num_di_b_subbank / num_bits_per_ecc_b_)); + num_do_b_subbank += (int) (ceil(num_do_b_subbank / num_bits_per_ecc_b_)); + num_di_b_bank_per_port += (int) (ceil(num_di_b_bank_per_port / num_bits_per_ecc_b_)); + num_do_b_bank_per_port += (int) (ceil(num_do_b_bank_per_port / num_bits_per_ecc_b_)); + + num_so_b_mat += (int) (ceil(num_so_b_mat / num_bits_per_ecc_b_)); + num_si_b_mat += (int) (ceil(num_si_b_mat / num_bits_per_ecc_b_)); + num_si_b_subbank += (int) (ceil(num_si_b_subbank / num_bits_per_ecc_b_)); + num_so_b_subbank += (int) (ceil(num_so_b_subbank / num_bits_per_ecc_b_)); + num_si_b_bank_per_port += (int) (ceil(num_si_b_bank_per_port / num_bits_per_ecc_b_)); + num_so_b_bank_per_port += (int) (ceil(num_so_b_bank_per_port / num_bits_per_ecc_b_)); + } + + is_valid = true; +} + diff --git a/ext/mcpat/cacti/parameter.h b/ext/mcpat/cacti/parameter.h new file mode 100644 index 000000000..9c827bbc8 --- /dev/null +++ b/ext/mcpat/cacti/parameter.h @@ -0,0 +1,367 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#ifndef __PARAMETER_H__ +#define __PARAMETER_H__ + +#include "area.h" +#include "cacti_interface.h" +#include "const.h" +#include "io.h" + +// parameters which are functions of certain device technology +class TechnologyParameter +{ + public: + class DeviceType + { + public: + double C_g_ideal; + double C_fringe; + double C_overlap; + double C_junc; // C_junc_area + double C_junc_sidewall; + double l_phy; + double l_elec; + double R_nch_on; + double R_pch_on; + double Vdd; + double Vth; + double I_on_n; + double I_on_p; + double I_off_n; + double I_off_p; + double I_g_on_n; + double I_g_on_p; + double C_ox; + double t_ox; + double n_to_p_eff_curr_drv_ratio; + double long_channel_leakage_reduction; + + DeviceType(): C_g_ideal(0), C_fringe(0), C_overlap(0), C_junc(0), + C_junc_sidewall(0), l_phy(0), l_elec(0), R_nch_on(0), R_pch_on(0), + Vdd(0), Vth(0), + I_on_n(0), I_on_p(0), I_off_n(0), I_off_p(0),I_g_on_n(0),I_g_on_p(0), + C_ox(0), t_ox(0), n_to_p_eff_curr_drv_ratio(0), long_channel_leakage_reduction(0) { }; + void reset() + { + C_g_ideal = 0; + C_fringe = 0; + C_overlap = 0; + C_junc = 0; + l_phy = 0; + l_elec = 0; + R_nch_on = 0; + R_pch_on = 0; + Vdd = 0; + Vth = 0; + I_on_n = 0; + I_on_p = 0; + I_off_n = 0; + I_off_p = 0; + I_g_on_n = 0; + I_g_on_p = 0; + C_ox = 0; + t_ox = 0; + n_to_p_eff_curr_drv_ratio = 0; + long_channel_leakage_reduction = 0; + } + + void display(uint32_t indent = 0); + }; + class InterconnectType + { + public: + double pitch; + double R_per_um; + double C_per_um; + double horiz_dielectric_constant; + double vert_dielectric_constant; + double aspect_ratio; + double miller_value; + double ild_thickness; + + InterconnectType(): pitch(0), R_per_um(0), C_per_um(0) { }; + + void reset() + { + pitch = 0; + R_per_um = 0; + C_per_um = 0; + horiz_dielectric_constant = 0; + vert_dielectric_constant = 0; + aspect_ratio = 0; + miller_value = 0; + ild_thickness = 0; + } + + void display(uint32_t indent = 0); + }; + class MemoryType + { + public: + double b_w; + double b_h; + double cell_a_w; + double cell_pmos_w; + double cell_nmos_w; + double Vbitpre; + + void reset() + { + b_w = 0; + b_h = 0; + cell_a_w = 0; + cell_pmos_w = 0; + cell_nmos_w = 0; + Vbitpre = 0; + } + + void display(uint32_t indent = 0); + }; + + class ScalingFactor + { + public: + double logic_scaling_co_eff; + double core_tx_density; + double long_channel_leakage_reduction; + + ScalingFactor(): logic_scaling_co_eff(0), core_tx_density(0), + long_channel_leakage_reduction(0) { }; + + void reset() + { + logic_scaling_co_eff= 0; + core_tx_density = 0; + long_channel_leakage_reduction= 0; + } + + void display(uint32_t indent = 0); + }; + + double ram_wl_stitching_overhead_; + double min_w_nmos_; + double max_w_nmos_; + double max_w_nmos_dec; + double unit_len_wire_del; + double FO4; + double kinv; + double vpp; + double w_sense_en; + double w_sense_n; + double w_sense_p; + double sense_delay; + double sense_dy_power; + double w_iso; + double w_poly_contact; + double spacing_poly_to_poly; + double spacing_poly_to_contact; + + double w_comp_inv_p1; + double w_comp_inv_p2; + double w_comp_inv_p3; + double w_comp_inv_n1; + double w_comp_inv_n2; + double w_comp_inv_n3; + double w_eval_inv_p; + double w_eval_inv_n; + double w_comp_n; + double w_comp_p; + + double dram_cell_I_on; + double dram_cell_Vdd; + double dram_cell_I_off_worst_case_len_temp; + double dram_cell_C; + double gm_sense_amp_latch; + + double w_nmos_b_mux; + double w_nmos_sa_mux; + double w_pmos_bl_precharge; + double w_pmos_bl_eq; + double MIN_GAP_BET_P_AND_N_DIFFS; + double MIN_GAP_BET_SAME_TYPE_DIFFS; + double HPOWERRAIL; + double cell_h_def; + + double chip_layout_overhead; + double macro_layout_overhead; + double sckt_co_eff; + + double fringe_cap; + + uint64_t h_dec; + + DeviceType sram_cell; // SRAM cell transistor + DeviceType dram_acc; // DRAM access transistor + DeviceType dram_wl; // DRAM wordline transistor + DeviceType peri_global; // peripheral global + DeviceType cam_cell; // SRAM cell transistor + + InterconnectType wire_local; + InterconnectType wire_inside_mat; + InterconnectType wire_outside_mat; + + ScalingFactor scaling_factor; + + MemoryType sram; + MemoryType dram; + MemoryType cam; + + void display(uint32_t indent = 0); + + void reset() + { + dram_cell_Vdd = 0; + dram_cell_I_on = 0; + dram_cell_C = 0; + vpp = 0; + + sense_delay = 0; + sense_dy_power = 0; + fringe_cap = 0; +// horiz_dielectric_constant = 0; +// vert_dielectric_constant = 0; +// aspect_ratio = 0; +// miller_value = 0; +// ild_thickness = 0; + + dram_cell_I_off_worst_case_len_temp = 0; + + sram_cell.reset(); + dram_acc.reset(); + dram_wl.reset(); + peri_global.reset(); + cam_cell.reset(); + + scaling_factor.reset(); + + wire_local.reset(); + wire_inside_mat.reset(); + wire_outside_mat.reset(); + + sram.reset(); + dram.reset(); + cam.reset(); + + chip_layout_overhead = 0; + macro_layout_overhead = 0; + sckt_co_eff = 0; + } +}; + + + +class DynamicParameter +{ + public: + bool is_tag; + bool pure_ram; + bool pure_cam; + bool fully_assoc; + int tagbits; + int num_subarrays; // only for leakage computation -- the number of subarrays per bank + int num_mats; // only for leakage computation -- the number of mats per bank + double Nspd; + int Ndwl; + int Ndbl; + int Ndcm; + int deg_bl_muxing; + int deg_senseamp_muxing_non_associativity; + int Ndsam_lev_1; + int Ndsam_lev_2; + int number_addr_bits_mat; // per port + int number_subbanks_decode; // per_port + int num_di_b_bank_per_port; + int num_do_b_bank_per_port; + int num_di_b_mat; + int num_do_b_mat; + int num_di_b_subbank; + int num_do_b_subbank; + + int num_si_b_mat; + int num_so_b_mat; + int num_si_b_subbank; + int num_so_b_subbank; + int num_si_b_bank_per_port; + int num_so_b_bank_per_port; + + int number_way_select_signals_mat; + int num_act_mats_hor_dir; + + int num_act_mats_hor_dir_sl; + bool is_dram; + double V_b_sense; + unsigned int num_r_subarray; + unsigned int num_c_subarray; + int tag_num_r_subarray;//sheng: fully associative cache tag and data must be computed together, data and tag must be separate + int tag_num_c_subarray; + int data_num_r_subarray; + int data_num_c_subarray; + int num_mats_h_dir; + int num_mats_v_dir; + uint32_t ram_cell_tech_type; + double dram_refresh_period; + + DynamicParameter(); + DynamicParameter( + bool is_tag_, + int pure_ram_, + int pure_cam_, + double Nspd_, + unsigned int Ndwl_, + unsigned int Ndbl_, + unsigned int Ndcm_, + unsigned int Ndsam_lev_1_, + unsigned int Ndsam_lev_2_, + bool is_main_mem_); + + int use_inp_params; + unsigned int num_rw_ports; + unsigned int num_rd_ports; + unsigned int num_wr_ports; + unsigned int num_se_rd_ports; // number of single ended read ports + unsigned int num_search_ports; + unsigned int out_w;// == nr_bits_out + bool is_main_mem; + Area cell, cam_cell;//cell is the sram_cell in both nomal cache/ram and FA. + bool is_valid; +}; + + + +extern InputParameter * g_ip; +extern TechnologyParameter g_tp; + +#endif + diff --git a/ext/mcpat/cacti/router.cc b/ext/mcpat/cacti/router.cc new file mode 100644 index 000000000..06f170691 --- /dev/null +++ b/ext/mcpat/cacti/router.cc @@ -0,0 +1,311 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#include "router.h" + +Router::Router( + double flit_size_, + double vc_buf, /* vc size = vc_buffer_size * flit_size */ + double vc_c, + TechnologyParameter::DeviceType *dt, + double I_, + double O_, + double M_ + ):flit_size(flit_size_), + deviceType(dt), + I(I_), + O(O_), + M(M_) +{ + vc_buffer_size = vc_buf; + vc_count = vc_c; + min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_; + double technology = g_ip->F_sz_um; + + Vdd = dt->Vdd; + + /*Crossbar parameters. Transmisson gate is employed for connector*/ + NTtr = 10*technology*1e-6/2; /*Transmission gate's nmos tr. length*/ + PTtr = 20*technology*1e-6/2; /* pmos tr. length*/ + wt = 15*technology*1e-6/2; /*track width*/ + ht = 15*technology*1e-6/2; /*track height*/ +// I = 5; /*Number of crossbar input ports*/ +// O = 5; /*Number of crossbar output ports*/ + NTi = 12.5*technology*1e-6/2; + PTi = 25*technology*1e-6/2; + + NTid = 60*technology*1e-6/2; //m + PTid = 120*technology*1e-6/2; // m + NTod = 60*technology*1e-6/2; // m + PTod = 120*technology*1e-6/2; // m + + calc_router_parameters(); +} + +Router::~Router(){} + + +double //wire cap with triple spacing +Router::Cw3(double length) { + Wire wc(g_ip->wt, length, 1, 3, 3); + return (wc.wire_cap(length)); +} + +/*Function to calculate the gate capacitance*/ +double +Router::gate_cap(double w) { + return (double) gate_C (w*1e6 /*u*/, 0); +} + +/*Function to calculate the diffusion capacitance*/ +double +Router::diff_cap(double w, int type /*0 for n-mos and 1 for p-mos*/, + double s /*number of stacking transistors*/) { + return (double) drain_C_(w*1e6 /*u*/, type, (int) s, 1, g_tp.cell_h_def); +} + + +/*crossbar related functions */ + +// Model for simple transmission gate +double +Router::transmission_buf_inpcap() { + return diff_cap(NTtr, 0, 1)+diff_cap(PTtr, 1, 1); +} + +double +Router::transmission_buf_outcap() { + return diff_cap(NTtr, 0, 1)+diff_cap(PTtr, 1, 1); +} + +double +Router::transmission_buf_ctrcap() { + return gate_cap(NTtr)+gate_cap(PTtr); +} + +double +Router::crossbar_inpline() { + return (Cw3(O*flit_size*wt) + O*transmission_buf_inpcap() + gate_cap(NTid) + + gate_cap(PTid) + diff_cap(NTid, 0, 1) + diff_cap(PTid, 1, 1)); +} + +double +Router::crossbar_outline() { + return (Cw3(I*flit_size*ht) + I*transmission_buf_outcap() + gate_cap(NTod) + + gate_cap(PTod) + diff_cap(NTod, 0, 1) + diff_cap(PTod, 1, 1)); +} + +double +Router::crossbar_ctrline() { + return (Cw3(0.5*O*flit_size*wt) + flit_size*transmission_buf_ctrcap() + + diff_cap(NTi, 0, 1) + diff_cap(PTi, 1, 1) + + gate_cap(NTi) + gate_cap(PTi)); +} + +double +Router::tr_crossbar_power() { + return (crossbar_inpline()*Vdd*Vdd*flit_size/2 + + crossbar_outline()*Vdd*Vdd*flit_size/2)*2; +} + +void Router::buffer_stats() +{ + DynamicParameter dyn_p; + dyn_p.is_tag = false; + dyn_p.pure_cam = false; + dyn_p.fully_assoc = false; + dyn_p.pure_ram = true; + dyn_p.is_dram = false; + dyn_p.is_main_mem = false; + dyn_p.num_subarrays = 1; + dyn_p.num_mats = 1; + dyn_p.Ndbl = 1; + dyn_p.Ndwl = 1; + dyn_p.Nspd = 1; + dyn_p.deg_bl_muxing = 1; + dyn_p.deg_senseamp_muxing_non_associativity = 1; + dyn_p.Ndsam_lev_1 = 1; + dyn_p.Ndsam_lev_2 = 1; + dyn_p.Ndcm = 1; + dyn_p.number_addr_bits_mat = 8; + dyn_p.number_way_select_signals_mat = 1; + dyn_p.number_subbanks_decode = 0; + dyn_p.num_act_mats_hor_dir = 1; + dyn_p.V_b_sense = Vdd; // FIXME check power calc. + dyn_p.ram_cell_tech_type = 0; + dyn_p.num_r_subarray = (int) vc_buffer_size; + dyn_p.num_c_subarray = (int) flit_size * (int) vc_count; + dyn_p.num_mats_h_dir = 1; + dyn_p.num_mats_v_dir = 1; + dyn_p.num_do_b_subbank = (int)flit_size; + dyn_p.num_di_b_subbank = (int)flit_size; + dyn_p.num_do_b_mat = (int) flit_size; + dyn_p.num_di_b_mat = (int) flit_size; + dyn_p.num_do_b_mat = (int) flit_size; + dyn_p.num_di_b_mat = (int) flit_size; + dyn_p.num_do_b_bank_per_port = (int) flit_size; + dyn_p.num_di_b_bank_per_port = (int) flit_size; + dyn_p.out_w = (int) flit_size; + + dyn_p.use_inp_params = 1; + dyn_p.num_wr_ports = (unsigned int) vc_count; + dyn_p.num_rd_ports = 1;//(unsigned int) vc_count;//based on Bill Dally's book + dyn_p.num_rw_ports = 0; + dyn_p.num_se_rd_ports =0; + dyn_p.num_search_ports =0; + + + + dyn_p.cell.h = g_tp.sram.b_h + 2 * g_tp.wire_outside_mat.pitch * (dyn_p.num_wr_ports + + dyn_p.num_rw_ports - 1 + dyn_p.num_rd_ports); + dyn_p.cell.w = g_tp.sram.b_w + 2 * g_tp.wire_outside_mat.pitch * (dyn_p.num_rw_ports - 1 + + (dyn_p.num_rd_ports - dyn_p.num_se_rd_ports) + + dyn_p.num_wr_ports) + g_tp.wire_outside_mat.pitch * dyn_p.num_se_rd_ports; + + Mat buff(dyn_p); + buff.compute_delays(0); + buff.compute_power_energy(); + buffer.power.readOp = buff.power.readOp; + buffer.power.writeOp = buffer.power.readOp; //FIXME + buffer.area = buff.area; +} + + + + void +Router::cb_stats () +{ + if (1) { + Crossbar c_b(I, O, flit_size); + c_b.compute_power(); + crossbar.delay = c_b.delay; + crossbar.power.readOp.dynamic = c_b.power.readOp.dynamic; + crossbar.power.readOp.leakage = c_b.power.readOp.leakage; + crossbar.power.readOp.gate_leakage = c_b.power.readOp.gate_leakage; + crossbar.area = c_b.area; +// c_b.print_crossbar(); + } + else { + crossbar.power.readOp.dynamic = tr_crossbar_power(); + crossbar.power.readOp.leakage = flit_size * I * O * + cmos_Isub_leakage(NTtr*g_tp.min_w_nmos_, PTtr*min_w_pmos, 1, tg); + crossbar.power.readOp.gate_leakage = flit_size * I * O * + cmos_Ig_leakage(NTtr*g_tp.min_w_nmos_, PTtr*min_w_pmos, 1, tg); + } +} + +void +Router::get_router_power() +{ + /* calculate buffer stats */ + buffer_stats(); + + /* calculate cross-bar stats */ + cb_stats(); + + /* calculate arbiter stats */ + Arbiter vcarb(vc_count, flit_size, buffer.area.w); + Arbiter cbarb(I, flit_size, crossbar.area.w); + vcarb.compute_power(); + cbarb.compute_power(); + arbiter.power.readOp.dynamic = vcarb.power.readOp.dynamic * I + + cbarb.power.readOp.dynamic * O; + arbiter.power.readOp.leakage = vcarb.power.readOp.leakage * I + + cbarb.power.readOp.leakage * O; + arbiter.power.readOp.gate_leakage = vcarb.power.readOp.gate_leakage * I + + cbarb.power.readOp.gate_leakage * O; + +// arb_stats(); + power.readOp.dynamic = ((buffer.power.readOp.dynamic+buffer.power.writeOp.dynamic) + + crossbar.power.readOp.dynamic + + arbiter.power.readOp.dynamic)*MIN(I, O)*M; + double pppm_t[4] = {1,I,I,1}; + power = power + (buffer.power*pppm_t + crossbar.power + arbiter.power)*pppm_lkg; + +} + + void +Router::get_router_delay () +{ + FREQUENCY=5; // move this to config file --TODO + cycle_time = (1/(double)FREQUENCY)*1e3; //ps + delay = 4; + max_cyc = 17 * g_tp.FO4; //s + max_cyc *= 1e12; //ps + if (cycle_time < max_cyc) { + FREQUENCY = (1/max_cyc)*1e3; //GHz + } +} + + void +Router::get_router_area() +{ + area.h = I*buffer.area.h; + area.w = buffer.area.w+crossbar.area.w; +} + + void +Router::calc_router_parameters() +{ + /* calculate router frequency and pipeline cycles */ + get_router_delay(); + + /* router power stats */ + get_router_power(); + + /* area stats */ + get_router_area(); +} + + void +Router::print_router() +{ + cout << "\n\nRouter stats:\n"; + cout << "\tRouter Area - "<< area.get_area()*1e-6<<"(mm^2)\n"; + cout << "\tMaximum possible network frequency - " << (1/max_cyc)*1e3 << "GHz\n"; + cout << "\tNetwork frequency - " << FREQUENCY <<" GHz\n"; + cout << "\tNo. of Virtual channels - " << vc_count << "\n"; + cout << "\tNo. of pipeline stages - " << delay << endl; + cout << "\tLink bandwidth - " << flit_size << " (bits)\n"; + cout << "\tNo. of buffer entries per virtual channel - "<< vc_buffer_size << "\n"; + cout << "\tSimple buffer Area - "<< buffer.area.get_area()*1e-6<<"(mm^2)\n"; + cout << "\tSimple buffer access (Read) - " << buffer.power.readOp.dynamic * 1e9 <<" (nJ)\n"; + cout << "\tSimple buffer leakage - " << buffer.power.readOp.leakage * 1e3 <<" (mW)\n"; + cout << "\tCrossbar Area - "<< crossbar.area.get_area()*1e-6<<"(mm^2)\n"; + cout << "\tCross bar access energy - " << crossbar.power.readOp.dynamic * 1e9<<" (nJ)\n"; + cout << "\tCross bar leakage power - " << crossbar.power.readOp.leakage * 1e3<<" (mW)\n"; + cout << "\tArbiter access energy (VC arb + Crossbar arb) - "<<arbiter.power.readOp.dynamic * 1e9 <<" (nJ)\n"; + cout << "\tArbiter leakage (VC arb + Crossbar arb) - "<<arbiter.power.readOp.leakage * 1e3 <<" (mW)\n"; + +} + diff --git a/ext/mcpat/cacti/router.h b/ext/mcpat/cacti/router.h new file mode 100644 index 000000000..72ef44939 --- /dev/null +++ b/ext/mcpat/cacti/router.h @@ -0,0 +1,115 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#ifndef __ROUTER_H__ +#define __ROUTER_H__ + +#include <assert.h> + +#include <iostream> + +#include "arbiter.h" +#include "basic_circuit.h" +#include "cacti_interface.h" +#include "component.h" +#include "crossbar.h" +#include "mat.h" +#include "parameter.h" +#include "wire.h" + +class Router : public Component +{ + public: + Router( + double flit_size_, + double vc_buf, /* vc size = vc_buffer_size * flit_size */ + double vc_count, + TechnologyParameter::DeviceType *dt = &(g_tp.peri_global), + double I_ = 5, + double O_ = 5, + double M_ = 0.6); + ~Router(); + + + void print_router(); + + Component arbiter, crossbar, buffer; + + double cycle_time, max_cyc; + double flit_size; + double vc_count; + double vc_buffer_size; /* vc size = vc_buffer_size * flit_size */ + + private: + TechnologyParameter::DeviceType *deviceType; + double FREQUENCY; // move this to config file --TODO + double Cw3(double len); + double gate_cap(double w); + double diff_cap(double w, int type /*0 for n-mos and 1 for p-mos*/, double stack); + enum Wire_type wtype; + enum Wire_placement wire_placement; + //corssbar + double NTtr, PTtr, wt, ht, I, O, NTi, PTi, NTid, PTid, NTod, PTod, TriS1, TriS2; + double M; //network load + double transmission_buf_inpcap(); + double transmission_buf_outcap(); + double transmission_buf_ctrcap(); + double crossbar_inpline(); + double crossbar_outline(); + double crossbar_ctrline(); + double tr_crossbar_power(); + void cb_stats (); + double arb_power(); + void arb_stats (); + double buffer_params(); + void buffer_stats(); + + + //arbiter + + //buffer + + //router params + double Vdd; + + void calc_router_parameters(); + void get_router_area(); + void get_router_power(); + void get_router_delay(); + + double min_w_pmos; + + +}; + +#endif diff --git a/ext/mcpat/cacti/subarray.cc b/ext/mcpat/cacti/subarray.cc new file mode 100755 index 000000000..7cbf7d990 --- /dev/null +++ b/ext/mcpat/cacti/subarray.cc @@ -0,0 +1,196 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + + +#include <cassert> +#include <cmath> +#include <iostream> + +#include "subarray.h" + +Subarray::Subarray(const DynamicParameter & dp_, bool is_fa_): + dp(dp_), num_rows(dp.num_r_subarray), num_cols(dp.num_c_subarray), + num_cols_fa_cam(dp.tag_num_c_subarray), num_cols_fa_ram(dp.data_num_c_subarray), + cell(dp.cell), cam_cell(dp.cam_cell), is_fa(is_fa_) +{ + //num_cols=7; + //cout<<"num_cols ="<< num_cols <<endl; + if (!(is_fa || dp.pure_cam)) + { + num_cols +=(g_ip->add_ecc_b_ ? (int)ceil(num_cols / num_bits_per_ecc_b_) : 0); // ECC overhead + uint32_t ram_num_cells_wl_stitching = + (dp.ram_cell_tech_type == lp_dram) ? dram_num_cells_wl_stitching_ : + (dp.ram_cell_tech_type == comm_dram) ? comm_dram_num_cells_wl_stitching_ : sram_num_cells_wl_stitching_; + + area.h = cell.h * num_rows; + + area.w = cell.w * num_cols + + ceil(num_cols / ram_num_cells_wl_stitching) * g_tp.ram_wl_stitching_overhead_; // stitching overhead + } + else //cam fa + { + + //should not add dummy row here since the dummy row do not need decoder + if (is_fa)// fully associative cache + { + num_cols_fa_cam += g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_cam / num_bits_per_ecc_b_) : 0; + num_cols_fa_ram += (g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_ram / num_bits_per_ecc_b_) : 0); + num_cols = num_cols_fa_cam + num_cols_fa_ram; + } + else + { + num_cols_fa_cam += g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_cam / num_bits_per_ecc_b_) : 0; + num_cols_fa_ram = 0; + num_cols = num_cols_fa_cam; + } + + area.h = cam_cell.h * (num_rows + 1);//height of subarray is decided by CAM array. blank space in sram array are filled with dummy cells + area.w = cam_cell.w * num_cols_fa_cam + cell.w * num_cols_fa_ram + + ceil((num_cols_fa_cam + num_cols_fa_ram) / sram_num_cells_wl_stitching_)*g_tp.ram_wl_stitching_overhead_ + + 16*g_tp.wire_local.pitch //the overhead for the NAND gate to connect the two halves + + 128*g_tp.wire_local.pitch;//the overhead for the drivers from matchline to wordline of RAM + } + + assert(area.h>0); + assert(area.w>0); + compute_C(); +} + + + +Subarray::~Subarray() +{ +} + + + +double Subarray::get_total_cell_area() +{ +// return (is_fa==false? cell.get_area() * num_rows * num_cols +// //: cam_cell.h*(num_rows+1)*(num_cols_fa_cam + sram_cell.get_area()*num_cols_fa_ram)); +// : cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram)); +// //: cam_cell.get_area()*(num_rows+1)*num_cols_fa_cam + sram_cell.get_area()*(num_rows+1)*num_cols_fa_ram);//for FA, this area does not include the dummy cells in SRAM arrays. + + if (!(is_fa || dp.pure_cam)) + return (cell.get_area() * num_rows * num_cols); + else if (is_fa) + { //for FA, this area includes the dummy cells in SRAM arrays. + //return (cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram)); + //cout<<"diff" <<cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram)- cam_cell.h*(num_rows+1)*(cam_cell.w*num_cols_fa_cam + cell.w*num_cols_fa_ram)<<endl; + return (cam_cell.h*(num_rows+1)*(cam_cell.w*num_cols_fa_cam + cell.w*num_cols_fa_ram)); + } + else + return (cam_cell.get_area()*(num_rows+1)*num_cols_fa_cam ); + + +} + + + +void Subarray::compute_C() +{ + double c_w_metal = cell.w * g_tp.wire_local.C_per_um; + double r_w_metal = cell.w * g_tp.wire_local.R_per_um; + double C_b_metal = cell.h * g_tp.wire_local.C_per_um; + double C_b_row_drain_C; + + if (dp.is_dram) + { + C_wl = (gate_C_pass(g_tp.dram.cell_a_w, g_tp.dram.b_w, true, true) + c_w_metal) * num_cols; + + if (dp.ram_cell_tech_type == comm_dram) + { + C_bl = num_rows * C_b_metal; + } + else + { + C_b_row_drain_C = drain_C_(g_tp.dram.cell_a_w, NCH, 1, 0, cell.w, true, true) / 2.0; // due to shared contact + C_bl = num_rows * (C_b_row_drain_C + C_b_metal); + } + } + else + { + if (!(is_fa ||dp.pure_cam)) + { + C_wl = (gate_C_pass(g_tp.sram.cell_a_w, (g_tp.sram.b_w-2*g_tp.sram.cell_a_w)/2.0, false, true)*2 + + c_w_metal) * num_cols; + C_b_row_drain_C = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0; // due to shared contact + C_bl = num_rows * (C_b_row_drain_C + C_b_metal); + } + else + { + //Following is wordline not matchline + //CAM portion + c_w_metal = cam_cell.w * g_tp.wire_local.C_per_um; + r_w_metal = cam_cell.w * g_tp.wire_local.R_per_um; + C_wl_cam = (gate_C_pass(g_tp.cam.cell_a_w, (g_tp.cam.b_w-2*g_tp.cam.cell_a_w)/2.0, false, true)*2 + + c_w_metal) * num_cols_fa_cam; + R_wl_cam = (r_w_metal) * num_cols_fa_cam; + + if (!dp.pure_cam) + { + //RAM portion + c_w_metal = cell.w * g_tp.wire_local.C_per_um; + r_w_metal = cell.w * g_tp.wire_local.R_per_um; + C_wl_ram = (gate_C_pass(g_tp.sram.cell_a_w, (g_tp.sram.b_w-2*g_tp.sram.cell_a_w)/2.0, false, true)*2 + + c_w_metal) * num_cols_fa_ram; + R_wl_ram = (r_w_metal) * num_cols_fa_ram; + } + else + { + C_wl_ram = R_wl_ram =0; + } + C_wl = C_wl_cam + C_wl_ram; + C_wl += (16+128)*g_tp.wire_local.pitch*g_tp.wire_local.C_per_um; + + R_wl = R_wl_cam + R_wl_ram; + R_wl += (16+128)*g_tp.wire_local.pitch*g_tp.wire_local.R_per_um; + + //there are two ways to write to a FA, + //1) Write to CAM array then force a match on match line to active the corresponding wordline in RAM; + //2) using separate wordline for read/write and search in RAM. + //We are using the second approach. + + //Bitline CAM portion This is bitline not searchline. We assume no sharing between bitline and searchline according to SUN's implementations. + C_b_metal = cam_cell.h * g_tp.wire_local.C_per_um; + C_b_row_drain_C = drain_C_(g_tp.cam.cell_a_w, NCH, 1, 0, cam_cell.w, false, true) / 2.0; // due to shared contact + C_bl_cam = (num_rows+1) * (C_b_row_drain_C + C_b_metal); + //height of subarray is decided by CAM array. blank space in sram array are filled with dummy cells + C_b_row_drain_C = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0; // due to shared contact + C_bl = (num_rows +1) * (C_b_row_drain_C + C_b_metal); + + } + } +} + + diff --git a/ext/mcpat/cacti/subarray.h b/ext/mcpat/cacti/subarray.h new file mode 100755 index 000000000..5fb062420 --- /dev/null +++ b/ext/mcpat/cacti/subarray.h @@ -0,0 +1,70 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#ifndef __SUBARRAY_H__ +#define __SUBARRAY_H__ + +#include "area.h" +#include "component.h" +#include "parameter.h" + +using namespace std; + + +class Subarray : public Component +{ + public: + Subarray(const DynamicParameter & dp, bool is_fa_); + ~Subarray(); + + const DynamicParameter & dp; + double get_total_cell_area(); + unsigned int num_rows; + unsigned int num_cols; + int32_t num_cols_fa_cam; + int32_t num_cols_fa_ram; + Area cell, cam_cell; + + bool is_fa; + double C_wl, C_wl_cam, C_wl_ram; + double R_wl, R_wl_cam, R_wl_ram; + double C_bl, C_bl_cam; + private: + + void compute_C(); // compute bitline and wordline capacitance +}; + + + +#endif + diff --git a/ext/mcpat/cacti/technology.cc b/ext/mcpat/cacti/technology.cc new file mode 100644 index 000000000..a40c6eb44 --- /dev/null +++ b/ext/mcpat/cacti/technology.cc @@ -0,0 +1,2921 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + +#include "basic_circuit.h" + +#include "parameter.h" + +double wire_resistance(double resistivity, double wire_width, double wire_thickness, + double barrier_thickness, double dishing_thickness, double alpha_scatter) +{ + double resistance; + resistance = alpha_scatter * resistivity /((wire_thickness - barrier_thickness - dishing_thickness)*(wire_width - 2 * barrier_thickness)); + return(resistance); +} + +double wire_capacitance(double wire_width, double wire_thickness, double wire_spacing, + double ild_thickness, double miller_value, double horiz_dielectric_constant, + double vert_dielectric_constant, double fringe_cap) +{ + double vertical_cap, sidewall_cap, total_cap; + vertical_cap = 2 * PERMITTIVITY_FREE_SPACE * vert_dielectric_constant * wire_width / ild_thickness; + sidewall_cap = 2 * PERMITTIVITY_FREE_SPACE * miller_value * horiz_dielectric_constant * wire_thickness / wire_spacing; + total_cap = vertical_cap + sidewall_cap + fringe_cap; + return(total_cap); +} + + +void init_tech_params(double technology, bool is_tag) +{ + int iter, tech, tech_lo, tech_hi; + double curr_alpha, curr_vpp; + double wire_width, wire_thickness, wire_spacing, + fringe_cap, pmos_to_nmos_sizing_r; +// double aspect_ratio,ild_thickness, miller_value = 1.5, horiz_dielectric_constant, vert_dielectric_constant; + double barrier_thickness, dishing_thickness, alpha_scatter; + double curr_vdd_dram_cell, curr_v_th_dram_access_transistor, curr_I_on_dram_cell, curr_c_dram_cell; + + uint32_t ram_cell_tech_type = (is_tag) ? g_ip->tag_arr_ram_cell_tech_type : g_ip->data_arr_ram_cell_tech_type; + uint32_t peri_global_tech_type = (is_tag) ? g_ip->tag_arr_peri_global_tech_type : g_ip->data_arr_peri_global_tech_type; + + technology = technology * 1000.0; // in the unit of nm + + // initialize parameters + g_tp.reset(); + double gmp_to_gmn_multiplier_periph_global = 0; + + double curr_Wmemcella_dram, curr_Wmemcellpmos_dram, curr_Wmemcellnmos_dram, + curr_area_cell_dram, curr_asp_ratio_cell_dram, curr_Wmemcella_sram, + curr_Wmemcellpmos_sram, curr_Wmemcellnmos_sram, curr_area_cell_sram, + curr_asp_ratio_cell_sram, curr_I_off_dram_cell_worst_case_length_temp; + double curr_Wmemcella_cam, curr_Wmemcellpmos_cam, curr_Wmemcellnmos_cam, curr_area_cell_cam,//Sheng: CAM data + curr_asp_ratio_cell_cam; + double SENSE_AMP_D, SENSE_AMP_P; // J + double area_cell_dram = 0; + double asp_ratio_cell_dram = 0; + double area_cell_sram = 0; + double asp_ratio_cell_sram = 0; + double area_cell_cam = 0; + double asp_ratio_cell_cam = 0; + double mobility_eff_periph_global = 0; + double Vdsat_periph_global = 0; + double nmos_effective_resistance_multiplier; + double width_dram_access_transistor; + + double curr_logic_scaling_co_eff = 0;//This is based on the reported numbers of Intel Merom 65nm, Penryn45nm and IBM cell 90/65/45 date + double curr_core_tx_density = 0;//this is density per um^2; 90, ...22nm based on Intel Penryn + double curr_chip_layout_overhead = 0; + double curr_macro_layout_overhead = 0; + double curr_sckt_co_eff = 0; + + if (technology < 181 && technology > 179) + { + tech_lo = 180; + tech_hi = 180; + } + else if (technology < 91 && technology > 89) + { + tech_lo = 90; + tech_hi = 90; + } + else if (technology < 66 && technology > 64) + { + tech_lo = 65; + tech_hi = 65; + } + else if (technology < 46 && technology > 44) + { + tech_lo = 45; + tech_hi = 45; + } + else if (technology < 33 && technology > 31) + { + tech_lo = 32; + tech_hi = 32; + } + else if (technology < 23 && technology > 21) + { + tech_lo = 22; + tech_hi = 22; + if (ram_cell_tech_type == 3 ) + { + cout<<"current version does not support eDRAM technologies at 22nm"<<endl; + exit(0); + } + } +// else if (technology < 17 && technology > 15) +// { +// tech_lo = 16; +// tech_hi = 16; +// } + else if (technology < 180 && technology > 90) + { + tech_lo = 180; + tech_hi = 90; + } + else if (technology < 90 && technology > 65) + { + tech_lo = 90; + tech_hi = 65; + } + else if (technology < 65 && technology > 45) + { + tech_lo = 65; + tech_hi = 45; + } + else if (technology < 45 && technology > 32) + { + tech_lo = 45; + tech_hi = 32; + } + else if (technology < 32 && technology > 22) + { + tech_lo = 32; + tech_hi = 22; + } +// else if (technology < 22 && technology > 16) +// { +// tech_lo = 22; +// tech_hi = 16; +// } + else + { + cout<<"Invalid technology nodes"<<endl; + exit(0); + } + + double vdd[NUMBER_TECH_FLAVORS]; + double Lphy[NUMBER_TECH_FLAVORS]; + double Lelec[NUMBER_TECH_FLAVORS]; + double t_ox[NUMBER_TECH_FLAVORS]; + double v_th[NUMBER_TECH_FLAVORS]; + double c_ox[NUMBER_TECH_FLAVORS]; + double mobility_eff[NUMBER_TECH_FLAVORS]; + double Vdsat[NUMBER_TECH_FLAVORS]; + double c_g_ideal[NUMBER_TECH_FLAVORS]; + double c_fringe[NUMBER_TECH_FLAVORS]; + double c_junc[NUMBER_TECH_FLAVORS]; + double I_on_n[NUMBER_TECH_FLAVORS]; + double I_on_p[NUMBER_TECH_FLAVORS]; + double Rnchannelon[NUMBER_TECH_FLAVORS]; + double Rpchannelon[NUMBER_TECH_FLAVORS]; + double n_to_p_eff_curr_drv_ratio[NUMBER_TECH_FLAVORS]; + double I_off_n[NUMBER_TECH_FLAVORS][101]; + double I_g_on_n[NUMBER_TECH_FLAVORS][101]; + //double I_off_p[NUMBER_TECH_FLAVORS][101]; + double gmp_to_gmn_multiplier[NUMBER_TECH_FLAVORS]; + //double curr_sckt_co_eff[NUMBER_TECH_FLAVORS]; + double long_channel_leakage_reduction[NUMBER_TECH_FLAVORS]; + + for (iter = 0; iter <= 1; ++iter) + { + // linear interpolation + if (iter == 0) + { + tech = tech_lo; + if (tech_lo == tech_hi) + { + curr_alpha = 1; + } + else + { + curr_alpha = (technology - tech_hi)/(tech_lo - tech_hi); + } + } + else + { + tech = tech_hi; + if (tech_lo == tech_hi) + { + break; + } + else + { + curr_alpha = (tech_lo - technology)/(tech_lo - tech_hi); + } + } + + if (tech == 180) + { + //180nm technology-node. Corresponds to year 1999 in ITRS + //Only HP transistor was of interest that 180nm since leakage power was not a big issue. Performance was the king + //MASTAR does not contain data for 0.18um process. The following parameters are projected based on ITRS 2000 update and IBM 0.18 Cu Spice input + bool Aggre_proj = false; + SENSE_AMP_D = .28e-9; // s + SENSE_AMP_P = 14.7e-15; // J + vdd[0] = 1.5; + Lphy[0] = 0.12;//Lphy is the physical gate-length. micron + Lelec[0] = 0.10;//Lelec is the electrical gate-length. micron + t_ox[0] = 1.2e-3*(Aggre_proj? 1.9/1.2:2);//micron + v_th[0] = Aggre_proj? 0.36 : 0.4407;//V + c_ox[0] = 1.79e-14*(Aggre_proj? 1.9/1.2:2);//F/micron2 + mobility_eff[0] = 302.16 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs + Vdsat[0] = 0.128*2; //V + c_g_ideal[0] = (Aggre_proj? 1.9/1.2:2)*6.64e-16;//F/micron + c_fringe[0] = (Aggre_proj? 1.9/1.2:2)*0.08e-15;//F/micron + c_junc[0] = (Aggre_proj? 1.9/1.2:2)*1e-15;//F/micron2 + I_on_n[0] = 750e-6;//A/micron + I_on_p[0] = 350e-6;//A/micron + //Note that nmos_effective_resistance_multiplier, n_to_p_eff_curr_drv_ratio and gmp_to_gmn_multiplier values are calculated offline + nmos_effective_resistance_multiplier = 1.54; + n_to_p_eff_curr_drv_ratio[0] = 2.45; + gmp_to_gmn_multiplier[0] = 1.22; + Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron + Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron + long_channel_leakage_reduction[0] = 1; + I_off_n[0][0] = 7e-10;//A/micron + I_off_n[0][10] = 8.26e-10; + I_off_n[0][20] = 9.74e-10; + I_off_n[0][30] = 1.15e-9; + I_off_n[0][40] = 1.35e-9; + I_off_n[0][50] = 1.60e-9; + I_off_n[0][60] = 1.88e-9; + I_off_n[0][70] = 2.29e-9; + I_off_n[0][80] = 2.70e-9; + I_off_n[0][90] = 3.19e-9; + I_off_n[0][100] = 3.76e-9; + + I_g_on_n[0][0] = 1.65e-10;//A/micron + I_g_on_n[0][10] = 1.65e-10; + I_g_on_n[0][20] = 1.65e-10; + I_g_on_n[0][30] = 1.65e-10; + I_g_on_n[0][40] = 1.65e-10; + I_g_on_n[0][50] = 1.65e-10; + I_g_on_n[0][60] = 1.65e-10; + I_g_on_n[0][70] = 1.65e-10; + I_g_on_n[0][80] = 1.65e-10; + I_g_on_n[0][90] = 1.65e-10; + I_g_on_n[0][100] = 1.65e-10; + + //SRAM cell properties + curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um; + curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_sram = 1.46; + //CAM cell properties //TODO: data need to be revisited + curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um; + curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;//360 + curr_asp_ratio_cell_cam = 2.92;//2.5 + //Empirical undifferetiated core/FU coefficient + curr_logic_scaling_co_eff = 1.5;//linear scaling from 90nm + curr_core_tx_density = 1.25*0.7*0.7*0.4; + curr_sckt_co_eff = 1.11; + curr_chip_layout_overhead = 1.0;//die measurement results based on Niagara 1 and 2 + curr_macro_layout_overhead = 1.0;//EDA placement and routing tool rule of thumb + + } + + if (tech == 90) + { + SENSE_AMP_D = .28e-9; // s + SENSE_AMP_P = 14.7e-15; // J + //90nm technology-node. Corresponds to year 2004 in ITRS + //ITRS HP device type + vdd[0] = 1.2; + Lphy[0] = 0.037;//Lphy is the physical gate-length. micron + Lelec[0] = 0.0266;//Lelec is the electrical gate-length. micron + t_ox[0] = 1.2e-3;//micron + v_th[0] = 0.23707;//V + c_ox[0] = 1.79e-14;//F/micron2 + mobility_eff[0] = 342.16 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs + Vdsat[0] = 0.128; //V + c_g_ideal[0] = 6.64e-16;//F/micron + c_fringe[0] = 0.08e-15;//F/micron + c_junc[0] = 1e-15;//F/micron2 + I_on_n[0] = 1076.9e-6;//A/micron + I_on_p[0] = 712.6e-6;//A/micron + //Note that nmos_effective_resistance_multiplier, n_to_p_eff_curr_drv_ratio and gmp_to_gmn_multiplier values are calculated offline + nmos_effective_resistance_multiplier = 1.54; + n_to_p_eff_curr_drv_ratio[0] = 2.45; + gmp_to_gmn_multiplier[0] = 1.22; + Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron + Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron + long_channel_leakage_reduction[0] = 1; + I_off_n[0][0] = 3.24e-8;//A/micron + I_off_n[0][10] = 4.01e-8; + I_off_n[0][20] = 4.90e-8; + I_off_n[0][30] = 5.92e-8; + I_off_n[0][40] = 7.08e-8; + I_off_n[0][50] = 8.38e-8; + I_off_n[0][60] = 9.82e-8; + I_off_n[0][70] = 1.14e-7; + I_off_n[0][80] = 1.29e-7; + I_off_n[0][90] = 1.43e-7; + I_off_n[0][100] = 1.54e-7; + + I_g_on_n[0][0] = 1.65e-8;//A/micron + I_g_on_n[0][10] = 1.65e-8; + I_g_on_n[0][20] = 1.65e-8; + I_g_on_n[0][30] = 1.65e-8; + I_g_on_n[0][40] = 1.65e-8; + I_g_on_n[0][50] = 1.65e-8; + I_g_on_n[0][60] = 1.65e-8; + I_g_on_n[0][70] = 1.65e-8; + I_g_on_n[0][80] = 1.65e-8; + I_g_on_n[0][90] = 1.65e-8; + I_g_on_n[0][100] = 1.65e-8; + + //ITRS LSTP device type + vdd[1] = 1.3; + Lphy[1] = 0.075; + Lelec[1] = 0.0486; + t_ox[1] = 2.2e-3; + v_th[1] = 0.48203; + c_ox[1] = 1.22e-14; + mobility_eff[1] = 356.76 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[1] = 0.373; + c_g_ideal[1] = 9.15e-16; + c_fringe[1] = 0.08e-15; + c_junc[1] = 1e-15; + I_on_n[1] = 503.6e-6; + I_on_p[1] = 235.1e-6; + nmos_effective_resistance_multiplier = 1.92; + n_to_p_eff_curr_drv_ratio[1] = 2.44; + gmp_to_gmn_multiplier[1] =0.88; + Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1]; + Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1]; + long_channel_leakage_reduction[1] = 1; + I_off_n[1][0] = 2.81e-12; + I_off_n[1][10] = 4.76e-12; + I_off_n[1][20] = 7.82e-12; + I_off_n[1][30] = 1.25e-11; + I_off_n[1][40] = 1.94e-11; + I_off_n[1][50] = 2.94e-11; + I_off_n[1][60] = 4.36e-11; + I_off_n[1][70] = 6.32e-11; + I_off_n[1][80] = 8.95e-11; + I_off_n[1][90] = 1.25e-10; + I_off_n[1][100] = 1.7e-10; + + I_g_on_n[1][0] = 3.87e-11;//A/micron + I_g_on_n[1][10] = 3.87e-11; + I_g_on_n[1][20] = 3.87e-11; + I_g_on_n[1][30] = 3.87e-11; + I_g_on_n[1][40] = 3.87e-11; + I_g_on_n[1][50] = 3.87e-11; + I_g_on_n[1][60] = 3.87e-11; + I_g_on_n[1][70] = 3.87e-11; + I_g_on_n[1][80] = 3.87e-11; + I_g_on_n[1][90] = 3.87e-11; + I_g_on_n[1][100] = 3.87e-11; + + //ITRS LOP device type + vdd[2] = 0.9; + Lphy[2] = 0.053; + Lelec[2] = 0.0354; + t_ox[2] = 1.5e-3; + v_th[2] = 0.30764; + c_ox[2] = 1.59e-14; + mobility_eff[2] = 460.39 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[2] = 0.113; + c_g_ideal[2] = 8.45e-16; + c_fringe[2] = 0.08e-15; + c_junc[2] = 1e-15; + I_on_n[2] = 386.6e-6; + I_on_p[2] = 209.7e-6; + nmos_effective_resistance_multiplier = 1.77; + n_to_p_eff_curr_drv_ratio[2] = 2.54; + gmp_to_gmn_multiplier[2] = 0.98; + Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2]; + Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2]; + long_channel_leakage_reduction[2] = 1; + I_off_n[2][0] = 2.14e-9; + I_off_n[2][10] = 2.9e-9; + I_off_n[2][20] = 3.87e-9; + I_off_n[2][30] = 5.07e-9; + I_off_n[2][40] = 6.54e-9; + I_off_n[2][50] = 8.27e-8; + I_off_n[2][60] = 1.02e-7; + I_off_n[2][70] = 1.20e-7; + I_off_n[2][80] = 1.36e-8; + I_off_n[2][90] = 1.52e-8; + I_off_n[2][100] = 1.73e-8; + + I_g_on_n[2][0] = 4.31e-8;//A/micron + I_g_on_n[2][10] = 4.31e-8; + I_g_on_n[2][20] = 4.31e-8; + I_g_on_n[2][30] = 4.31e-8; + I_g_on_n[2][40] = 4.31e-8; + I_g_on_n[2][50] = 4.31e-8; + I_g_on_n[2][60] = 4.31e-8; + I_g_on_n[2][70] = 4.31e-8; + I_g_on_n[2][80] = 4.31e-8; + I_g_on_n[2][90] = 4.31e-8; + I_g_on_n[2][100] = 4.31e-8; + + if (ram_cell_tech_type == lp_dram) + { + //LP-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.2; + Lphy[3] = 0.12; + Lelec[3] = 0.0756; + curr_v_th_dram_access_transistor = 0.4545; + width_dram_access_transistor = 0.14; + curr_I_on_dram_cell = 45e-6; + curr_I_off_dram_cell_worst_case_length_temp = 21.1e-12; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 0.168; + curr_asp_ratio_cell_dram = 1.46; + curr_c_dram_cell = 20e-15; + + //LP-DRAM wordline transistor parameters + curr_vpp = 1.6; + t_ox[3] = 2.2e-3; + v_th[3] = 0.4545; + c_ox[3] = 1.22e-14; + mobility_eff[3] = 323.95 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.3; + c_g_ideal[3] = 1.47e-15; + c_fringe[3] = 0.08e-15; + c_junc[3] = 1e-15; + I_on_n[3] = 321.6e-6; + I_on_p[3] = 203.3e-6; + nmos_effective_resistance_multiplier = 1.65; + n_to_p_eff_curr_drv_ratio[3] = 1.95; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 1.42e-11; + I_off_n[3][10] = 2.25e-11; + I_off_n[3][20] = 3.46e-11; + I_off_n[3][30] = 5.18e-11; + I_off_n[3][40] = 7.58e-11; + I_off_n[3][50] = 1.08e-10; + I_off_n[3][60] = 1.51e-10; + I_off_n[3][70] = 2.02e-10; + I_off_n[3][80] = 2.57e-10; + I_off_n[3][90] = 3.14e-10; + I_off_n[3][100] = 3.85e-10; + } + else if (ram_cell_tech_type == comm_dram) + { + //COMM-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.6; + Lphy[3] = 0.09; + Lelec[3] = 0.0576; + curr_v_th_dram_access_transistor = 1; + width_dram_access_transistor = 0.09; + curr_I_on_dram_cell = 20e-6; + curr_I_off_dram_cell_worst_case_length_temp = 1e-15; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 6*0.09*0.09; + curr_asp_ratio_cell_dram = 1.5; + curr_c_dram_cell = 30e-15; + + //COMM-DRAM wordline transistor parameters + curr_vpp = 3.7; + t_ox[3] = 5.5e-3; + v_th[3] = 1.0; + c_ox[3] = 5.65e-15; + mobility_eff[3] = 302.2 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.32; + c_g_ideal[3] = 5.08e-16; + c_fringe[3] = 0.08e-15; + c_junc[3] = 1e-15; + I_on_n[3] = 1094.3e-6; + I_on_p[3] = I_on_n[3] / 2; + nmos_effective_resistance_multiplier = 1.62; + n_to_p_eff_curr_drv_ratio[3] = 2.05; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 5.80e-15; + I_off_n[3][10] = 1.21e-14; + I_off_n[3][20] = 2.42e-14; + I_off_n[3][30] = 4.65e-14; + I_off_n[3][40] = 8.60e-14; + I_off_n[3][50] = 1.54e-13; + I_off_n[3][60] = 2.66e-13; + I_off_n[3][70] = 4.45e-13; + I_off_n[3][80] = 7.17e-13; + I_off_n[3][90] = 1.11e-12; + I_off_n[3][100] = 1.67e-12; + } + + //SRAM cell properties + curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um; + curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_sram = 1.46; + //CAM cell properties //TODO: data need to be revisited + curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um; + curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;//360 + curr_asp_ratio_cell_cam = 2.92;//2.5 + //Empirical undifferetiated core/FU coefficient + curr_logic_scaling_co_eff = 1; + curr_core_tx_density = 1.25*0.7*0.7; + curr_sckt_co_eff = 1.1539; + curr_chip_layout_overhead = 1.2;//die measurement results based on Niagara 1 and 2 + curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb + + + } + + if (tech == 65) + { //65nm technology-node. Corresponds to year 2007 in ITRS + //ITRS HP device type + SENSE_AMP_D = .2e-9; // s + SENSE_AMP_P = 5.7e-15; // J + vdd[0] = 1.1; + Lphy[0] = 0.025; + Lelec[0] = 0.019; + t_ox[0] = 1.1e-3; + v_th[0] = .19491; + c_ox[0] = 1.88e-14; + mobility_eff[0] = 436.24 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[0] = 7.71e-2; + c_g_ideal[0] = 4.69e-16; + c_fringe[0] = 0.077e-15; + c_junc[0] = 1e-15; + I_on_n[0] = 1197.2e-6; + I_on_p[0] = 870.8e-6; + nmos_effective_resistance_multiplier = 1.50; + n_to_p_eff_curr_drv_ratio[0] = 2.41; + gmp_to_gmn_multiplier[0] = 1.38; + Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0]; + Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0]; + long_channel_leakage_reduction[0] = 1/3.74; + //Using MASTAR, @380K, increase Lgate until Ion reduces to 90% or Lgate increase by 10%, whichever comes first + //Ioff(Lgate normal)/Ioff(Lgate long)= 3.74. + I_off_n[0][0] = 1.96e-7; + I_off_n[0][10] = 2.29e-7; + I_off_n[0][20] = 2.66e-7; + I_off_n[0][30] = 3.05e-7; + I_off_n[0][40] = 3.49e-7; + I_off_n[0][50] = 3.95e-7; + I_off_n[0][60] = 4.45e-7; + I_off_n[0][70] = 4.97e-7; + I_off_n[0][80] = 5.48e-7; + I_off_n[0][90] = 5.94e-7; + I_off_n[0][100] = 6.3e-7; + I_g_on_n[0][0] = 4.09e-8;//A/micron + I_g_on_n[0][10] = 4.09e-8; + I_g_on_n[0][20] = 4.09e-8; + I_g_on_n[0][30] = 4.09e-8; + I_g_on_n[0][40] = 4.09e-8; + I_g_on_n[0][50] = 4.09e-8; + I_g_on_n[0][60] = 4.09e-8; + I_g_on_n[0][70] = 4.09e-8; + I_g_on_n[0][80] = 4.09e-8; + I_g_on_n[0][90] = 4.09e-8; + I_g_on_n[0][100] = 4.09e-8; + + //ITRS LSTP device type + vdd[1] = 1.2; + Lphy[1] = 0.045; + Lelec[1] = 0.0298; + t_ox[1] = 1.9e-3; + v_th[1] = 0.52354; + c_ox[1] = 1.36e-14; + mobility_eff[1] = 341.21 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[1] = 0.128; + c_g_ideal[1] = 6.14e-16; + c_fringe[1] = 0.08e-15; + c_junc[1] = 1e-15; + I_on_n[1] = 519.2e-6; + I_on_p[1] = 266e-6; + nmos_effective_resistance_multiplier = 1.96; + n_to_p_eff_curr_drv_ratio[1] = 2.23; + gmp_to_gmn_multiplier[1] = 0.99; + Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1]; + Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1]; + long_channel_leakage_reduction[1] = 1/2.82; + I_off_n[1][0] = 9.12e-12; + I_off_n[1][10] = 1.49e-11; + I_off_n[1][20] = 2.36e-11; + I_off_n[1][30] = 3.64e-11; + I_off_n[1][40] = 5.48e-11; + I_off_n[1][50] = 8.05e-11; + I_off_n[1][60] = 1.15e-10; + I_off_n[1][70] = 1.59e-10; + I_off_n[1][80] = 2.1e-10; + I_off_n[1][90] = 2.62e-10; + I_off_n[1][100] = 3.21e-10; + + I_g_on_n[1][0] = 1.09e-10;//A/micron + I_g_on_n[1][10] = 1.09e-10; + I_g_on_n[1][20] = 1.09e-10; + I_g_on_n[1][30] = 1.09e-10; + I_g_on_n[1][40] = 1.09e-10; + I_g_on_n[1][50] = 1.09e-10; + I_g_on_n[1][60] = 1.09e-10; + I_g_on_n[1][70] = 1.09e-10; + I_g_on_n[1][80] = 1.09e-10; + I_g_on_n[1][90] = 1.09e-10; + I_g_on_n[1][100] = 1.09e-10; + + //ITRS LOP device type + vdd[2] = 0.8; + Lphy[2] = 0.032; + Lelec[2] = 0.0216; + t_ox[2] = 1.2e-3; + v_th[2] = 0.28512; + c_ox[2] = 1.87e-14; + mobility_eff[2] = 495.19 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[2] = 0.292; + c_g_ideal[2] = 6e-16; + c_fringe[2] = 0.08e-15; + c_junc[2] = 1e-15; + I_on_n[2] = 573.1e-6; + I_on_p[2] = 340.6e-6; + nmos_effective_resistance_multiplier = 1.82; + n_to_p_eff_curr_drv_ratio[2] = 2.28; + gmp_to_gmn_multiplier[2] = 1.11; + Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2]; + Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2]; + long_channel_leakage_reduction[2] = 1/2.05; + I_off_n[2][0] = 4.9e-9; + I_off_n[2][10] = 6.49e-9; + I_off_n[2][20] = 8.45e-9; + I_off_n[2][30] = 1.08e-8; + I_off_n[2][40] = 1.37e-8; + I_off_n[2][50] = 1.71e-8; + I_off_n[2][60] = 2.09e-8; + I_off_n[2][70] = 2.48e-8; + I_off_n[2][80] = 2.84e-8; + I_off_n[2][90] = 3.13e-8; + I_off_n[2][100] = 3.42e-8; + + I_g_on_n[2][0] = 9.61e-9;//A/micron + I_g_on_n[2][10] = 9.61e-9; + I_g_on_n[2][20] = 9.61e-9; + I_g_on_n[2][30] = 9.61e-9; + I_g_on_n[2][40] = 9.61e-9; + I_g_on_n[2][50] = 9.61e-9; + I_g_on_n[2][60] = 9.61e-9; + I_g_on_n[2][70] = 9.61e-9; + I_g_on_n[2][80] = 9.61e-9; + I_g_on_n[2][90] = 9.61e-9; + I_g_on_n[2][100] = 9.61e-9; + + if (ram_cell_tech_type == lp_dram) + { + //LP-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.2; + Lphy[3] = 0.12; + Lelec[3] = 0.0756; + curr_v_th_dram_access_transistor = 0.43806; + width_dram_access_transistor = 0.09; + curr_I_on_dram_cell = 36e-6; + curr_I_off_dram_cell_worst_case_length_temp = 19.6e-12; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 0.11; + curr_asp_ratio_cell_dram = 1.46; + curr_c_dram_cell = 20e-15; + + //LP-DRAM wordline transistor parameters + curr_vpp = 1.6; + t_ox[3] = 2.2e-3; + v_th[3] = 0.43806; + c_ox[3] = 1.22e-14; + mobility_eff[3] = 328.32 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.43806; + c_g_ideal[3] = 1.46e-15; + c_fringe[3] = 0.08e-15; + c_junc[3] = 1e-15 ; + I_on_n[3] = 399.8e-6; + I_on_p[3] = 243.4e-6; + nmos_effective_resistance_multiplier = 1.65; + n_to_p_eff_curr_drv_ratio[3] = 2.05; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 2.23e-11; + I_off_n[3][10] = 3.46e-11; + I_off_n[3][20] = 5.24e-11; + I_off_n[3][30] = 7.75e-11; + I_off_n[3][40] = 1.12e-10; + I_off_n[3][50] = 1.58e-10; + I_off_n[3][60] = 2.18e-10; + I_off_n[3][70] = 2.88e-10; + I_off_n[3][80] = 3.63e-10; + I_off_n[3][90] = 4.41e-10; + I_off_n[3][100] = 5.36e-10; + } + else if (ram_cell_tech_type == comm_dram) + { + //COMM-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.3; + Lphy[3] = 0.065; + Lelec[3] = 0.0426; + curr_v_th_dram_access_transistor = 1; + width_dram_access_transistor = 0.065; + curr_I_on_dram_cell = 20e-6; + curr_I_off_dram_cell_worst_case_length_temp = 1e-15; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 6*0.065*0.065; + curr_asp_ratio_cell_dram = 1.5; + curr_c_dram_cell = 30e-15; + + //COMM-DRAM wordline transistor parameters + curr_vpp = 3.3; + t_ox[3] = 5e-3; + v_th[3] = 1.0; + c_ox[3] = 6.16e-15; + mobility_eff[3] = 303.44 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.385; + c_g_ideal[3] = 4e-16; + c_fringe[3] = 0.08e-15; + c_junc[3] = 1e-15 ; + I_on_n[3] = 1031e-6; + I_on_p[3] = I_on_n[3] / 2; + nmos_effective_resistance_multiplier = 1.69; + n_to_p_eff_curr_drv_ratio[3] = 2.39; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 1.80e-14; + I_off_n[3][10] = 3.64e-14; + I_off_n[3][20] = 7.03e-14; + I_off_n[3][30] = 1.31e-13; + I_off_n[3][40] = 2.35e-13; + I_off_n[3][50] = 4.09e-13; + I_off_n[3][60] = 6.89e-13; + I_off_n[3][70] = 1.13e-12; + I_off_n[3][80] = 1.78e-12; + I_off_n[3][90] = 2.71e-12; + I_off_n[3][100] = 3.99e-12; + } + + //SRAM cell properties + curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um; + curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_sram = 1.46; + //CAM cell properties //TODO: data need to be revisited + curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um; + curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_cam = 2.92; + //Empirical undifferetiated core/FU coefficient + curr_logic_scaling_co_eff = 0.7; //Rather than scale proportionally to square of feature size, only scale linearly according to IBM cell processor + curr_core_tx_density = 1.25*0.7; + curr_sckt_co_eff = 1.1359; + curr_chip_layout_overhead = 1.2;//die measurement results based on Niagara 1 and 2 + curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb + } + + if (tech == 45) + { //45nm technology-node. Corresponds to year 2010 in ITRS + //ITRS HP device type + SENSE_AMP_D = .04e-9; // s + SENSE_AMP_P = 2.7e-15; // J + vdd[0] = 1.0; + Lphy[0] = 0.018; + Lelec[0] = 0.01345; + t_ox[0] = 0.65e-3; + v_th[0] = .18035; + c_ox[0] = 3.77e-14; + mobility_eff[0] = 266.68 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[0] = 9.38E-2; + c_g_ideal[0] = 6.78e-16; + c_fringe[0] = 0.05e-15; + c_junc[0] = 1e-15; + I_on_n[0] = 2046.6e-6; + //There are certain problems with the ITRS PMOS numbers in MASTAR for 45nm. So we are using 65nm values of + //n_to_p_eff_curr_drv_ratio and gmp_to_gmn_multiplier for 45nm + I_on_p[0] = I_on_n[0] / 2;//This value is fixed arbitrarily but I_on_p is not being used in CACTI + nmos_effective_resistance_multiplier = 1.51; + n_to_p_eff_curr_drv_ratio[0] = 2.41; + gmp_to_gmn_multiplier[0] = 1.38; + Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0]; + Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0]; + long_channel_leakage_reduction[0] = 1/3.546;//Using MASTAR, @380K, increase Lgate until Ion reduces to 90%, Ioff(Lgate normal)/Ioff(Lgate long)= 3.74 + I_off_n[0][0] = 2.8e-7; + I_off_n[0][10] = 3.28e-7; + I_off_n[0][20] = 3.81e-7; + I_off_n[0][30] = 4.39e-7; + I_off_n[0][40] = 5.02e-7; + I_off_n[0][50] = 5.69e-7; + I_off_n[0][60] = 6.42e-7; + I_off_n[0][70] = 7.2e-7; + I_off_n[0][80] = 8.03e-7; + I_off_n[0][90] = 8.91e-7; + I_off_n[0][100] = 9.84e-7; + + I_g_on_n[0][0] = 3.59e-8;//A/micron + I_g_on_n[0][10] = 3.59e-8; + I_g_on_n[0][20] = 3.59e-8; + I_g_on_n[0][30] = 3.59e-8; + I_g_on_n[0][40] = 3.59e-8; + I_g_on_n[0][50] = 3.59e-8; + I_g_on_n[0][60] = 3.59e-8; + I_g_on_n[0][70] = 3.59e-8; + I_g_on_n[0][80] = 3.59e-8; + I_g_on_n[0][90] = 3.59e-8; + I_g_on_n[0][100] = 3.59e-8; + + //ITRS LSTP device type + vdd[1] = 1.1; + Lphy[1] = 0.028; + Lelec[1] = 0.0212; + t_ox[1] = 1.4e-3; + v_th[1] = 0.50245; + c_ox[1] = 2.01e-14; + mobility_eff[1] = 363.96 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[1] = 9.12e-2; + c_g_ideal[1] = 5.18e-16; + c_fringe[1] = 0.08e-15; + c_junc[1] = 1e-15; + I_on_n[1] = 666.2e-6; + I_on_p[1] = I_on_n[1] / 2; + nmos_effective_resistance_multiplier = 1.99; + n_to_p_eff_curr_drv_ratio[1] = 2.23; + gmp_to_gmn_multiplier[1] = 0.99; + Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1]; + Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1]; + long_channel_leakage_reduction[1] = 1/2.08; + I_off_n[1][0] = 1.01e-11; + I_off_n[1][10] = 1.65e-11; + I_off_n[1][20] = 2.62e-11; + I_off_n[1][30] = 4.06e-11; + I_off_n[1][40] = 6.12e-11; + I_off_n[1][50] = 9.02e-11; + I_off_n[1][60] = 1.3e-10; + I_off_n[1][70] = 1.83e-10; + I_off_n[1][80] = 2.51e-10; + I_off_n[1][90] = 3.29e-10; + I_off_n[1][100] = 4.1e-10; + + I_g_on_n[1][0] = 9.47e-12;//A/micron + I_g_on_n[1][10] = 9.47e-12; + I_g_on_n[1][20] = 9.47e-12; + I_g_on_n[1][30] = 9.47e-12; + I_g_on_n[1][40] = 9.47e-12; + I_g_on_n[1][50] = 9.47e-12; + I_g_on_n[1][60] = 9.47e-12; + I_g_on_n[1][70] = 9.47e-12; + I_g_on_n[1][80] = 9.47e-12; + I_g_on_n[1][90] = 9.47e-12; + I_g_on_n[1][100] = 9.47e-12; + + //ITRS LOP device type + vdd[2] = 0.7; + Lphy[2] = 0.022; + Lelec[2] = 0.016; + t_ox[2] = 0.9e-3; + v_th[2] = 0.22599; + c_ox[2] = 2.82e-14;//F/micron2 + mobility_eff[2] = 508.9 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[2] = 5.71e-2; + c_g_ideal[2] = 6.2e-16; + c_fringe[2] = 0.073e-15; + c_junc[2] = 1e-15; + I_on_n[2] = 748.9e-6; + I_on_p[2] = I_on_n[2] / 2; + nmos_effective_resistance_multiplier = 1.76; + n_to_p_eff_curr_drv_ratio[2] = 2.28; + gmp_to_gmn_multiplier[2] = 1.11; + Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2]; + Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2]; + long_channel_leakage_reduction[2] = 1/1.92; + I_off_n[2][0] = 4.03e-9; + I_off_n[2][10] = 5.02e-9; + I_off_n[2][20] = 6.18e-9; + I_off_n[2][30] = 7.51e-9; + I_off_n[2][40] = 9.04e-9; + I_off_n[2][50] = 1.08e-8; + I_off_n[2][60] = 1.27e-8; + I_off_n[2][70] = 1.47e-8; + I_off_n[2][80] = 1.66e-8; + I_off_n[2][90] = 1.84e-8; + I_off_n[2][100] = 2.03e-8; + + I_g_on_n[2][0] = 3.24e-8;//A/micron + I_g_on_n[2][10] = 4.01e-8; + I_g_on_n[2][20] = 4.90e-8; + I_g_on_n[2][30] = 5.92e-8; + I_g_on_n[2][40] = 7.08e-8; + I_g_on_n[2][50] = 8.38e-8; + I_g_on_n[2][60] = 9.82e-8; + I_g_on_n[2][70] = 1.14e-7; + I_g_on_n[2][80] = 1.29e-7; + I_g_on_n[2][90] = 1.43e-7; + I_g_on_n[2][100] = 1.54e-7; + + if (ram_cell_tech_type == lp_dram) + { + //LP-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.1; + Lphy[3] = 0.078; + Lelec[3] = 0.0504;// Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors. + curr_v_th_dram_access_transistor = 0.44559; + width_dram_access_transistor = 0.079; + curr_I_on_dram_cell = 36e-6;//A + curr_I_off_dram_cell_worst_case_length_temp = 19.5e-12; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = width_dram_access_transistor * Lphy[3] * 10.0; + curr_asp_ratio_cell_dram = 1.46; + curr_c_dram_cell = 20e-15; + + //LP-DRAM wordline transistor parameters + curr_vpp = 1.5; + t_ox[3] = 2.1e-3; + v_th[3] = 0.44559; + c_ox[3] = 1.41e-14; + mobility_eff[3] = 426.30 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.181; + c_g_ideal[3] = 1.10e-15; + c_fringe[3] = 0.08e-15; + c_junc[3] = 1e-15; + I_on_n[3] = 456e-6; + I_on_p[3] = I_on_n[3] / 2; + nmos_effective_resistance_multiplier = 1.65; + n_to_p_eff_curr_drv_ratio[3] = 2.05; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 2.54e-11; + I_off_n[3][10] = 3.94e-11; + I_off_n[3][20] = 5.95e-11; + I_off_n[3][30] = 8.79e-11; + I_off_n[3][40] = 1.27e-10; + I_off_n[3][50] = 1.79e-10; + I_off_n[3][60] = 2.47e-10; + I_off_n[3][70] = 3.31e-10; + I_off_n[3][80] = 4.26e-10; + I_off_n[3][90] = 5.27e-10; + I_off_n[3][100] = 6.46e-10; + } + else if (ram_cell_tech_type == comm_dram) + { + //COMM-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.1; + Lphy[3] = 0.045; + Lelec[3] = 0.0298; + curr_v_th_dram_access_transistor = 1; + width_dram_access_transistor = 0.045; + curr_I_on_dram_cell = 20e-6;//A + curr_I_off_dram_cell_worst_case_length_temp = 1e-15; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 6*0.045*0.045; + curr_asp_ratio_cell_dram = 1.5; + curr_c_dram_cell = 30e-15; + + //COMM-DRAM wordline transistor parameters + curr_vpp = 2.7; + t_ox[3] = 4e-3; + v_th[3] = 1.0; + c_ox[3] = 7.98e-15; + mobility_eff[3] = 368.58 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.147; + c_g_ideal[3] = 3.59e-16; + c_fringe[3] = 0.08e-15; + c_junc[3] = 1e-15; + I_on_n[3] = 999.4e-6; + I_on_p[3] = I_on_n[3] / 2; + nmos_effective_resistance_multiplier = 1.69; + n_to_p_eff_curr_drv_ratio[3] = 1.95; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 1.31e-14; + I_off_n[3][10] = 2.68e-14; + I_off_n[3][20] = 5.25e-14; + I_off_n[3][30] = 9.88e-14; + I_off_n[3][40] = 1.79e-13; + I_off_n[3][50] = 3.15e-13; + I_off_n[3][60] = 5.36e-13; + I_off_n[3][70] = 8.86e-13; + I_off_n[3][80] = 1.42e-12; + I_off_n[3][90] = 2.20e-12; + I_off_n[3][100] = 3.29e-12; + } + + + //SRAM cell properties + curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um; + curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_sram = 1.46; + //CAM cell properties //TODO: data need to be revisited + curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um; + curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_cam = 2.92; + //Empirical undifferetiated core/FU coefficient + curr_logic_scaling_co_eff = 0.7*0.7; + curr_core_tx_density = 1.25; + curr_sckt_co_eff = 1.1387; + curr_chip_layout_overhead = 1.2;//die measurement results based on Niagara 1 and 2 + curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb + } + + if (tech == 32) + { + SENSE_AMP_D = .03e-9; // s + SENSE_AMP_P = 2.16e-15; // J + //For 2013, MPU/ASIC stagger-contacted M1 half-pitch is 32 nm (so this is 32 nm + //technology i.e. FEATURESIZE = 0.032). Using the SOI process numbers for + //HP and LSTP. + vdd[0] = 0.9; + Lphy[0] = 0.013; + Lelec[0] = 0.01013; + t_ox[0] = 0.5e-3; + v_th[0] = 0.21835; + c_ox[0] = 4.11e-14; + mobility_eff[0] = 361.84 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[0] = 5.09E-2; + c_g_ideal[0] = 5.34e-16; + c_fringe[0] = 0.04e-15; + c_junc[0] = 1e-15; + I_on_n[0] = 2211.7e-6; + I_on_p[0] = I_on_n[0] / 2; + nmos_effective_resistance_multiplier = 1.49; + n_to_p_eff_curr_drv_ratio[0] = 2.41; + gmp_to_gmn_multiplier[0] = 1.38; + Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron + Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron + long_channel_leakage_reduction[0] = 1/3.706; + //Using MASTAR, @300K (380K does not work in MASTAR), increase Lgate until Ion reduces to 95% or Lgate increase by 5% (DG device can only increase by 5%), + //whichever comes first + I_off_n[0][0] = 1.52e-7; + I_off_n[0][10] = 1.55e-7; + I_off_n[0][20] = 1.59e-7; + I_off_n[0][30] = 1.68e-7; + I_off_n[0][40] = 1.90e-7; + I_off_n[0][50] = 2.69e-7; + I_off_n[0][60] = 5.32e-7; + I_off_n[0][70] = 1.02e-6; + I_off_n[0][80] = 1.62e-6; + I_off_n[0][90] = 2.73e-6; + I_off_n[0][100] = 6.1e-6; + + I_g_on_n[0][0] = 6.55e-8;//A/micron + I_g_on_n[0][10] = 6.55e-8; + I_g_on_n[0][20] = 6.55e-8; + I_g_on_n[0][30] = 6.55e-8; + I_g_on_n[0][40] = 6.55e-8; + I_g_on_n[0][50] = 6.55e-8; + I_g_on_n[0][60] = 6.55e-8; + I_g_on_n[0][70] = 6.55e-8; + I_g_on_n[0][80] = 6.55e-8; + I_g_on_n[0][90] = 6.55e-8; + I_g_on_n[0][100] = 6.55e-8; + +// 32 DG +// I_g_on_n[0][0] = 2.71e-9;//A/micron +// I_g_on_n[0][10] = 2.71e-9; +// I_g_on_n[0][20] = 2.71e-9; +// I_g_on_n[0][30] = 2.71e-9; +// I_g_on_n[0][40] = 2.71e-9; +// I_g_on_n[0][50] = 2.71e-9; +// I_g_on_n[0][60] = 2.71e-9; +// I_g_on_n[0][70] = 2.71e-9; +// I_g_on_n[0][80] = 2.71e-9; +// I_g_on_n[0][90] = 2.71e-9; +// I_g_on_n[0][100] = 2.71e-9; + + //LSTP device type + vdd[1] = 1; + Lphy[1] = 0.020; + Lelec[1] = 0.0173; + t_ox[1] = 1.2e-3; + v_th[1] = 0.513; + c_ox[1] = 2.29e-14; + mobility_eff[1] = 347.46 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[1] = 8.64e-2; + c_g_ideal[1] = 4.58e-16; + c_fringe[1] = 0.053e-15; + c_junc[1] = 1e-15; + I_on_n[1] = 683.6e-6; + I_on_p[1] = I_on_n[1] / 2; + nmos_effective_resistance_multiplier = 1.99; + n_to_p_eff_curr_drv_ratio[1] = 2.23; + gmp_to_gmn_multiplier[1] = 0.99; + Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1]; + Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1]; + long_channel_leakage_reduction[1] = 1/1.93; + I_off_n[1][0] = 2.06e-11; + I_off_n[1][10] = 3.30e-11; + I_off_n[1][20] = 5.15e-11; + I_off_n[1][30] = 7.83e-11; + I_off_n[1][40] = 1.16e-10; + I_off_n[1][50] = 1.69e-10; + I_off_n[1][60] = 2.40e-10; + I_off_n[1][70] = 3.34e-10; + I_off_n[1][80] = 4.54e-10; + I_off_n[1][90] = 5.96e-10; + I_off_n[1][100] = 7.44e-10; + + I_g_on_n[1][0] = 3.73e-11;//A/micron + I_g_on_n[1][10] = 3.73e-11; + I_g_on_n[1][20] = 3.73e-11; + I_g_on_n[1][30] = 3.73e-11; + I_g_on_n[1][40] = 3.73e-11; + I_g_on_n[1][50] = 3.73e-11; + I_g_on_n[1][60] = 3.73e-11; + I_g_on_n[1][70] = 3.73e-11; + I_g_on_n[1][80] = 3.73e-11; + I_g_on_n[1][90] = 3.73e-11; + I_g_on_n[1][100] = 3.73e-11; + + + //LOP device type + vdd[2] = 0.6; + Lphy[2] = 0.016; + Lelec[2] = 0.01232; + t_ox[2] = 0.9e-3; + v_th[2] = 0.24227; + c_ox[2] = 2.84e-14; + mobility_eff[2] = 513.52 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[2] = 4.64e-2; + c_g_ideal[2] = 4.54e-16; + c_fringe[2] = 0.057e-15; + c_junc[2] = 1e-15; + I_on_n[2] = 827.8e-6; + I_on_p[2] = I_on_n[2] / 2; + nmos_effective_resistance_multiplier = 1.73; + n_to_p_eff_curr_drv_ratio[2] = 2.28; + gmp_to_gmn_multiplier[2] = 1.11; + Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2]; + Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2]; + long_channel_leakage_reduction[2] = 1/1.89; + I_off_n[2][0] = 5.94e-8; + I_off_n[2][10] = 7.23e-8; + I_off_n[2][20] = 8.7e-8; + I_off_n[2][30] = 1.04e-7; + I_off_n[2][40] = 1.22e-7; + I_off_n[2][50] = 1.43e-7; + I_off_n[2][60] = 1.65e-7; + I_off_n[2][70] = 1.90e-7; + I_off_n[2][80] = 2.15e-7; + I_off_n[2][90] = 2.39e-7; + I_off_n[2][100] = 2.63e-7; + + I_g_on_n[2][0] = 2.93e-9;//A/micron + I_g_on_n[2][10] = 2.93e-9; + I_g_on_n[2][20] = 2.93e-9; + I_g_on_n[2][30] = 2.93e-9; + I_g_on_n[2][40] = 2.93e-9; + I_g_on_n[2][50] = 2.93e-9; + I_g_on_n[2][60] = 2.93e-9; + I_g_on_n[2][70] = 2.93e-9; + I_g_on_n[2][80] = 2.93e-9; + I_g_on_n[2][90] = 2.93e-9; + I_g_on_n[2][100] = 2.93e-9; + + if (ram_cell_tech_type == lp_dram) + { + //LP-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.0; + Lphy[3] = 0.056; + Lelec[3] = 0.0419;//Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors. + curr_v_th_dram_access_transistor = 0.44129; + width_dram_access_transistor = 0.056; + curr_I_on_dram_cell = 36e-6; + curr_I_off_dram_cell_worst_case_length_temp = 18.9e-12; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = width_dram_access_transistor * Lphy[3] * 10.0; + curr_asp_ratio_cell_dram = 1.46; + curr_c_dram_cell = 20e-15; + + //LP-DRAM wordline transistor parameters + curr_vpp = 1.5; + t_ox[3] = 2e-3; + v_th[3] = 0.44467; + c_ox[3] = 1.48e-14; + mobility_eff[3] = 408.12 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.174; + c_g_ideal[3] = 7.45e-16; + c_fringe[3] = 0.053e-15; + c_junc[3] = 1e-15; + I_on_n[3] = 1055.4e-6; + I_on_p[3] = I_on_n[3] / 2; + nmos_effective_resistance_multiplier = 1.65; + n_to_p_eff_curr_drv_ratio[3] = 2.05; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 3.57e-11; + I_off_n[3][10] = 5.51e-11; + I_off_n[3][20] = 8.27e-11; + I_off_n[3][30] = 1.21e-10; + I_off_n[3][40] = 1.74e-10; + I_off_n[3][50] = 2.45e-10; + I_off_n[3][60] = 3.38e-10; + I_off_n[3][70] = 4.53e-10; + I_off_n[3][80] = 5.87e-10; + I_off_n[3][90] = 7.29e-10; + I_off_n[3][100] = 8.87e-10; + } + else if (ram_cell_tech_type == comm_dram) + { + //COMM-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.0; + Lphy[3] = 0.032; + Lelec[3] = 0.0205;//Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors. + curr_v_th_dram_access_transistor = 1; + width_dram_access_transistor = 0.032; + curr_I_on_dram_cell = 20e-6; + curr_I_off_dram_cell_worst_case_length_temp = 1e-15; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 6*0.032*0.032; + curr_asp_ratio_cell_dram = 1.5; + curr_c_dram_cell = 30e-15; + + //COMM-DRAM wordline transistor parameters + curr_vpp = 2.6; + t_ox[3] = 4e-3; + v_th[3] = 1.0; + c_ox[3] = 7.99e-15; + mobility_eff[3] = 380.76 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.129; + c_g_ideal[3] = 2.56e-16; + c_fringe[3] = 0.053e-15; + c_junc[3] = 1e-15; + I_on_n[3] = 1024.5e-6; + I_on_p[3] = I_on_n[3] / 2; + nmos_effective_resistance_multiplier = 1.69; + n_to_p_eff_curr_drv_ratio[3] = 1.95; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 3.63e-14; + I_off_n[3][10] = 7.18e-14; + I_off_n[3][20] = 1.36e-13; + I_off_n[3][30] = 2.49e-13; + I_off_n[3][40] = 4.41e-13; + I_off_n[3][50] = 7.55e-13; + I_off_n[3][60] = 1.26e-12; + I_off_n[3][70] = 2.03e-12; + I_off_n[3][80] = 3.19e-12; + I_off_n[3][90] = 4.87e-12; + I_off_n[3][100] = 7.16e-12; + } + + //SRAM cell properties + curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um; + curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_sram = 1.46; + //CAM cell properties //TODO: data need to be revisited + curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um; + curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_cam = 2.92; + //Empirical undifferetiated core/FU coefficient + curr_logic_scaling_co_eff = 0.7*0.7*0.7; + curr_core_tx_density = 1.25/0.7; + curr_sckt_co_eff = 1.1111; + curr_chip_layout_overhead = 1.2;//die measurement results based on Niagara 1 and 2 + curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb + } + + if(tech == 22){ + SENSE_AMP_D = .03e-9; // s + SENSE_AMP_P = 2.16e-15; // J + //For 2016, MPU/ASIC stagger-contacted M1 half-pitch is 22 nm (so this is 22 nm + //technology i.e. FEATURESIZE = 0.022). Using the DG process numbers for HP. + //22 nm HP + vdd[0] = 0.8; + Lphy[0] = 0.009;//Lphy is the physical gate-length. + Lelec[0] = 0.00468;//Lelec is the electrical gate-length. + t_ox[0] = 0.55e-3;//micron + v_th[0] = 0.1395;//V + c_ox[0] = 3.63e-14;//F/micron2 + mobility_eff[0] = 426.07 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs + Vdsat[0] = 2.33e-2; //V/micron + c_g_ideal[0] = 3.27e-16;//F/micron + c_fringe[0] = 0.06e-15;//F/micron + c_junc[0] = 0;//F/micron2 + I_on_n[0] = 2626.4e-6;//A/micron + I_on_p[0] = I_on_n[0] / 2;//A/micron //This value for I_on_p is not really used. + nmos_effective_resistance_multiplier = 1.45; + n_to_p_eff_curr_drv_ratio[0] = 2; //Wpmos/Wnmos = 2 in 2007 MASTAR. Look in + //"Dynamic" tab of Device workspace. + gmp_to_gmn_multiplier[0] = 1.38; //Just using the 32nm SOI value. + Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron + Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron + long_channel_leakage_reduction[0] = 1/3.274; + I_off_n[0][0] = 1.52e-7/1.5*1.2;//From 22nm, leakage current are directly from ITRS report rather than MASTAR, since MASTAR has serious bugs there. + I_off_n[0][10] = 1.55e-7/1.5*1.2; + I_off_n[0][20] = 1.59e-7/1.5*1.2; + I_off_n[0][30] = 1.68e-7/1.5*1.2; + I_off_n[0][40] = 1.90e-7/1.5*1.2; + I_off_n[0][50] = 2.69e-7/1.5*1.2; + I_off_n[0][60] = 5.32e-7/1.5*1.2; + I_off_n[0][70] = 1.02e-6/1.5*1.2; + I_off_n[0][80] = 1.62e-6/1.5*1.2; + I_off_n[0][90] = 2.73e-6/1.5*1.2; + I_off_n[0][100] = 6.1e-6/1.5*1.2; + //for 22nm DG HP + I_g_on_n[0][0] = 1.81e-9;//A/micron + I_g_on_n[0][10] = 1.81e-9; + I_g_on_n[0][20] = 1.81e-9; + I_g_on_n[0][30] = 1.81e-9; + I_g_on_n[0][40] = 1.81e-9; + I_g_on_n[0][50] = 1.81e-9; + I_g_on_n[0][60] = 1.81e-9; + I_g_on_n[0][70] = 1.81e-9; + I_g_on_n[0][80] = 1.81e-9; + I_g_on_n[0][90] = 1.81e-9; + I_g_on_n[0][100] = 1.81e-9; + + //22 nm LSTP DG + vdd[1] = 0.8; + Lphy[1] = 0.014; + Lelec[1] = 0.008;//Lelec is the electrical gate-length. + t_ox[1] = 1.1e-3;//micron + v_th[1] = 0.40126;//V + c_ox[1] = 2.30e-14;//F/micron2 + mobility_eff[1] = 738.09 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs + Vdsat[1] = 6.64e-2; //V/micron + c_g_ideal[1] = 3.22e-16;//F/micron + c_fringe[1] = 0.08e-15; + c_junc[1] = 0;//F/micron2 + I_on_n[1] = 727.6e-6;//A/micron + I_on_p[1] = I_on_n[1] / 2; + nmos_effective_resistance_multiplier = 1.99; + n_to_p_eff_curr_drv_ratio[1] = 2; + gmp_to_gmn_multiplier[1] = 0.99; + Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];//ohm-micron + Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];//ohm-micron + long_channel_leakage_reduction[1] = 1/1.89; + I_off_n[1][0] = 2.43e-11; + I_off_n[1][10] = 4.85e-11; + I_off_n[1][20] = 9.68e-11; + I_off_n[1][30] = 1.94e-10; + I_off_n[1][40] = 3.87e-10; + I_off_n[1][50] = 7.73e-10; + I_off_n[1][60] = 3.55e-10; + I_off_n[1][70] = 3.09e-9; + I_off_n[1][80] = 6.19e-9; + I_off_n[1][90] = 1.24e-8; + I_off_n[1][100]= 2.48e-8; + + I_g_on_n[1][0] = 4.51e-10;//A/micron + I_g_on_n[1][10] = 4.51e-10; + I_g_on_n[1][20] = 4.51e-10; + I_g_on_n[1][30] = 4.51e-10; + I_g_on_n[1][40] = 4.51e-10; + I_g_on_n[1][50] = 4.51e-10; + I_g_on_n[1][60] = 4.51e-10; + I_g_on_n[1][70] = 4.51e-10; + I_g_on_n[1][80] = 4.51e-10; + I_g_on_n[1][90] = 4.51e-10; + I_g_on_n[1][100] = 4.51e-10; + + //22 nm LOP + vdd[2] = 0.6; + Lphy[2] = 0.011; + Lelec[2] = 0.00604;//Lelec is the electrical gate-length. + t_ox[2] = 0.8e-3;//micron + v_th[2] = 0.2315;//V + c_ox[2] = 2.87e-14;//F/micron2 + mobility_eff[2] = 698.37 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs + Vdsat[2] = 1.81e-2; //V/micron + c_g_ideal[2] = 3.16e-16;//F/micron + c_fringe[2] = 0.08e-15; + c_junc[2] = 0;//F/micron2 This is Cj0 not Cjunc in MASTAR results->Dynamic Tab + I_on_n[2] = 916.1e-6;//A/micron + I_on_p[2] = I_on_n[2] / 2; + nmos_effective_resistance_multiplier = 1.73; + n_to_p_eff_curr_drv_ratio[2] = 2; + gmp_to_gmn_multiplier[2] = 1.11; + Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];//ohm-micron + Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];//ohm-micron + long_channel_leakage_reduction[2] = 1/2.38; + + I_off_n[2][0] = 1.31e-8; + I_off_n[2][10] = 2.60e-8; + I_off_n[2][20] = 5.14e-8; + I_off_n[2][30] = 1.02e-7; + I_off_n[2][40] = 2.02e-7; + I_off_n[2][50] = 3.99e-7; + I_off_n[2][60] = 7.91e-7; + I_off_n[2][70] = 1.09e-6; + I_off_n[2][80] = 2.09e-6; + I_off_n[2][90] = 4.04e-6; + I_off_n[2][100]= 4.48e-6; + + I_g_on_n[2][0] = 2.74e-9;//A/micron + I_g_on_n[2][10] = 2.74e-9; + I_g_on_n[2][20] = 2.74e-9; + I_g_on_n[2][30] = 2.74e-9; + I_g_on_n[2][40] = 2.74e-9; + I_g_on_n[2][50] = 2.74e-9; + I_g_on_n[2][60] = 2.74e-9; + I_g_on_n[2][70] = 2.74e-9; + I_g_on_n[2][80] = 2.74e-9; + I_g_on_n[2][90] = 2.74e-9; + I_g_on_n[2][100] = 2.74e-9; + + + + if (ram_cell_tech_type == 3) + {} + else if (ram_cell_tech_type == 4) + { + //22 nm commodity DRAM cell access transistor technology parameters. + //parameters + curr_vdd_dram_cell = 0.9;//0.45;//This value has reduced greatly in 2007 ITRS for all technology nodes. In + //2005 ITRS, the value was about twice the value in 2007 ITRS + Lphy[3] = 0.022;//micron + Lelec[3] = 0.0181;//micron. + curr_v_th_dram_access_transistor = 1;//V + width_dram_access_transistor = 0.022;//micron + curr_I_on_dram_cell = 20e-6; //This is a typical value that I have always + //kept constant. In reality this could perhaps be lower + curr_I_off_dram_cell_worst_case_length_temp = 1e-15;//A + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 6*0.022*0.022;//micron2. + curr_asp_ratio_cell_dram = 0.667; + curr_c_dram_cell = 30e-15;//This is a typical value that I have alwaus + //kept constant. + + //22 nm commodity DRAM wordline transistor parameters obtained using MASTAR. + curr_vpp = 2.3;//vpp. V + t_ox[3] = 3.5e-3;//micron + v_th[3] = 1.0;//V + c_ox[3] = 9.06e-15;//F/micron2 + mobility_eff[3] = 367.29 * (1e-2 * 1e6 * 1e-2 * 1e6);//micron2 / Vs + Vdsat[3] = 0.0972; //V/micron + c_g_ideal[3] = 1.99e-16;//F/micron + c_fringe[3] = 0.053e-15;//F/micron + c_junc[3] = 1e-15;//F/micron2 + I_on_n[3] = 910.5e-6;//A/micron + I_on_p[3] = I_on_n[3] / 2;//This value for I_on_p is not really used. + nmos_effective_resistance_multiplier = 1.69;//Using the value from 32nm. + // + n_to_p_eff_curr_drv_ratio[3] = 1.95;//Using the value from 32nm + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];//ohm-micron + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];//ohm-micron + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 1.1e-13; //A/micron + I_off_n[3][10] = 2.11e-13; + I_off_n[3][20] = 3.88e-13; + I_off_n[3][30] = 6.9e-13; + I_off_n[3][40] = 1.19e-12; + I_off_n[3][50] = 1.98e-12; + I_off_n[3][60] = 3.22e-12; + I_off_n[3][70] = 5.09e-12; + I_off_n[3][80] = 7.85e-12; + I_off_n[3][90] = 1.18e-11; + I_off_n[3][100] = 1.72e-11; + + } + else + { + //some error handler + } + + //SRAM cell properties + curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um; + curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_sram = 1.46; + //CAM cell properties //TODO: data need to be revisited + curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um; + curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_cam = 2.92; + //Empirical undifferetiated core/FU coefficient + curr_logic_scaling_co_eff = 0.7*0.7*0.7*0.7; + curr_core_tx_density = 1.25/0.7/0.7; + curr_sckt_co_eff = 1.1296; + curr_chip_layout_overhead = 1.2;//die measurement results based on Niagara 1 and 2 + curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb + } + + if(tech == 16){ + //For 2019, MPU/ASIC stagger-contacted M1 half-pitch is 16 nm (so this is 16 nm + //technology i.e. FEATURESIZE = 0.016). Using the DG process numbers for HP. + //16 nm HP + vdd[0] = 0.7; + Lphy[0] = 0.006;//Lphy is the physical gate-length. + Lelec[0] = 0.00315;//Lelec is the electrical gate-length. + t_ox[0] = 0.5e-3;//micron + v_th[0] = 0.1489;//V + c_ox[0] = 3.83e-14;//F/micron2 Cox_elec in MASTAR + mobility_eff[0] = 476.15 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs + Vdsat[0] = 1.42e-2; //V/micron calculated in spreadsheet + c_g_ideal[0] = 2.30e-16;//F/micron + c_fringe[0] = 0.06e-15;//F/micron MASTAR inputdynamic/3 + c_junc[0] = 0;//F/micron2 MASTAR result dynamic + I_on_n[0] = 2768.4e-6;//A/micron + I_on_p[0] = I_on_n[0] / 2;//A/micron //This value for I_on_p is not really used. + nmos_effective_resistance_multiplier = 1.48;//nmos_effective_resistance_multiplier is the ratio of Ieff to Idsat where Ieff is the effective NMOS current and Idsat is the saturation current. + n_to_p_eff_curr_drv_ratio[0] = 2; //Wpmos/Wnmos = 2 in 2007 MASTAR. Look in + //"Dynamic" tab of Device workspace. + gmp_to_gmn_multiplier[0] = 1.38; //Just using the 32nm SOI value. + Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron + Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron + long_channel_leakage_reduction[0] = 1/2.655; + I_off_n[0][0] = 1.52e-7/1.5*1.2*1.07; + I_off_n[0][10] = 1.55e-7/1.5*1.2*1.07; + I_off_n[0][20] = 1.59e-7/1.5*1.2*1.07; + I_off_n[0][30] = 1.68e-7/1.5*1.2*1.07; + I_off_n[0][40] = 1.90e-7/1.5*1.2*1.07; + I_off_n[0][50] = 2.69e-7/1.5*1.2*1.07; + I_off_n[0][60] = 5.32e-7/1.5*1.2*1.07; + I_off_n[0][70] = 1.02e-6/1.5*1.2*1.07; + I_off_n[0][80] = 1.62e-6/1.5*1.2*1.07; + I_off_n[0][90] = 2.73e-6/1.5*1.2*1.07; + I_off_n[0][100] = 6.1e-6/1.5*1.2*1.07; + //for 16nm DG HP + I_g_on_n[0][0] = 1.07e-9;//A/micron + I_g_on_n[0][10] = 1.07e-9; + I_g_on_n[0][20] = 1.07e-9; + I_g_on_n[0][30] = 1.07e-9; + I_g_on_n[0][40] = 1.07e-9; + I_g_on_n[0][50] = 1.07e-9; + I_g_on_n[0][60] = 1.07e-9; + I_g_on_n[0][70] = 1.07e-9; + I_g_on_n[0][80] = 1.07e-9; + I_g_on_n[0][90] = 1.07e-9; + I_g_on_n[0][100] = 1.07e-9; + +// //16 nm LSTP DG +// vdd[1] = 0.8; +// Lphy[1] = 0.014; +// Lelec[1] = 0.008;//Lelec is the electrical gate-length. +// t_ox[1] = 1.1e-3;//micron +// v_th[1] = 0.40126;//V +// c_ox[1] = 2.30e-14;//F/micron2 +// mobility_eff[1] = 738.09 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs +// Vdsat[1] = 6.64e-2; //V/micron +// c_g_ideal[1] = 3.22e-16;//F/micron +// c_fringe[1] = 0.008e-15; +// c_junc[1] = 0;//F/micron2 +// I_on_n[1] = 727.6e-6;//A/micron +// I_on_p[1] = I_on_n[1] / 2; +// nmos_effective_resistance_multiplier = 1.99; +// n_to_p_eff_curr_drv_ratio[1] = 2; +// gmp_to_gmn_multiplier[1] = 0.99; +// Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];//ohm-micron +// Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];//ohm-micron +// I_off_n[1][0] = 2.43e-11; +// I_off_n[1][10] = 4.85e-11; +// I_off_n[1][20] = 9.68e-11; +// I_off_n[1][30] = 1.94e-10; +// I_off_n[1][40] = 3.87e-10; +// I_off_n[1][50] = 7.73e-10; +// I_off_n[1][60] = 3.55e-10; +// I_off_n[1][70] = 3.09e-9; +// I_off_n[1][80] = 6.19e-9; +// I_off_n[1][90] = 1.24e-8; +// I_off_n[1][100]= 2.48e-8; +// +// // for 22nm LSTP HP +// I_g_on_n[1][0] = 4.51e-10;//A/micron +// I_g_on_n[1][10] = 4.51e-10; +// I_g_on_n[1][20] = 4.51e-10; +// I_g_on_n[1][30] = 4.51e-10; +// I_g_on_n[1][40] = 4.51e-10; +// I_g_on_n[1][50] = 4.51e-10; +// I_g_on_n[1][60] = 4.51e-10; +// I_g_on_n[1][70] = 4.51e-10; +// I_g_on_n[1][80] = 4.51e-10; +// I_g_on_n[1][90] = 4.51e-10; +// I_g_on_n[1][100] = 4.51e-10; + + + if (ram_cell_tech_type == 3) + {} + else if (ram_cell_tech_type == 4) + { + //22 nm commodity DRAM cell access transistor technology parameters. + //parameters + curr_vdd_dram_cell = 0.9;//0.45;//This value has reduced greatly in 2007 ITRS for all technology nodes. In + //2005 ITRS, the value was about twice the value in 2007 ITRS + Lphy[3] = 0.022;//micron + Lelec[3] = 0.0181;//micron. + curr_v_th_dram_access_transistor = 1;//V + width_dram_access_transistor = 0.022;//micron + curr_I_on_dram_cell = 20e-6; //This is a typical value that I have always + //kept constant. In reality this could perhaps be lower + curr_I_off_dram_cell_worst_case_length_temp = 1e-15;//A + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 6*0.022*0.022;//micron2. + curr_asp_ratio_cell_dram = 0.667; + curr_c_dram_cell = 30e-15;//This is a typical value that I have alwaus + //kept constant. + + //22 nm commodity DRAM wordline transistor parameters obtained using MASTAR. + curr_vpp = 2.3;//vpp. V + t_ox[3] = 3.5e-3;//micron + v_th[3] = 1.0;//V + c_ox[3] = 9.06e-15;//F/micron2 + mobility_eff[3] = 367.29 * (1e-2 * 1e6 * 1e-2 * 1e6);//micron2 / Vs + Vdsat[3] = 0.0972; //V/micron + c_g_ideal[3] = 1.99e-16;//F/micron + c_fringe[3] = 0.053e-15;//F/micron + c_junc[3] = 1e-15;//F/micron2 + I_on_n[3] = 910.5e-6;//A/micron + I_on_p[3] = I_on_n[3] / 2;//This value for I_on_p is not really used. + nmos_effective_resistance_multiplier = 1.69;//Using the value from 32nm. + // + n_to_p_eff_curr_drv_ratio[3] = 1.95;//Using the value from 32nm + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];//ohm-micron + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];//ohm-micron + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 1.1e-13; //A/micron + I_off_n[3][10] = 2.11e-13; + I_off_n[3][20] = 3.88e-13; + I_off_n[3][30] = 6.9e-13; + I_off_n[3][40] = 1.19e-12; + I_off_n[3][50] = 1.98e-12; + I_off_n[3][60] = 3.22e-12; + I_off_n[3][70] = 5.09e-12; + I_off_n[3][80] = 7.85e-12; + I_off_n[3][90] = 1.18e-11; + I_off_n[3][100] = 1.72e-11; + + } + else + { + //some error handler + } + + //SRAM cell properties + curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um; + curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_sram = 1.46; + //CAM cell properties //TODO: data need to be revisited + curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um; + curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_cam = 2.92; + //Empirical undifferetiated core/FU coefficient + curr_logic_scaling_co_eff = 0.7*0.7*0.7*0.7*0.7; + curr_core_tx_density = 1.25/0.7/0.7/0.7; + curr_sckt_co_eff = 1.1296; + curr_chip_layout_overhead = 1.2;//die measurement results based on Niagara 1 and 2 + curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb + } + + + g_tp.peri_global.Vdd += curr_alpha * vdd[peri_global_tech_type]; + g_tp.peri_global.t_ox += curr_alpha * t_ox[peri_global_tech_type]; + g_tp.peri_global.Vth += curr_alpha * v_th[peri_global_tech_type]; + g_tp.peri_global.C_ox += curr_alpha * c_ox[peri_global_tech_type]; + g_tp.peri_global.C_g_ideal += curr_alpha * c_g_ideal[peri_global_tech_type]; + g_tp.peri_global.C_fringe += curr_alpha * c_fringe[peri_global_tech_type]; + g_tp.peri_global.C_junc += curr_alpha * c_junc[peri_global_tech_type]; + g_tp.peri_global.C_junc_sidewall = 0.25e-15; // F/micron + g_tp.peri_global.l_phy += curr_alpha * Lphy[peri_global_tech_type]; + g_tp.peri_global.l_elec += curr_alpha * Lelec[peri_global_tech_type]; + g_tp.peri_global.I_on_n += curr_alpha * I_on_n[peri_global_tech_type]; + g_tp.peri_global.R_nch_on += curr_alpha * Rnchannelon[peri_global_tech_type]; + g_tp.peri_global.R_pch_on += curr_alpha * Rpchannelon[peri_global_tech_type]; + g_tp.peri_global.n_to_p_eff_curr_drv_ratio + += curr_alpha * n_to_p_eff_curr_drv_ratio[peri_global_tech_type]; + g_tp.peri_global.long_channel_leakage_reduction + += curr_alpha * long_channel_leakage_reduction[peri_global_tech_type]; + g_tp.peri_global.I_off_n += curr_alpha * I_off_n[peri_global_tech_type][g_ip->temp - 300]; + g_tp.peri_global.I_off_p += curr_alpha * I_off_n[peri_global_tech_type][g_ip->temp - 300]; + g_tp.peri_global.I_g_on_n += curr_alpha * I_g_on_n[peri_global_tech_type][g_ip->temp - 300]; + g_tp.peri_global.I_g_on_p += curr_alpha * I_g_on_n[peri_global_tech_type][g_ip->temp - 300]; + gmp_to_gmn_multiplier_periph_global += curr_alpha * gmp_to_gmn_multiplier[peri_global_tech_type]; + + g_tp.sram_cell.Vdd += curr_alpha * vdd[ram_cell_tech_type]; + g_tp.sram_cell.l_phy += curr_alpha * Lphy[ram_cell_tech_type]; + g_tp.sram_cell.l_elec += curr_alpha * Lelec[ram_cell_tech_type]; + g_tp.sram_cell.t_ox += curr_alpha * t_ox[ram_cell_tech_type]; + g_tp.sram_cell.Vth += curr_alpha * v_th[ram_cell_tech_type]; + g_tp.sram_cell.C_g_ideal += curr_alpha * c_g_ideal[ram_cell_tech_type]; + g_tp.sram_cell.C_fringe += curr_alpha * c_fringe[ram_cell_tech_type]; + g_tp.sram_cell.C_junc += curr_alpha * c_junc[ram_cell_tech_type]; + g_tp.sram_cell.C_junc_sidewall = 0.25e-15; // F/micron + g_tp.sram_cell.I_on_n += curr_alpha * I_on_n[ram_cell_tech_type]; + g_tp.sram_cell.R_nch_on += curr_alpha * Rnchannelon[ram_cell_tech_type]; + g_tp.sram_cell.R_pch_on += curr_alpha * Rpchannelon[ram_cell_tech_type]; + g_tp.sram_cell.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[ram_cell_tech_type]; + g_tp.sram_cell.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[ram_cell_tech_type]; + g_tp.sram_cell.I_off_n += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300]; + g_tp.sram_cell.I_off_p += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300]; + g_tp.sram_cell.I_g_on_n += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300]; + g_tp.sram_cell.I_g_on_p += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300]; + + g_tp.dram_cell_Vdd += curr_alpha * curr_vdd_dram_cell; + g_tp.dram_acc.Vth += curr_alpha * curr_v_th_dram_access_transistor; + g_tp.dram_acc.l_phy += curr_alpha * Lphy[dram_cell_tech_flavor]; + g_tp.dram_acc.l_elec += curr_alpha * Lelec[dram_cell_tech_flavor]; + g_tp.dram_acc.C_g_ideal += curr_alpha * c_g_ideal[dram_cell_tech_flavor]; + g_tp.dram_acc.C_fringe += curr_alpha * c_fringe[dram_cell_tech_flavor]; + g_tp.dram_acc.C_junc += curr_alpha * c_junc[dram_cell_tech_flavor]; + g_tp.dram_acc.C_junc_sidewall = 0.25e-15; // F/micron + g_tp.dram_cell_I_on += curr_alpha * curr_I_on_dram_cell; + g_tp.dram_cell_I_off_worst_case_len_temp += curr_alpha * curr_I_off_dram_cell_worst_case_length_temp; + g_tp.dram_acc.I_on_n += curr_alpha * I_on_n[dram_cell_tech_flavor]; + g_tp.dram_cell_C += curr_alpha * curr_c_dram_cell; + g_tp.vpp += curr_alpha * curr_vpp; + g_tp.dram_wl.l_phy += curr_alpha * Lphy[dram_cell_tech_flavor]; + g_tp.dram_wl.l_elec += curr_alpha * Lelec[dram_cell_tech_flavor]; + g_tp.dram_wl.C_g_ideal += curr_alpha * c_g_ideal[dram_cell_tech_flavor]; + g_tp.dram_wl.C_fringe += curr_alpha * c_fringe[dram_cell_tech_flavor]; + g_tp.dram_wl.C_junc += curr_alpha * c_junc[dram_cell_tech_flavor]; + g_tp.dram_wl.C_junc_sidewall = 0.25e-15; // F/micron + g_tp.dram_wl.I_on_n += curr_alpha * I_on_n[dram_cell_tech_flavor]; + g_tp.dram_wl.R_nch_on += curr_alpha * Rnchannelon[dram_cell_tech_flavor]; + g_tp.dram_wl.R_pch_on += curr_alpha * Rpchannelon[dram_cell_tech_flavor]; + g_tp.dram_wl.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[dram_cell_tech_flavor]; + g_tp.dram_wl.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[dram_cell_tech_flavor]; + g_tp.dram_wl.I_off_n += curr_alpha * I_off_n[dram_cell_tech_flavor][g_ip->temp - 300]; + g_tp.dram_wl.I_off_p += curr_alpha * I_off_n[dram_cell_tech_flavor][g_ip->temp - 300]; + + g_tp.cam_cell.Vdd += curr_alpha * vdd[ram_cell_tech_type]; + g_tp.cam_cell.l_phy += curr_alpha * Lphy[ram_cell_tech_type]; + g_tp.cam_cell.l_elec += curr_alpha * Lelec[ram_cell_tech_type]; + g_tp.cam_cell.t_ox += curr_alpha * t_ox[ram_cell_tech_type]; + g_tp.cam_cell.Vth += curr_alpha * v_th[ram_cell_tech_type]; + g_tp.cam_cell.C_g_ideal += curr_alpha * c_g_ideal[ram_cell_tech_type]; + g_tp.cam_cell.C_fringe += curr_alpha * c_fringe[ram_cell_tech_type]; + g_tp.cam_cell.C_junc += curr_alpha * c_junc[ram_cell_tech_type]; + g_tp.cam_cell.C_junc_sidewall = 0.25e-15; // F/micron + g_tp.cam_cell.I_on_n += curr_alpha * I_on_n[ram_cell_tech_type]; + g_tp.cam_cell.R_nch_on += curr_alpha * Rnchannelon[ram_cell_tech_type]; + g_tp.cam_cell.R_pch_on += curr_alpha * Rpchannelon[ram_cell_tech_type]; + g_tp.cam_cell.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[ram_cell_tech_type]; + g_tp.cam_cell.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[ram_cell_tech_type]; + g_tp.cam_cell.I_off_n += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300]; + g_tp.cam_cell.I_off_p += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300]; + g_tp.cam_cell.I_g_on_n += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300]; + g_tp.cam_cell.I_g_on_p += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300]; + + g_tp.dram.cell_a_w += curr_alpha * curr_Wmemcella_dram; + g_tp.dram.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_dram; + g_tp.dram.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_dram; + area_cell_dram += curr_alpha * curr_area_cell_dram; + asp_ratio_cell_dram += curr_alpha * curr_asp_ratio_cell_dram; + + g_tp.sram.cell_a_w += curr_alpha * curr_Wmemcella_sram; + g_tp.sram.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_sram; + g_tp.sram.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_sram; + area_cell_sram += curr_alpha * curr_area_cell_sram; + asp_ratio_cell_sram += curr_alpha * curr_asp_ratio_cell_sram; + + g_tp.cam.cell_a_w += curr_alpha * curr_Wmemcella_cam;//sheng + g_tp.cam.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_cam; + g_tp.cam.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_cam; + area_cell_cam += curr_alpha * curr_area_cell_cam; + asp_ratio_cell_cam += curr_alpha * curr_asp_ratio_cell_cam; + + //Sense amplifier latch Gm calculation + mobility_eff_periph_global += curr_alpha * mobility_eff[peri_global_tech_type]; + Vdsat_periph_global += curr_alpha * Vdsat[peri_global_tech_type]; + + //Empirical undifferetiated core/FU coefficient + g_tp.scaling_factor.logic_scaling_co_eff += curr_alpha * curr_logic_scaling_co_eff; + g_tp.scaling_factor.core_tx_density += curr_alpha * curr_core_tx_density; + g_tp.chip_layout_overhead += curr_alpha * curr_chip_layout_overhead; + g_tp.macro_layout_overhead += curr_alpha * curr_macro_layout_overhead; + g_tp.sckt_co_eff += curr_alpha * curr_sckt_co_eff; + } + + + //Currently we are not modeling the resistance/capacitance of poly anywhere. + //Continuous function (or date have been processed) does not need linear interpolation + g_tp.w_comp_inv_p1 = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process + g_tp.w_comp_inv_n1 = 7.5 * g_ip->F_sz_um;//this was 6 micron for the 0.8 micron process + g_tp.w_comp_inv_p2 = 25 * g_ip->F_sz_um;//this was 20 micron for the 0.8 micron process + g_tp.w_comp_inv_n2 = 15 * g_ip->F_sz_um;//this was 12 micron for the 0.8 micron process + g_tp.w_comp_inv_p3 = 50 * g_ip->F_sz_um;//this was 40 micron for the 0.8 micron process + g_tp.w_comp_inv_n3 = 30 * g_ip->F_sz_um;//this was 24 micron for the 0.8 micron process + g_tp.w_eval_inv_p = 100 * g_ip->F_sz_um;//this was 80 micron for the 0.8 micron process + g_tp.w_eval_inv_n = 50 * g_ip->F_sz_um;//this was 40 micron for the 0.8 micron process + g_tp.w_comp_n = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process + g_tp.w_comp_p = 37.5 * g_ip->F_sz_um;//this was 30 micron for the 0.8 micron process + + g_tp.MIN_GAP_BET_P_AND_N_DIFFS = 5 * g_ip->F_sz_um; + g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS = 1.5 * g_ip->F_sz_um; + g_tp.HPOWERRAIL = 2 * g_ip->F_sz_um; + g_tp.cell_h_def = 50 * g_ip->F_sz_um; + g_tp.w_poly_contact = g_ip->F_sz_um; + g_tp.spacing_poly_to_contact = g_ip->F_sz_um; + g_tp.spacing_poly_to_poly = 1.5 * g_ip->F_sz_um; + g_tp.ram_wl_stitching_overhead_ = 7.5 * g_ip->F_sz_um; + + g_tp.min_w_nmos_ = 3 * g_ip->F_sz_um / 2; + g_tp.max_w_nmos_ = 100 * g_ip->F_sz_um; + g_tp.w_iso = 12.5*g_ip->F_sz_um;//was 10 micron for the 0.8 micron process + g_tp.w_sense_n = 3.75*g_ip->F_sz_um; // sense amplifier N-trans; was 3 micron for the 0.8 micron process + g_tp.w_sense_p = 7.5*g_ip->F_sz_um; // sense amplifier P-trans; was 6 micron for the 0.8 micron process + g_tp.w_sense_en = 5*g_ip->F_sz_um; // Sense enable transistor of the sense amplifier; was 4 micron for the 0.8 micron process + g_tp.w_nmos_b_mux = 6 * g_tp.min_w_nmos_; + g_tp.w_nmos_sa_mux = 6 * g_tp.min_w_nmos_; + + if (ram_cell_tech_type == comm_dram) + { + g_tp.max_w_nmos_dec = 8 * g_ip->F_sz_um; + g_tp.h_dec = 8; // in the unit of memory cell height + } + else + { + g_tp.max_w_nmos_dec = g_tp.max_w_nmos_; + g_tp.h_dec = 4; // in the unit of memory cell height + } + + g_tp.peri_global.C_overlap = 0.2 * g_tp.peri_global.C_g_ideal; + g_tp.sram_cell.C_overlap = 0.2 * g_tp.sram_cell.C_g_ideal; + g_tp.cam_cell.C_overlap = 0.2 * g_tp.cam_cell.C_g_ideal; + + g_tp.dram_acc.C_overlap = 0.2 * g_tp.dram_acc.C_g_ideal; + g_tp.dram_acc.R_nch_on = g_tp.dram_cell_Vdd / g_tp.dram_acc.I_on_n; + //g_tp.dram_acc.R_pch_on = g_tp.dram_cell_Vdd / g_tp.dram_acc.I_on_p; + + g_tp.dram_wl.C_overlap = 0.2 * g_tp.dram_wl.C_g_ideal; + + double gmn_sense_amp_latch = (mobility_eff_periph_global / 2) * g_tp.peri_global.C_ox * (g_tp.w_sense_n / g_tp.peri_global.l_elec) * Vdsat_periph_global; + double gmp_sense_amp_latch = gmp_to_gmn_multiplier_periph_global * gmn_sense_amp_latch; + g_tp.gm_sense_amp_latch = gmn_sense_amp_latch + gmp_sense_amp_latch; + + g_tp.dram.b_w = sqrt(area_cell_dram / (asp_ratio_cell_dram)); + g_tp.dram.b_h = asp_ratio_cell_dram * g_tp.dram.b_w; + g_tp.sram.b_w = sqrt(area_cell_sram / (asp_ratio_cell_sram)); + g_tp.sram.b_h = asp_ratio_cell_sram * g_tp.sram.b_w; + g_tp.cam.b_w = sqrt(area_cell_cam / (asp_ratio_cell_cam));//Sheng + g_tp.cam.b_h = asp_ratio_cell_cam * g_tp.cam.b_w; + + g_tp.dram.Vbitpre = g_tp.dram_cell_Vdd; + g_tp.sram.Vbitpre = vdd[ram_cell_tech_type]; + g_tp.cam.Vbitpre = vdd[ram_cell_tech_type];//Sheng + pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); + g_tp.w_pmos_bl_precharge = 6 * pmos_to_nmos_sizing_r * g_tp.min_w_nmos_; + g_tp.w_pmos_bl_eq = pmos_to_nmos_sizing_r * g_tp.min_w_nmos_; + + + double wire_pitch [NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES], + wire_r_per_micron[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES], + wire_c_per_micron[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES], + horiz_dielectric_constant[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES], + vert_dielectric_constant[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES], + aspect_ratio[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES], + miller_value[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES], + ild_thickness[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES]; + + for (iter=0; iter<=1; ++iter) + { + // linear interpolation + if (iter == 0) + { + tech = tech_lo; + if (tech_lo == tech_hi) + { + curr_alpha = 1; + } + else + { + curr_alpha = (technology - tech_hi)/(tech_lo - tech_hi); + } + } + else + { + tech = tech_hi; + if (tech_lo == tech_hi) + { + break; + } + else + { + curr_alpha = (tech_lo - technology)/(tech_lo - tech_hi); + } + } + + if (tech == 180) + { + //Aggressive projections + wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//micron + aspect_ratio[0][0] = 2.0; + wire_width = wire_pitch[0][0] / 2; //micron + wire_thickness = aspect_ratio[0][0] * wire_width;//micron + wire_spacing = wire_pitch[0][0] - wire_width;//micron + barrier_thickness = 0.017;//micron + dishing_thickness = 0;//micron + alpha_scatter = 1; + wire_r_per_micron[0][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);//ohm/micron + ild_thickness[0][0] = 0.75;//micron + miller_value[0][0] = 1.5; + horiz_dielectric_constant[0][0] = 2.709; + vert_dielectric_constant[0][0] = 3.9; + fringe_cap = 0.115e-15; //F/micron + wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], + vert_dielectric_constant[0][0], + fringe_cap);//F/micron. + + wire_pitch[0][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[0][1] / 2; + aspect_ratio[0][1] = 2.4; + wire_thickness = aspect_ratio[0][1] * wire_width; + wire_spacing = wire_pitch[0][1] - wire_width; + wire_r_per_micron[0][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][1] = 0.75;//micron + miller_value[0][1] = 1.5; + horiz_dielectric_constant[0][1] = 2.709; + vert_dielectric_constant[0][1] = 3.9; + fringe_cap = 0.115e-15; //F/micron + wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], + vert_dielectric_constant[0][1], + fringe_cap); + + wire_pitch[0][2] = 8 * g_ip->F_sz_um; + aspect_ratio[0][2] = 2.2; + wire_width = wire_pitch[0][2] / 2; + wire_thickness = aspect_ratio[0][2] * wire_width; + wire_spacing = wire_pitch[0][2] - wire_width; + wire_r_per_micron[0][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][2] = 1.5; + miller_value[0][2] = 1.5; + horiz_dielectric_constant[0][2] = 2.709; + vert_dielectric_constant[0][2] = 3.9; + wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2], + fringe_cap); + + //Conservative projections + wire_pitch[1][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[1][0]= 2.0; + wire_width = wire_pitch[1][0] / 2; + wire_thickness = aspect_ratio[1][0] * wire_width; + wire_spacing = wire_pitch[1][0] - wire_width; + barrier_thickness = 0.017; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][0] = 0.75; + miller_value[1][0] = 1.5; + horiz_dielectric_constant[1][0] = 3.038; + vert_dielectric_constant[1][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], + vert_dielectric_constant[1][0], + fringe_cap); + + wire_pitch[1][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[1][1] / 2; + aspect_ratio[1][1] = 2.0; + wire_thickness = aspect_ratio[1][1] * wire_width; + wire_spacing = wire_pitch[1][1] - wire_width; + wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][1] = 0.75; + miller_value[1][1] = 1.5; + horiz_dielectric_constant[1][1] = 3.038; + vert_dielectric_constant[1][1] = 3.9; + wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], + vert_dielectric_constant[1][1], + fringe_cap); + + wire_pitch[1][2] = 8 * g_ip->F_sz_um; + aspect_ratio[1][2] = 2.2; + wire_width = wire_pitch[1][2] / 2; + wire_thickness = aspect_ratio[1][2] * wire_width; + wire_spacing = wire_pitch[1][2] - wire_width; + dishing_thickness = 0.1 * wire_thickness; + wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][2] = 1.98; + miller_value[1][2] = 1.5; + horiz_dielectric_constant[1][2] = 3.038; + vert_dielectric_constant[1][2] = 3.9; + wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][2] , miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2], + fringe_cap); + //Nominal projections for commodity DRAM wordline/bitline + wire_pitch[1][3] = 2 * 0.18; + wire_c_per_micron[1][3] = 60e-15 / (256 * 2 * 0.18); + wire_r_per_micron[1][3] = 12 / 0.18; + } + else if (tech == 90) + { + //Aggressive projections + wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//micron + aspect_ratio[0][0] = 2.4; + wire_width = wire_pitch[0][0] / 2; //micron + wire_thickness = aspect_ratio[0][0] * wire_width;//micron + wire_spacing = wire_pitch[0][0] - wire_width;//micron + barrier_thickness = 0.01;//micron + dishing_thickness = 0;//micron + alpha_scatter = 1; + wire_r_per_micron[0][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);//ohm/micron + ild_thickness[0][0] = 0.48;//micron + miller_value[0][0] = 1.5; + horiz_dielectric_constant[0][0] = 2.709; + vert_dielectric_constant[0][0] = 3.9; + fringe_cap = 0.115e-15; //F/micron + wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], + vert_dielectric_constant[0][0], + fringe_cap);//F/micron. + + wire_pitch[0][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[0][1] / 2; + aspect_ratio[0][1] = 2.4; + wire_thickness = aspect_ratio[0][1] * wire_width; + wire_spacing = wire_pitch[0][1] - wire_width; + wire_r_per_micron[0][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][1] = 0.48;//micron + miller_value[0][1] = 1.5; + horiz_dielectric_constant[0][1] = 2.709; + vert_dielectric_constant[0][1] = 3.9; + wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], + vert_dielectric_constant[0][1], + fringe_cap); + + wire_pitch[0][2] = 8 * g_ip->F_sz_um; + aspect_ratio[0][2] = 2.7; + wire_width = wire_pitch[0][2] / 2; + wire_thickness = aspect_ratio[0][2] * wire_width; + wire_spacing = wire_pitch[0][2] - wire_width; + wire_r_per_micron[0][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][2] = 0.96; + miller_value[0][2] = 1.5; + horiz_dielectric_constant[0][2] = 2.709; + vert_dielectric_constant[0][2] = 3.9; + wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2], + fringe_cap); + + //Conservative projections + wire_pitch[1][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[1][0] = 2.0; + wire_width = wire_pitch[1][0] / 2; + wire_thickness = aspect_ratio[1][0] * wire_width; + wire_spacing = wire_pitch[1][0] - wire_width; + barrier_thickness = 0.008; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][0] = 0.48; + miller_value[1][0] = 1.5; + horiz_dielectric_constant[1][0] = 3.038; + vert_dielectric_constant[1][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], + vert_dielectric_constant[1][0], + fringe_cap); + + wire_pitch[1][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[1][1] / 2; + aspect_ratio[1][1] = 2.0; + wire_thickness = aspect_ratio[1][1] * wire_width; + wire_spacing = wire_pitch[1][1] - wire_width; + wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][1] = 0.48; + miller_value[1][1] = 1.5; + horiz_dielectric_constant[1][1] = 3.038; + vert_dielectric_constant[1][1] = 3.9; + wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], + vert_dielectric_constant[1][1], + fringe_cap); + + wire_pitch[1][2] = 8 * g_ip->F_sz_um; + aspect_ratio[1][2] = 2.2; + wire_width = wire_pitch[1][2] / 2; + wire_thickness = aspect_ratio[1][2] * wire_width; + wire_spacing = wire_pitch[1][2] - wire_width; + dishing_thickness = 0.1 * wire_thickness; + wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][2] = 1.1; + miller_value[1][2] = 1.5; + horiz_dielectric_constant[1][2] = 3.038; + vert_dielectric_constant[1][2] = 3.9; + wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][2] , miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2], + fringe_cap); + //Nominal projections for commodity DRAM wordline/bitline + wire_pitch[1][3] = 2 * 0.09; + wire_c_per_micron[1][3] = 60e-15 / (256 * 2 * 0.09); + wire_r_per_micron[1][3] = 12 / 0.09; + } + else if (tech == 65) + { + //Aggressive projections + wire_pitch[0][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[0][0] = 2.7; + wire_width = wire_pitch[0][0] / 2; + wire_thickness = aspect_ratio[0][0] * wire_width; + wire_spacing = wire_pitch[0][0] - wire_width; + barrier_thickness = 0; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][0] = 0.405; + miller_value[0][0] = 1.5; + horiz_dielectric_constant[0][0] = 2.303; + vert_dielectric_constant[0][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][0] , miller_value[0][0] , horiz_dielectric_constant[0][0] , vert_dielectric_constant[0][0] , + fringe_cap); + + wire_pitch[0][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[0][1] / 2; + aspect_ratio[0][1] = 2.7; + wire_thickness = aspect_ratio[0][1] * wire_width; + wire_spacing = wire_pitch[0][1] - wire_width; + wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][1] = 0.405; + miller_value[0][1] = 1.5; + horiz_dielectric_constant[0][1] = 2.303; + vert_dielectric_constant[0][1] = 3.9; + wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], + vert_dielectric_constant[0][1], + fringe_cap); + + wire_pitch[0][2] = 8 * g_ip->F_sz_um; + aspect_ratio[0][2] = 2.8; + wire_width = wire_pitch[0][2] / 2; + wire_thickness = aspect_ratio[0][2] * wire_width; + wire_spacing = wire_pitch[0][2] - wire_width; + wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][2] = 0.81; + miller_value[0][2] = 1.5; + horiz_dielectric_constant[0][2] = 2.303; + vert_dielectric_constant[0][2] = 3.9; + wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2], + fringe_cap); + + //Conservative projections + wire_pitch[1][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[1][0] = 2.0; + wire_width = wire_pitch[1][0] / 2; + wire_thickness = aspect_ratio[1][0] * wire_width; + wire_spacing = wire_pitch[1][0] - wire_width; + barrier_thickness = 0.006; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][0] = 0.405; + miller_value[1][0] = 1.5; + horiz_dielectric_constant[1][0] = 2.734; + vert_dielectric_constant[1][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0], + fringe_cap); + + wire_pitch[1][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[1][1] / 2; + aspect_ratio[1][1] = 2.0; + wire_thickness = aspect_ratio[1][1] * wire_width; + wire_spacing = wire_pitch[1][1] - wire_width; + wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][1] = 0.405; + miller_value[1][1] = 1.5; + horiz_dielectric_constant[1][1] = 2.734; + vert_dielectric_constant[1][1] = 3.9; + wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1], + fringe_cap); + + wire_pitch[1][2] = 8 * g_ip->F_sz_um; + aspect_ratio[1][2] = 2.2; + wire_width = wire_pitch[1][2] / 2; + wire_thickness = aspect_ratio[1][2] * wire_width; + wire_spacing = wire_pitch[1][2] - wire_width; + dishing_thickness = 0.1 * wire_thickness; + wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][2] = 0.77; + miller_value[1][2] = 1.5; + horiz_dielectric_constant[1][2] = 2.734; + vert_dielectric_constant[1][2] = 3.9; + wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2], + fringe_cap); + //Nominal projections for commodity DRAM wordline/bitline + wire_pitch[1][3] = 2 * 0.065; + wire_c_per_micron[1][3] = 52.5e-15 / (256 * 2 * 0.065); + wire_r_per_micron[1][3] = 12 / 0.065; + } + else if (tech == 45) + { + //Aggressive projections. + wire_pitch[0][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[0][0] = 3.0; + wire_width = wire_pitch[0][0] / 2; + wire_thickness = aspect_ratio[0][0] * wire_width; + wire_spacing = wire_pitch[0][0] - wire_width; + barrier_thickness = 0; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][0] = 0.315; + miller_value[0][0] = 1.5; + horiz_dielectric_constant[0][0] = 1.958; + vert_dielectric_constant[0][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][0] , miller_value[0][0] , horiz_dielectric_constant[0][0] , vert_dielectric_constant[0][0] , + fringe_cap); + + wire_pitch[0][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[0][1] / 2; + aspect_ratio[0][1] = 3.0; + wire_thickness = aspect_ratio[0][1] * wire_width; + wire_spacing = wire_pitch[0][1] - wire_width; + wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][1] = 0.315; + miller_value[0][1] = 1.5; + horiz_dielectric_constant[0][1] = 1.958; + vert_dielectric_constant[0][1] = 3.9; + wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1], + fringe_cap); + + wire_pitch[0][2] = 8 * g_ip->F_sz_um; + aspect_ratio[0][2] = 3.0; + wire_width = wire_pitch[0][2] / 2; + wire_thickness = aspect_ratio[0][2] * wire_width; + wire_spacing = wire_pitch[0][2] - wire_width; + wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][2] = 0.63; + miller_value[0][2] = 1.5; + horiz_dielectric_constant[0][2] = 1.958; + vert_dielectric_constant[0][2] = 3.9; + wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2], + fringe_cap); + + //Conservative projections + wire_pitch[1][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[1][0] = 2.0; + wire_width = wire_pitch[1][0] / 2; + wire_thickness = aspect_ratio[1][0] * wire_width; + wire_spacing = wire_pitch[1][0] - wire_width; + barrier_thickness = 0.004; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][0] = 0.315; + miller_value[1][0] = 1.5; + horiz_dielectric_constant[1][0] = 2.46; + vert_dielectric_constant[1][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0], + fringe_cap); + + wire_pitch[1][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[1][1] / 2; + aspect_ratio[1][1] = 2.0; + wire_thickness = aspect_ratio[1][1] * wire_width; + wire_spacing = wire_pitch[1][1] - wire_width; + wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][1] = 0.315; + miller_value[1][1] = 1.5; + horiz_dielectric_constant[1][1] = 2.46; + vert_dielectric_constant[1][1] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1], + fringe_cap); + + wire_pitch[1][2] = 8 * g_ip->F_sz_um; + aspect_ratio[1][2] = 2.2; + wire_width = wire_pitch[1][2] / 2; + wire_thickness = aspect_ratio[1][2] * wire_width; + wire_spacing = wire_pitch[1][2] - wire_width; + dishing_thickness = 0.1 * wire_thickness; + wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][2] = 0.55; + miller_value[1][2] = 1.5; + horiz_dielectric_constant[1][2] = 2.46; + vert_dielectric_constant[1][2] = 3.9; + wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2], + fringe_cap); + //Nominal projections for commodity DRAM wordline/bitline + wire_pitch[1][3] = 2 * 0.045; + wire_c_per_micron[1][3] = 37.5e-15 / (256 * 2 * 0.045); + wire_r_per_micron[1][3] = 12 / 0.045; + } + else if (tech == 32) + { + //Aggressive projections. + wire_pitch[0][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[0][0] = 3.0; + wire_width = wire_pitch[0][0] / 2; + wire_thickness = aspect_ratio[0][0] * wire_width; + wire_spacing = wire_pitch[0][0] - wire_width; + barrier_thickness = 0; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][0] = 0.21; + miller_value[0][0] = 1.5; + horiz_dielectric_constant[0][0] = 1.664; + vert_dielectric_constant[0][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0], + fringe_cap); + + wire_pitch[0][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[0][1] / 2; + aspect_ratio[0][1] = 3.0; + wire_thickness = aspect_ratio[0][1] * wire_width; + wire_spacing = wire_pitch[0][1] - wire_width; + wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][1] = 0.21; + miller_value[0][1] = 1.5; + horiz_dielectric_constant[0][1] = 1.664; + vert_dielectric_constant[0][1] = 3.9; + wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1], + fringe_cap); + + wire_pitch[0][2] = 8 * g_ip->F_sz_um; + aspect_ratio[0][2] = 3.0; + wire_width = wire_pitch[0][2] / 2; + wire_thickness = aspect_ratio[0][2] * wire_width; + wire_spacing = wire_pitch[0][2] - wire_width; + wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][2] = 0.42; + miller_value[0][2] = 1.5; + horiz_dielectric_constant[0][2] = 1.664; + vert_dielectric_constant[0][2] = 3.9; + wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2], + fringe_cap); + + //Conservative projections + wire_pitch[1][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[1][0] = 2.0; + wire_width = wire_pitch[1][0] / 2; + wire_thickness = aspect_ratio[1][0] * wire_width; + wire_spacing = wire_pitch[1][0] - wire_width; + barrier_thickness = 0.003; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][0] = 0.21; + miller_value[1][0] = 1.5; + horiz_dielectric_constant[1][0] = 2.214; + vert_dielectric_constant[1][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0], + fringe_cap); + + wire_pitch[1][1] = 4 * g_ip->F_sz_um; + aspect_ratio[1][1] = 2.0; + wire_width = wire_pitch[1][1] / 2; + wire_thickness = aspect_ratio[1][1] * wire_width; + wire_spacing = wire_pitch[1][1] - wire_width; + wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][1] = 0.21; + miller_value[1][1] = 1.5; + horiz_dielectric_constant[1][1] = 2.214; + vert_dielectric_constant[1][1] = 3.9; + wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1], + fringe_cap); + + wire_pitch[1][2] = 8 * g_ip->F_sz_um; + aspect_ratio[1][2] = 2.2; + wire_width = wire_pitch[1][2] / 2; + wire_thickness = aspect_ratio[1][2] * wire_width; + wire_spacing = wire_pitch[1][2] - wire_width; + dishing_thickness = 0.1 * wire_thickness; + wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][2] = 0.385; + miller_value[1][2] = 1.5; + horiz_dielectric_constant[1][2] = 2.214; + vert_dielectric_constant[1][2] = 3.9; + wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2], + fringe_cap); + //Nominal projections for commodity DRAM wordline/bitline + wire_pitch[1][3] = 2 * 0.032;//micron + wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.032);//F/micron + wire_r_per_micron[1][3] = 12 / 0.032;//ohm/micron + } + else if (tech == 22) + { + //Aggressive projections. + wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//local + aspect_ratio[0][0] = 3.0; + wire_width = wire_pitch[0][0] / 2; + wire_thickness = aspect_ratio[0][0] * wire_width; + wire_spacing = wire_pitch[0][0] - wire_width; + barrier_thickness = 0; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][0] = 0.15; + miller_value[0][0] = 1.5; + horiz_dielectric_constant[0][0] = 1.414; + vert_dielectric_constant[0][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0], + fringe_cap); + + wire_pitch[0][1] = 4 * g_ip->F_sz_um;//semi-global + wire_width = wire_pitch[0][1] / 2; + aspect_ratio[0][1] = 3.0; + wire_thickness = aspect_ratio[0][1] * wire_width; + wire_spacing = wire_pitch[0][1] - wire_width; + wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][1] = 0.15; + miller_value[0][1] = 1.5; + horiz_dielectric_constant[0][1] = 1.414; + vert_dielectric_constant[0][1] = 3.9; + wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1], + fringe_cap); + + wire_pitch[0][2] = 8 * g_ip->F_sz_um;//global + aspect_ratio[0][2] = 3.0; + wire_width = wire_pitch[0][2] / 2; + wire_thickness = aspect_ratio[0][2] * wire_width; + wire_spacing = wire_pitch[0][2] - wire_width; + wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][2] = 0.3; + miller_value[0][2] = 1.5; + horiz_dielectric_constant[0][2] = 1.414; + vert_dielectric_constant[0][2] = 3.9; + wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2], + fringe_cap); + +// //************************* +// wire_pitch[0][4] = 16 * g_ip.F_sz_um;//global +// aspect_ratio = 3.0; +// wire_width = wire_pitch[0][4] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[0][4] - wire_width; +// wire_r_per_micron[0][4] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.3; +// wire_c_per_micron[0][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[0][5] = 24 * g_ip.F_sz_um;//global +// aspect_ratio = 3.0; +// wire_width = wire_pitch[0][5] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[0][5] - wire_width; +// wire_r_per_micron[0][5] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.3; +// wire_c_per_micron[0][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[0][6] = 32 * g_ip.F_sz_um;//global +// aspect_ratio = 3.0; +// wire_width = wire_pitch[0][6] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[0][6] - wire_width; +// wire_r_per_micron[0][6] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.3; +// wire_c_per_micron[0][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); + //************************* + + //Conservative projections + wire_pitch[1][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[1][0] = 2.0; + wire_width = wire_pitch[1][0] / 2; + wire_thickness = aspect_ratio[1][0] * wire_width; + wire_spacing = wire_pitch[1][0] - wire_width; + barrier_thickness = 0.003; + dishing_thickness = 0; + alpha_scatter = 1.05; + wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][0] = 0.15; + miller_value[1][0] = 1.5; + horiz_dielectric_constant[1][0] = 2.104; + vert_dielectric_constant[1][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0], + fringe_cap); + + wire_pitch[1][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[1][1] / 2; + aspect_ratio[1][1] = 2.0; + wire_thickness = aspect_ratio[1][1] * wire_width; + wire_spacing = wire_pitch[1][1] - wire_width; + wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][1] = 0.15; + miller_value[1][1] = 1.5; + horiz_dielectric_constant[1][1] = 2.104; + vert_dielectric_constant[1][1] = 3.9; + wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1], + fringe_cap); + + wire_pitch[1][2] = 8 * g_ip->F_sz_um; + aspect_ratio[1][2] = 2.2; + wire_width = wire_pitch[1][2] / 2; + wire_thickness = aspect_ratio[1][2] * wire_width; + wire_spacing = wire_pitch[1][2] - wire_width; + dishing_thickness = 0.1 * wire_thickness; + wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][2] = 0.275; + miller_value[1][2] = 1.5; + horiz_dielectric_constant[1][2] = 2.104; + vert_dielectric_constant[1][2] = 3.9; + wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2], + fringe_cap); + //Nominal projections for commodity DRAM wordline/bitline + wire_pitch[1][3] = 2 * 0.022;//micron + wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.022);//F/micron + wire_r_per_micron[1][3] = 12 / 0.022;//ohm/micron + + //****************** +// wire_pitch[1][4] = 16 * g_ip.F_sz_um; +// aspect_ratio = 2.2; +// wire_width = wire_pitch[1][4] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[1][4] - wire_width; +// dishing_thickness = 0.1 * wire_thickness; +// wire_r_per_micron[1][4] = wire_resistance(CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.275; +// wire_c_per_micron[1][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[1][5] = 24 * g_ip.F_sz_um; +// aspect_ratio = 2.2; +// wire_width = wire_pitch[1][5] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[1][5] - wire_width; +// dishing_thickness = 0.1 * wire_thickness; +// wire_r_per_micron[1][5] = wire_resistance(CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.275; +// wire_c_per_micron[1][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[1][6] = 32 * g_ip.F_sz_um; +// aspect_ratio = 2.2; +// wire_width = wire_pitch[1][6] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[1][6] - wire_width; +// dishing_thickness = 0.1 * wire_thickness; +// wire_r_per_micron[1][6] = wire_resistance(CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.275; +// wire_c_per_micron[1][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); + } + + else if (tech == 16) + { + //Aggressive projections. + wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//local + aspect_ratio[0][0] = 3.0; + wire_width = wire_pitch[0][0] / 2; + wire_thickness = aspect_ratio[0][0] * wire_width; + wire_spacing = wire_pitch[0][0] - wire_width; + barrier_thickness = 0; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][0] = 0.108; + miller_value[0][0] = 1.5; + horiz_dielectric_constant[0][0] = 1.202; + vert_dielectric_constant[0][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0], + fringe_cap); + + wire_pitch[0][1] = 4 * g_ip->F_sz_um;//semi-global + aspect_ratio[0][1] = 3.0; + wire_width = wire_pitch[0][1] / 2; + wire_thickness = aspect_ratio[0][1] * wire_width; + wire_spacing = wire_pitch[0][1] - wire_width; + wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][1] = 0.108; + miller_value[0][1] = 1.5; + horiz_dielectric_constant[0][1] = 1.202; + vert_dielectric_constant[0][1] = 3.9; + wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1], + fringe_cap); + + wire_pitch[0][2] = 8 * g_ip->F_sz_um;//global + aspect_ratio[0][2] = 3.0; + wire_width = wire_pitch[0][2] / 2; + wire_thickness = aspect_ratio[0][2] * wire_width; + wire_spacing = wire_pitch[0][2] - wire_width; + wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][2] = 0.216; + miller_value[0][2] = 1.5; + horiz_dielectric_constant[0][2] = 1.202; + vert_dielectric_constant[0][2] = 3.9; + wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2], + fringe_cap); + +// //************************* +// wire_pitch[0][4] = 16 * g_ip.F_sz_um;//global +// aspect_ratio = 3.0; +// wire_width = wire_pitch[0][4] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[0][4] - wire_width; +// wire_r_per_micron[0][4] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.3; +// wire_c_per_micron[0][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[0][5] = 24 * g_ip.F_sz_um;//global +// aspect_ratio = 3.0; +// wire_width = wire_pitch[0][5] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[0][5] - wire_width; +// wire_r_per_micron[0][5] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.3; +// wire_c_per_micron[0][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[0][6] = 32 * g_ip.F_sz_um;//global +// aspect_ratio = 3.0; +// wire_width = wire_pitch[0][6] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[0][6] - wire_width; +// wire_r_per_micron[0][6] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.3; +// wire_c_per_micron[0][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); + //************************* + + //Conservative projections + wire_pitch[1][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[1][0] = 2.0; + wire_width = wire_pitch[1][0] / 2; + wire_thickness = aspect_ratio[1][0] * wire_width; + wire_spacing = wire_pitch[1][0] - wire_width; + barrier_thickness = 0.002; + dishing_thickness = 0; + alpha_scatter = 1.05; + wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][0] = 0.108; + miller_value[1][0] = 1.5; + horiz_dielectric_constant[1][0] = 1.998; + vert_dielectric_constant[1][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0], + fringe_cap); + + wire_pitch[1][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[1][1] / 2; + aspect_ratio[1][1] = 2.0; + wire_thickness = aspect_ratio[1][1] * wire_width; + wire_spacing = wire_pitch[1][1] - wire_width; + wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][1] = 0.108; + miller_value[1][1] = 1.5; + horiz_dielectric_constant[1][1] = 1.998; + vert_dielectric_constant[1][1] = 3.9; + wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1], + fringe_cap); + + wire_pitch[1][2] = 8 * g_ip->F_sz_um; + aspect_ratio[1][2] = 2.2; + wire_width = wire_pitch[1][2] / 2; + wire_thickness = aspect_ratio[1][2] * wire_width; + wire_spacing = wire_pitch[1][2] - wire_width; + dishing_thickness = 0.1 * wire_thickness; + wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][2] = 0.198; + miller_value[1][2] = 1.5; + horiz_dielectric_constant[1][2] = 1.998; + vert_dielectric_constant[1][2] = 3.9; + wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2], + fringe_cap); + //Nominal projections for commodity DRAM wordline/bitline + wire_pitch[1][3] = 2 * 0.016;//micron + wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.016);//F/micron + wire_r_per_micron[1][3] = 12 / 0.016;//ohm/micron + + //****************** +// wire_pitch[1][4] = 16 * g_ip.F_sz_um; +// aspect_ratio = 2.2; +// wire_width = wire_pitch[1][4] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[1][4] - wire_width; +// dishing_thickness = 0.1 * wire_thickness; +// wire_r_per_micron[1][4] = wire_resistance(CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.275; +// wire_c_per_micron[1][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[1][5] = 24 * g_ip.F_sz_um; +// aspect_ratio = 2.2; +// wire_width = wire_pitch[1][5] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[1][5] - wire_width; +// dishing_thickness = 0.1 * wire_thickness; +// wire_r_per_micron[1][5] = wire_resistance(CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.275; +// wire_c_per_micron[1][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[1][6] = 32 * g_ip.F_sz_um; +// aspect_ratio = 2.2; +// wire_width = wire_pitch[1][6] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[1][6] - wire_width; +// dishing_thickness = 0.1 * wire_thickness; +// wire_r_per_micron[1][6] = wire_resistance(CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.275; +// wire_c_per_micron[1][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); + } + g_tp.wire_local.pitch += curr_alpha * wire_pitch[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + g_tp.wire_local.R_per_um += curr_alpha * wire_r_per_micron[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + g_tp.wire_local.C_per_um += curr_alpha * wire_c_per_micron[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + g_tp.wire_local.aspect_ratio += curr_alpha * aspect_ratio[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + g_tp.wire_local.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + g_tp.wire_local.miller_value += curr_alpha * miller_value[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + g_tp.wire_local.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + g_tp.wire_local.vert_dielectric_constant += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + + g_tp.wire_inside_mat.pitch += curr_alpha * wire_pitch[g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + g_tp.wire_inside_mat.R_per_um += curr_alpha* wire_r_per_micron[g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + g_tp.wire_inside_mat.C_per_um += curr_alpha* wire_c_per_micron[g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + g_tp.wire_inside_mat.aspect_ratio += curr_alpha * aspect_ratio[g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + g_tp.wire_inside_mat.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + g_tp.wire_inside_mat.miller_value += curr_alpha * miller_value[g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + g_tp.wire_inside_mat.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + g_tp.wire_inside_mat.vert_dielectric_constant += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + + g_tp.wire_outside_mat.pitch += curr_alpha * wire_pitch[g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + g_tp.wire_outside_mat.R_per_um += curr_alpha*wire_r_per_micron[g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + g_tp.wire_outside_mat.C_per_um += curr_alpha*wire_c_per_micron[g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + g_tp.wire_outside_mat.aspect_ratio += curr_alpha * aspect_ratio[g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + g_tp.wire_outside_mat.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + g_tp.wire_outside_mat.miller_value += curr_alpha * miller_value[g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + g_tp.wire_outside_mat.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + g_tp.wire_outside_mat.vert_dielectric_constant += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + + g_tp.unit_len_wire_del = g_tp.wire_inside_mat.R_per_um * g_tp.wire_inside_mat.C_per_um / 2; + + g_tp.sense_delay += curr_alpha *SENSE_AMP_D; + g_tp.sense_dy_power += curr_alpha *SENSE_AMP_P; +// g_tp.horiz_dielectric_constant += horiz_dielectric_constant; +// g_tp.vert_dielectric_constant += vert_dielectric_constant; +// g_tp.aspect_ratio += aspect_ratio; +// g_tp.miller_value += miller_value; +// g_tp.ild_thickness += ild_thickness; + + } + g_tp.fringe_cap = fringe_cap; + + double rd = tr_R_on(g_tp.min_w_nmos_, NCH, 1); + double p_to_n_sizing_r = pmos_to_nmos_sz_ratio(); + double c_load = gate_C(g_tp.min_w_nmos_ * (1 + p_to_n_sizing_r), 0.0); + double tf = rd * c_load; + g_tp.kinv = horowitz(0, tf, 0.5, 0.5, RISE); + double KLOAD = 1; + c_load = KLOAD * (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(g_tp.min_w_nmos_ * p_to_n_sizing_r, PCH, 1, 1, g_tp.cell_h_def) + + gate_C(g_tp.min_w_nmos_ * 4 * (1 + p_to_n_sizing_r), 0.0)); + tf = rd * c_load; + g_tp.FO4 = horowitz(0, tf, 0.5, 0.5, RISE); +} + diff --git a/ext/mcpat/cacti/uca.cc b/ext/mcpat/cacti/uca.cc new file mode 100755 index 000000000..568cd9e44 --- /dev/null +++ b/ext/mcpat/cacti/uca.cc @@ -0,0 +1,426 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#include <cmath> +#include <iostream> + +#include "uca.h" + +UCA::UCA(const DynamicParameter & dyn_p) + :dp(dyn_p), bank(dp), nbanks(g_ip->nbanks), refresh_power(0) +{ + int num_banks_ver_dir = 1 << ((bank.area.h > bank.area.w) ? _log2(nbanks)/2 : (_log2(nbanks) - _log2(nbanks)/2)); + int num_banks_hor_dir = nbanks/num_banks_ver_dir; + + if (dp.use_inp_params) + { + RWP = dp.num_rw_ports; + ERP = dp.num_rd_ports; + EWP = dp.num_wr_ports; + SCHP = dp.num_search_ports; + } + else + { + RWP = g_ip->num_rw_ports; + ERP = g_ip->num_rd_ports; + EWP = g_ip->num_wr_ports; + SCHP = g_ip->num_search_ports; + } + + num_addr_b_bank = (dp.number_addr_bits_mat + dp.number_subbanks_decode)*(RWP+ERP+EWP); + num_di_b_bank = dp.num_di_b_bank_per_port * (RWP + EWP); + num_do_b_bank = dp.num_do_b_bank_per_port * (RWP + ERP); + num_si_b_bank = dp.num_si_b_bank_per_port * SCHP; + num_so_b_bank = dp.num_so_b_bank_per_port * SCHP; + + if (!dp.fully_assoc && !dp.pure_cam) + { + + if (g_ip->fast_access && dp.is_tag == false) + { + num_do_b_bank *= g_ip->data_assoc; + } + + htree_in_add = new Htree2(g_ip->wt, bank.area.w, bank.area.h, + num_addr_b_bank, num_di_b_bank,0, num_do_b_bank,0,num_banks_ver_dir*2, num_banks_hor_dir*2, Add_htree, true); + htree_in_data = new Htree2(g_ip->wt, bank.area.w, bank.area.h, + num_addr_b_bank, num_di_b_bank, 0, num_do_b_bank, 0, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true); + htree_out_data = new Htree2(g_ip->wt, bank.area.w, bank.area.h, + num_addr_b_bank, num_di_b_bank, 0, num_do_b_bank, 0, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true); + } + + else + { + + htree_in_add = new Htree2(g_ip->wt, bank.area.w, bank.area.h, + num_addr_b_bank, num_di_b_bank, num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Add_htree, true); + htree_in_data = new Htree2(g_ip->wt, bank.area.w, bank.area.h, + num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true); + htree_out_data = new Htree2(g_ip->wt, bank.area.w, bank.area.h, + num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true); + htree_in_search = new Htree2(g_ip->wt, bank.area.w, bank.area.h, + num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true); + htree_out_search = new Htree2(g_ip->wt, bank.area.w, bank.area.h, + num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true); + } + + area.w = htree_in_data->area.w; + area.h = htree_in_data->area.h; + + area_all_dataramcells = bank.mat.subarray.get_total_cell_area() * dp.num_subarrays * g_ip->nbanks; +// cout<<"area cell"<<area_all_dataramcells<<endl; +// cout<<area.get_area()<<endl; + // delay calculation + double inrisetime = 0.0; + compute_delays(inrisetime); + compute_power_energy(); +} + + + +UCA::~UCA() +{ + delete htree_in_add; + delete htree_in_data; + delete htree_out_data; +} + + + +double UCA::compute_delays(double inrisetime) +{ + double outrisetime = bank.compute_delays(inrisetime); + + double delay_array_to_mat = htree_in_add->delay + bank.htree_in_add->delay; + double max_delay_before_row_decoder = delay_array_to_mat + bank.mat.r_predec->delay; + delay_array_to_sa_mux_lev_1_decoder = delay_array_to_mat + + bank.mat.sa_mux_lev_1_predec->delay + + bank.mat.sa_mux_lev_1_dec->delay; + delay_array_to_sa_mux_lev_2_decoder = delay_array_to_mat + + bank.mat.sa_mux_lev_2_predec->delay + + bank.mat.sa_mux_lev_2_dec->delay; + double delay_inside_mat = bank.mat.row_dec->delay + bank.mat.delay_bitline + bank.mat.delay_sa; + + delay_before_subarray_output_driver = + MAX(MAX(max_delay_before_row_decoder + delay_inside_mat, // row_path + delay_array_to_mat + bank.mat.b_mux_predec->delay + bank.mat.bit_mux_dec->delay + bank.mat.delay_sa), // col_path + MAX(delay_array_to_sa_mux_lev_1_decoder, // sa_mux_lev_1_path + delay_array_to_sa_mux_lev_2_decoder)); // sa_mux_lev_2_path + delay_from_subarray_out_drv_to_out = bank.mat.delay_subarray_out_drv_htree + + bank.htree_out_data->delay + htree_out_data->delay; + access_time = bank.mat.delay_comparator; + + double ram_delay_inside_mat; + if (dp.fully_assoc) + { + //delay of FA contains both CAM tag and RAM data + { //delay of CAM + ram_delay_inside_mat = bank.mat.delay_bitline + bank.mat.delay_matchchline; + access_time = htree_in_add->delay + bank.htree_in_add->delay; + //delay of fully-associative data array + access_time += ram_delay_inside_mat + delay_from_subarray_out_drv_to_out; + } + } + else + { + access_time = delay_before_subarray_output_driver + delay_from_subarray_out_drv_to_out; //data_acc_path + } + + if (dp.is_main_mem) + { + double t_rcd = max_delay_before_row_decoder + delay_inside_mat; + double cas_latency = MAX(delay_array_to_sa_mux_lev_1_decoder, delay_array_to_sa_mux_lev_2_decoder) + + delay_from_subarray_out_drv_to_out; + access_time = t_rcd + cas_latency; + } + + double temp; + + if (!dp.fully_assoc) + { + temp = delay_inside_mat + bank.mat.delay_wl_reset + bank.mat.delay_bl_restore;//TODO: Sheng: revisit + if (dp.is_dram) + { + temp += bank.mat.delay_writeback; // temp stores random cycle time + } + + + temp = MAX(temp, bank.mat.r_predec->delay); + temp = MAX(temp, bank.mat.b_mux_predec->delay); + temp = MAX(temp, bank.mat.sa_mux_lev_1_predec->delay); + temp = MAX(temp, bank.mat.sa_mux_lev_2_predec->delay); + } + else + { + ram_delay_inside_mat = bank.mat.delay_bitline + bank.mat.delay_matchchline; + temp = ram_delay_inside_mat + bank.mat.delay_cam_sl_restore + bank.mat.delay_cam_ml_reset + bank.mat.delay_bl_restore + + bank.mat.delay_hit_miss_reset + bank.mat.delay_wl_reset; + + temp = MAX(temp, bank.mat.b_mux_predec->delay);//TODO: Sheng revisit whether distinguish cam and ram bitline etc. + temp = MAX(temp, bank.mat.sa_mux_lev_1_predec->delay); + temp = MAX(temp, bank.mat.sa_mux_lev_2_predec->delay); + } + + // The following is true only if the input parameter "repeaters_in_htree" is set to false --Nav + if (g_ip->rpters_in_htree == false) + { + temp = MAX(temp, bank.htree_in_add->max_unpipelined_link_delay); + } + cycle_time = temp; + + double delay_req_network = max_delay_before_row_decoder; + double delay_rep_network = delay_from_subarray_out_drv_to_out; + multisubbank_interleave_cycle_time = MAX(delay_req_network, delay_rep_network); + + if (dp.is_main_mem) + { + multisubbank_interleave_cycle_time = htree_in_add->delay; + precharge_delay = htree_in_add->delay + + bank.htree_in_add->delay + bank.mat.delay_writeback + + bank.mat.delay_wl_reset + bank.mat.delay_bl_restore; + cycle_time = access_time + precharge_delay; + } + else + { + precharge_delay = 0; + } + + double dram_array_availability = 0; + if (dp.is_dram) + { + dram_array_availability = (1 - dp.num_r_subarray * cycle_time / dp.dram_refresh_period) * 100; + } + + return outrisetime; +} + + + +// note: currently, power numbers are for a bank of an array +void UCA::compute_power_energy() +{ + bank.compute_power_energy(); + power = bank.power; + + power_routing_to_bank.readOp.dynamic = htree_in_add->power.readOp.dynamic + htree_out_data->power.readOp.dynamic; + power_routing_to_bank.writeOp.dynamic = htree_in_add->power.readOp.dynamic + htree_in_data->power.readOp.dynamic; + if (dp.fully_assoc || dp.pure_cam) + power_routing_to_bank.searchOp.dynamic= htree_in_search->power.searchOp.dynamic + htree_out_search->power.searchOp.dynamic; + + power_routing_to_bank.readOp.leakage += htree_in_add->power.readOp.leakage + + htree_in_data->power.readOp.leakage + + htree_out_data->power.readOp.leakage; + + power_routing_to_bank.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage + + htree_in_data->power.readOp.gate_leakage + + htree_out_data->power.readOp.gate_leakage; + if (dp.fully_assoc || dp.pure_cam) + { + power_routing_to_bank.readOp.leakage += htree_in_search->power.readOp.leakage + htree_out_search->power.readOp.leakage; + power_routing_to_bank.readOp.gate_leakage += htree_in_search->power.readOp.gate_leakage + htree_out_search->power.readOp.gate_leakage; + } + + power.searchOp.dynamic += power_routing_to_bank.searchOp.dynamic; + power.readOp.dynamic += power_routing_to_bank.readOp.dynamic; + power.readOp.leakage += power_routing_to_bank.readOp.leakage; + power.readOp.gate_leakage += power_routing_to_bank.readOp.gate_leakage; + + // calculate total write energy per access + power.writeOp.dynamic = power.readOp.dynamic + - bank.mat.power_bitline.readOp.dynamic * dp.num_act_mats_hor_dir + + bank.mat.power_bitline.writeOp.dynamic * dp.num_act_mats_hor_dir + - power_routing_to_bank.readOp.dynamic + + power_routing_to_bank.writeOp.dynamic + + bank.htree_in_data->power.readOp.dynamic + - bank.htree_out_data->power.readOp.dynamic; + + if (dp.is_dram == false) + { + power.writeOp.dynamic -= bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir; + } + + dyn_read_energy_from_closed_page = power.readOp.dynamic; + dyn_read_energy_from_open_page = power.readOp.dynamic - + (bank.mat.r_predec->power.readOp.dynamic + + bank.mat.power_row_decoders.readOp.dynamic + + bank.mat.power_bl_precharge_eq_drv.readOp.dynamic + + bank.mat.power_sa.readOp.dynamic + + bank.mat.power_bitline.readOp.dynamic) * dp.num_act_mats_hor_dir; + + dyn_read_energy_remaining_words_in_burst = + (MAX((g_ip->burst_len / g_ip->int_prefetch_w), 1) - 1) * + ((bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic + + bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic + + bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic + + bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic + + bank.mat.power_subarray_out_drv.readOp.dynamic) * dp.num_act_mats_hor_dir + + bank.htree_out_data->power.readOp.dynamic + + power_routing_to_bank.readOp.dynamic); + dyn_read_energy_from_closed_page += dyn_read_energy_remaining_words_in_burst; + dyn_read_energy_from_open_page += dyn_read_energy_remaining_words_in_burst; + + activate_energy = htree_in_add->power.readOp.dynamic + + bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_act + + (bank.mat.r_predec->power.readOp.dynamic + + bank.mat.power_row_decoders.readOp.dynamic + + bank.mat.power_sa.readOp.dynamic) * dp.num_act_mats_hor_dir; + read_energy = (htree_in_add->power.readOp.dynamic + + bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_rd_or_wr + + (bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic + + bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic + + bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic + + bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic + + bank.mat.power_subarray_out_drv.readOp.dynamic) * dp.num_act_mats_hor_dir + + bank.htree_out_data->power.readOp.dynamic + + htree_in_data->power.readOp.dynamic) * g_ip->burst_len; + write_energy = (htree_in_add->power.readOp.dynamic + + bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_rd_or_wr + + htree_in_data->power.readOp.dynamic + + bank.htree_in_data->power.readOp.dynamic + + (bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic + + bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic + + bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic + + bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic) * dp.num_act_mats_hor_dir) * g_ip->burst_len; + precharge_energy = (bank.mat.power_bitline.readOp.dynamic + + bank.mat.power_bl_precharge_eq_drv.readOp.dynamic) * dp.num_act_mats_hor_dir; + + leak_power_subbank_closed_page = + (bank.mat.r_predec->power.readOp.leakage + + bank.mat.b_mux_predec->power.readOp.leakage + + bank.mat.sa_mux_lev_1_predec->power.readOp.leakage + + bank.mat.sa_mux_lev_2_predec->power.readOp.leakage + + bank.mat.power_row_decoders.readOp.leakage + + bank.mat.power_bit_mux_decoders.readOp.leakage + + bank.mat.power_sa_mux_lev_1_decoders.readOp.leakage + + bank.mat.power_sa_mux_lev_2_decoders.readOp.leakage + + bank.mat.leak_power_sense_amps_closed_page_state) * dp.num_act_mats_hor_dir; + + leak_power_subbank_closed_page += + (bank.mat.r_predec->power.readOp.gate_leakage + + bank.mat.b_mux_predec->power.readOp.gate_leakage + + bank.mat.sa_mux_lev_1_predec->power.readOp.gate_leakage + + bank.mat.sa_mux_lev_2_predec->power.readOp.gate_leakage + + bank.mat.power_row_decoders.readOp.gate_leakage + + bank.mat.power_bit_mux_decoders.readOp.gate_leakage + + bank.mat.power_sa_mux_lev_1_decoders.readOp.gate_leakage + + bank.mat.power_sa_mux_lev_2_decoders.readOp.gate_leakage) * dp.num_act_mats_hor_dir; //+ + //bank.mat.leak_power_sense_amps_closed_page_state) * dp.num_act_mats_hor_dir; + + leak_power_subbank_open_page = + (bank.mat.r_predec->power.readOp.leakage + + bank.mat.b_mux_predec->power.readOp.leakage + + bank.mat.sa_mux_lev_1_predec->power.readOp.leakage + + bank.mat.sa_mux_lev_2_predec->power.readOp.leakage + + bank.mat.power_row_decoders.readOp.leakage + + bank.mat.power_bit_mux_decoders.readOp.leakage + + bank.mat.power_sa_mux_lev_1_decoders.readOp.leakage + + bank.mat.power_sa_mux_lev_2_decoders.readOp.leakage + + bank.mat.leak_power_sense_amps_open_page_state) * dp.num_act_mats_hor_dir; + + leak_power_subbank_open_page += + (bank.mat.r_predec->power.readOp.gate_leakage + + bank.mat.b_mux_predec->power.readOp.gate_leakage + + bank.mat.sa_mux_lev_1_predec->power.readOp.gate_leakage + + bank.mat.sa_mux_lev_2_predec->power.readOp.gate_leakage + + bank.mat.power_row_decoders.readOp.gate_leakage + + bank.mat.power_bit_mux_decoders.readOp.gate_leakage + + bank.mat.power_sa_mux_lev_1_decoders.readOp.gate_leakage + + bank.mat.power_sa_mux_lev_2_decoders.readOp.gate_leakage ) * dp.num_act_mats_hor_dir; + //bank.mat.leak_power_sense_amps_open_page_state) * dp.num_act_mats_hor_dir; + + leak_power_request_and_reply_networks = + power_routing_to_bank.readOp.leakage + + bank.htree_in_add->power.readOp.leakage + + bank.htree_in_data->power.readOp.leakage + + bank.htree_out_data->power.readOp.leakage; + + leak_power_request_and_reply_networks += + power_routing_to_bank.readOp.gate_leakage + + bank.htree_in_add->power.readOp.gate_leakage + + bank.htree_in_data->power.readOp.gate_leakage + + bank.htree_out_data->power.readOp.gate_leakage; + + if (dp.fully_assoc || dp.pure_cam) + { + leak_power_request_and_reply_networks += htree_in_search->power.readOp.leakage + htree_out_search->power.readOp.leakage; + leak_power_request_and_reply_networks += htree_in_search->power.readOp.gate_leakage + htree_out_search->power.readOp.gate_leakage; + } + + + if (dp.is_dram) + { // if DRAM, add contribution of power spent in row predecoder drivers, blocks and decoders to refresh power + refresh_power = (bank.mat.r_predec->power.readOp.dynamic * dp.num_act_mats_hor_dir + + bank.mat.row_dec->power.readOp.dynamic) * dp.num_r_subarray * dp.num_subarrays; + refresh_power += bank.mat.per_bitline_read_energy * dp.num_c_subarray * dp.num_r_subarray * dp.num_subarrays; + refresh_power += bank.mat.power_bl_precharge_eq_drv.readOp.dynamic * dp.num_act_mats_hor_dir; + refresh_power += bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir; + refresh_power /= dp.dram_refresh_period; + } + + + if (dp.is_tag == false) + { + power.readOp.dynamic = dyn_read_energy_from_closed_page; + power.writeOp.dynamic = dyn_read_energy_from_closed_page + - dyn_read_energy_remaining_words_in_burst + - bank.mat.power_bitline.readOp.dynamic * dp.num_act_mats_hor_dir + + bank.mat.power_bitline.writeOp.dynamic * dp.num_act_mats_hor_dir + + (power_routing_to_bank.writeOp.dynamic - + power_routing_to_bank.readOp.dynamic - + bank.htree_out_data->power.readOp.dynamic + + bank.htree_in_data->power.readOp.dynamic) * + (MAX((g_ip->burst_len / g_ip->int_prefetch_w), 1) - 1); //FIXME + + if (dp.is_dram == false) + { + power.writeOp.dynamic -= bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir; + } + } + + // if DRAM, add refresh power to total leakage + if (dp.is_dram) + { + power.readOp.leakage += refresh_power; + } + + // TODO: below should be avoided. + /*if (dp.is_main_mem) + { + power.readOp.leakage += MAIN_MEM_PER_CHIP_STANDBY_CURRENT_mA * 1e-3 * g_tp.peri_global.Vdd / g_ip->nbanks; + }*/ + + assert(power.readOp.dynamic > 0); + assert(power.writeOp.dynamic > 0); + assert(power.readOp.leakage > 0); +} + diff --git a/ext/mcpat/cacti/uca.h b/ext/mcpat/cacti/uca.h new file mode 100755 index 000000000..fdab14fc7 --- /dev/null +++ b/ext/mcpat/cacti/uca.h @@ -0,0 +1,95 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#ifndef __UCA_H__ +#define __UCA_H__ + +#include "area.h" +#include "bank.h" +#include "component.h" +#include "htree2.h" +#include "parameter.h" + +class UCA : public Component +{ + public: + UCA(const DynamicParameter & dyn_p); + ~UCA(); + double compute_delays(double inrisetime); // returns outrisetime + void compute_power_energy(); + + DynamicParameter dp; + Bank bank; + + Htree2 * htree_in_add; + Htree2 * htree_in_data; + Htree2 * htree_out_data; + Htree2 * htree_in_search; + Htree2 * htree_out_search; + + powerDef power_routing_to_bank; + + uint32_t nbanks; + + int num_addr_b_bank; + int num_di_b_bank; + int num_do_b_bank; + int num_si_b_bank; + int num_so_b_bank; + int RWP, ERP, EWP,SCHP; + double area_all_dataramcells; + + double dyn_read_energy_from_closed_page; + double dyn_read_energy_from_open_page; + double dyn_read_energy_remaining_words_in_burst; + + double refresh_power; // only for DRAM + double activate_energy; + double read_energy; + double write_energy; + double precharge_energy; + double leak_power_subbank_closed_page; + double leak_power_subbank_open_page; + double leak_power_request_and_reply_networks; + + double delay_array_to_sa_mux_lev_1_decoder; + double delay_array_to_sa_mux_lev_2_decoder; + double delay_before_subarray_output_driver; + double delay_from_subarray_out_drv_to_out; + double access_time; + double precharge_delay; + double multisubbank_interleave_cycle_time; +}; + +#endif + diff --git a/ext/mcpat/cacti/wire.cc b/ext/mcpat/cacti/wire.cc new file mode 100644 index 000000000..742000c85 --- /dev/null +++ b/ext/mcpat/cacti/wire.cc @@ -0,0 +1,832 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#include "wire.h" +#include "cmath" +// use this constructor to calculate wire stats +Wire::Wire( + enum Wire_type wire_model, + double wl, + int n, + double w_s, + double s_s, + enum Wire_placement wp, + double resistivity, + TechnologyParameter::DeviceType *dt + ):wt(wire_model), wire_length(wl*1e-6), nsense(n), w_scale(w_s), s_scale(s_s), + resistivity(resistivity), deviceType(dt) +{ + wire_placement = wp; + min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_; + in_rise_time = 0; + out_rise_time = 0; + if (initialized != 1) { + cout << "Wire not initialized. Initializing it with default values\n"; + Wire winit; + } + calculate_wire_stats(); + // change everything back to seconds, microns, and Joules + repeater_spacing *= 1e6; + wire_length *= 1e6; + wire_width *= 1e6; + wire_spacing *= 1e6; + assert(wire_length > 0); + assert(power.readOp.dynamic > 0); + assert(power.readOp.leakage > 0); + assert(power.readOp.gate_leakage > 0); +} + + // the following values are for peripheral global technology + // specified in the input config file + Component Wire::global; + Component Wire::global_5; + Component Wire::global_10; + Component Wire::global_20; + Component Wire::global_30; + Component Wire::low_swing; + + int Wire::initialized; + double Wire::wire_width_init; + double Wire::wire_spacing_init; + + +Wire::Wire(double w_s, double s_s, enum Wire_placement wp, double resis, TechnologyParameter::DeviceType *dt) +{ + w_scale = w_s; + s_scale = s_s; + deviceType = dt; + wire_placement = wp; + resistivity = resis; + min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio * g_tp.min_w_nmos_; + in_rise_time = 0; + out_rise_time = 0; + + switch (wire_placement) + { + case outside_mat: wire_width = g_tp.wire_outside_mat.pitch; break; + case inside_mat : wire_width = g_tp.wire_inside_mat.pitch; break; + default: wire_width = g_tp.wire_local.pitch; break; + } + + wire_spacing = wire_width; + + wire_width *= (w_scale * 1e-6/2) /* (m) */; + wire_spacing *= (s_scale * 1e-6/2) /* (m) */; + + initialized = 1; + init_wire(); + wire_width_init = wire_width; + wire_spacing_init = wire_spacing; + + assert(power.readOp.dynamic > 0); + assert(power.readOp.leakage > 0); + assert(power.readOp.gate_leakage > 0); +} + + + +Wire::~Wire() +{ +} + + + +void +Wire::calculate_wire_stats() +{ + + if (wire_placement == outside_mat) { + wire_width = g_tp.wire_outside_mat.pitch; + } + else if (wire_placement == inside_mat) { + wire_width = g_tp.wire_inside_mat.pitch; + } + else { + wire_width = g_tp.wire_local.pitch; + } + + wire_spacing = wire_width; + + wire_width *= (w_scale * 1e-6/2) /* (m) */; + wire_spacing *= (s_scale * 1e-6/2) /* (m) */; + + + if (wt != Low_swing) { + + // delay_optimal_wire(); + + if (wt == Global) { + delay = global.delay * wire_length; + power.readOp.dynamic = global.power.readOp.dynamic * wire_length; + power.readOp.leakage = global.power.readOp.leakage * wire_length; + power.readOp.gate_leakage = global.power.readOp.gate_leakage * wire_length; + repeater_spacing = global.area.w; + repeater_size = global.area.h; + area.set_area((wire_length/repeater_spacing) * + compute_gate_area(INV, 1, min_w_pmos * repeater_size, + g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def)); + } + else if (wt == Global_5) { + delay = global_5.delay * wire_length; + power.readOp.dynamic = global_5.power.readOp.dynamic * wire_length; + power.readOp.leakage = global_5.power.readOp.leakage * wire_length; + power.readOp.gate_leakage = global_5.power.readOp.gate_leakage * wire_length; + repeater_spacing = global_5.area.w; + repeater_size = global_5.area.h; + area.set_area((wire_length/repeater_spacing) * + compute_gate_area(INV, 1, min_w_pmos * repeater_size, + g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def)); + } + else if (wt == Global_10) { + delay = global_10.delay * wire_length; + power.readOp.dynamic = global_10.power.readOp.dynamic * wire_length; + power.readOp.leakage = global_10.power.readOp.leakage * wire_length; + power.readOp.gate_leakage = global_10.power.readOp.gate_leakage * wire_length; + repeater_spacing = global_10.area.w; + repeater_size = global_10.area.h; + area.set_area((wire_length/repeater_spacing) * + compute_gate_area(INV, 1, min_w_pmos * repeater_size, + g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def)); + } + else if (wt == Global_20) { + delay = global_20.delay * wire_length; + power.readOp.dynamic = global_20.power.readOp.dynamic * wire_length; + power.readOp.leakage = global_20.power.readOp.leakage * wire_length; + power.readOp.gate_leakage = global_20.power.readOp.gate_leakage * wire_length; + repeater_spacing = global_20.area.w; + repeater_size = global_20.area.h; + area.set_area((wire_length/repeater_spacing) * + compute_gate_area(INV, 1, min_w_pmos * repeater_size, + g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def)); + } + else if (wt == Global_30) { + delay = global_30.delay * wire_length; + power.readOp.dynamic = global_30.power.readOp.dynamic * wire_length; + power.readOp.leakage = global_30.power.readOp.leakage * wire_length; + power.readOp.gate_leakage = global_30.power.readOp.gate_leakage * wire_length; + repeater_spacing = global_30.area.w; + repeater_size = global_30.area.h; + area.set_area((wire_length/repeater_spacing) * + compute_gate_area(INV, 1, min_w_pmos * repeater_size, + g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def)); + } + out_rise_time = delay*repeater_spacing/deviceType->Vth; + } + else if (wt == Low_swing) { + low_swing_model (); + repeater_spacing = wire_length; + repeater_size = 1; + } + else { + assert(0); + } +} + + + +/* + * The fall time of an input signal to the first stage of a circuit is + * assumed to be same as the fall time of the output signal of two + * inverters connected in series (refer: CACTI 1 Technical report, + * section 6.1.3) + */ + double +Wire::signal_fall_time () +{ + + /* rise time of inverter 1's output */ + double rt; + /* fall time of inverter 2's output */ + double ft; + double timeconst; + + timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) * + tr_R_on(min_w_pmos, PCH, 1); + rt = horowitz (0, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, FALL) / (deviceType->Vdd - deviceType->Vth); + timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) * + tr_R_on(g_tp.min_w_nmos_, NCH, 1); + ft = horowitz (rt, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE) / deviceType->Vth; + return ft; +} + + + +double Wire::signal_rise_time () +{ + + /* rise time of inverter 1's output */ + double ft; + /* fall time of inverter 2's output */ + double rt; + double timeconst; + + timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) * + tr_R_on(g_tp.min_w_nmos_, NCH, 1); + rt = horowitz (0, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE) / deviceType->Vth; + timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) * + tr_R_on(min_w_pmos, PCH, 1); + ft = horowitz (rt, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, FALL) / (deviceType->Vdd - deviceType->Vth); + return ft; //sec +} + + + +/* Wire resistance and capacitance calculations + * wire width + * + * /__/ + * | | + * | | height = ASPECT_RATIO*wire width (ASPECT_RATIO = 2.2, ref: ITRS) + * |__|/ + * + * spacing between wires in same level = wire width + * spacing between wires in adjacent levels = wire width---this is incorrect, + * according to R.Ho's paper and thesis. ILD != wire width + * + */ + +double Wire::wire_cap (double len /* in m */, bool call_from_outside) +{ + //TODO: this should be consistent with the wire_res in technology file + double sidewall, adj, tot_cap; + double wire_height; + double epsilon0 = 8.8542e-12; + double aspect_ratio, horiz_dielectric_constant, vert_dielectric_constant, miller_value,ild_thickness; + + switch (wire_placement) + { + case outside_mat: + { + aspect_ratio = g_tp.wire_outside_mat.aspect_ratio; + horiz_dielectric_constant = g_tp.wire_outside_mat.horiz_dielectric_constant; + vert_dielectric_constant = g_tp.wire_outside_mat.vert_dielectric_constant; + miller_value = g_tp.wire_outside_mat.miller_value; + ild_thickness = g_tp.wire_outside_mat.ild_thickness; + break; + } + case inside_mat : + { + aspect_ratio = g_tp.wire_inside_mat.aspect_ratio; + horiz_dielectric_constant = g_tp.wire_inside_mat.horiz_dielectric_constant; + vert_dielectric_constant = g_tp.wire_inside_mat.vert_dielectric_constant; + miller_value = g_tp.wire_inside_mat.miller_value; + ild_thickness = g_tp.wire_inside_mat.ild_thickness; + break; + } + default: + { + aspect_ratio = g_tp.wire_local.aspect_ratio; + horiz_dielectric_constant = g_tp.wire_local.horiz_dielectric_constant; + vert_dielectric_constant = g_tp.wire_local.vert_dielectric_constant; + miller_value = g_tp.wire_local.miller_value; + ild_thickness = g_tp.wire_local.ild_thickness; + break; + } + } + + if (call_from_outside) + { + wire_width *= 1e-6; + wire_spacing *= 1e-6; + } + wire_height = wire_width/w_scale*aspect_ratio; + /* + * assuming height does not change. wire_width = width_original*w_scale + * So wire_height does not change as wire width increases + */ + +// capacitance between wires in the same level +// sidewall = 2*miller_value * horiz_dielectric_constant * (wire_height/wire_spacing) +// * epsilon0; + + sidewall = miller_value * horiz_dielectric_constant * (wire_height/wire_spacing) + * epsilon0; + + + // capacitance between wires in adjacent levels + //adj = miller_value * vert_dielectric_constant *w_scale * epsilon0; + //adj = 2*vert_dielectric_constant *wire_width/(ild_thickness*1e-6) * epsilon0; + + adj = miller_value *vert_dielectric_constant *wire_width/(ild_thickness*1e-6) * epsilon0; + //Change ild_thickness from micron to M + + //tot_cap = (sidewall + adj + (deviceType->C_fringe * 1e6)); //F/m + tot_cap = (sidewall + adj + (g_tp.fringe_cap * 1e6)); //F/m + + if (call_from_outside) + { + wire_width *= 1e6; + wire_spacing *= 1e6; + } + return (tot_cap*len); // (F) +} + + + double +Wire::wire_res (double len /*(in m)*/) +{ + + double aspect_ratio,alpha_scatter =1.05, dishing_thickness=0, barrier_thickness=0; + //TODO: this should be consistent with the wire_res in technology file + //The whole computation should be consistent with the wire_res in technology.cc too! + + switch (wire_placement) + { + case outside_mat: + { + aspect_ratio = g_tp.wire_outside_mat.aspect_ratio; + break; + } + case inside_mat : + { + aspect_ratio = g_tp.wire_inside_mat.aspect_ratio; + break; + } + default: + { + aspect_ratio = g_tp.wire_local.aspect_ratio; + break; + } + } + return (alpha_scatter * resistivity * 1e-6 * len/((aspect_ratio*wire_width/w_scale-dishing_thickness - barrier_thickness)* + (wire_width-2*barrier_thickness))); +} + +/* + * Calculates the delay, power and area of the transmitter circuit. + * + * The transmitter delay is the sum of nand gate delay, inverter delay + * low swing nmos delay, and the wire delay + * (ref: Technical report 6) + */ + void +Wire::low_swing_model() +{ + double len = wire_length; + double beta = pmos_to_nmos_sz_ratio(); + + + double inputrise = (in_rise_time == 0) ? signal_rise_time() : in_rise_time; + + /* Final nmos low swing driver size calculation: + * Try to size the driver such that the delay + * is less than 8FO4. + * If the driver size is greater than + * the max allowable size, assume max size for the driver. + * In either case, recalculate the delay using + * the final driver size assuming slow input with + * finite rise time instead of ideal step input + * + * (ref: Technical report 6) + */ + double cwire = wire_cap(len); /* load capacitance */ + double rwire = wire_res(len); + +#define RES_ADJ (8.6) // Increase in resistance due to low driving vol. + + double driver_res = (-8*g_tp.FO4/(log(0.5) * cwire))/RES_ADJ; + double nsize = R_to_w(driver_res, NCH); + + nsize = MIN(nsize, g_tp.max_w_nmos_); + nsize = MAX(nsize, g_tp.min_w_nmos_); + + if(rwire*cwire > 8*g_tp.FO4) + { + nsize = g_tp.max_w_nmos_; + } + + // size the inverter appropriately to minimize the transmitter delay + // Note - In order to minimize leakage, we are not adding a set of inverters to + // bring down delay. Instead, we are sizing the single gate + // based on the logical effort. + double st_eff = sqrt((2+beta/1+beta)*gate_C(nsize, 0)/(gate_C(2*g_tp.min_w_nmos_, 0) + + gate_C(2*min_w_pmos, 0))); + double req_cin = ((2+beta/1+beta)*gate_C(nsize, 0))/st_eff; + double inv_size = req_cin/(gate_C(min_w_pmos, 0) + gate_C(g_tp.min_w_nmos_, 0)); + inv_size = MAX(inv_size, 1); + + /* nand gate delay */ + double res_eq = (2 * tr_R_on(g_tp.min_w_nmos_, NCH, 1)); + double cap_eq = 2 * drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + drain_C_(2*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) + + gate_C(inv_size*g_tp.min_w_nmos_, 0) + + gate_C(inv_size*min_w_pmos, 0); + + double timeconst = res_eq * cap_eq; + + delay = horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd, + deviceType->Vth/deviceType->Vdd, RISE); + double temp_power = cap_eq*deviceType->Vdd*deviceType->Vdd; + + inputrise = delay / (deviceType->Vdd - deviceType->Vth); /* for the next stage */ + + /* Inverter delay: + * The load capacitance of this inv depends on + * the gate capacitance of the final stage nmos + * transistor which in turn depends on nsize + */ + res_eq = tr_R_on(inv_size*min_w_pmos, PCH, 1); + cap_eq = drain_C_(inv_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + drain_C_(inv_size*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) + + gate_C(nsize, 0); + timeconst = res_eq * cap_eq; + + delay += horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd, + deviceType->Vth/deviceType->Vdd, FALL); + temp_power += cap_eq*deviceType->Vdd*deviceType->Vdd; + + + transmitter.delay = delay; + transmitter.power.readOp.dynamic = temp_power*2; /* since it is a diff. model*/ + transmitter.power.readOp.leakage = deviceType->Vdd * + (4 * cmos_Isub_leakage(g_tp.min_w_nmos_, min_w_pmos, 2, nand) + + 4 * cmos_Isub_leakage(g_tp.min_w_nmos_, min_w_pmos, 1, inv)); + + transmitter.power.readOp.gate_leakage = deviceType->Vdd * + (4 * cmos_Ig_leakage(g_tp.min_w_nmos_, min_w_pmos, 2, nand) + + 4 * cmos_Ig_leakage(g_tp.min_w_nmos_, min_w_pmos, 1, inv)); + + inputrise = delay / deviceType->Vth; + + /* nmos delay + wire delay */ + cap_eq = cwire + drain_C_(nsize, NCH, 1, 1, g_tp.cell_h_def)*2 + + nsense * sense_amp_input_cap(); //+receiver cap + /* + * NOTE: nmos is used as both pull up and pull down transistor + * in the transmitter. This is because for low voltage swing, drive + * resistance of nmos is less than pmos + * (for a detailed graph ref: On-Chip Wires: Scaling and Efficiency) + */ + timeconst = (tr_R_on(nsize, NCH, 1)*RES_ADJ) * (cwire + + drain_C_(nsize, NCH, 1, 1, g_tp.cell_h_def)*2) + + rwire*cwire/2 + + (tr_R_on(nsize, NCH, 1)*RES_ADJ + rwire) * + nsense * sense_amp_input_cap(); + + /* + * since we are pre-equalizing and overdriving the low + * swing wires, the net time constant is less + * than the actual value + */ + delay += horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd, .25, 0); +#define VOL_SWING .1 + temp_power += cap_eq*VOL_SWING*.400; /* .4v is the over drive voltage */ + temp_power *= 2; /* differential wire */ + + l_wire.delay = delay - transmitter.delay; + l_wire.power.readOp.dynamic = temp_power - transmitter.power.readOp.dynamic; + l_wire.power.readOp.leakage = deviceType->Vdd* + (4* cmos_Isub_leakage(nsize, 0, 1, nmos)); + + l_wire.power.readOp.gate_leakage = deviceType->Vdd* + (4* cmos_Ig_leakage(nsize, 0, 1, nmos)); + + //double rt = horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd, + // deviceType->Vth/deviceType->Vdd, RISE)/deviceType->Vth; + + delay += g_tp.sense_delay; + + sense_amp.delay = g_tp.sense_delay; + out_rise_time = g_tp.sense_delay/(deviceType->Vth); + sense_amp.power.readOp.dynamic = g_tp.sense_dy_power; + sense_amp.power.readOp.leakage = 0; //FIXME + sense_amp.power.readOp.gate_leakage = 0; + + power.readOp.dynamic = temp_power + sense_amp.power.readOp.dynamic; + power.readOp.leakage = transmitter.power.readOp.leakage + + l_wire.power.readOp.leakage + + sense_amp.power.readOp.leakage; + power.readOp.gate_leakage = transmitter.power.readOp.gate_leakage + + l_wire.power.readOp.gate_leakage + + sense_amp.power.readOp.gate_leakage; +} + + double +Wire::sense_amp_input_cap() +{ + return drain_C_(g_tp.w_iso, PCH, 1, 1, g_tp.cell_h_def) + + gate_C(g_tp.w_sense_en + g_tp.w_sense_n, 0) + + drain_C_(g_tp.w_sense_n, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(g_tp.w_sense_p, PCH, 1, 1, g_tp.cell_h_def); +} + + +void Wire::delay_optimal_wire () +{ + double len = wire_length; + //double min_wire_width = wire_width; //m + double beta = pmos_to_nmos_sz_ratio(); + double switching = 0; // switching energy + double short_ckt = 0; // short-circuit energy + double tc = 0; // time constant + // input cap of min sized driver + double input_cap = gate_C(g_tp.min_w_nmos_ + min_w_pmos, 0); + + // output parasitic capacitance of + // the min. sized driver + double out_cap = drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def); + // drive resistance + double out_res = (tr_R_on(g_tp.min_w_nmos_, NCH, 1) + + tr_R_on(min_w_pmos, PCH, 1))/2; + double wr = wire_res(len); //ohm + + // wire cap /m + double wc = wire_cap(len); + + // size the repeater such that the delay of the wire is minimum + double repeater_scaling = sqrt(out_res*wc/(wr*input_cap)); // len will cancel + + // calc the optimum spacing between the repeaters (m) + + repeater_spacing = sqrt(2 * out_res * (out_cap + input_cap)/ + ((wr/len)*(wc/len))); + repeater_size = repeater_scaling; + + switching = (repeater_scaling * (input_cap + out_cap) + + repeater_spacing * (wc/len)) * deviceType->Vdd * deviceType->Vdd; + + tc = out_res * (input_cap + out_cap) + + out_res * wc/len * repeater_spacing/repeater_scaling + + wr/len * repeater_spacing * input_cap * repeater_scaling + + 0.5 * (wr/len) * (wc/len)* repeater_spacing * repeater_spacing; + + delay = 0.693 * tc * len/repeater_spacing; + +#define Ishort_ckt 65e-6 /* across all tech Ref:Banerjee et al. {IEEE TED} */ + short_ckt = deviceType->Vdd * g_tp.min_w_nmos_ * Ishort_ckt * 1.0986 * + repeater_scaling * tc; + + area.set_area((len/repeater_spacing) * + compute_gate_area(INV, 1, min_w_pmos * repeater_scaling, + g_tp.min_w_nmos_ * repeater_scaling, g_tp.cell_h_def)); + power.readOp.dynamic = ((len/repeater_spacing)*(switching + short_ckt)); + power.readOp.leakage = ((len/repeater_spacing)* + deviceType->Vdd* + cmos_Isub_leakage(g_tp.min_w_nmos_*repeater_scaling, beta*g_tp.min_w_nmos_*repeater_scaling, 1, inv)); + power.readOp.gate_leakage = ((len/repeater_spacing)* + deviceType->Vdd* + cmos_Ig_leakage(g_tp.min_w_nmos_*repeater_scaling, beta*g_tp.min_w_nmos_*repeater_scaling, 1, inv)); +} + + + +// calculate power/delay values for wires with suboptimal repeater sizing/spacing +void +Wire::init_wire(){ + wire_length = 1; + delay_optimal_wire(); + double sp, si; + powerDef pow; + si = repeater_size; + sp = repeater_spacing; + sp *= 1e6; // in microns + + double i, j, del; + repeated_wire.push_back(Component()); + for (j=sp; j < 4*sp; j+=100) { + for (i = si; i > 1; i--) { + pow = wire_model(j*1e-6, i, &del); + if (j == sp && i == si) { + global.delay = del; + global.power = pow; + global.area.h = si; + global.area.w = sp*1e-6; // m + } +// cout << "Repeater size - "<< i << +// " Repeater spacing - " << j << +// " Delay - " << del << +// " PowerD - " << pow.readOp.dynamic << +// " PowerL - " << pow.readOp.leakage <<endl; + repeated_wire.back().delay = del; + repeated_wire.back().power.readOp = pow.readOp; + repeated_wire.back().area.w = j*1e-6; //m + repeated_wire.back().area.h = i; + repeated_wire.push_back(Component()); + + } + } + repeated_wire.pop_back(); + update_fullswing(); + Wire *l_wire = new Wire(Low_swing, 0.001/* 1 mm*/, 1); + low_swing.delay = l_wire->delay; + low_swing.power = l_wire->power; + delete l_wire; +} + + + +void Wire::update_fullswing() +{ + + list<Component>::iterator citer; + double del[4]; + del[3] = this->global.delay + this->global.delay*.3; + del[2] = global.delay + global.delay*.2; + del[1] = global.delay + global.delay*.1; + del[0] = global.delay + global.delay*.05; + double threshold; + double ncost; + double cost; + int i = 4; + while (i>0) { + threshold = del[i-1]; + cost = BIGNUM; + for (citer = repeated_wire.begin(); citer != repeated_wire.end(); citer++) + { + if (citer->delay > threshold) { + citer = repeated_wire.erase(citer); + citer --; + } + else { + ncost = citer->power.readOp.dynamic/global.power.readOp.dynamic + + citer->power.readOp.leakage/global.power.readOp.leakage; + if(ncost < cost) + { + cost = ncost; + if (i == 4) { + global_30.delay = citer->delay; + global_30.power = citer->power; + global_30.area = citer->area; + } + else if (i==3) { + global_20.delay = citer->delay; + global_20.power = citer->power; + global_20.area = citer->area; + } + else if(i==2) { + global_10.delay = citer->delay; + global_10.power = citer->power; + global_10.area = citer->area; + } + else if(i==1) { + global_5.delay = citer->delay; + global_5.power = citer->power; + global_5.area = citer->area; + } + } + } + } + i--; + } +} + + + +powerDef Wire::wire_model (double space, double size, double *delay) +{ + powerDef ptemp; + double len = 1; + //double min_wire_width = wire_width; //m + double beta = pmos_to_nmos_sz_ratio(); + // switching energy + double switching = 0; + // short-circuit energy + double short_ckt = 0; + // time constant + double tc = 0; + // input cap of min sized driver + double input_cap = gate_C (g_tp.min_w_nmos_ + + min_w_pmos, 0); + + // output parasitic capacitance of + // the min. sized driver + double out_cap = drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) + + drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def); + // drive resistance + double out_res = (tr_R_on(g_tp.min_w_nmos_, NCH, 1) + + tr_R_on(min_w_pmos, PCH, 1))/2; + double wr = wire_res(len); //ohm + + // wire cap /m + double wc = wire_cap(len); + + repeater_spacing = space; + repeater_size = size; + + switching = (repeater_size * (input_cap + out_cap) + + repeater_spacing * (wc/len)) * deviceType->Vdd * deviceType->Vdd; + + tc = out_res * (input_cap + out_cap) + + out_res * wc/len * repeater_spacing/repeater_size + + wr/len * repeater_spacing * out_cap * repeater_size + + 0.5 * (wr/len) * (wc/len)* repeater_spacing * repeater_spacing; + + *delay = 0.693 * tc * len/repeater_spacing; + +#define Ishort_ckt 65e-6 /* across all tech Ref:Banerjee et al. {IEEE TED} */ + short_ckt = deviceType->Vdd * g_tp.min_w_nmos_ * Ishort_ckt * 1.0986 * + repeater_size * tc; + + ptemp.readOp.dynamic = ((len/repeater_spacing)*(switching + short_ckt)); + ptemp.readOp.leakage = ((len/repeater_spacing)* + deviceType->Vdd* + cmos_Isub_leakage(g_tp.min_w_nmos_*repeater_size, beta*g_tp.min_w_nmos_*repeater_size, 1, inv)); + + ptemp.readOp.gate_leakage = ((len/repeater_spacing)* + deviceType->Vdd* + cmos_Ig_leakage(g_tp.min_w_nmos_*repeater_size, beta*g_tp.min_w_nmos_*repeater_size, 1, inv)); + + return ptemp; +} + +void +Wire::print_wire() +{ + + cout << "\nWire Properties:\n\n"; + cout << " Delay Optimal\n\tRepeater size - "<< global.area.h << + " \n\tRepeater spacing - " << global.area.w*1e3 << " (mm)" + " \n\tDelay - " << global.delay*1e6 << " (ns/mm)" + " \n\tPowerD - " << global.power.readOp.dynamic *1e6<< " (nJ/mm)" + " \n\tPowerL - " << global.power.readOp.leakage << " (mW/mm)" + " \n\tPowerLgate - " << global.power.readOp.gate_leakage << " (mW/mm)\n"; + cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n"; + cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n"; + cout <<endl; + + cout << " 5% Overhead\n\tRepeater size - "<< global_5.area.h << + " \n\tRepeater spacing - " << global_5.area.w*1e3 << " (mm)" + " \n\tDelay - " << global_5.delay *1e6<< " (ns/mm)" + " \n\tPowerD - " << global_5.power.readOp.dynamic *1e6<< " (nJ/mm)" + " \n\tPowerL - " << global_5.power.readOp.leakage << " (mW/mm)" + " \n\tPowerLgate - " << global_5.power.readOp.gate_leakage << " (mW/mm)\n"; + cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n"; + cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n"; + cout <<endl; + cout << " 10% Overhead\n\tRepeater size - "<< global_10.area.h << + " \n\tRepeater spacing - " << global_10.area.w*1e3 << " (mm)" + " \n\tDelay - " << global_10.delay *1e6<< " (ns/mm)" + " \n\tPowerD - " << global_10.power.readOp.dynamic *1e6<< " (nJ/mm)" + " \n\tPowerL - " << global_10.power.readOp.leakage << " (mW/mm)" + " \n\tPowerLgate - " << global_10.power.readOp.gate_leakage << " (mW/mm)\n"; + cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n"; + cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n"; + cout <<endl; + cout << " 20% Overhead\n\tRepeater size - "<< global_20.area.h << + " \n\tRepeater spacing - " << global_20.area.w*1e3 << " (mm)" + " \n\tDelay - " << global_20.delay *1e6<< " (ns/mm)" + " \n\tPowerD - " << global_20.power.readOp.dynamic *1e6<< " (nJ/mm)" + " \n\tPowerL - " << global_20.power.readOp.leakage << " (mW/mm)" + " \n\tPowerLgate - " << global_20.power.readOp.gate_leakage << " (mW/mm)\n"; + cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n"; + cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n"; + cout <<endl; + cout << " 30% Overhead\n\tRepeater size - "<< global_30.area.h << + " \n\tRepeater spacing - " << global_30.area.w*1e3 << " (mm)" + " \n\tDelay - " << global_30.delay *1e6<< " (ns/mm)" + " \n\tPowerD - " << global_30.power.readOp.dynamic *1e6<< " (nJ/mm)" + " \n\tPowerL - " << global_30.power.readOp.leakage << " (mW/mm)" + " \n\tPowerLgate - " << global_30.power.readOp.gate_leakage << " (mW/mm)\n"; + cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n"; + cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n"; + cout <<endl; + cout << " Low-swing wire (1 mm) - Note: Unlike repeated wires, \n\tdelay and power " + "values of low-swing wires do not\n\thave a linear relationship with length." << + " \n\tdelay - " << low_swing.delay *1e9<< " (ns)" + " \n\tpowerD - " << low_swing.power.readOp.dynamic *1e9<< " (nJ)" + " \n\tPowerL - " << low_swing.power.readOp.leakage << " (mW)" + " \n\tPowerLgate - " << low_swing.power.readOp.gate_leakage << " (mW)\n"; + cout << "\tWire width - " <<wire_width_init * 2 /* differential */<< " microns\n"; + cout << "\tWire spacing - " <<wire_spacing_init * 2 /* differential */<< " microns\n"; + cout <<endl; + cout <<endl; + +} + diff --git a/ext/mcpat/cacti/wire.h b/ext/mcpat/cacti/wire.h new file mode 100644 index 000000000..51d55afff --- /dev/null +++ b/ext/mcpat/cacti/wire.h @@ -0,0 +1,124 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#ifndef __WIRE_H__ +#define __WIRE_H__ + +#include <iostream> +#include <list> + +#include "assert.h" +#include "basic_circuit.h" +#include "cacti_interface.h" +#include "component.h" +#include "parameter.h" + +class Wire : public Component +{ + public: + Wire(enum Wire_type wire_model, double len /* in u*/, + int nsense = 1/* no. of sense amps connected to the low-swing wire */, + double width_scaling = 1, + double spacing_scaling = 1, + enum Wire_placement wire_placement = outside_mat, + double resistivity = CU_RESISTIVITY, + TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)); + ~Wire(); + + Wire( double width_scaling = 1, + double spacing_scaling = 1, + enum Wire_placement wire_placement = outside_mat, + double resistivity = CU_RESISTIVITY, + TechnologyParameter::DeviceType *dt = &(g_tp.peri_global) + ); // should be used only once for initializing static members + void init_wire(); + + void calculate_wire_stats(); + void delay_optimal_wire(); + double wire_cap(double len, bool call_from_outside=false); + double wire_res(double len); + void low_swing_model(); + double signal_fall_time(); + double signal_rise_time(); + double sense_amp_input_cap(); + + enum Wire_type wt; + double wire_spacing; + double wire_width; + enum Wire_placement wire_placement; + double repeater_size; + double repeater_spacing; + double wire_length; + double in_rise_time, out_rise_time; + + void set_in_rise_time(double rt) + { + in_rise_time = rt; + } + static Component global; + static Component global_5; + static Component global_10; + static Component global_20; + static Component global_30; + static Component low_swing; + static double wire_width_init; + static double wire_spacing_init; + void print_wire(); + + private: + + int nsense; // no. of sense amps connected to a low-swing wire if it + // is broadcasting data to multiple destinations + // width and spacing scaling factor can be used + // to model low level wires or special + // fat wires + double w_scale, s_scale; + double resistivity; + powerDef wire_model (double space, double size, double *delay); + list <Component> repeated_wire; + void update_fullswing(); + static int initialized; + + + //low-swing + Component transmitter; + Component l_wire; + Component sense_amp; + + double min_w_pmos; + + TechnologyParameter::DeviceType *deviceType; + +}; + +#endif diff --git a/ext/mcpat/core.cc b/ext/mcpat/core.cc new file mode 100644 index 000000000..ba9106061 --- /dev/null +++ b/ext/mcpat/core.cc @@ -0,0 +1,4135 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <cmath> +#include <iostream> +#include <string> + +#include "XML_Parse.h" +#include "basic_circuit.h" +#include "const.h" +#include "core.h" +#include "io.h" +#include "parameter.h" +//#include "globalvar.h" + +InstFetchU::InstFetchU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_) +:XML(XML_interface), + ithCore(ithCore_), + interface_ip(*interface_ip_), + coredynp(dyn_p_), + IB (0), + BTB (0), + ID_inst (0), + ID_operand (0), + ID_misc (0), + exist(exist_) +{ + if (!exist) return; + int idx, tag, data, size, line, assoc, banks; + bool debug= false, is_default = true; + + clockRate = coredynp.clockRate; + executionTime = coredynp.executionTime; + cache_p = (Cache_policy)XML->sys.core[ithCore].icache.icache_config[7]; + //Assuming all L1 caches are virtually idxed physically tagged. + //cache + + size = (int)XML->sys.core[ithCore].icache.icache_config[0]; + line = (int)XML->sys.core[ithCore].icache.icache_config[1]; + assoc = (int)XML->sys.core[ithCore].icache.icache_config[2]; + banks = (int)XML->sys.core[ithCore].icache.icache_config[3]; + idx = debug?9:int(ceil(log2(size/line/assoc))); + tag = debug?51:(int)XML->sys.physical_address_width-idx-int(ceil(log2(line))) + EXTRA_TAG_BITS; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.cache_sz = debug?32768:(int)XML->sys.core[ithCore].icache.icache_config[0]; + interface_ip.line_sz = debug?64:(int)XML->sys.core[ithCore].icache.icache_config[1]; + interface_ip.assoc = debug?8:(int)XML->sys.core[ithCore].icache.icache_config[2]; + interface_ip.nbanks = debug?1:(int)XML->sys.core[ithCore].icache.icache_config[3]; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0;//debug?0:XML->sys.core[ithCore].icache.icache_config[5]; + interface_ip.throughput = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[4]/clockRate; + interface_ip.latency = debug?3.0/clockRate:XML->sys.core[ithCore].icache.icache_config[5]/clockRate; + interface_ip.is_cache = true; + interface_ip.pure_cam = false; + interface_ip.pure_ram = false; + // interface_ip.obj_func_dyn_energy = 0; + // interface_ip.obj_func_dyn_power = 0; + // interface_ip.obj_func_leak_power = 0; + // interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + icache.caches = new ArrayST(&interface_ip, "icache", Core_device, coredynp.opt_local, coredynp.core_ty); + scktRatio = g_tp.sckt_co_eff; + chip_PR_overhead = g_tp.chip_layout_overhead; + macro_PR_overhead = g_tp.macro_layout_overhead; + icache.area.set_area(icache.area.get_area()+ icache.caches->local_result.area); + area.set_area(area.get_area()+ icache.caches->local_result.area); + //output_data_csv(icache.caches.local_result); + + + /* + *iCache controllers + *miss buffer Each MSHR contains enough state + *to handle one or more accesses of any type to a single memory line. + *Due to the generality of the MSHR mechanism, + *the amount of state involved is non-trivial: + *including the address, pointers to the cache entry and destination register, + *written data, and various other pieces of state. + */ + interface_ip.num_search_ports = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports; + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = (XML->sys.physical_address_width) + int(ceil(log2(size/line))) + icache.caches->l_ip.line_sz*8; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0)); + interface_ip.cache_sz = XML->sys.core[ithCore].icache.buffer_sizes[0]*interface_ip.line_sz; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[4]/clockRate;//means cycle time + interface_ip.latency = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[5]/clockRate;//means access time + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports = XML->sys.core[ithCore].number_instruction_fetch_ports; + icache.missb = new ArrayST(&interface_ip, "icacheMissBuffer", Core_device, coredynp.opt_local, coredynp.core_ty); + icache.area.set_area(icache.area.get_area()+ icache.missb->local_result.area); + area.set_area(area.get_area()+ icache.missb->local_result.area); + //output_data_csv(icache.missb.local_result); + + //fill buffer + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = icache.caches->l_ip.line_sz; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data;//int(pow(2.0,ceil(log2(data)))); + interface_ip.cache_sz = data*XML->sys.core[ithCore].icache.buffer_sizes[1]; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[4]/clockRate; + interface_ip.latency = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports = XML->sys.core[ithCore].number_instruction_fetch_ports; + icache.ifb = new ArrayST(&interface_ip, "icacheFillBuffer", Core_device, coredynp.opt_local, coredynp.core_ty); + icache.area.set_area(icache.area.get_area()+ icache.ifb->local_result.area); + area.set_area(area.get_area()+ icache.ifb->local_result.area); + //output_data_csv(icache.ifb.local_result); + + //prefetch buffer + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;//check with previous entries to decide wthether to merge. + data = icache.caches->l_ip.line_sz;//separate queue to prevent from cache polution. + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data;//int(pow(2.0,ceil(log2(data)))); + interface_ip.cache_sz = XML->sys.core[ithCore].icache.buffer_sizes[2]*interface_ip.line_sz; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[4]/clockRate; + interface_ip.latency = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports = XML->sys.core[ithCore].number_instruction_fetch_ports; + icache.prefetchb = new ArrayST(&interface_ip, "icacheprefetchBuffer", Core_device, coredynp.opt_local, coredynp.core_ty); + icache.area.set_area(icache.area.get_area()+ icache.prefetchb->local_result.area); + area.set_area(area.get_area()+ icache.prefetchb->local_result.area); + //output_data_csv(icache.prefetchb.local_result); + + //Instruction buffer + data = XML->sys.core[ithCore].instruction_length*XML->sys.core[ithCore].peak_issue_width;//icache.caches.l_ip.line_sz; //multiple threads timing sharing the instruction buffer. + interface_ip.is_cache = false; + interface_ip.pure_ram = true; + interface_ip.pure_cam = false; + interface_ip.line_sz = int(ceil(data/8.0)); + interface_ip.cache_sz = XML->sys.core[ithCore].number_hardware_threads*XML->sys.core[ithCore].instruction_buffer_size*interface_ip.line_sz>64? + XML->sys.core[ithCore].number_hardware_threads*XML->sys.core[ithCore].instruction_buffer_size*interface_ip.line_sz:64; + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + //NOTE: Assuming IB is time slice shared among threads, every fetch op will at least fetch "fetch width" instructions. + interface_ip.num_rw_ports = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports;//XML->sys.core[ithCore].fetch_width; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + IB = new ArrayST(&interface_ip, "InstBuffer", Core_device, coredynp.opt_local, coredynp.core_ty); + IB->area.set_area(IB->area.get_area()+ IB->local_result.area); + area.set_area(area.get_area()+ IB->local_result.area); + //output_data_csv(IB.IB.local_result); + + // inst_decoder.opcode_length = XML->sys.core[ithCore].opcode_width; + // inst_decoder.init_decoder(is_default, &interface_ip); + // inst_decoder.full_decoder_power(); + + if (coredynp.predictionW>0) + { + /* + * BTB branch target buffer, accessed during IF stage. Virtually indexed and virtually tagged + * It is only a cache without all the buffers in the cache controller since it is more like a + * look up table than a cache with cache controller. When access miss, no load from other places + * such as main memory (not actively fill the misses), it is passively updated under two circumstances: + * 1) when BPT@ID stage finds out current is a taken branch while BTB missed + * 2) When BPT@ID stage predicts differently than BTB + * 3) When ID stage finds out current instruction is not a branch while BTB had a hit.(mark as invalid) + * 4) when EXEU find out wrong target has been provided from BTB. + * + */ + size = XML->sys.core[ithCore].BTB.BTB_config[0]; + line = XML->sys.core[ithCore].BTB.BTB_config[1]; + assoc = XML->sys.core[ithCore].BTB.BTB_config[2]; + banks = XML->sys.core[ithCore].BTB.BTB_config[3]; + idx = debug?9:int(ceil(log2(size/line/assoc))); +// tag = debug?51:XML->sys.virtual_address_width-idx-int(ceil(log2(line))) + int(ceil(log2(XML->sys.core[ithCore].number_hardware_threads))) +EXTRA_TAG_BITS; + tag = debug?51:XML->sys.virtual_address_width + int(ceil(log2(XML->sys.core[ithCore].number_hardware_threads))) +EXTRA_TAG_BITS; + interface_ip.is_cache = true; + interface_ip.pure_ram = false; + interface_ip.pure_cam = false; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.cache_sz = debug?32768:size; + interface_ip.line_sz = debug?64:line; + interface_ip.assoc = debug?8:assoc; + interface_ip.nbanks = debug?1:banks; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0;//debug?0:XML->sys.core[ithCore].dcache.dcache_config[5]; + interface_ip.throughput = debug?1.0/clockRate:XML->sys.core[ithCore].BTB.BTB_config[4]/clockRate; + interface_ip.latency = debug?3.0/clockRate:XML->sys.core[ithCore].BTB.BTB_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = coredynp.predictionW; + interface_ip.num_wr_ports = coredynp.predictionW; + interface_ip.num_se_rd_ports = 0; + BTB = new ArrayST(&interface_ip, "Branch Target Buffer", Core_device, coredynp.opt_local, coredynp.core_ty); + BTB->area.set_area(BTB->area.get_area()+ BTB->local_result.area); + area.set_area(area.get_area()+ BTB->local_result.area); + ///cout<<"area="<<area<<endl; + + BPT = new BranchPredictor(XML, ithCore, &interface_ip,coredynp); + area.set_area(area.get_area()+ BPT->area.get_area()); + } + + ID_inst = new inst_decoder(is_default, &interface_ip, + coredynp.opcode_length, 1/*Decoder should not know how many by itself*/, + coredynp.x86, + Core_device, coredynp.core_ty); + + ID_operand = new inst_decoder(is_default, &interface_ip, + coredynp.arch_ireg_width, 1, + coredynp.x86, + Core_device, coredynp.core_ty); + + ID_misc = new inst_decoder(is_default, &interface_ip, + 8/* Prefix field etc upto 14B*/, 1, + coredynp.x86, + Core_device, coredynp.core_ty); + //TODO: X86 decoder should decode the inst in cyclic mode under the control of squencer. + //So the dynamic power should be multiplied by a few times. + area.set_area(area.get_area()+ (ID_inst->area.get_area() + +ID_operand->area.get_area() + +ID_misc->area.get_area())*coredynp.decodeW); + +} + + +BranchPredictor::BranchPredictor(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_) +:XML(XML_interface), + ithCore(ithCore_), + interface_ip(*interface_ip_), + coredynp(dyn_p_), + globalBPT(0), + localBPT(0), + L1_localBPT(0), + L2_localBPT(0), + chooser(0), + RAS(0), + exist(exist_) +{ + /* + * Branch Predictor, accessed during ID stage. + * McPAT's branch predictor model is the tournament branch predictor used in Alpha 21264, + * including global predictor, local two level predictor, and Chooser. + * The Branch predictor also includes a RAS (return address stack) for function calls + * Branch predictors are tagged by thread ID and modeled as 1-way associative $ + * However RAS return address stacks are duplicated for each thread. + * TODO:Data Width need to be computed more precisely * + */ + if (!exist) return; + int tag, data; + + clockRate = coredynp.clockRate; + executionTime = coredynp.executionTime; + interface_ip.assoc = 1; + interface_ip.pure_cam = false; + if (coredynp.multithreaded) + { + + tag = int(log2(coredynp.num_hthreads)+ EXTRA_TAG_BITS); + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + + interface_ip.is_cache = true; + interface_ip.pure_ram = false; + } + else + { + interface_ip.is_cache = false; + interface_ip.pure_ram = true; + + } + //Global predictor + data = int(ceil(XML->sys.core[ithCore].predictor.global_predictor_bits/8.0)); + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].predictor.global_predictor_entries; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 2; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = coredynp.predictionW; + interface_ip.num_wr_ports = coredynp.predictionW; + interface_ip.num_se_rd_ports = 0; + globalBPT = new ArrayST(&interface_ip, "Global Predictor", Core_device, coredynp.opt_local, coredynp.core_ty); + globalBPT->area.set_area(globalBPT->area.get_area()+ globalBPT->local_result.area); + area.set_area(area.get_area()+ globalBPT->local_result.area); + + //Local BPT (Level 1) + data = int(ceil(XML->sys.core[ithCore].predictor.local_predictor_size[0]/8.0)); + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].predictor.local_predictor_entries; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 2; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = coredynp.predictionW; + interface_ip.num_wr_ports = coredynp.predictionW; + interface_ip.num_se_rd_ports = 0; + L1_localBPT = new ArrayST(&interface_ip, "L1 local Predictor", Core_device, coredynp.opt_local, coredynp.core_ty); + L1_localBPT->area.set_area(L1_localBPT->area.get_area()+ L1_localBPT->local_result.area); + area.set_area(area.get_area()+ L1_localBPT->local_result.area); + + //Local BPT (Level 2) + data = int(ceil(XML->sys.core[ithCore].predictor.local_predictor_size[1]/8.0)); + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].predictor.local_predictor_entries; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 2; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = coredynp.predictionW; + interface_ip.num_wr_ports = coredynp.predictionW; + interface_ip.num_se_rd_ports = 0; + L2_localBPT = new ArrayST(&interface_ip, "L2 local Predictor", Core_device, coredynp.opt_local, coredynp.core_ty); + L2_localBPT->area.set_area(L2_localBPT->area.get_area()+ L2_localBPT->local_result.area); + area.set_area(area.get_area()+ L2_localBPT->local_result.area); + + //Chooser + data = int(ceil(XML->sys.core[ithCore].predictor.chooser_predictor_bits/8.0)); + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].predictor.chooser_predictor_entries; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 2; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = coredynp.predictionW; + interface_ip.num_wr_ports = coredynp.predictionW; + interface_ip.num_se_rd_ports = 0; + chooser = new ArrayST(&interface_ip, "Predictor Chooser", Core_device, coredynp.opt_local, coredynp.core_ty); + chooser->area.set_area(chooser->area.get_area()+ chooser->local_result.area); + area.set_area(area.get_area()+ chooser->local_result.area); + + //RAS return address stacks are Duplicated for each thread. + interface_ip.is_cache = false; + interface_ip.pure_ram = true; + data = int(ceil(coredynp.pc_width/8.0)); + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].RAS_size; + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 2; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = coredynp.predictionW; + interface_ip.num_wr_ports = coredynp.predictionW; + interface_ip.num_se_rd_ports = 0; + RAS = new ArrayST(&interface_ip, "RAS", Core_device, coredynp.opt_local, coredynp.core_ty); + RAS->area.set_area(RAS->area.get_area()+ RAS->local_result.area*coredynp.num_hthreads); + area.set_area(area.get_area()+ RAS->local_result.area*coredynp.num_hthreads); + +} + +SchedulerU::SchedulerU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_) +:XML(XML_interface), + ithCore(ithCore_), + interface_ip(*interface_ip_), + coredynp(dyn_p_), + int_inst_window(0), + fp_inst_window(0), + ROB(0), + instruction_selection(0), + exist(exist_) + { + if (!exist) return; + int tag, data; + bool is_default=true; + string tmp_name; + + clockRate = coredynp.clockRate; + executionTime = coredynp.executionTime; + if ((coredynp.core_ty==Inorder && coredynp.multithreaded)) + { + //Instruction issue queue, in-order multi-issue or multithreaded processor also has this structure. Unified window for Inorder processors + tag = int(log2(XML->sys.core[ithCore].number_hardware_threads)*coredynp.perThreadState);//This is the normal thread state bits based on Niagara Design + data = XML->sys.core[ithCore].instruction_length; + //NOTE: x86 inst can be very lengthy, up to 15B. Source: Intel® 64 and IA-32 Architectures + //Software Developer’s Manual + interface_ip.is_cache = true; + interface_ip.pure_cam = false; + interface_ip.pure_ram = false; + interface_ip.line_sz = int(ceil(data/8.0)); + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.cache_sz = XML->sys.core[ithCore].instruction_window_size*interface_ip.line_sz>64?XML->sys.core[ithCore].instruction_window_size*interface_ip.line_sz:64; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 1; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = coredynp.peak_issueW; + interface_ip.num_wr_ports = coredynp.peak_issueW; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports = coredynp.peak_issueW; + int_inst_window = new ArrayST(&interface_ip, "InstFetchQueue", Core_device, coredynp.opt_local, coredynp.core_ty); + int_inst_window->area.set_area(int_inst_window->area.get_area()+ int_inst_window->local_result.area*coredynp.num_pipelines); + area.set_area(area.get_area()+ int_inst_window->local_result.area*coredynp.num_pipelines); + //output_data_csv(iRS.RS.local_result); + Iw_height =int_inst_window->local_result.cache_ht; + + /* + * selection logic + * In a single-issue Inorder multithreaded processor like Niagara, issue width=1*number_of_threads since the processor does need to pick up + * instructions from multiple ready ones(although these ready ones are from different threads).While SMT processors do not distinguish which thread belongs to who + * at the issue stage. + */ + + instruction_selection = new selection_logic(is_default, XML->sys.core[ithCore].instruction_window_size, + coredynp.peak_issueW*XML->sys.core[ithCore].number_hardware_threads, + &interface_ip, Core_device, coredynp.core_ty); + } + + if (coredynp.core_ty==OOO) + { + /* + * CAM based instruction window + * For physicalRegFilebased OOO it is the instruction issue queue, where only tags of phy regs are stored + * For RS based OOO it is the Reservation station, where both tags and values of phy regs are stored + * It is written once and read twice(two operands) before an instruction can be issued. + * X86 instruction can be very long up to 15B. add instruction length in XML + */ + if(coredynp.scheu_ty==PhysicalRegFile) + { + tag = coredynp.phy_ireg_width; + // Each time only half of the tag is compared, but two tag should be stored. + // This underestimate the search power + data = int((ceil((coredynp.instruction_length+2*(coredynp.phy_ireg_width - coredynp.arch_ireg_width))/2.0)/8.0)); + //Data width being divided by 2 means only after both operands available the whole data will be read out. + //This is modeled using two equivalent readouts with half of the data width + tmp_name = "InstIssueQueue"; + } + else + { + tag = coredynp.phy_ireg_width; + // Each time only half of the tag is compared, but two tag should be stored. + // This underestimate the search power + data = int(ceil(((coredynp.instruction_length+2*(coredynp.phy_ireg_width - coredynp.arch_ireg_width)+ + 2*coredynp.int_data_width)/2.0)/8.0)); + //Data width being divided by 2 means only after both operands available the whole data will be read out. + //This is modeled using two equivalent readouts with half of the data width + + tmp_name = "IntReservationStation"; + } + interface_ip.is_cache = true; + interface_ip.pure_cam = false; + interface_ip.pure_ram = false; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].instruction_window_size; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.access_mode = 0; + interface_ip.throughput = 2*1.0/clockRate; + interface_ip.latency = 2*1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = coredynp.peak_issueW; + interface_ip.num_wr_ports = coredynp.peak_issueW; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports = coredynp.peak_issueW; + int_inst_window = new ArrayST(&interface_ip, tmp_name, Core_device, coredynp.opt_local, coredynp.core_ty); + int_inst_window->area.set_area(int_inst_window->area.get_area()+ int_inst_window->local_result.area*coredynp.num_pipelines); + area.set_area(area.get_area()+ int_inst_window->local_result.area*coredynp.num_pipelines); + Iw_height =int_inst_window->local_result.cache_ht; + //FU inst window + if(coredynp.scheu_ty==PhysicalRegFile) + { + tag = 2*coredynp.phy_freg_width;// TODO: each time only half of the tag is compared + data = int(ceil((coredynp.instruction_length+2*(coredynp.phy_freg_width - coredynp.arch_freg_width))/8.0)); + tmp_name = "FPIssueQueue"; + } + else + { + tag = 2*coredynp.phy_ireg_width; + data = int(ceil((coredynp.instruction_length+2*(coredynp.phy_freg_width - coredynp.arch_freg_width)+ + 2*coredynp.fp_data_width)/8.0)); + tmp_name = "FPReservationStation"; + } + interface_ip.is_cache = true; + interface_ip.pure_cam = false; + interface_ip.pure_ram = false; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].fp_instruction_window_size; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.access_mode = 0; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = coredynp.fp_issueW; + interface_ip.num_wr_ports = coredynp.fp_issueW; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports = coredynp.fp_issueW; + fp_inst_window = new ArrayST(&interface_ip, tmp_name, Core_device, coredynp.opt_local, coredynp.core_ty); + fp_inst_window->area.set_area(fp_inst_window->area.get_area()+ fp_inst_window->local_result.area*coredynp.num_fp_pipelines); + area.set_area(area.get_area()+ fp_inst_window->local_result.area*coredynp.num_fp_pipelines); + fp_Iw_height =fp_inst_window->local_result.cache_ht; + + if (XML->sys.core[ithCore].ROB_size >0) + { + /* + * if ROB_size = 0, then the target processor does not support hardware-based + * speculation, i.e. , the processor allow OOO issue as well as OOO completion, which + * means branch must be resolved before instruction issued into instruction window, since + * there is no change to flush miss-predict branch path after instructions are issued in this situation. + * + * ROB.ROB size = inflight inst. ROB is unified for int and fp inst. + * One old approach is to combine the RAT and ROB as a huge CAM structure as in AMD K7. + * However, this approach is abandoned due to its high power and poor scalablility. + * McPAT uses current implementation of ROB as circular buffer. + * ROB is written once when instruction is issued and read once when the instruction is committed. * + */ + int robExtra = int(ceil(5 + log2(coredynp.num_hthreads))); + //5 bits are: busy, Issued, Finished, speculative, valid + if(coredynp.scheu_ty==PhysicalRegFile) + { + //PC is to id the instruction for recover exception. + //inst is used to map the renamed dest. registers.so that commit stage can know which reg/RRAT to update +// data = int(ceil((robExtra+coredynp.pc_width + +// coredynp.instruction_length + 2*coredynp.phy_ireg_width)/8.0)); + data = int(ceil((robExtra+coredynp.pc_width + + coredynp.phy_ireg_width)/8.0)); + } + else + { + //in RS based OOO, ROB also contains value of destination reg +// data = int(ceil((robExtra+coredynp.pc_width + +// coredynp.instruction_length + 2*coredynp.phy_ireg_width + coredynp.fp_data_width)/8.0)); + data = int(ceil((robExtra + coredynp.pc_width + + coredynp.phy_ireg_width + coredynp.fp_data_width)/8.0)); + } + interface_ip.is_cache = false; + interface_ip.pure_cam = false; + interface_ip.pure_ram = true; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].ROB_size;//The XML ROB size is for all threads + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 1; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = coredynp.peak_commitW; + interface_ip.num_wr_ports = coredynp.peak_issueW; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports = 0; + ROB = new ArrayST(&interface_ip, "ReorderBuffer", Core_device, coredynp.opt_local, coredynp.core_ty); + ROB->area.set_area(ROB->area.get_area()+ ROB->local_result.area*coredynp.num_pipelines); + area.set_area(area.get_area()+ ROB->local_result.area*coredynp.num_pipelines); + ROB_height =ROB->local_result.cache_ht; + } + + instruction_selection = new selection_logic(is_default, XML->sys.core[ithCore].instruction_window_size, + coredynp.peak_issueW, &interface_ip, Core_device, coredynp.core_ty); + } +} + +LoadStoreU::LoadStoreU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_,bool exist_) +:XML(XML_interface), + ithCore(ithCore_), + interface_ip(*interface_ip_), + coredynp(dyn_p_), + LSQ(0), + exist(exist_) +{ + if (!exist) return; + int idx, tag, data, size, line, assoc, banks; + bool debug= false; + int ldst_opcode = XML->sys.core[ithCore].opcode_width;//16; + + clockRate = coredynp.clockRate; + executionTime = coredynp.executionTime; + cache_p = (Cache_policy)XML->sys.core[ithCore].dcache.dcache_config[7]; + + interface_ip.num_search_ports = XML->sys.core[ithCore].memory_ports; + interface_ip.is_cache = true; + interface_ip.pure_cam = false; + interface_ip.pure_ram = false; + //Dcache + size = (int)XML->sys.core[ithCore].dcache.dcache_config[0]; + line = (int)XML->sys.core[ithCore].dcache.dcache_config[1]; + assoc = (int)XML->sys.core[ithCore].dcache.dcache_config[2]; + banks = (int)XML->sys.core[ithCore].dcache.dcache_config[3]; + idx = debug?9:int(ceil(log2(size/line/assoc))); + tag = debug?51:XML->sys.physical_address_width-idx-int(ceil(log2(line))) + EXTRA_TAG_BITS; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.cache_sz = debug?32768:(int)XML->sys.core[ithCore].dcache.dcache_config[0]; + interface_ip.line_sz = debug?64:(int)XML->sys.core[ithCore].dcache.dcache_config[1]; + interface_ip.assoc = debug?8:(int)XML->sys.core[ithCore].dcache.dcache_config[2]; + interface_ip.nbanks = debug?1:(int)XML->sys.core[ithCore].dcache.dcache_config[3]; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0;//debug?0:XML->sys.core[ithCore].dcache.dcache_config[5]; + interface_ip.throughput = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[4]/clockRate; + interface_ip.latency = debug?3.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[5]/clockRate; + interface_ip.is_cache = true; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = debug?1:XML->sys.core[ithCore].memory_ports;//usually In-order has 1 and OOO has 2 at least. + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + dcache.caches = new ArrayST(&interface_ip, "dcache", Core_device, coredynp.opt_local, coredynp.core_ty); + dcache.area.set_area(dcache.area.get_area()+ dcache.caches->local_result.area); + area.set_area(area.get_area()+ dcache.caches->local_result.area); + //output_data_csv(dcache.caches.local_result); + + //dCache controllers + //miss buffer + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = (XML->sys.physical_address_width) + int(ceil(log2(size/line))) + dcache.caches->l_ip.line_sz*8; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0)); + interface_ip.cache_sz = XML->sys.core[ithCore].dcache.buffer_sizes[0]*interface_ip.line_sz; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 2; + interface_ip.throughput = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[4]/clockRate; + interface_ip.latency = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = debug?1:XML->sys.core[ithCore].memory_ports;; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + dcache.missb = new ArrayST(&interface_ip, "dcacheMissBuffer", Core_device, coredynp.opt_local, coredynp.core_ty); + dcache.area.set_area(dcache.area.get_area()+ dcache.missb->local_result.area); + area.set_area(area.get_area()+ dcache.missb->local_result.area); + //output_data_csv(dcache.missb.local_result); + + //fill buffer + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = dcache.caches->l_ip.line_sz; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data;//int(pow(2.0,ceil(log2(data)))); + interface_ip.cache_sz = data*XML->sys.core[ithCore].dcache.buffer_sizes[1]; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 2; + interface_ip.throughput = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[4]/clockRate; + interface_ip.latency = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = debug?1:XML->sys.core[ithCore].memory_ports;; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + dcache.ifb = new ArrayST(&interface_ip, "dcacheFillBuffer", Core_device, coredynp.opt_local, coredynp.core_ty); + dcache.area.set_area(dcache.area.get_area()+ dcache.ifb->local_result.area); + area.set_area(area.get_area()+ dcache.ifb->local_result.area); + //output_data_csv(dcache.ifb.local_result); + + //prefetch buffer + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;//check with previous entries to decide wthether to merge. + data = dcache.caches->l_ip.line_sz;//separate queue to prevent from cache polution. + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data;//int(pow(2.0,ceil(log2(data)))); + interface_ip.cache_sz = XML->sys.core[ithCore].dcache.buffer_sizes[2]*interface_ip.line_sz; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 2; + interface_ip.throughput = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[4]/clockRate; + interface_ip.latency = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = debug?1:XML->sys.core[ithCore].memory_ports;; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + dcache.prefetchb = new ArrayST(&interface_ip, "dcacheprefetchBuffer", Core_device, coredynp.opt_local, coredynp.core_ty); + dcache.area.set_area(dcache.area.get_area()+ dcache.prefetchb->local_result.area); + area.set_area(area.get_area()+ dcache.prefetchb->local_result.area); + //output_data_csv(dcache.prefetchb.local_result); + + //WBB + + if (cache_p==Write_back) + { + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = dcache.caches->l_ip.line_sz; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data; + interface_ip.cache_sz = XML->sys.core[ithCore].dcache.buffer_sizes[3]*interface_ip.line_sz; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 2; + interface_ip.throughput = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[4]/clockRate; + interface_ip.latency = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = XML->sys.core[ithCore].memory_ports; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + dcache.wbb = new ArrayST(&interface_ip, "dcacheWBB", Core_device, coredynp.opt_local, coredynp.core_ty); + dcache.area.set_area(dcache.area.get_area()+ dcache.wbb->local_result.area); + area.set_area(area.get_area()+ dcache.wbb->local_result.area); + //output_data_csv(dcache.wbb.local_result); + } + + /* + * LSU--in-order processors do not have separate load queue: unified lsq + * partitioned among threads + * it is actually the store queue but for inorder processors it serves as both loadQ and StoreQ + */ + tag = ldst_opcode+XML->sys.virtual_address_width +int(ceil(log2(XML->sys.core[ithCore].number_hardware_threads))) + EXTRA_TAG_BITS; + data = XML->sys.machine_bits; + interface_ip.is_cache = true; + interface_ip.line_sz = int(ceil(data/32.0))*4; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.cache_sz = XML->sys.core[ithCore].store_buffer_size*interface_ip.line_sz*XML->sys.core[ithCore].number_hardware_threads; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 1; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = XML->sys.core[ithCore].memory_ports; + interface_ip.num_wr_ports = XML->sys.core[ithCore].memory_ports; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports =XML->sys.core[ithCore].memory_ports; + LSQ = new ArrayST(&interface_ip, "Load(Store)Queue", Core_device, coredynp.opt_local, coredynp.core_ty); + LSQ->area.set_area(LSQ->area.get_area()+ LSQ->local_result.area); + area.set_area(area.get_area()+ LSQ->local_result.area); + area.set_area(area.get_area()*cdb_overhead); + //output_data_csv(LSQ.LSQ.local_result); + lsq_height=LSQ->local_result.cache_ht*sqrt(cdb_overhead);/*XML->sys.core[ithCore].number_hardware_threads*/ + + if ((coredynp.core_ty==OOO) && (XML->sys.core[ithCore].load_buffer_size >0)) + { + interface_ip.line_sz = int(ceil(data/32.0))*4; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.cache_sz = XML->sys.core[ithCore].load_buffer_size*interface_ip.line_sz*XML->sys.core[ithCore].number_hardware_threads; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 1; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = XML->sys.core[ithCore].memory_ports; + interface_ip.num_wr_ports = XML->sys.core[ithCore].memory_ports; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports =XML->sys.core[ithCore].memory_ports; + LoadQ = new ArrayST(&interface_ip, "LoadQueue", Core_device, coredynp.opt_local, coredynp.core_ty); + LoadQ->area.set_area(LoadQ->area.get_area()+ LoadQ->local_result.area); + area.set_area(area.get_area()+ LoadQ->local_result.area); + area.set_area(area.get_area()*cdb_overhead); + //output_data_csv(LoadQ.LoadQ.local_result); + lsq_height=(LSQ->local_result.cache_ht + LoadQ->local_result.cache_ht)*sqrt(cdb_overhead);/*XML->sys.core[ithCore].number_hardware_threads*/ + } + +} + +MemManU::MemManU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_,bool exist_) +:XML(XML_interface), + ithCore(ithCore_), + interface_ip(*interface_ip_), + coredynp(dyn_p_), + itlb(0), + dtlb(0), + exist(exist_) +{ + if (!exist) return; + int tag, data; + bool debug= false; + + clockRate = coredynp.clockRate; + executionTime = coredynp.executionTime; + interface_ip.is_cache = true; + interface_ip.pure_cam = false; + interface_ip.pure_ram = false; + interface_ip.specific_tag = 1; + //Itlb TLBs are partioned among threads according to Nigara and Nehalem + tag = XML->sys.virtual_address_width- int(floor(log2(XML->sys.virtual_memory_page_size))) + int(ceil(log2(XML->sys.core[ithCore].number_hardware_threads)))+ EXTRA_TAG_BITS; + data = XML->sys.physical_address_width- int(floor(log2(XML->sys.virtual_memory_page_size))); + interface_ip.tag_w = tag; + interface_ip.line_sz = int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0)); + interface_ip.cache_sz = XML->sys.core[ithCore].itlb.number_entries*interface_ip.line_sz;//*XML->sys.core[ithCore].number_hardware_threads; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[4]/clockRate; + interface_ip.latency = debug?1.0/clockRate:XML->sys.core[ithCore].icache.icache_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports = debug?1:XML->sys.core[ithCore].number_instruction_fetch_ports; + itlb = new ArrayST(&interface_ip, "ITLB", Core_device, coredynp.opt_local, coredynp.core_ty); + itlb->area.set_area(itlb->area.get_area()+ itlb->local_result.area); + area.set_area(area.get_area()+ itlb->local_result.area); + //output_data_csv(itlb.tlb.local_result); + + //dtlb + tag = XML->sys.virtual_address_width- int(floor(log2(XML->sys.virtual_memory_page_size))) +int(ceil(log2(XML->sys.core[ithCore].number_hardware_threads)))+ EXTRA_TAG_BITS; + data = XML->sys.physical_address_width- int(floor(log2(XML->sys.virtual_memory_page_size))); + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0)); + interface_ip.cache_sz = XML->sys.core[ithCore].dtlb.number_entries*interface_ip.line_sz;//*XML->sys.core[ithCore].number_hardware_threads; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[4]/clockRate; + interface_ip.latency = debug?1.0/clockRate:XML->sys.core[ithCore].dcache.dcache_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = XML->sys.core[ithCore].memory_ports; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports = XML->sys.core[ithCore].memory_ports; + dtlb = new ArrayST(&interface_ip, "DTLB", Core_device, coredynp.opt_local, coredynp.core_ty); + dtlb->area.set_area(dtlb->area.get_area()+ dtlb->local_result.area); + area.set_area(area.get_area()+ dtlb->local_result.area); + //output_data_csv(dtlb.tlb.local_result); + +} + +RegFU::RegFU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_,bool exist_) +:XML(XML_interface), + ithCore(ithCore_), + interface_ip(*interface_ip_), + coredynp(dyn_p_), + IRF (0), + FRF (0), + RFWIN (0), + exist(exist_) + { + /* + * processors have separate architectural register files for each thread. + * therefore, the bypass buses need to travel across all the register files. + */ + if (!exist) return; + int data; + + clockRate = coredynp.clockRate; + executionTime = coredynp.executionTime; + //**********************************IRF*************************************** + data = coredynp.int_data_width; + interface_ip.is_cache = false; + interface_ip.pure_cam = false; + interface_ip.pure_ram = true; + interface_ip.line_sz = int(ceil(data/32.0))*4; + interface_ip.cache_sz = coredynp.num_IRF_entry*interface_ip.line_sz; + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 1; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//this is the transfer port for saving/restoring states when exceptions happen. + interface_ip.num_rd_ports = 2*coredynp.peak_issueW; + interface_ip.num_wr_ports = coredynp.peak_issueW; + interface_ip.num_se_rd_ports = 0; + IRF = new ArrayST(&interface_ip, "Integer Register File", Core_device, coredynp.opt_local, coredynp.core_ty); + IRF->area.set_area(IRF->area.get_area()+ IRF->local_result.area*XML->sys.core[ithCore].number_hardware_threads*coredynp.num_pipelines*cdb_overhead); + area.set_area(area.get_area()+ IRF->local_result.area*XML->sys.core[ithCore].number_hardware_threads*coredynp.num_pipelines*cdb_overhead); + //area.set_area(area.get_area()*cdb_overhead); + //output_data_csv(IRF.RF.local_result); + + //**********************************FRF*************************************** + data = coredynp.fp_data_width; + interface_ip.is_cache = false; + interface_ip.pure_cam = false; + interface_ip.pure_ram = true; + interface_ip.line_sz = int(ceil(data/32.0))*4; + interface_ip.cache_sz = coredynp.num_FRF_entry*interface_ip.line_sz; + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 1; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//this is the transfer port for saving/restoring states when exceptions happen. + interface_ip.num_rd_ports = 2*XML->sys.core[ithCore].issue_width; + interface_ip.num_wr_ports = XML->sys.core[ithCore].issue_width; + interface_ip.num_se_rd_ports = 0; + FRF = new ArrayST(&interface_ip, "Floating point Register File", Core_device, coredynp.opt_local, coredynp.core_ty); + FRF->area.set_area(FRF->area.get_area()+ FRF->local_result.area*XML->sys.core[ithCore].number_hardware_threads*coredynp.num_fp_pipelines*cdb_overhead); + area.set_area(area.get_area()+ FRF->local_result.area*XML->sys.core[ithCore].number_hardware_threads*coredynp.num_fp_pipelines*cdb_overhead); + //area.set_area(area.get_area()*cdb_overhead); + //output_data_csv(FRF.RF.local_result); + int_regfile_height= IRF->local_result.cache_ht*XML->sys.core[ithCore].number_hardware_threads*sqrt(cdb_overhead); + fp_regfile_height = FRF->local_result.cache_ht*XML->sys.core[ithCore].number_hardware_threads*sqrt(cdb_overhead); + //since a EXU is associated with each pipeline, the cdb should not have longer length. + if (coredynp.regWindowing) + { + //*********************************REG_WIN************************************ + data = coredynp.int_data_width; //ECC, and usually 2 regs are transfered together during window shifting.Niagara Mega cell + interface_ip.is_cache = false; + interface_ip.pure_cam = false; + interface_ip.pure_ram = true; + interface_ip.line_sz = int(ceil(data/8.0)); + interface_ip.cache_sz = XML->sys.core[ithCore].register_windows_size*IRF->l_ip.cache_sz*XML->sys.core[ithCore].number_hardware_threads; + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 1; + interface_ip.throughput = 4.0/clockRate; + interface_ip.latency = 4.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//this is the transfer port for saving/restoring states when exceptions happen. + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + RFWIN = new ArrayST(&interface_ip, "RegWindow", Core_device, coredynp.opt_local, coredynp.core_ty); + RFWIN->area.set_area(RFWIN->area.get_area()+ RFWIN->local_result.area*coredynp.num_pipelines); + area.set_area(area.get_area()+ RFWIN->local_result.area*coredynp.num_pipelines); + //output_data_csv(RFWIN.RF.local_result); + } + + + } + +EXECU::EXECU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, double lsq_height_, const CoreDynParam & dyn_p_, bool exist_) +:XML(XML_interface), + ithCore(ithCore_), + interface_ip(*interface_ip_), + lsq_height(lsq_height_), + coredynp(dyn_p_), + rfu(0), + scheu(0), + fp_u(0), + exeu(0), + mul(0), + int_bypass(0), + intTagBypass(0), + int_mul_bypass(0), + intTag_mul_Bypass(0), + fp_bypass(0), + fpTagBypass(0), + exist(exist_) +{ + if (!exist) return; + double fu_height = 0.0; + clockRate = coredynp.clockRate; + executionTime = coredynp.executionTime; + rfu = new RegFU(XML, ithCore, &interface_ip,coredynp); + scheu = new SchedulerU(XML, ithCore, &interface_ip,coredynp); + exeu = new FunctionalUnit(XML, ithCore,&interface_ip, coredynp, ALU); + area.set_area(area.get_area()+ exeu->area.get_area() + rfu->area.get_area() +scheu->area.get_area() ); + fu_height = exeu->FU_height; + if (coredynp.num_fpus >0) + { + fp_u = new FunctionalUnit(XML, ithCore,&interface_ip, coredynp, FPU); + area.set_area(area.get_area()+ fp_u->area.get_area()); + } + if (coredynp.num_muls >0) + { + mul = new FunctionalUnit(XML, ithCore,&interface_ip, coredynp, MUL); + area.set_area(area.get_area()+ mul->area.get_area()); + fu_height += mul->FU_height; + } + /* + * broadcast logic, including int-broadcast; int_tag-broadcast; fp-broadcast; fp_tag-broadcast + * integer by pass has two paths and fp has 3 paths. + * on the same bus there are multiple tri-state drivers and muxes that go to different components on the same bus + */ + if (XML->sys.Embedded) + { + interface_ip.wt =Global_30; + interface_ip.wire_is_mat_type = 0; + interface_ip.wire_os_mat_type = 0; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + } + else + { + interface_ip.wt =Global; + interface_ip.wire_is_mat_type = 2;//start from semi-global since local wires are already used + interface_ip.wire_os_mat_type = 2; + interface_ip.throughput = 10.0/clockRate; //Do not care + interface_ip.latency = 10.0/clockRate; + } + + if (coredynp.core_ty==Inorder) + { + int_bypass = new interconnect("Int Bypass Data", Core_device, 1, 1, int(ceil(XML->sys.machine_bits/32.0)*32), + rfu->int_regfile_height + exeu->FU_height + lsq_height, &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + bypass.area.set_area(bypass.area.get_area() + int_bypass->area.get_area()); + intTagBypass = new interconnect("Int Bypass tag" , Core_device, 1, 1, coredynp.perThreadState, + rfu->int_regfile_height + exeu->FU_height + lsq_height + scheu->Iw_height, &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + bypass.area.set_area(bypass.area.get_area() +intTagBypass->area.get_area()); + + if (coredynp.num_muls>0) + { + int_mul_bypass = new interconnect("Mul Bypass Data" , Core_device, 1, 1, int(ceil(XML->sys.machine_bits/32.0)*32*1.5), + rfu->fp_regfile_height + exeu->FU_height + mul->FU_height + lsq_height, &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + bypass.area.set_area(bypass.area.get_area() +int_mul_bypass->area.get_area()); + intTag_mul_Bypass = new interconnect("Mul Bypass tag" , Core_device, 1, 1, coredynp.perThreadState, + rfu->fp_regfile_height + exeu->FU_height + mul->FU_height + lsq_height + scheu->Iw_height, &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + bypass.area.set_area(bypass.area.get_area() +intTag_mul_Bypass->area.get_area()); + } + + if (coredynp.num_fpus>0) + { + fp_bypass = new interconnect("FP Bypass Data" , Core_device, 1, 1, int(ceil(XML->sys.machine_bits/32.0)*32*1.5), + rfu->fp_regfile_height + fp_u->FU_height, &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + bypass.area.set_area(bypass.area.get_area() +fp_bypass->area.get_area()); + fpTagBypass = new interconnect("FP Bypass tag" , Core_device, 1, 1, coredynp.perThreadState, + rfu->fp_regfile_height + fp_u->FU_height + lsq_height + scheu->Iw_height, &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + bypass.area.set_area(bypass.area.get_area() +fpTagBypass->area.get_area()); + } + } + else + {//OOO + if (coredynp.scheu_ty==PhysicalRegFile) + { + /* For physical register based OOO, + * data broadcast interconnects cover across functional units, lsq, inst windows and register files, + * while tag broadcast interconnects also cover across ROB + */ + int_bypass = new interconnect("Int Bypass Data", Core_device, 1, 1, int(ceil(coredynp.int_data_width)), + rfu->int_regfile_height + exeu->FU_height + lsq_height, &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + bypass.area.set_area(bypass.area.get_area() +int_bypass->area.get_area()); + intTagBypass = new interconnect("Int Bypass tag" , Core_device, 1, 1, coredynp.phy_ireg_width, + rfu->int_regfile_height + exeu->FU_height + lsq_height + scheu->Iw_height + scheu->ROB_height , &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + + if (coredynp.num_muls>0) + { + int_mul_bypass = new interconnect("Mul Bypass Data", Core_device, 1, 1, int(ceil(coredynp.int_data_width)), + rfu->int_regfile_height + exeu->FU_height + mul->FU_height + lsq_height, &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + intTag_mul_Bypass = new interconnect("Mul Bypass tag" , Core_device, 1, 1, coredynp.phy_ireg_width, + rfu->int_regfile_height + exeu->FU_height + mul->FU_height + lsq_height + scheu->Iw_height + scheu->ROB_height , &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + bypass.area.set_area(bypass.area.get_area() +int_mul_bypass->area.get_area()); + bypass.area.set_area(bypass.area.get_area() +intTag_mul_Bypass->area.get_area()); + } + + if (coredynp.num_fpus>0) + { + fp_bypass = new interconnect("FP Bypass Data" , Core_device, 1, 1, int(ceil(coredynp.fp_data_width)), + rfu->fp_regfile_height + fp_u->FU_height, &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + fpTagBypass = new interconnect("FP Bypass tag" , Core_device, 1, 1, coredynp.phy_freg_width, + rfu->fp_regfile_height + fp_u->FU_height + lsq_height + scheu->fp_Iw_height + scheu->ROB_height, &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + bypass.area.set_area(bypass.area.get_area() +fp_bypass->area.get_area()); + bypass.area.set_area(bypass.area.get_area() +fpTagBypass->area.get_area()); + } + } + else + { + /* + * In RS based processor both data and tag are broadcast together, + * covering functional units, lsq, nst windows, register files, and ROBs + */ + int_bypass = new interconnect("Int Bypass Data", Core_device, 1, 1, int(ceil(coredynp.int_data_width)), + rfu->int_regfile_height + exeu->FU_height + lsq_height + scheu->Iw_height + scheu->ROB_height, &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + intTagBypass = new interconnect("Int Bypass tag" , Core_device, 1, 1, coredynp.phy_ireg_width, + rfu->int_regfile_height + exeu->FU_height + lsq_height + scheu->Iw_height + scheu->ROB_height , &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + bypass.area.set_area(bypass.area.get_area() +int_bypass->area.get_area()); + bypass.area.set_area(bypass.area.get_area() +intTagBypass->area.get_area()); + if (coredynp.num_muls>0) + { + int_mul_bypass = new interconnect("Mul Bypass Data", Core_device, 1, 1, int(ceil(coredynp.int_data_width)), + rfu->int_regfile_height + exeu->FU_height + mul->FU_height + lsq_height + scheu->Iw_height + scheu->ROB_height, &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + intTag_mul_Bypass = new interconnect("Mul Bypass tag" , Core_device, 1, 1, coredynp.phy_ireg_width, + rfu->int_regfile_height + exeu->FU_height + mul->FU_height + lsq_height + scheu->Iw_height + scheu->ROB_height , &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + bypass.area.set_area(bypass.area.get_area() +int_mul_bypass->area.get_area()); + bypass.area.set_area(bypass.area.get_area() +intTag_mul_Bypass->area.get_area()); + } + + if (coredynp.num_fpus>0) + { + fp_bypass = new interconnect("FP Bypass Data" , Core_device, 1, 1, int(ceil(coredynp.fp_data_width)), + rfu->fp_regfile_height + fp_u->FU_height + lsq_height + scheu->fp_Iw_height + scheu->ROB_height, &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + fpTagBypass = new interconnect("FP Bypass tag" , Core_device, 1, 1, coredynp.phy_freg_width, + rfu->fp_regfile_height + fp_u->FU_height + lsq_height + scheu->fp_Iw_height + scheu->ROB_height, &interface_ip, 3, + false, 1.0, coredynp.opt_local, coredynp.core_ty); + bypass.area.set_area(bypass.area.get_area() +fp_bypass->area.get_area()); + bypass.area.set_area(bypass.area.get_area() +fpTagBypass->area.get_area()); + } + } + + + } + area.set_area(area.get_area()+ bypass.area.get_area()); +} + +RENAMINGU::RENAMINGU(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_,bool exist_) +:XML(XML_interface), + ithCore(ithCore_), + interface_ip(*interface_ip_), + coredynp(dyn_p_), + iFRAT(0), + fFRAT(0), + iRRAT(0), + fRRAT(0), + ifreeL(0), + ffreeL(0), + idcl(0), + fdcl(0), + RAHT(0), + exist(exist_) + { + /* + * Although renaming logic maybe be used in in-order processors, + * McPAT assumes no renaming logic is used since the performance gain is very limited and + * the only major inorder processor with renaming logic is Itainium + * that is a VLIW processor and different from current McPAT's model. + * physical register base OOO must have Dual-RAT architecture or equivalent structure.FRAT:FrontRAT, RRAT:RetireRAT; + * i,f prefix mean int and fp + * RAT for all Renaming logic, random accessible checkpointing is used, but only update when instruction retires. + * FRAT will be read twice and written once per instruction; + * RRAT will be write once per instruction when committing and reads out all when context switch + * checkpointing is implicit + * Renaming logic is duplicated for each different hardware threads + * + * No Dual-RAT is needed in RS-based OOO processors, + * however, RAT needs to do associative search in RAT, when instruction commits and ROB release the entry, + * to make sure all the renamings associated with the ROB to be released are updated at the same time. + * RAM scheme has # ARchi Reg entry with each entry hold phy reg tag, + * CAM scheme has # Phy Reg entry with each entry hold ARchi reg tag, + * + * Both RAM and CAM have same DCL + */ + if (!exist) return; + int tag, data, out_w; +// interface_ip.wire_is_mat_type = 0; +// interface_ip.wire_os_mat_type = 0; +// interface_ip.wt = Global_30; + clockRate = coredynp.clockRate; + executionTime = coredynp.executionTime; + if (coredynp.core_ty==OOO) + { + //integer pipeline + if (coredynp.scheu_ty==PhysicalRegFile) + { + if (coredynp.rm_ty ==RAMbased) + { //FRAT with global checkpointing (GCs) please see paper tech report for detailed explaintions + data = 33;//int(ceil(coredynp.phy_ireg_width*(1+coredynp.globalCheckpoint)/8.0)); +// data = int(ceil(coredynp.phy_ireg_width/8.0)); + out_w = 1;//int(ceil(coredynp.phy_ireg_width/8.0)); + interface_ip.is_cache = false; + interface_ip.pure_cam = false; + interface_ip.pure_ram = true; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].archi_Regs_IRF_size; + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = out_w*8; + interface_ip.access_mode = 2; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//the extra one port is for GCs + interface_ip.num_rd_ports = 2*coredynp.decodeW; + interface_ip.num_wr_ports = coredynp.decodeW; + interface_ip.num_se_rd_ports = 0; + iFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty); + iFRAT->area.set_area(iFRAT->area.get_area()+ iFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads); + area.set_area(area.get_area()+ iFRAT->area.get_area()); + +// //RAHT According to Intel, combine GC with FRAT is very costly. +// data = int(ceil(coredynp.phy_ireg_width/8.0)*coredynp.num_IRF_entry); +// out_w = data; +// interface_ip.is_cache = false; +// interface_ip.pure_cam = false; +// interface_ip.pure_ram = true; +// interface_ip.line_sz = data; +// interface_ip.cache_sz = data*coredynp.globalCheckpoint; +// interface_ip.assoc = 1; +// interface_ip.nbanks = 1; +// interface_ip.out_w = out_w*8; +// interface_ip.access_mode = 0; +// interface_ip.throughput = 1.0/clockRate; +// interface_ip.latency = 1.0/clockRate; +// interface_ip.obj_func_dyn_energy = 0; +// interface_ip.obj_func_dyn_power = 0; +// interface_ip.obj_func_leak_power = 0; +// interface_ip.obj_func_cycle_t = 1; +// interface_ip.num_rw_ports = 1;//the extra one port is for GCs +// interface_ip.num_rd_ports = 2*coredynp.decodeW; +// interface_ip.num_wr_ports = coredynp.decodeW; +// interface_ip.num_se_rd_ports = 0; +// iFRAT = new ArrayST(&interface_ip, "Int FrontRAT"); +// iFRAT->area.set_area(iFRAT->area.get_area()+ iFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads); +// area.set_area(area.get_area()+ iFRAT->area.get_area()); + + //FRAT floating point + data = int(ceil(coredynp.phy_freg_width*(1+coredynp.globalCheckpoint)/8.0)); + out_w = int(ceil(coredynp.phy_freg_width/8.0)); + interface_ip.is_cache = false; + interface_ip.pure_cam = false; + interface_ip.pure_ram = true; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].archi_Regs_FRF_size; + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = out_w*8; + interface_ip.access_mode = 2; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//the extra one port is for GCs + interface_ip.num_rd_ports = 2*coredynp.fp_decodeW; + interface_ip.num_wr_ports = coredynp.fp_decodeW; + interface_ip.num_se_rd_ports = 0; + fFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty); + fFRAT->area.set_area(fFRAT->area.get_area()+ fFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads); + area.set_area(area.get_area()+ fFRAT->area.get_area()); + + } + else if ((coredynp.rm_ty ==CAMbased)) + { + //FRAT + tag = coredynp.arch_ireg_width; + data = int(ceil ((coredynp.arch_ireg_width+1*coredynp.globalCheckpoint)/8.0));//the address of CAM needed to be sent out + out_w = int(ceil (coredynp.arch_ireg_width/8.0)); + interface_ip.is_cache = true; + interface_ip.pure_cam = false; + interface_ip.pure_ram = false; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].phy_Regs_IRF_size; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = out_w*8; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.access_mode = 2; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//for GCs + interface_ip.num_rd_ports = coredynp.decodeW; + interface_ip.num_wr_ports = coredynp.decodeW; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports= 2*coredynp.decodeW; + iFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty); + iFRAT->area.set_area(iFRAT->area.get_area()+ iFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads); + area.set_area(area.get_area()+ iFRAT->area.get_area()); + + //FRAT for FP + tag = coredynp.arch_freg_width; + data = int(ceil ((coredynp.arch_freg_width+1*coredynp.globalCheckpoint)/8.0));//the address of CAM needed to be sent out + out_w = int(ceil (coredynp.arch_freg_width/8.0)); + interface_ip.is_cache = true; + interface_ip.pure_cam = false; + interface_ip.pure_ram = false; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].phy_Regs_FRF_size; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = out_w*8; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.access_mode = 2; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//for GCs + interface_ip.num_rd_ports = coredynp.fp_decodeW; + interface_ip.num_wr_ports = coredynp.fp_decodeW; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports= 2*coredynp.fp_decodeW; + fFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty); + fFRAT->area.set_area(fFRAT->area.get_area()+ fFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads); + area.set_area(area.get_area()+ fFRAT->area.get_area()); + + } + + //RRAT is always RAM based, does not have GCs, and is used only for record latest non-speculative mapping + data = int(ceil(coredynp.phy_ireg_width/8.0)); + interface_ip.is_cache = false; + interface_ip.pure_cam = false; + interface_ip.pure_ram = true; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].archi_Regs_IRF_size*2;//HACK to make it as least 64B + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 1; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = XML->sys.core[ithCore].commit_width; + interface_ip.num_wr_ports = XML->sys.core[ithCore].commit_width; + interface_ip.num_se_rd_ports = 0; + iRRAT = new ArrayST(&interface_ip, "Int RetireRAT", Core_device, coredynp.opt_local, coredynp.core_ty); + iRRAT->area.set_area(iRRAT->area.get_area()+ iRRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads); + area.set_area(area.get_area()+ iRRAT->area.get_area()); + + //RRAT for FP + data = int(ceil(coredynp.phy_freg_width/8.0)); + interface_ip.is_cache = false; + interface_ip.pure_cam = false; + interface_ip.pure_ram = true; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].archi_Regs_FRF_size*2;//HACK to make it as least 64B + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 1; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = coredynp.fp_decodeW; + interface_ip.num_wr_ports = coredynp.fp_decodeW; + interface_ip.num_se_rd_ports = 0; + fRRAT = new ArrayST(&interface_ip, "Int RetireRAT", Core_device, coredynp.opt_local, coredynp.core_ty); + fRRAT->area.set_area(fRRAT->area.get_area()+ fRRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads); + area.set_area(area.get_area()+ fRRAT->area.get_area()); + + //Freelist of renaming unit always RAM based + //Recycle happens at two places: 1)when DCL check there are WAW, the Phyregisters/ROB directly recycles into freelist + // 2)When instruction commits the Phyregisters/ROB needed to be recycled. + //therefore num_wr port = decode-1(-1 means at least one phy reg will be used for the current renaming group) + commit width + data = int(ceil(coredynp.phy_ireg_width/8.0)); + interface_ip.is_cache = false; + interface_ip.pure_cam = false; + interface_ip.pure_ram = true; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*coredynp.num_ifreelist_entries; + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 1; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//TODO + interface_ip.num_rd_ports = coredynp.decodeW; + interface_ip.num_wr_ports = coredynp.decodeW -1 + XML->sys.core[ithCore].commit_width; + //every cycle, (coredynp.decodeW -1) inst may need to send back it dest tags, committW insts needs to update freelist buffers + interface_ip.num_se_rd_ports = 0; + ifreeL = new ArrayST(&interface_ip, "Int Free List", Core_device, coredynp.opt_local, coredynp.core_ty); + ifreeL->area.set_area(ifreeL->area.get_area()+ ifreeL->local_result.area*XML->sys.core[ithCore].number_hardware_threads); + area.set_area(area.get_area()+ ifreeL->area.get_area()); + + //freelist for FP + data = int(ceil(coredynp.phy_freg_width/8.0)); + interface_ip.is_cache = false; + interface_ip.pure_cam = false; + interface_ip.pure_ram = true; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*coredynp.num_ffreelist_entries; + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 1; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = coredynp.fp_decodeW; + interface_ip.num_wr_ports = coredynp.fp_decodeW -1 + XML->sys.core[ithCore].commit_width; + interface_ip.num_se_rd_ports = 0; + ffreeL = new ArrayST(&interface_ip, "Int Free List", Core_device, coredynp.opt_local, coredynp.core_ty); + ffreeL->area.set_area(ffreeL->area.get_area()+ ffreeL->local_result.area*XML->sys.core[ithCore].number_hardware_threads); + area.set_area(area.get_area()+ ffreeL->area.get_area()); + + idcl = new dep_resource_conflict_check(&interface_ip,coredynp,coredynp.phy_ireg_width);//TODO:Separate 2 sections See TR + fdcl = new dep_resource_conflict_check(&interface_ip,coredynp,coredynp.phy_freg_width); + + } + else if (coredynp.scheu_ty==ReservationStation){ + if (coredynp.rm_ty ==RAMbased){ + /* + * however, RAT needs to do associative search in RAT, when instruction commits and ROB release the entry, + * to make sure all the renamings associated with the ROB to be released are updated to ARF at the same time. + * RAM based RAT for RS base OOO does not save the search operations. Its advantage is to have less entries than + * CAM based RAT so that it is more scalable as number of ROB/physical regs increases. + */ + tag = coredynp.phy_ireg_width; + data = int(ceil(coredynp.phy_ireg_width*(1+coredynp.globalCheckpoint)/8.0)); + out_w = int(ceil(coredynp.phy_ireg_width/8.0)); + interface_ip.is_cache = true; + interface_ip.pure_cam = false; + interface_ip.pure_ram = false; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].archi_Regs_IRF_size; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = out_w*8; + interface_ip.access_mode = 2; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//the extra one port is for GCs + interface_ip.num_rd_ports = 2*coredynp.decodeW; + interface_ip.num_wr_ports = coredynp.decodeW; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports= coredynp.commitW;//TODO + iFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty); + iFRAT->local_result.adjust_area(); + iFRAT->area.set_area(iFRAT->area.get_area()+ iFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads); + area.set_area(area.get_area()+ iFRAT->area.get_area()); + + //FP + tag = coredynp.phy_freg_width; + data = int(ceil(coredynp.phy_freg_width*(1+coredynp.globalCheckpoint)/8.0)); + out_w = int(ceil(coredynp.phy_freg_width/8.0)); + interface_ip.is_cache = true; + interface_ip.pure_cam = false; + interface_ip.pure_ram = false; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].archi_Regs_FRF_size; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = out_w*8; + interface_ip.access_mode = 2; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//the extra one port is for GCs + interface_ip.num_rd_ports = 2*coredynp.fp_decodeW; + interface_ip.num_wr_ports = coredynp.fp_decodeW; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports= coredynp.fp_decodeW;//actually is fp commit width + fFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty); + fFRAT->local_result.adjust_area(); + fFRAT->area.set_area(fFRAT->area.get_area()+ fFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads); + area.set_area(area.get_area()+ fFRAT->area.get_area()); + + } + else if ((coredynp.rm_ty ==CAMbased)) + { + //FRAT + tag = coredynp.arch_ireg_width; + data = int(ceil (coredynp.arch_ireg_width+1*coredynp.globalCheckpoint/8.0));//the address of CAM needed to be sent out + out_w = int(ceil (coredynp.arch_ireg_width/8.0)); + interface_ip.is_cache = true; + interface_ip.pure_cam = false; + interface_ip.pure_ram = false; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].phy_Regs_IRF_size; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = out_w*8; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.access_mode = 2; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//for GCs + interface_ip.num_rd_ports = XML->sys.core[ithCore].decode_width;//0;TODO + interface_ip.num_wr_ports = XML->sys.core[ithCore].decode_width; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports= 2*XML->sys.core[ithCore].decode_width; + iFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty); + iFRAT->area.set_area(iFRAT->area.get_area()+ iFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads); + area.set_area(area.get_area()+ iFRAT->area.get_area()); + + //FRAT + tag = coredynp.arch_freg_width; + data = int(ceil (coredynp.arch_freg_width+1*coredynp.globalCheckpoint/8.0));//the address of CAM needed to be sent out + out_w = int(ceil (coredynp.arch_freg_width/8.0)); + interface_ip.is_cache = true; + interface_ip.pure_cam = false; + interface_ip.pure_ram = false; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*XML->sys.core[ithCore].phy_Regs_FRF_size; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = out_w*8; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.access_mode = 2; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//for GCs + interface_ip.num_rd_ports = XML->sys.core[ithCore].decode_width;//0;TODO; + interface_ip.num_wr_ports = coredynp.fp_decodeW; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports= 2*coredynp.fp_decodeW; + fFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device, coredynp.opt_local, coredynp.core_ty); + fFRAT->area.set_area(fFRAT->area.get_area()+ fFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads); + area.set_area(area.get_area()+ fFRAT->area.get_area()); + + } + //No RRAT for RS based OOO + //Freelist of renaming unit of RS based OOO is unifed for both int and fp renaming unit since the ROB is unified + data = int(ceil(coredynp.phy_ireg_width/8.0)); + interface_ip.is_cache = false; + interface_ip.pure_cam = false; + interface_ip.pure_ram = true; + interface_ip.line_sz = data; + interface_ip.cache_sz = data*coredynp.num_ifreelist_entries; + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 1; + interface_ip.throughput = 1.0/clockRate; + interface_ip.latency = 1.0/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//TODO + interface_ip.num_rd_ports = XML->sys.core[ithCore].decode_width; + interface_ip.num_wr_ports = XML->sys.core[ithCore].decode_width -1 + XML->sys.core[ithCore].commit_width; + interface_ip.num_se_rd_ports = 0; + ifreeL = new ArrayST(&interface_ip, "Unified Free List", Core_device, coredynp.opt_local, coredynp.core_ty); + ifreeL->area.set_area(ifreeL->area.get_area()+ ifreeL->local_result.area*XML->sys.core[ithCore].number_hardware_threads); + area.set_area(area.get_area()+ ifreeL->area.get_area()); + + idcl = new dep_resource_conflict_check(&interface_ip,coredynp,coredynp.phy_ireg_width);//TODO:Separate 2 sections See TR + fdcl = new dep_resource_conflict_check(&interface_ip,coredynp,coredynp.phy_freg_width); + } + +} + if (coredynp.core_ty==Inorder&& coredynp.issueW>1) + { + /* Dependency check logic will only present when decode(issue) width>1. + * Multiple issue in order processor can do without renaming, but dcl is a must. + */ + idcl = new dep_resource_conflict_check(&interface_ip,coredynp,coredynp.phy_ireg_width);//TODO:Separate 2 sections See TR + fdcl = new dep_resource_conflict_check(&interface_ip,coredynp,coredynp.phy_freg_width); + } +} + +Core::Core(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_) +:XML(XML_interface), + ithCore(ithCore_), + interface_ip(*interface_ip_), + ifu (0), + lsu (0), + mmu (0), + exu (0), + rnu (0), + corepipe (0), + undiffCore (0), + l2cache (0) +{ + /* + * initialize, compute and optimize individual components. + */ + + double pipeline_area_per_unit; + if (XML->sys.Private_L2) + { + l2cache = new SharedCache(XML,ithCore, &interface_ip); + + } +// interface_ip.wire_is_mat_type = 2; +// interface_ip.wire_os_mat_type = 2; +// interface_ip.wt =Global_30; + set_core_param(); + clockRate = coredynp.clockRate; + executionTime = coredynp.executionTime; + ifu = new InstFetchU(XML, ithCore, &interface_ip,coredynp); + lsu = new LoadStoreU(XML, ithCore, &interface_ip,coredynp); + mmu = new MemManU (XML, ithCore, &interface_ip,coredynp); + exu = new EXECU (XML, ithCore, &interface_ip,lsu->lsq_height, coredynp); + undiffCore = new UndiffCore(XML, ithCore, &interface_ip,coredynp); + if (coredynp.core_ty==OOO) + { + rnu = new RENAMINGU(XML, ithCore, &interface_ip,coredynp); + } + corepipe = new Pipeline(&interface_ip,coredynp); + + if (coredynp.core_ty==OOO) + { + pipeline_area_per_unit = (corepipe->area.get_area()*coredynp.num_pipelines)/5.0; + if (rnu->exist) + { + rnu->area.set_area(rnu->area.get_area() + pipeline_area_per_unit); + } + } + else { + pipeline_area_per_unit = (corepipe->area.get_area()*coredynp.num_pipelines)/4.0; + } + + //area.set_area(area.get_area()+ corepipe->area.get_area()); + if (ifu->exist) + { + ifu->area.set_area(ifu->area.get_area() + pipeline_area_per_unit); + area.set_area(area.get_area() + ifu->area.get_area()); + } + if (lsu->exist) + { + lsu->area.set_area(lsu->area.get_area() + pipeline_area_per_unit); + area.set_area(area.get_area() + lsu->area.get_area()); + } + if (exu->exist) + { + exu->area.set_area(exu->area.get_area() + pipeline_area_per_unit); + area.set_area(area.get_area()+exu->area.get_area()); + } + if (mmu->exist) + { + mmu->area.set_area(mmu->area.get_area() + pipeline_area_per_unit); + area.set_area(area.get_area()+mmu->area.get_area()); + } + + if (coredynp.core_ty==OOO) + { + if (rnu->exist) + { + + area.set_area(area.get_area() + rnu->area.get_area()); + } + } + + if (undiffCore->exist) + { + area.set_area(area.get_area() + undiffCore->area.get_area()); + } + + if (XML->sys.Private_L2) + { + area.set_area(area.get_area() + l2cache->area.get_area()); + + } +// //clock power +// clockNetwork.init_wire_external(is_default, &interface_ip); +// clockNetwork.clk_area =area*1.1;//10% of placement overhead. rule of thumb +// clockNetwork.end_wiring_level =5;//toplevel metal +// clockNetwork.start_wiring_level =5;//toplevel metal +// clockNetwork.num_regs = corepipe.tot_stage_vector; +// clockNetwork.optimize_wire(); +} + + +void BranchPredictor::computeEnergy(bool is_tdp) +{ + if (!exist) return; + double r_access; + double w_access; + if (is_tdp) + { + r_access = coredynp.predictionW*coredynp.BR_duty_cycle; + w_access = 0*coredynp.BR_duty_cycle; + globalBPT->stats_t.readAc.access = r_access; + globalBPT->stats_t.writeAc.access = w_access; + globalBPT->tdp_stats = globalBPT->stats_t; + + L1_localBPT->stats_t.readAc.access = r_access; + L1_localBPT->stats_t.writeAc.access = w_access; + L1_localBPT->tdp_stats = L1_localBPT->stats_t; + + L2_localBPT->stats_t.readAc.access = r_access; + L2_localBPT->stats_t.writeAc.access = w_access; + L2_localBPT->tdp_stats = L2_localBPT->stats_t; + + chooser->stats_t.readAc.access = r_access; + chooser->stats_t.writeAc.access = w_access; + chooser->tdp_stats = chooser->stats_t; + + RAS->stats_t.readAc.access = r_access; + RAS->stats_t.writeAc.access = w_access; + RAS->tdp_stats = RAS->stats_t; + } + else + { + //The resolution of BPT accesses is coarse, but this is + //because most simulators cannot track finer grained details + r_access = XML->sys.core[ithCore].branch_instructions; + w_access = XML->sys.core[ithCore].branch_mispredictions + 0.1*XML->sys.core[ithCore].branch_instructions;//10% of BR will flip internal bits//0 + globalBPT->stats_t.readAc.access = r_access; + globalBPT->stats_t.writeAc.access = w_access; + globalBPT->rtp_stats = globalBPT->stats_t; + + L1_localBPT->stats_t.readAc.access = r_access; + L1_localBPT->stats_t.writeAc.access = w_access; + L1_localBPT->rtp_stats = L1_localBPT->stats_t; + + L2_localBPT->stats_t.readAc.access = r_access; + L2_localBPT->stats_t.writeAc.access = w_access; + L2_localBPT->rtp_stats = L2_localBPT->stats_t; + + chooser->stats_t.readAc.access = r_access; + chooser->stats_t.writeAc.access = w_access; + chooser->rtp_stats = chooser->stats_t; + + RAS->stats_t.readAc.access = XML->sys.core[ithCore].function_calls; + RAS->stats_t.writeAc.access = XML->sys.core[ithCore].function_calls; + RAS->rtp_stats = RAS->stats_t; + } + + globalBPT->power_t.reset(); + L1_localBPT->power_t.reset(); + L2_localBPT->power_t.reset(); + chooser->power_t.reset(); + RAS->power_t.reset(); + + globalBPT->power_t.readOp.dynamic += globalBPT->local_result.power.readOp.dynamic*globalBPT->stats_t.readAc.access + + globalBPT->stats_t.writeAc.access*globalBPT->local_result.power.writeOp.dynamic; + L1_localBPT->power_t.readOp.dynamic += L1_localBPT->local_result.power.readOp.dynamic*L1_localBPT->stats_t.readAc.access + + L1_localBPT->stats_t.writeAc.access*L1_localBPT->local_result.power.writeOp.dynamic; + + L2_localBPT->power_t.readOp.dynamic += L2_localBPT->local_result.power.readOp.dynamic*L2_localBPT->stats_t.readAc.access + + L2_localBPT->stats_t.writeAc.access*L2_localBPT->local_result.power.writeOp.dynamic; + + chooser->power_t.readOp.dynamic += chooser->local_result.power.readOp.dynamic*chooser->stats_t.readAc.access + + chooser->stats_t.writeAc.access*chooser->local_result.power.writeOp.dynamic; + RAS->power_t.readOp.dynamic += RAS->local_result.power.readOp.dynamic*RAS->stats_t.readAc.access + + RAS->stats_t.writeAc.access*RAS->local_result.power.writeOp.dynamic; + + if (is_tdp) + { + globalBPT->power = globalBPT->power_t + globalBPT->local_result.power*pppm_lkg; + L1_localBPT->power = L1_localBPT->power_t + L1_localBPT->local_result.power*pppm_lkg; + L2_localBPT->power = L2_localBPT->power_t + L2_localBPT->local_result.power*pppm_lkg; + chooser->power = chooser->power_t + chooser->local_result.power*pppm_lkg; + RAS->power = RAS->power_t + RAS->local_result.power*coredynp.pppm_lkg_multhread; + + power = power + globalBPT->power + L1_localBPT->power + chooser->power + RAS->power; + } + else + { + globalBPT->rt_power = globalBPT->power_t + globalBPT->local_result.power*pppm_lkg; + L1_localBPT->rt_power = L1_localBPT->power_t + L1_localBPT->local_result.power*pppm_lkg; + L2_localBPT->rt_power = L2_localBPT->power_t + L2_localBPT->local_result.power*pppm_lkg; + chooser->rt_power = chooser->power_t + chooser->local_result.power*pppm_lkg; + RAS->rt_power = RAS->power_t + RAS->local_result.power*coredynp.pppm_lkg_multhread; + rt_power = rt_power + globalBPT->rt_power + L1_localBPT->rt_power + chooser->rt_power + RAS->rt_power; + } +} + +void BranchPredictor::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + if (!exist) return; + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + if (is_tdp) + { + cout << indent_str<< "Global Predictor:" << endl; + cout << indent_str_next << "Area = " << globalBPT->area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << globalBPT->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? globalBPT->power.readOp.longer_channel_leakage:globalBPT->power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << globalBPT->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << globalBPT->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + cout << indent_str << "Local Predictor:" << endl; + cout << indent_str << "L1_Local Predictor:" << endl; + cout << indent_str_next << "Area = " << L1_localBPT->area.get_area() *1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << L1_localBPT->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? L1_localBPT->power.readOp.longer_channel_leakage:L1_localBPT->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << L1_localBPT->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << L1_localBPT->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + cout << indent_str << "L2_Local Predictor:" << endl; + cout << indent_str_next << "Area = " << L2_localBPT->area.get_area() *1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << L2_localBPT->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? L2_localBPT->power.readOp.longer_channel_leakage:L2_localBPT->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << L2_localBPT->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << L2_localBPT->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + + cout << indent_str << "Chooser:" << endl; + cout << indent_str_next << "Area = " << chooser->area.get_area() *1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << chooser->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? chooser->power.readOp.longer_channel_leakage:chooser->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << chooser->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << chooser->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + cout << indent_str << "RAS:" << endl; + cout << indent_str_next << "Area = " << RAS->area.get_area() *1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << RAS->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? RAS->power.readOp.longer_channel_leakage:RAS->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << RAS->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << RAS->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + } + else + { +// cout << indent_str_next << "Global Predictor Peak Dynamic = " << globalBPT->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Global Predictor Subthreshold Leakage = " << globalBPT->rt_power.readOp.leakage <<" W" << endl; +// cout << indent_str_next << "Global Predictor Gate Leakage = " << globalBPT->rt_power.readOp.gate_leakage << " W" << endl; +// cout << indent_str_next << "Local Predictor Peak Dynamic = " << L1_localBPT->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Local Predictor Subthreshold Leakage = " << L1_localBPT->rt_power.readOp.leakage << " W" << endl; +// cout << indent_str_next << "Local Predictor Gate Leakage = " << L1_localBPT->rt_power.readOp.gate_leakage << " W" << endl; +// cout << indent_str_next << "Chooser Peak Dynamic = " << chooser->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Chooser Subthreshold Leakage = " << chooser->rt_power.readOp.leakage << " W" << endl; +// cout << indent_str_next << "Chooser Gate Leakage = " << chooser->rt_power.readOp.gate_leakage << " W" << endl; +// cout << indent_str_next << "RAS Peak Dynamic = " << RAS->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "RAS Subthreshold Leakage = " << RAS->rt_power.readOp.leakage << " W" << endl; +// cout << indent_str_next << "RAS Gate Leakage = " << RAS->rt_power.readOp.gate_leakage << " W" << endl; + } + +} + +void InstFetchU::computeEnergy(bool is_tdp) +{ + if (!exist) return; + if (is_tdp) + { + //init stats for Peak + icache.caches->stats_t.readAc.access = icache.caches->l_ip.num_rw_ports*coredynp.IFU_duty_cycle; + icache.caches->stats_t.readAc.miss = 0; + icache.caches->stats_t.readAc.hit = icache.caches->stats_t.readAc.access - icache.caches->stats_t.readAc.miss; + icache.caches->tdp_stats = icache.caches->stats_t; + + icache.missb->stats_t.readAc.access = icache.missb->stats_t.readAc.hit= icache.missb->l_ip.num_search_ports; + icache.missb->stats_t.writeAc.access = icache.missb->stats_t.writeAc.hit= icache.missb->l_ip.num_search_ports; + icache.missb->tdp_stats = icache.missb->stats_t; + + icache.ifb->stats_t.readAc.access = icache.ifb->stats_t.readAc.hit= icache.ifb->l_ip.num_search_ports; + icache.ifb->stats_t.writeAc.access = icache.ifb->stats_t.writeAc.hit= icache.ifb->l_ip.num_search_ports; + icache.ifb->tdp_stats = icache.ifb->stats_t; + + icache.prefetchb->stats_t.readAc.access = icache.prefetchb->stats_t.readAc.hit= icache.prefetchb->l_ip.num_search_ports; + icache.prefetchb->stats_t.writeAc.access = icache.ifb->stats_t.writeAc.hit= icache.ifb->l_ip.num_search_ports; + icache.prefetchb->tdp_stats = icache.prefetchb->stats_t; + + IB->stats_t.readAc.access = IB->stats_t.writeAc.access = XML->sys.core[ithCore].peak_issue_width; + IB->tdp_stats = IB->stats_t; + + if (coredynp.predictionW>0) + { + BTB->stats_t.readAc.access = coredynp.predictionW;//XML->sys.core[ithCore].BTB.read_accesses; + BTB->stats_t.writeAc.access = 0;//XML->sys.core[ithCore].BTB.write_accesses; + } + + ID_inst->stats_t.readAc.access = coredynp.decodeW; + ID_operand->stats_t.readAc.access = coredynp.decodeW; + ID_misc->stats_t.readAc.access = coredynp.decodeW; + ID_inst->tdp_stats = ID_inst->stats_t; + ID_operand->tdp_stats = ID_operand->stats_t; + ID_misc->tdp_stats = ID_misc->stats_t; + + + } + else + { + //init stats for Runtime Dynamic (RTP) + icache.caches->stats_t.readAc.access = XML->sys.core[ithCore].icache.read_accesses; + icache.caches->stats_t.readAc.miss = XML->sys.core[ithCore].icache.read_misses; + icache.caches->stats_t.readAc.hit = icache.caches->stats_t.readAc.access - icache.caches->stats_t.readAc.miss; + icache.caches->rtp_stats = icache.caches->stats_t; + + icache.missb->stats_t.readAc.access = icache.caches->stats_t.readAc.miss; + icache.missb->stats_t.writeAc.access = icache.caches->stats_t.readAc.miss; + icache.missb->rtp_stats = icache.missb->stats_t; + + icache.ifb->stats_t.readAc.access = icache.caches->stats_t.readAc.miss; + icache.ifb->stats_t.writeAc.access = icache.caches->stats_t.readAc.miss; + icache.ifb->rtp_stats = icache.ifb->stats_t; + + icache.prefetchb->stats_t.readAc.access = icache.caches->stats_t.readAc.miss; + icache.prefetchb->stats_t.writeAc.access = icache.caches->stats_t.readAc.miss; + icache.prefetchb->rtp_stats = icache.prefetchb->stats_t; + + IB->stats_t.readAc.access = IB->stats_t.writeAc.access = XML->sys.core[ithCore].total_instructions; + IB->rtp_stats = IB->stats_t; + + if (coredynp.predictionW>0) + { + BTB->stats_t.readAc.access = XML->sys.core[ithCore].BTB.read_accesses;//XML->sys.core[ithCore].branch_instructions; + BTB->stats_t.writeAc.access = XML->sys.core[ithCore].BTB.write_accesses;//XML->sys.core[ithCore].branch_mispredictions; + BTB->rtp_stats = BTB->stats_t; + } + + ID_inst->stats_t.readAc.access = XML->sys.core[ithCore].total_instructions; + ID_operand->stats_t.readAc.access = XML->sys.core[ithCore].total_instructions; + ID_misc->stats_t.readAc.access = XML->sys.core[ithCore].total_instructions; + ID_inst->rtp_stats = ID_inst->stats_t; + ID_operand->rtp_stats = ID_operand->stats_t; + ID_misc->rtp_stats = ID_misc->stats_t; + + } + + icache.power_t.reset(); + IB->power_t.reset(); +// ID_inst->power_t.reset(); +// ID_operand->power_t.reset(); +// ID_misc->power_t.reset(); + if (coredynp.predictionW>0) + { + BTB->power_t.reset(); + } + + icache.power_t.readOp.dynamic += (icache.caches->stats_t.readAc.hit*icache.caches->local_result.power.readOp.dynamic+ + //icache.caches->stats_t.readAc.miss*icache.caches->local_result.tag_array2->power.readOp.dynamic+ + icache.caches->stats_t.readAc.miss*icache.caches->local_result.power.readOp.dynamic+ //assume tag data accessed in parallel + icache.caches->stats_t.readAc.miss*icache.caches->local_result.power.writeOp.dynamic); //read miss in Icache cause a write to Icache + icache.power_t.readOp.dynamic += icache.missb->stats_t.readAc.access*icache.missb->local_result.power.searchOp.dynamic + + icache.missb->stats_t.writeAc.access*icache.missb->local_result.power.writeOp.dynamic;//each access to missb involves a CAM and a write + icache.power_t.readOp.dynamic += icache.ifb->stats_t.readAc.access*icache.ifb->local_result.power.searchOp.dynamic + + icache.ifb->stats_t.writeAc.access*icache.ifb->local_result.power.writeOp.dynamic; + icache.power_t.readOp.dynamic += icache.prefetchb->stats_t.readAc.access*icache.prefetchb->local_result.power.searchOp.dynamic + + icache.prefetchb->stats_t.writeAc.access*icache.prefetchb->local_result.power.writeOp.dynamic; + + IB->power_t.readOp.dynamic += IB->local_result.power.readOp.dynamic*IB->stats_t.readAc.access + + IB->stats_t.writeAc.access*IB->local_result.power.writeOp.dynamic; + + if (coredynp.predictionW>0) + { + BTB->power_t.readOp.dynamic += BTB->local_result.power.readOp.dynamic*BTB->stats_t.readAc.access + + BTB->stats_t.writeAc.access*BTB->local_result.power.writeOp.dynamic; + + BPT->computeEnergy(is_tdp); + } + + if (is_tdp) + { +// icache.power = icache.power_t + +// (icache.caches->local_result.power)*pppm_lkg + +// (icache.missb->local_result.power + +// icache.ifb->local_result.power + +// icache.prefetchb->local_result.power)*pppm_Isub; + icache.power = icache.power_t + + (icache.caches->local_result.power + + icache.missb->local_result.power + + icache.ifb->local_result.power + + icache.prefetchb->local_result.power)*pppm_lkg; + + IB->power = IB->power_t + IB->local_result.power*pppm_lkg; + power = power + icache.power + IB->power; + if (coredynp.predictionW>0) + { + BTB->power = BTB->power_t + BTB->local_result.power*pppm_lkg; + power = power + BTB->power + BPT->power; + } + + ID_inst->power_t.readOp.dynamic = ID_inst->power.readOp.dynamic; + ID_operand->power_t.readOp.dynamic = ID_operand->power.readOp.dynamic; + ID_misc->power_t.readOp.dynamic = ID_misc->power.readOp.dynamic; + + ID_inst->power.readOp.dynamic *= ID_inst->tdp_stats.readAc.access; + ID_operand->power.readOp.dynamic *= ID_operand->tdp_stats.readAc.access; + ID_misc->power.readOp.dynamic *= ID_misc->tdp_stats.readAc.access; + + power = power + (ID_inst->power + + ID_operand->power + + ID_misc->power); + } + else + { +// icache.rt_power = icache.power_t + +// (icache.caches->local_result.power)*pppm_lkg + +// (icache.missb->local_result.power + +// icache.ifb->local_result.power + +// icache.prefetchb->local_result.power)*pppm_Isub; + + icache.rt_power = icache.power_t + + (icache.caches->local_result.power + + icache.missb->local_result.power + + icache.ifb->local_result.power + + icache.prefetchb->local_result.power)*pppm_lkg; + + IB->rt_power = IB->power_t + IB->local_result.power*pppm_lkg; + rt_power = rt_power + icache.rt_power + IB->rt_power; + if (coredynp.predictionW>0) + { + BTB->rt_power = BTB->power_t + BTB->local_result.power*pppm_lkg; + rt_power = rt_power + BTB->rt_power + BPT->rt_power; + } + + ID_inst->rt_power.readOp.dynamic = ID_inst->power_t.readOp.dynamic*ID_inst->rtp_stats.readAc.access; + ID_operand->rt_power.readOp.dynamic = ID_operand->power_t.readOp.dynamic * ID_operand->rtp_stats.readAc.access; + ID_misc->rt_power.readOp.dynamic = ID_misc->power_t.readOp.dynamic * ID_misc->rtp_stats.readAc.access; + + rt_power = rt_power + (ID_inst->rt_power + + ID_operand->rt_power + + ID_misc->rt_power); + } +} + +void InstFetchU::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + if (!exist) return; + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + + + if (is_tdp) + { + + cout << indent_str<< "Instruction Cache:" << endl; + cout << indent_str_next << "Area = " << icache.area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << icache.power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? icache.power.readOp.longer_channel_leakage:icache.power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << icache.power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << icache.rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + if (coredynp.predictionW>0) + { + cout << indent_str<< "Branch Target Buffer:" << endl; + cout << indent_str_next << "Area = " << BTB->area.get_area() *1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << BTB->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? BTB->power.readOp.longer_channel_leakage:BTB->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << BTB->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << BTB->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + if (BPT->exist) + { + cout << indent_str<< "Branch Predictor:" << endl; + cout << indent_str_next << "Area = " << BPT->area.get_area() *1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << BPT->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? BPT->power.readOp.longer_channel_leakage:BPT->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << BPT->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << BPT->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + if (plevel>3) + { + BPT->displayEnergy(indent+4, plevel, is_tdp); + } + } + } + cout << indent_str<< "Instruction Buffer:" << endl; + cout << indent_str_next << "Area = " << IB->area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << IB->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? IB->power.readOp.longer_channel_leakage:IB->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << IB->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << IB->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + cout << indent_str<< "Instruction Decoder:" << endl; + cout << indent_str_next << "Area = " << (ID_inst->area.get_area() + + ID_operand->area.get_area() + + ID_misc->area.get_area())*coredynp.decodeW*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << (ID_inst->power.readOp.dynamic + + ID_operand->power.readOp.dynamic + + ID_misc->power.readOp.dynamic)*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? (ID_inst->power.readOp.longer_channel_leakage + + ID_operand->power.readOp.longer_channel_leakage + + ID_misc->power.readOp.longer_channel_leakage): + (ID_inst->power.readOp.leakage + + ID_operand->power.readOp.leakage + + ID_misc->power.readOp.leakage)) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << (ID_inst->power.readOp.gate_leakage + + ID_operand->power.readOp.gate_leakage + + ID_misc->power.readOp.gate_leakage) << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << (ID_inst->rt_power.readOp.dynamic + + ID_operand->rt_power.readOp.dynamic + + ID_misc->rt_power.readOp.dynamic)/executionTime << " W" << endl; + cout <<endl; + } + else + { +// cout << indent_str_next << "Instruction Cache Peak Dynamic = " << icache.rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Instruction Cache Subthreshold Leakage = " << icache.rt_power.readOp.leakage <<" W" << endl; +// cout << indent_str_next << "Instruction Cache Gate Leakage = " << icache.rt_power.readOp.gate_leakage << " W" << endl; +// cout << indent_str_next << "Instruction Buffer Peak Dynamic = " << IB->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Instruction Buffer Subthreshold Leakage = " << IB->rt_power.readOp.leakage << " W" << endl; +// cout << indent_str_next << "Instruction Buffer Gate Leakage = " << IB->rt_power.readOp.gate_leakage << " W" << endl; +// cout << indent_str_next << "Branch Target Buffer Peak Dynamic = " << BTB->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Branch Target Buffer Subthreshold Leakage = " << BTB->rt_power.readOp.leakage << " W" << endl; +// cout << indent_str_next << "Branch Target Buffer Gate Leakage = " << BTB->rt_power.readOp.gate_leakage << " W" << endl; +// cout << indent_str_next << "Branch Predictor Peak Dynamic = " << BPT->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Branch Predictor Subthreshold Leakage = " << BPT->rt_power.readOp.leakage << " W" << endl; +// cout << indent_str_next << "Branch Predictor Gate Leakage = " << BPT->rt_power.readOp.gate_leakage << " W" << endl; + } + +} + +void RENAMINGU::computeEnergy(bool is_tdp) +{ + if (!exist) return; + double pppm_t[4] = {1,1,1,1}; + if (is_tdp) + {//init stats for Peak + if (coredynp.core_ty==OOO){ + if (coredynp.scheu_ty==PhysicalRegFile) + { + if (coredynp.rm_ty ==RAMbased) + { + iFRAT->stats_t.readAc.access = iFRAT->l_ip.num_rd_ports; + iFRAT->stats_t.writeAc.access = iFRAT->l_ip.num_wr_ports; + iFRAT->tdp_stats = iFRAT->stats_t; + + fFRAT->stats_t.readAc.access = fFRAT->l_ip.num_rd_ports; + fFRAT->stats_t.writeAc.access = fFRAT->l_ip.num_wr_ports; + fFRAT->tdp_stats = fFRAT->stats_t; + + } + else if ((coredynp.rm_ty ==CAMbased)) + { + iFRAT->stats_t.readAc.access = iFRAT->l_ip.num_search_ports; + iFRAT->stats_t.writeAc.access = iFRAT->l_ip.num_wr_ports; + iFRAT->tdp_stats = iFRAT->stats_t; + + fFRAT->stats_t.readAc.access = fFRAT->l_ip.num_search_ports; + fFRAT->stats_t.writeAc.access = fFRAT->l_ip.num_wr_ports; + fFRAT->tdp_stats = fFRAT->stats_t; + } + + iRRAT->stats_t.readAc.access = iRRAT->l_ip.num_rd_ports; + iRRAT->stats_t.writeAc.access = iRRAT->l_ip.num_wr_ports; + iRRAT->tdp_stats = iRRAT->stats_t; + + fRRAT->stats_t.readAc.access = fRRAT->l_ip.num_rd_ports; + fRRAT->stats_t.writeAc.access = fRRAT->l_ip.num_wr_ports; + fRRAT->tdp_stats = fRRAT->stats_t; + + ifreeL->stats_t.readAc.access = coredynp.decodeW;//ifreeL->l_ip.num_rd_ports;; + ifreeL->stats_t.writeAc.access = coredynp.decodeW;//ifreeL->l_ip.num_wr_ports; + ifreeL->tdp_stats = ifreeL->stats_t; + + ffreeL->stats_t.readAc.access = coredynp.decodeW;//ffreeL->l_ip.num_rd_ports; + ffreeL->stats_t.writeAc.access = coredynp.decodeW;//ffreeL->l_ip.num_wr_ports; + ffreeL->tdp_stats = ffreeL->stats_t; + } + else if (coredynp.scheu_ty==ReservationStation){ + if (coredynp.rm_ty ==RAMbased) + { + iFRAT->stats_t.readAc.access = iFRAT->l_ip.num_rd_ports; + iFRAT->stats_t.writeAc.access = iFRAT->l_ip.num_wr_ports; + iFRAT->stats_t.searchAc.access = iFRAT->l_ip.num_search_ports; + iFRAT->tdp_stats = iFRAT->stats_t; + + fFRAT->stats_t.readAc.access = fFRAT->l_ip.num_rd_ports; + fFRAT->stats_t.writeAc.access = fFRAT->l_ip.num_wr_ports; + fFRAT->stats_t.searchAc.access = fFRAT->l_ip.num_search_ports; + fFRAT->tdp_stats = fFRAT->stats_t; + + } + else if ((coredynp.rm_ty ==CAMbased)) + { + iFRAT->stats_t.readAc.access = iFRAT->l_ip.num_search_ports; + iFRAT->stats_t.writeAc.access = iFRAT->l_ip.num_wr_ports; + iFRAT->tdp_stats = iFRAT->stats_t; + + fFRAT->stats_t.readAc.access = fFRAT->l_ip.num_search_ports; + fFRAT->stats_t.writeAc.access = fFRAT->l_ip.num_wr_ports; + fFRAT->tdp_stats = fFRAT->stats_t; + } + //Unified free list for both int and fp + ifreeL->stats_t.readAc.access = coredynp.decodeW;//ifreeL->l_ip.num_rd_ports; + ifreeL->stats_t.writeAc.access = coredynp.decodeW;//ifreeL->l_ip.num_wr_ports; + ifreeL->tdp_stats = ifreeL->stats_t; + } + idcl->stats_t.readAc.access = coredynp.decodeW; + fdcl->stats_t.readAc.access = coredynp.decodeW; + idcl->tdp_stats = idcl->stats_t; + fdcl->tdp_stats = fdcl->stats_t; + } + else + { + if (coredynp.issueW>1) + { + idcl->stats_t.readAc.access = coredynp.decodeW; + fdcl->stats_t.readAc.access = coredynp.decodeW; + idcl->tdp_stats = idcl->stats_t; + fdcl->tdp_stats = fdcl->stats_t; + } + } + + } + else + {//init stats for Runtime Dynamic (RTP) + if (coredynp.core_ty==OOO){ + if (coredynp.scheu_ty==PhysicalRegFile) + { + if (coredynp.rm_ty ==RAMbased) + { + iFRAT->stats_t.readAc.access = XML->sys.core[ithCore].rename_reads; + iFRAT->stats_t.writeAc.access = XML->sys.core[ithCore].rename_writes; + iFRAT->rtp_stats = iFRAT->stats_t; + + fFRAT->stats_t.readAc.access = XML->sys.core[ithCore].fp_rename_reads; + fFRAT->stats_t.writeAc.access = XML->sys.core[ithCore].fp_rename_writes; + fFRAT->rtp_stats = fFRAT->stats_t; + } + else if ((coredynp.rm_ty ==CAMbased)) + { + iFRAT->stats_t.readAc.access = XML->sys.core[ithCore].rename_reads; + iFRAT->stats_t.writeAc.access = XML->sys.core[ithCore].rename_writes; + iFRAT->rtp_stats = iFRAT->stats_t; + + fFRAT->stats_t.readAc.access = XML->sys.core[ithCore].fp_rename_reads; + fFRAT->stats_t.writeAc.access = XML->sys.core[ithCore].fp_rename_writes; + fFRAT->rtp_stats = fFRAT->stats_t; + } + + iRRAT->stats_t.readAc.access = XML->sys.core[ithCore].rename_writes;//Hack, should be (context switch + branch mispredictions)*16 + iRRAT->stats_t.writeAc.access = XML->sys.core[ithCore].rename_writes; + iRRAT->rtp_stats = iRRAT->stats_t; + + fRRAT->stats_t.readAc.access = XML->sys.core[ithCore].fp_rename_writes;//Hack, should be (context switch + branch mispredictions)*16 + fRRAT->stats_t.writeAc.access = XML->sys.core[ithCore].fp_rename_writes; + fRRAT->rtp_stats = fRRAT->stats_t; + + ifreeL->stats_t.readAc.access = XML->sys.core[ithCore].rename_reads; + ifreeL->stats_t.writeAc.access = 2*XML->sys.core[ithCore].rename_writes; + ifreeL->rtp_stats = ifreeL->stats_t; + + ffreeL->stats_t.readAc.access = XML->sys.core[ithCore].fp_rename_reads; + ffreeL->stats_t.writeAc.access = 2*XML->sys.core[ithCore].fp_rename_writes; + ffreeL->rtp_stats = ffreeL->stats_t; + } + else if (coredynp.scheu_ty==ReservationStation){ + if (coredynp.rm_ty ==RAMbased) + { + iFRAT->stats_t.readAc.access = XML->sys.core[ithCore].rename_reads; + iFRAT->stats_t.writeAc.access = XML->sys.core[ithCore].rename_writes; + iFRAT->stats_t.searchAc.access = XML->sys.core[ithCore].committed_int_instructions;//hack: not all committed instructions use regs. + iFRAT->rtp_stats = iFRAT->stats_t; + + fFRAT->stats_t.readAc.access = XML->sys.core[ithCore].fp_rename_reads; + fFRAT->stats_t.writeAc.access = XML->sys.core[ithCore].fp_rename_writes; + fFRAT->stats_t.searchAc.access = XML->sys.core[ithCore].committed_fp_instructions; + fFRAT->rtp_stats = fFRAT->stats_t; + } + else if ((coredynp.rm_ty ==CAMbased)) + { + iFRAT->stats_t.readAc.access = XML->sys.core[ithCore].rename_reads; + iFRAT->stats_t.writeAc.access = XML->sys.core[ithCore].rename_writes; + iFRAT->rtp_stats = iFRAT->stats_t; + + fFRAT->stats_t.readAc.access = XML->sys.core[ithCore].fp_rename_reads; + fFRAT->stats_t.writeAc.access = XML->sys.core[ithCore].fp_rename_writes; + fFRAT->rtp_stats = fFRAT->stats_t; + } + //Unified free list for both int and fp since the ROB act as physcial registers + ifreeL->stats_t.readAc.access = XML->sys.core[ithCore].rename_reads + + XML->sys.core[ithCore].fp_rename_reads; + ifreeL->stats_t.writeAc.access = 2*(XML->sys.core[ithCore].rename_writes + + XML->sys.core[ithCore].fp_rename_writes);//HACK: 2-> since some of renaming in the same group + //are terminated early + ifreeL->rtp_stats = ifreeL->stats_t; + } + idcl->stats_t.readAc.access = 3*coredynp.decodeW*coredynp.decodeW*XML->sys.core[ithCore].rename_reads; + fdcl->stats_t.readAc.access = 3*coredynp.fp_issueW*coredynp.fp_issueW*XML->sys.core[ithCore].fp_rename_writes; + idcl->rtp_stats = idcl->stats_t; + fdcl->rtp_stats = fdcl->stats_t; + } + else + { + if (coredynp.issueW>1) + { + idcl->stats_t.readAc.access = 2*XML->sys.core[ithCore].int_instructions; + fdcl->stats_t.readAc.access = XML->sys.core[ithCore].fp_instructions; + idcl->rtp_stats = idcl->stats_t; + fdcl->rtp_stats = fdcl->stats_t; + } + } + + } + /* Compute engine */ + if (coredynp.core_ty==OOO) + { + if (coredynp.scheu_ty==PhysicalRegFile) + { + if (coredynp.rm_ty ==RAMbased) + { + iFRAT->power_t.reset(); + fFRAT->power_t.reset(); + + iFRAT->power_t.readOp.dynamic += (iFRAT->stats_t.readAc.access + *(iFRAT->local_result.power.readOp.dynamic + idcl->power.readOp.dynamic) + +iFRAT->stats_t.writeAc.access*iFRAT->local_result.power.writeOp.dynamic); + fFRAT->power_t.readOp.dynamic += (fFRAT->stats_t.readAc.access + *(fFRAT->local_result.power.readOp.dynamic + fdcl->power.readOp.dynamic) + +fFRAT->stats_t.writeAc.access*fFRAT->local_result.power.writeOp.dynamic); + } + else if ((coredynp.rm_ty ==CAMbased)) + { + iFRAT->power_t.reset(); + fFRAT->power_t.reset(); + iFRAT->power_t.readOp.dynamic += (iFRAT->stats_t.readAc.access + *(iFRAT->local_result.power.searchOp.dynamic + idcl->power.readOp.dynamic) + +iFRAT->stats_t.writeAc.access*iFRAT->local_result.power.writeOp.dynamic); + fFRAT->power_t.readOp.dynamic += (fFRAT->stats_t.readAc.access + *(fFRAT->local_result.power.searchOp.dynamic + fdcl->power.readOp.dynamic) + +fFRAT->stats_t.writeAc.access*fFRAT->local_result.power.writeOp.dynamic); + } + + iRRAT->power_t.reset(); + fRRAT->power_t.reset(); + ifreeL->power_t.reset(); + ffreeL->power_t.reset(); + + iRRAT->power_t.readOp.dynamic += (iRRAT->stats_t.readAc.access*iRRAT->local_result.power.readOp.dynamic + +iRRAT->stats_t.writeAc.access*iRRAT->local_result.power.writeOp.dynamic); + fRRAT->power_t.readOp.dynamic += (fRRAT->stats_t.readAc.access*fRRAT->local_result.power.readOp.dynamic + +fRRAT->stats_t.writeAc.access*fRRAT->local_result.power.writeOp.dynamic); + ifreeL->power_t.readOp.dynamic += (ifreeL->stats_t.readAc.access*ifreeL->local_result.power.readOp.dynamic + +ifreeL->stats_t.writeAc.access*ifreeL->local_result.power.writeOp.dynamic); + ffreeL->power_t.readOp.dynamic += (ffreeL->stats_t.readAc.access*ffreeL->local_result.power.readOp.dynamic + +ffreeL->stats_t.writeAc.access*ffreeL->local_result.power.writeOp.dynamic); + + } + else if (coredynp.scheu_ty==ReservationStation) + { + if (coredynp.rm_ty ==RAMbased) + { + iFRAT->power_t.reset(); + fFRAT->power_t.reset(); + + iFRAT->power_t.readOp.dynamic += (iFRAT->stats_t.readAc.access + *(iFRAT->local_result.power.readOp.dynamic + idcl->power.readOp.dynamic) + +iFRAT->stats_t.writeAc.access*iFRAT->local_result.power.writeOp.dynamic + +iFRAT->stats_t.searchAc.access*iFRAT->local_result.power.searchOp.dynamic); + fFRAT->power_t.readOp.dynamic += (fFRAT->stats_t.readAc.access + *(fFRAT->local_result.power.readOp.dynamic + fdcl->power.readOp.dynamic) + +fFRAT->stats_t.writeAc.access*fFRAT->local_result.power.writeOp.dynamic + +fFRAT->stats_t.searchAc.access*fFRAT->local_result.power.searchOp.dynamic); + } + else if ((coredynp.rm_ty ==CAMbased)) + { + iFRAT->power_t.reset(); + fFRAT->power_t.reset(); + iFRAT->power_t.readOp.dynamic += (iFRAT->stats_t.readAc.access + *(iFRAT->local_result.power.searchOp.dynamic + idcl->power.readOp.dynamic) + +iFRAT->stats_t.writeAc.access*iFRAT->local_result.power.writeOp.dynamic); + fFRAT->power_t.readOp.dynamic += (fFRAT->stats_t.readAc.access + *(fFRAT->local_result.power.searchOp.dynamic + fdcl->power.readOp.dynamic) + +fFRAT->stats_t.writeAc.access*fFRAT->local_result.power.writeOp.dynamic); + } + ifreeL->power_t.reset(); + ifreeL->power_t.readOp.dynamic += (ifreeL->stats_t.readAc.access*ifreeL->local_result.power.readOp.dynamic + +ifreeL->stats_t.writeAc.access*ifreeL->local_result.power.writeOp.dynamic); + } + + } + else + { + if (coredynp.issueW>1) + { + idcl->power_t.reset(); + fdcl->power_t.reset(); + set_pppm(pppm_t, idcl->stats_t.readAc.access, coredynp.num_hthreads, coredynp.num_hthreads, idcl->stats_t.readAc.access); + idcl->power_t = idcl->power * pppm_t; + set_pppm(pppm_t, fdcl->stats_t.readAc.access, coredynp.num_hthreads, coredynp.num_hthreads, idcl->stats_t.readAc.access); + fdcl->power_t = fdcl->power * pppm_t; + } + + } + + //assign value to tpd and rtp + if (is_tdp) + { + if (coredynp.core_ty==OOO) + { + if (coredynp.scheu_ty==PhysicalRegFile) + { + iFRAT->power = iFRAT->power_t + (iFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + idcl->power_t; + fFRAT->power = fFRAT->power_t + (fFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + fdcl->power_t; + iRRAT->power = iRRAT->power_t + iRRAT->local_result.power * coredynp.pppm_lkg_multhread; + fRRAT->power = fRRAT->power_t + fRRAT->local_result.power * coredynp.pppm_lkg_multhread; + ifreeL->power = ifreeL->power_t + ifreeL->local_result.power * coredynp.pppm_lkg_multhread; + ffreeL->power = ffreeL->power_t + ffreeL->local_result.power * coredynp.pppm_lkg_multhread; + power = power + (iFRAT->power + fFRAT->power) + + (iRRAT->power + fRRAT->power) + + (ifreeL->power + ffreeL->power); + } + else if (coredynp.scheu_ty==ReservationStation) + { + iFRAT->power = iFRAT->power_t + (iFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + idcl->power_t; + fFRAT->power = fFRAT->power_t + (fFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + fdcl->power_t; + ifreeL->power = ifreeL->power_t + ifreeL->local_result.power * coredynp.pppm_lkg_multhread; + power = power + (iFRAT->power + fFRAT->power) + + ifreeL->power; + } + } + else + { + power = power + idcl->power_t + fdcl->power_t; + } + + } + else + { + if (coredynp.core_ty==OOO) + { + if (coredynp.scheu_ty==PhysicalRegFile) + { + iFRAT->rt_power = iFRAT->power_t + (iFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + idcl->power_t; + fFRAT->rt_power = fFRAT->power_t + (fFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + fdcl->power_t; + iRRAT->rt_power = iRRAT->power_t + iRRAT->local_result.power * coredynp.pppm_lkg_multhread; + fRRAT->rt_power = fRRAT->power_t + fRRAT->local_result.power * coredynp.pppm_lkg_multhread; + ifreeL->rt_power = ifreeL->power_t + ifreeL->local_result.power * coredynp.pppm_lkg_multhread; + ffreeL->rt_power = ffreeL->power_t + ffreeL->local_result.power * coredynp.pppm_lkg_multhread; + rt_power = rt_power + (iFRAT->rt_power + fFRAT->rt_power) + + (iRRAT->rt_power + fRRAT->rt_power) + + (ifreeL->rt_power + ffreeL->rt_power); + } + else if (coredynp.scheu_ty==ReservationStation) + { + iFRAT->rt_power = iFRAT->power_t + (iFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + idcl->power_t; + fFRAT->rt_power = fFRAT->power_t + (fFRAT->local_result.power ) * coredynp.pppm_lkg_multhread + fdcl->power_t; + ifreeL->rt_power = ifreeL->power_t + ifreeL->local_result.power * coredynp.pppm_lkg_multhread; + rt_power = rt_power + (iFRAT->rt_power + fFRAT->rt_power) + + ifreeL->rt_power; + } + } + else + { + rt_power = rt_power + idcl->power_t + fdcl->power_t; + } + + } +} + +void RENAMINGU::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + if (!exist) return; + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + + + if (is_tdp) + { + + if (coredynp.core_ty==OOO) + { + cout << indent_str<< "Int Front End RAT:" << endl; + cout << indent_str_next << "Area = " << iFRAT->area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << iFRAT->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? iFRAT->power.readOp.longer_channel_leakage:iFRAT->power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << iFRAT->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << iFRAT->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + cout << indent_str<< "FP Front End RAT:" << endl; + cout << indent_str_next << "Area = " << fFRAT->area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << fFRAT->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? fFRAT->power.readOp.longer_channel_leakage:fFRAT->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << fFRAT->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << fFRAT->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + cout << indent_str<<"Free List:" << endl; + cout << indent_str_next << "Area = " << ifreeL->area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << ifreeL->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? ifreeL->power.readOp.longer_channel_leakage:ifreeL->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << ifreeL->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << ifreeL->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + + if (coredynp.scheu_ty==PhysicalRegFile) + { + cout << indent_str<< "Int Retire RAT: " << endl; + cout << indent_str_next << "Area = " << iRRAT->area.get_area() *1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << iRRAT->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? iRRAT->power.readOp.longer_channel_leakage:iRRAT->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << iRRAT->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << iRRAT->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + cout << indent_str<< "FP Retire RAT:" << endl; + cout << indent_str_next << "Area = " << fRRAT->area.get_area() *1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << fRRAT->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? fRRAT->power.readOp.longer_channel_leakage:fRRAT->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << fRRAT->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << fRRAT->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + cout << indent_str<< "FP Free List:" << endl; + cout << indent_str_next << "Area = " << ffreeL->area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << ffreeL->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? ffreeL->power.readOp.longer_channel_leakage:ffreeL->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << ffreeL->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << ffreeL->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + } + } + else + { + cout << indent_str<< "Int DCL:" << endl; + cout << indent_str_next << "Peak Dynamic = " << idcl->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? idcl->power.readOp.longer_channel_leakage:idcl->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << idcl->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << idcl->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout << indent_str<<"FP DCL:" << endl; + cout << indent_str_next << "Peak Dynamic = " << fdcl->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? fdcl->power.readOp.longer_channel_leakage:fdcl->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << fdcl->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << fdcl->rt_power.readOp.dynamic/executionTime << " W" << endl; + } + } + else + { + if (coredynp.core_ty==OOO) + { + cout << indent_str_next << "Int Front End RAT Peak Dynamic = " << iFRAT->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Int Front End RAT Subthreshold Leakage = " << iFRAT->rt_power.readOp.leakage <<" W" << endl; + cout << indent_str_next << "Int Front End RAT Gate Leakage = " << iFRAT->rt_power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "FP Front End RAT Peak Dynamic = " << fFRAT->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "FP Front End RAT Subthreshold Leakage = " << fFRAT->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "FP Front End RAT Gate Leakage = " << fFRAT->rt_power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Free List Peak Dynamic = " << ifreeL->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Free List Subthreshold Leakage = " << ifreeL->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "Free List Gate Leakage = " << fFRAT->rt_power.readOp.gate_leakage << " W" << endl; + if (coredynp.scheu_ty==PhysicalRegFile) + { + cout << indent_str_next << "Int Retire RAT Peak Dynamic = " << iRRAT->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Int Retire RAT Subthreshold Leakage = " << iRRAT->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "Int Retire RAT Gate Leakage = " << iRRAT->rt_power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "FP Retire RAT Peak Dynamic = " << fRRAT->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "FP Retire RAT Subthreshold Leakage = " << fRRAT->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "FP Retire RAT Gate Leakage = " << fRRAT->rt_power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "FP Free List Peak Dynamic = " << ffreeL->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "FP Free List Subthreshold Leakage = " << ffreeL->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "FP Free List Gate Leakage = " << fFRAT->rt_power.readOp.gate_leakage << " W" << endl; + } + } + else + { + cout << indent_str_next << "Int DCL Peak Dynamic = " << idcl->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Int DCL Subthreshold Leakage = " << idcl->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "Int DCL Gate Leakage = " << idcl->rt_power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "FP DCL Peak Dynamic = " << fdcl->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "FP DCL Subthreshold Leakage = " << fdcl->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "FP DCL Gate Leakage = " << fdcl->rt_power.readOp.gate_leakage << " W" << endl; + } + } + +} + + +void SchedulerU::computeEnergy(bool is_tdp) +{ + if (!exist) return; + double ROB_duty_cycle; +// ROB_duty_cycle = ((coredynp.ALU_duty_cycle + coredynp.num_muls>0?coredynp.MUL_duty_cycle:0 +// + coredynp.num_fpus>0?coredynp.FPU_duty_cycle:0))*1.1<1 ? (coredynp.ALU_duty_cycle + coredynp.num_muls>0?coredynp.MUL_duty_cycle:0 +// + coredynp.num_fpus>0?coredynp.FPU_duty_cycle:0)*1.1:1; + ROB_duty_cycle = 1; + //init stats + if (is_tdp) + { + if (coredynp.core_ty==OOO) + { + int_inst_window->stats_t.readAc.access = coredynp.issueW*coredynp.num_pipelines;//int_inst_window->l_ip.num_search_ports; + int_inst_window->stats_t.writeAc.access = coredynp.issueW*coredynp.num_pipelines;//int_inst_window->l_ip.num_wr_ports; + int_inst_window->stats_t.searchAc.access = coredynp.issueW*coredynp.num_pipelines; + int_inst_window->tdp_stats = int_inst_window->stats_t; + fp_inst_window->stats_t.readAc.access = fp_inst_window->l_ip.num_rd_ports*coredynp.num_fp_pipelines; + fp_inst_window->stats_t.writeAc.access = fp_inst_window->l_ip.num_wr_ports*coredynp.num_fp_pipelines; + fp_inst_window->stats_t.searchAc.access = fp_inst_window->l_ip.num_search_ports*coredynp.num_fp_pipelines; + fp_inst_window->tdp_stats = fp_inst_window->stats_t; + + if (XML->sys.core[ithCore].ROB_size >0) + { + ROB->stats_t.readAc.access = coredynp.commitW*coredynp.num_pipelines*ROB_duty_cycle; + ROB->stats_t.writeAc.access = coredynp.issueW*coredynp.num_pipelines*ROB_duty_cycle; + ROB->tdp_stats = ROB->stats_t; + + /* + * When inst commits, ROB must be read. + * Because for Physcial register based cores, physical register tag in ROB + * need to be read out and write into RRAT/CAM based RAT. + * For RS based cores, register content that stored in ROB must be + * read out and stored in architectural registers. + * + * if no-register is involved, the ROB read out operation when instruction commits can be ignored. + * assuming 20% insts. belong this type. + * TODO: ROB duty_cycle need to be revisited + */ + } + + } + else if (coredynp.multithreaded) + { + int_inst_window->stats_t.readAc.access = coredynp.issueW*coredynp.num_pipelines;//int_inst_window->l_ip.num_search_ports; + int_inst_window->stats_t.writeAc.access = coredynp.issueW*coredynp.num_pipelines;//int_inst_window->l_ip.num_wr_ports; + int_inst_window->stats_t.searchAc.access = coredynp.issueW*coredynp.num_pipelines; + int_inst_window->tdp_stats = int_inst_window->stats_t; + } + + } + else + {//rtp + if (coredynp.core_ty==OOO) + { + int_inst_window->stats_t.readAc.access = XML->sys.core[ithCore].inst_window_reads; + int_inst_window->stats_t.writeAc.access = XML->sys.core[ithCore].inst_window_writes; + int_inst_window->stats_t.searchAc.access = XML->sys.core[ithCore].inst_window_wakeup_accesses; + int_inst_window->rtp_stats = int_inst_window->stats_t; + fp_inst_window->stats_t.readAc.access = XML->sys.core[ithCore].fp_inst_window_reads; + fp_inst_window->stats_t.writeAc.access = XML->sys.core[ithCore].fp_inst_window_writes; + fp_inst_window->stats_t.searchAc.access = XML->sys.core[ithCore].fp_inst_window_wakeup_accesses; + fp_inst_window->rtp_stats = fp_inst_window->stats_t; + + if (XML->sys.core[ithCore].ROB_size >0) + { + + ROB->stats_t.readAc.access = XML->sys.core[ithCore].ROB_reads; + ROB->stats_t.writeAc.access = XML->sys.core[ithCore].ROB_writes; + /* ROB need to be updated in RS based OOO when new values are produced, + * this update may happen before the commit stage when ROB entry is released + * 1. ROB write at instruction inserted in + * 2. ROB write as results produced (for RS based OOO only) + * 3. ROB read as instruction committed. For RS based OOO, data values are read out and sent to ARF + * For Physical reg based OOO, no data stored in ROB, but register tags need to be + * read out and used to set the RRAT and to recycle the register tag to free list buffer + */ + ROB->rtp_stats = ROB->stats_t; + } + + } + else if (coredynp.multithreaded) + { + int_inst_window->stats_t.readAc.access = XML->sys.core[ithCore].int_instructions + XML->sys.core[ithCore].fp_instructions; + int_inst_window->stats_t.writeAc.access = XML->sys.core[ithCore].int_instructions + XML->sys.core[ithCore].fp_instructions; + int_inst_window->stats_t.searchAc.access = 2*(XML->sys.core[ithCore].int_instructions + XML->sys.core[ithCore].fp_instructions); + int_inst_window->rtp_stats = int_inst_window->stats_t; + } + } + + //computation engine + if (coredynp.core_ty==OOO) + { + int_inst_window->power_t.reset(); + fp_inst_window->power_t.reset(); + + /* each instruction needs to write to scheduler, read out when all resources and source operands are ready + * two search ops with one for each source operand + * + */ + int_inst_window->power_t.readOp.dynamic += int_inst_window->local_result.power.readOp.dynamic * int_inst_window->stats_t.readAc.access + + int_inst_window->local_result.power.searchOp.dynamic * int_inst_window->stats_t.searchAc.access + + int_inst_window->local_result.power.writeOp.dynamic * int_inst_window->stats_t.writeAc.access + + int_inst_window->stats_t.readAc.access * instruction_selection->power.readOp.dynamic; + + fp_inst_window->power_t.readOp.dynamic += fp_inst_window->local_result.power.readOp.dynamic * fp_inst_window->stats_t.readAc.access + + fp_inst_window->local_result.power.searchOp.dynamic * fp_inst_window->stats_t.searchAc.access + + fp_inst_window->local_result.power.writeOp.dynamic * fp_inst_window->stats_t.writeAc.access + + fp_inst_window->stats_t.writeAc.access * instruction_selection->power.readOp.dynamic; + + if (XML->sys.core[ithCore].ROB_size >0) + { + ROB->power_t.reset(); + ROB->power_t.readOp.dynamic += ROB->local_result.power.readOp.dynamic*ROB->stats_t.readAc.access + + ROB->stats_t.writeAc.access*ROB->local_result.power.writeOp.dynamic; + } + + + + + } + else if (coredynp.multithreaded) + { + int_inst_window->power_t.reset(); + int_inst_window->power_t.readOp.dynamic += int_inst_window->local_result.power.readOp.dynamic * int_inst_window->stats_t.readAc.access + + int_inst_window->local_result.power.searchOp.dynamic * int_inst_window->stats_t.searchAc.access + + int_inst_window->local_result.power.writeOp.dynamic * int_inst_window->stats_t.writeAc.access + + int_inst_window->stats_t.writeAc.access * instruction_selection->power.readOp.dynamic; + } + + //assign values + if (is_tdp) + { + if (coredynp.core_ty==OOO) + { + int_inst_window->power = int_inst_window->power_t + (int_inst_window->local_result.power +instruction_selection->power) *pppm_lkg; + fp_inst_window->power = fp_inst_window->power_t + (fp_inst_window->local_result.power +instruction_selection->power) *pppm_lkg; + power = power + int_inst_window->power + fp_inst_window->power; + if (XML->sys.core[ithCore].ROB_size >0) + { + ROB->power = ROB->power_t + ROB->local_result.power*pppm_lkg; + power = power + ROB->power; + } + + } + else if (coredynp.multithreaded) + { + // set_pppm(pppm_t, XML->sys.core[ithCore].issue_width,1, 1, 1); + int_inst_window->power = int_inst_window->power_t + (int_inst_window->local_result.power +instruction_selection->power) *pppm_lkg; + power = power + int_inst_window->power; + } + + } + else + {//rtp + if (coredynp.core_ty==OOO) + { + int_inst_window->rt_power = int_inst_window->power_t + (int_inst_window->local_result.power +instruction_selection->power) *pppm_lkg; + fp_inst_window->rt_power = fp_inst_window->power_t + (fp_inst_window->local_result.power +instruction_selection->power) *pppm_lkg; + rt_power = rt_power + int_inst_window->rt_power + fp_inst_window->rt_power; + if (XML->sys.core[ithCore].ROB_size >0) + { + ROB->rt_power = ROB->power_t + ROB->local_result.power*pppm_lkg; + rt_power = rt_power + ROB->rt_power; + } + + } + else if (coredynp.multithreaded) + { + // set_pppm(pppm_t, XML->sys.core[ithCore].issue_width,1, 1, 1); + int_inst_window->rt_power = int_inst_window->power_t + (int_inst_window->local_result.power +instruction_selection->power) *pppm_lkg; + rt_power = rt_power + int_inst_window->rt_power; + } + } +// set_pppm(pppm_t, XML->sys.core[ithCore].issue_width,1, 1, 1); +// cout<<"Scheduler power="<<power.readOp.dynamic<<"leakage="<<power.readOp.leakage<<endl; +// cout<<"IW="<<int_inst_window->local_result.power.searchOp.dynamic * int_inst_window->stats_t.readAc.access + +// + int_inst_window->local_result.power.writeOp.dynamic * int_inst_window->stats_t.writeAc.access<<"leakage="<<int_inst_window->local_result.power.readOp.leakage<<endl; +// cout<<"selection"<<instruction_selection->power.readOp.dynamic<<"leakage"<<instruction_selection->power.readOp.leakage<<endl; +} + +void SchedulerU::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + if (!exist) return; + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + + + if (is_tdp) + { + if (coredynp.core_ty==OOO) + { + cout << indent_str << "Instruction Window:" << endl; + cout << indent_str_next << "Area = " << int_inst_window->area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << int_inst_window->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? int_inst_window->power.readOp.longer_channel_leakage:int_inst_window->power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << int_inst_window->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << int_inst_window->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + cout << indent_str << "FP Instruction Window:" << endl; + cout << indent_str_next << "Area = " << fp_inst_window->area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << fp_inst_window->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? fp_inst_window->power.readOp.longer_channel_leakage:fp_inst_window->power.readOp.leakage ) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << fp_inst_window->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << fp_inst_window->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + if (XML->sys.core[ithCore].ROB_size >0) + { + cout << indent_str<<"ROB:" << endl; + cout << indent_str_next << "Area = " << ROB->area.get_area() *1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << ROB->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? ROB->power.readOp.longer_channel_leakage:ROB->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << ROB->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << ROB->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + } + } + else if (coredynp.multithreaded) + { + cout << indent_str << "Instruction Window:" << endl; + cout << indent_str_next << "Area = " << int_inst_window->area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << int_inst_window->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? int_inst_window->power.readOp.longer_channel_leakage:int_inst_window->power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << int_inst_window->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << int_inst_window->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + } + } + else + { + if (coredynp.core_ty==OOO) + { + cout << indent_str_next << "Instruction Window Peak Dynamic = " << int_inst_window->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Instruction Window Subthreshold Leakage = " << int_inst_window->rt_power.readOp.leakage <<" W" << endl; + cout << indent_str_next << "Instruction Window Gate Leakage = " << int_inst_window->rt_power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "FP Instruction Window Peak Dynamic = " << fp_inst_window->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "FP Instruction Window Subthreshold Leakage = " << fp_inst_window->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "FP Instruction Window Gate Leakage = " << fp_inst_window->rt_power.readOp.gate_leakage << " W" << endl; + if (XML->sys.core[ithCore].ROB_size >0) + { + cout << indent_str_next << "ROB Peak Dynamic = " << ROB->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "ROB Subthreshold Leakage = " << ROB->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "ROB Gate Leakage = " << ROB->rt_power.readOp.gate_leakage << " W" << endl; + } + } + else if (coredynp.multithreaded) + { + cout << indent_str_next << "Instruction Window Peak Dynamic = " << int_inst_window->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Instruction Window Subthreshold Leakage = " << int_inst_window->rt_power.readOp.leakage <<" W" << endl; + cout << indent_str_next << "Instruction Window Gate Leakage = " << int_inst_window->rt_power.readOp.gate_leakage << " W" << endl; + } + } + +} + +void LoadStoreU::computeEnergy(bool is_tdp) +{ + if (!exist) return; + if (is_tdp) + { + //init stats for Peak + dcache.caches->stats_t.readAc.access = 0.67*dcache.caches->l_ip.num_rw_ports*coredynp.LSU_duty_cycle; + dcache.caches->stats_t.readAc.miss = 0; + dcache.caches->stats_t.readAc.hit = dcache.caches->stats_t.readAc.access - dcache.caches->stats_t.readAc.miss; + dcache.caches->stats_t.writeAc.access = 0.33*dcache.caches->l_ip.num_rw_ports*coredynp.LSU_duty_cycle; + dcache.caches->stats_t.writeAc.miss = 0; + dcache.caches->stats_t.writeAc.hit = dcache.caches->stats_t.writeAc.access - dcache.caches->stats_t.writeAc.miss; + dcache.caches->tdp_stats = dcache.caches->stats_t; + + dcache.missb->stats_t.readAc.access = dcache.missb->l_ip.num_search_ports; + dcache.missb->stats_t.writeAc.access = dcache.missb->l_ip.num_search_ports; + dcache.missb->tdp_stats = dcache.missb->stats_t; + + dcache.ifb->stats_t.readAc.access = dcache.ifb->l_ip.num_search_ports; + dcache.ifb->stats_t.writeAc.access = dcache.ifb->l_ip.num_search_ports; + dcache.ifb->tdp_stats = dcache.ifb->stats_t; + + dcache.prefetchb->stats_t.readAc.access = dcache.prefetchb->l_ip.num_search_ports; + dcache.prefetchb->stats_t.writeAc.access = dcache.ifb->l_ip.num_search_ports; + dcache.prefetchb->tdp_stats = dcache.prefetchb->stats_t; + if (cache_p==Write_back) + { + dcache.wbb->stats_t.readAc.access = dcache.wbb->l_ip.num_search_ports; + dcache.wbb->stats_t.writeAc.access = dcache.wbb->l_ip.num_search_ports; + dcache.wbb->tdp_stats = dcache.wbb->stats_t; + } + + LSQ->stats_t.readAc.access = LSQ->stats_t.writeAc.access = LSQ->l_ip.num_search_ports*coredynp.LSU_duty_cycle; + LSQ->tdp_stats = LSQ->stats_t; + if ((coredynp.core_ty==OOO) && (XML->sys.core[ithCore].load_buffer_size >0)) + { + LoadQ->stats_t.readAc.access = LoadQ->stats_t.writeAc.access = LoadQ->l_ip.num_search_ports*coredynp.LSU_duty_cycle; + LoadQ->tdp_stats = LoadQ->stats_t; + } + } + else + { + //init stats for Runtime Dynamic (RTP) + dcache.caches->stats_t.readAc.access = XML->sys.core[ithCore].dcache.read_accesses; + dcache.caches->stats_t.readAc.miss = XML->sys.core[ithCore].dcache.read_misses; + dcache.caches->stats_t.readAc.hit = dcache.caches->stats_t.readAc.access - dcache.caches->stats_t.readAc.miss; + dcache.caches->stats_t.writeAc.access = XML->sys.core[ithCore].dcache.write_accesses; + dcache.caches->stats_t.writeAc.miss = XML->sys.core[ithCore].dcache.write_misses; + dcache.caches->stats_t.writeAc.hit = dcache.caches->stats_t.writeAc.access - dcache.caches->stats_t.writeAc.miss; + dcache.caches->rtp_stats = dcache.caches->stats_t; + + if (cache_p==Write_back) + { + dcache.missb->stats_t.readAc.access = dcache.caches->stats_t.writeAc.miss; + dcache.missb->stats_t.writeAc.access = dcache.caches->stats_t.writeAc.miss; + dcache.missb->rtp_stats = dcache.missb->stats_t; + + dcache.ifb->stats_t.readAc.access = dcache.caches->stats_t.writeAc.miss; + dcache.ifb->stats_t.writeAc.access = dcache.caches->stats_t.writeAc.miss; + dcache.ifb->rtp_stats = dcache.ifb->stats_t; + + dcache.prefetchb->stats_t.readAc.access = dcache.caches->stats_t.writeAc.miss; + dcache.prefetchb->stats_t.writeAc.access = dcache.caches->stats_t.writeAc.miss; + dcache.prefetchb->rtp_stats = dcache.prefetchb->stats_t; + + dcache.wbb->stats_t.readAc.access = dcache.caches->stats_t.writeAc.miss; + dcache.wbb->stats_t.writeAc.access = dcache.caches->stats_t.writeAc.miss; + dcache.wbb->rtp_stats = dcache.wbb->stats_t; + } + else + { + dcache.missb->stats_t.readAc.access = dcache.caches->stats_t.readAc.miss; + dcache.missb->stats_t.writeAc.access = dcache.caches->stats_t.readAc.miss; + dcache.missb->rtp_stats = dcache.missb->stats_t; + + dcache.ifb->stats_t.readAc.access = dcache.caches->stats_t.readAc.miss; + dcache.ifb->stats_t.writeAc.access = dcache.caches->stats_t.readAc.miss; + dcache.ifb->rtp_stats = dcache.ifb->stats_t; + + dcache.prefetchb->stats_t.readAc.access = dcache.caches->stats_t.readAc.miss; + dcache.prefetchb->stats_t.writeAc.access = dcache.caches->stats_t.readAc.miss; + dcache.prefetchb->rtp_stats = dcache.prefetchb->stats_t; + } + + LSQ->stats_t.readAc.access = (XML->sys.core[ithCore].load_instructions + XML->sys.core[ithCore].store_instructions)*2;//flush overhead considered + LSQ->stats_t.writeAc.access = (XML->sys.core[ithCore].load_instructions + XML->sys.core[ithCore].store_instructions)*2; + LSQ->rtp_stats = LSQ->stats_t; + + if ((coredynp.core_ty==OOO) && (XML->sys.core[ithCore].load_buffer_size >0)) + { + LoadQ->stats_t.readAc.access = XML->sys.core[ithCore].load_instructions + XML->sys.core[ithCore].store_instructions; + LoadQ->stats_t.writeAc.access = XML->sys.core[ithCore].load_instructions + XML->sys.core[ithCore].store_instructions; + LoadQ->rtp_stats = LoadQ->stats_t; + } + + } + + dcache.power_t.reset(); + LSQ->power_t.reset(); + dcache.power_t.readOp.dynamic += (dcache.caches->stats_t.readAc.hit*dcache.caches->local_result.power.readOp.dynamic+ + dcache.caches->stats_t.readAc.miss*dcache.caches->local_result.power.readOp.dynamic+ + dcache.caches->stats_t.writeAc.miss*dcache.caches->local_result.tag_array2->power.readOp.dynamic+ + dcache.caches->stats_t.writeAc.access*dcache.caches->local_result.power.writeOp.dynamic); + + if (cache_p==Write_back) + {//write miss will generate a write later + dcache.power_t.readOp.dynamic += dcache.caches->stats_t.writeAc.miss*dcache.caches->local_result.power.writeOp.dynamic; + } + + dcache.power_t.readOp.dynamic += dcache.missb->stats_t.readAc.access*dcache.missb->local_result.power.searchOp.dynamic + + dcache.missb->stats_t.writeAc.access*dcache.missb->local_result.power.writeOp.dynamic;//each access to missb involves a CAM and a write + dcache.power_t.readOp.dynamic += dcache.ifb->stats_t.readAc.access*dcache.ifb->local_result.power.searchOp.dynamic + + dcache.ifb->stats_t.writeAc.access*dcache.ifb->local_result.power.writeOp.dynamic; + dcache.power_t.readOp.dynamic += dcache.prefetchb->stats_t.readAc.access*dcache.prefetchb->local_result.power.searchOp.dynamic + + dcache.prefetchb->stats_t.writeAc.access*dcache.prefetchb->local_result.power.writeOp.dynamic; + if (cache_p==Write_back) + { + dcache.power_t.readOp.dynamic += dcache.wbb->stats_t.readAc.access*dcache.wbb->local_result.power.searchOp.dynamic + + dcache.wbb->stats_t.writeAc.access*dcache.wbb->local_result.power.writeOp.dynamic; + } + + if ((coredynp.core_ty==OOO) && (XML->sys.core[ithCore].load_buffer_size >0)) + { + LoadQ->power_t.reset(); + LoadQ->power_t.readOp.dynamic += LoadQ->stats_t.readAc.access*(LoadQ->local_result.power.searchOp.dynamic+ LoadQ->local_result.power.readOp.dynamic)+ + LoadQ->stats_t.writeAc.access*LoadQ->local_result.power.writeOp.dynamic;//every memory access invloves at least two operations on LoadQ + + LSQ->power_t.readOp.dynamic += LSQ->stats_t.readAc.access*(LSQ->local_result.power.searchOp.dynamic + LSQ->local_result.power.readOp.dynamic) + + LSQ->stats_t.writeAc.access*LSQ->local_result.power.writeOp.dynamic;//every memory access invloves at least two operations on LSQ + + } + else + { + LSQ->power_t.readOp.dynamic += LSQ->stats_t.readAc.access*(LSQ->local_result.power.searchOp.dynamic + LSQ->local_result.power.readOp.dynamic) + + LSQ->stats_t.writeAc.access*LSQ->local_result.power.writeOp.dynamic;//every memory access invloves at least two operations on LSQ + + } + + if (is_tdp) + { +// dcache.power = dcache.power_t + (dcache.caches->local_result.power)*pppm_lkg + +// (dcache.missb->local_result.power + +// dcache.ifb->local_result.power + +// dcache.prefetchb->local_result.power + +// dcache.wbb->local_result.power)*pppm_Isub; + dcache.power = dcache.power_t + (dcache.caches->local_result.power + + dcache.missb->local_result.power + + dcache.ifb->local_result.power + + dcache.prefetchb->local_result.power) *pppm_lkg; + if (cache_p==Write_back) + { + dcache.power = dcache.power + dcache.wbb->local_result.power*pppm_lkg; + } + + LSQ->power = LSQ->power_t + LSQ->local_result.power *pppm_lkg; + power = power + dcache.power + LSQ->power; + + if ((coredynp.core_ty==OOO) && (XML->sys.core[ithCore].load_buffer_size >0)) + { + LoadQ->power = LoadQ->power_t + LoadQ->local_result.power *pppm_lkg; + power = power + LoadQ->power; + } + } + else + { +// dcache.rt_power = dcache.power_t + (dcache.caches->local_result.power + +// dcache.missb->local_result.power + +// dcache.ifb->local_result.power + +// dcache.prefetchb->local_result.power + +// dcache.wbb->local_result.power)*pppm_lkg; + dcache.rt_power = dcache.power_t + (dcache.caches->local_result.power + + dcache.missb->local_result.power + + dcache.ifb->local_result.power + + dcache.prefetchb->local_result.power )*pppm_lkg; + + if (cache_p==Write_back) + { + dcache.rt_power = dcache.rt_power + dcache.wbb->local_result.power*pppm_lkg; + } + + LSQ->rt_power = LSQ->power_t + LSQ->local_result.power *pppm_lkg; + rt_power = rt_power + dcache.rt_power + LSQ->rt_power; + + if ((coredynp.core_ty==OOO) && (XML->sys.core[ithCore].load_buffer_size >0)) + { + LoadQ->rt_power = LoadQ->power_t + LoadQ->local_result.power *pppm_lkg; + rt_power = rt_power + LoadQ->rt_power; + } + } +} + + +void LoadStoreU::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + if (!exist) return; + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + + + if (is_tdp) + { + cout << indent_str << "Data Cache:" << endl; + cout << indent_str_next << "Area = " << dcache.area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << dcache.power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? dcache.power.readOp.longer_channel_leakage:dcache.power.readOp.leakage )<<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << dcache.power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << dcache.rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + if (coredynp.core_ty==Inorder) + { + cout << indent_str << "Load/Store Queue:" << endl; + cout << indent_str_next << "Area = " << LSQ->area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << LSQ->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? LSQ->power.readOp.longer_channel_leakage:LSQ->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << LSQ->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << LSQ->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + } + else + + { + if (XML->sys.core[ithCore].load_buffer_size >0) + { + cout << indent_str << "LoadQ:" << endl; + cout << indent_str_next << "Area = " << LoadQ->area.get_area() *1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << LoadQ->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? LoadQ->power.readOp.longer_channel_leakage:LoadQ->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << LoadQ->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << LoadQ->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + } + cout << indent_str<< "StoreQ:" << endl; + cout << indent_str_next << "Area = " << LSQ->area.get_area() *1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << LSQ->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? LSQ->power.readOp.longer_channel_leakage:LSQ->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << LSQ->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << LSQ->rt_power.readOp.dynamic/executionTime<< " W" << endl; + cout <<endl; + } + } + else + { + cout << indent_str_next << "Data Cache Peak Dynamic = " << dcache.rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Data Cache Subthreshold Leakage = " << dcache.rt_power.readOp.leakage <<" W" << endl; + cout << indent_str_next << "Data Cache Gate Leakage = " << dcache.rt_power.readOp.gate_leakage << " W" << endl; + if (coredynp.core_ty==Inorder) + { + cout << indent_str_next << "Load/Store Queue Peak Dynamic = " << LSQ->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Load/Store Queue Subthreshold Leakage = " << LSQ->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "Load/Store Queue Gate Leakage = " << LSQ->rt_power.readOp.gate_leakage << " W" << endl; + } + else + { + cout << indent_str_next << "LoadQ Peak Dynamic = " << LoadQ->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "LoadQ Subthreshold Leakage = " << LoadQ->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "LoadQ Gate Leakage = " << LoadQ->rt_power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "StoreQ Peak Dynamic = " << LSQ->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "StoreQ Subthreshold Leakage = " << LSQ->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "StoreQ Gate Leakage = " << LSQ->rt_power.readOp.gate_leakage << " W" << endl; + } + } + +} + +void MemManU::computeEnergy(bool is_tdp) +{ + + if (!exist) return; + if (is_tdp) + { + //init stats for Peak + itlb->stats_t.readAc.access = itlb->l_ip.num_search_ports; + itlb->stats_t.readAc.miss = 0; + itlb->stats_t.readAc.hit = itlb->stats_t.readAc.access - itlb->stats_t.readAc.miss; + itlb->tdp_stats = itlb->stats_t; + + dtlb->stats_t.readAc.access = dtlb->l_ip.num_search_ports*coredynp.LSU_duty_cycle; + dtlb->stats_t.readAc.miss = 0; + dtlb->stats_t.readAc.hit = dtlb->stats_t.readAc.access - dtlb->stats_t.readAc.miss; + dtlb->tdp_stats = dtlb->stats_t; + } + else + { + //init stats for Runtime Dynamic (RTP) + itlb->stats_t.readAc.access = XML->sys.core[ithCore].itlb.total_accesses; + itlb->stats_t.readAc.miss = XML->sys.core[ithCore].itlb.total_misses; + itlb->stats_t.readAc.hit = itlb->stats_t.readAc.access - itlb->stats_t.readAc.miss; + itlb->rtp_stats = itlb->stats_t; + + dtlb->stats_t.readAc.access = XML->sys.core[ithCore].dtlb.total_accesses; + dtlb->stats_t.readAc.miss = XML->sys.core[ithCore].dtlb.total_misses; + dtlb->stats_t.readAc.hit = dtlb->stats_t.readAc.access - dtlb->stats_t.readAc.miss; + dtlb->rtp_stats = dtlb->stats_t; + } + + itlb->power_t.reset(); + dtlb->power_t.reset(); + itlb->power_t.readOp.dynamic += itlb->stats_t.readAc.access*itlb->local_result.power.searchOp.dynamic//FA spent most power in tag, so use total access not hits + +itlb->stats_t.readAc.miss*itlb->local_result.power.writeOp.dynamic; + dtlb->power_t.readOp.dynamic += dtlb->stats_t.readAc.access*dtlb->local_result.power.searchOp.dynamic//FA spent most power in tag, so use total access not hits + +dtlb->stats_t.readAc.miss*dtlb->local_result.power.writeOp.dynamic; + + if (is_tdp) + { + itlb->power = itlb->power_t + itlb->local_result.power *pppm_lkg; + dtlb->power = dtlb->power_t + dtlb->local_result.power *pppm_lkg; + power = power + itlb->power + dtlb->power; + } + else + { + itlb->rt_power = itlb->power_t + itlb->local_result.power *pppm_lkg; + dtlb->rt_power = dtlb->power_t + dtlb->local_result.power *pppm_lkg; + rt_power = rt_power + itlb->rt_power + dtlb->rt_power; + } +} + +void MemManU::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + if (!exist) return; + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + + + + + if (is_tdp) + { + cout << indent_str << "Itlb:" << endl; + cout << indent_str_next << "Area = " << itlb->area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << itlb->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? itlb->power.readOp.longer_channel_leakage:itlb->power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << itlb->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << itlb->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + cout << indent_str<< "Dtlb:" << endl; + cout << indent_str_next << "Area = " << dtlb->area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << dtlb->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? dtlb->power.readOp.longer_channel_leakage:dtlb->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << dtlb->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << dtlb->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + } + else + { + cout << indent_str_next << "Itlb Peak Dynamic = " << itlb->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Itlb Subthreshold Leakage = " << itlb->rt_power.readOp.leakage <<" W" << endl; + cout << indent_str_next << "Itlb Gate Leakage = " << itlb->rt_power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Dtlb Peak Dynamic = " << dtlb->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Dtlb Subthreshold Leakage = " << dtlb->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "Dtlb Gate Leakage = " << dtlb->rt_power.readOp.gate_leakage << " W" << endl; + } + +} + +void RegFU::computeEnergy(bool is_tdp) +{ +/* + * Architecture RF and physical RF cannot be present at the same time. + * Therefore, the RF stats can only refer to either ARF or PRF; + * And the same stats can be used for both. + */ + if (!exist) return; + if (is_tdp) + { + //init stats for Peak + IRF->stats_t.readAc.access = coredynp.issueW*2*(coredynp.ALU_duty_cycle*1.1+ + (coredynp.num_muls>0?coredynp.MUL_duty_cycle:0))*coredynp.num_pipelines; + IRF->stats_t.writeAc.access = coredynp.issueW*(coredynp.ALU_duty_cycle*1.1+ + (coredynp.num_muls>0?coredynp.MUL_duty_cycle:0))*coredynp.num_pipelines; + //Rule of Thumb: about 10% RF related instructions do not need to access ALUs + IRF->tdp_stats = IRF->stats_t; + + FRF->stats_t.readAc.access = FRF->l_ip.num_rd_ports*coredynp.FPU_duty_cycle*1.05*coredynp.num_fp_pipelines; + FRF->stats_t.writeAc.access = FRF->l_ip.num_wr_ports*coredynp.FPU_duty_cycle*1.05*coredynp.num_fp_pipelines; + FRF->tdp_stats = FRF->stats_t; + if (coredynp.regWindowing) + { + RFWIN->stats_t.readAc.access = 0;//0.5*RFWIN->l_ip.num_rw_ports; + RFWIN->stats_t.writeAc.access = 0;//0.5*RFWIN->l_ip.num_rw_ports; + RFWIN->tdp_stats = RFWIN->stats_t; + } + } + else + { + //init stats for Runtime Dynamic (RTP) + IRF->stats_t.readAc.access = XML->sys.core[ithCore].int_regfile_reads;//TODO: no diff on archi and phy + IRF->stats_t.writeAc.access = XML->sys.core[ithCore].int_regfile_writes; + IRF->rtp_stats = IRF->stats_t; + + FRF->stats_t.readAc.access = XML->sys.core[ithCore].float_regfile_reads; + FRF->stats_t.writeAc.access = XML->sys.core[ithCore].float_regfile_writes; + FRF->rtp_stats = FRF->stats_t; + if (coredynp.regWindowing) + { + RFWIN->stats_t.readAc.access = XML->sys.core[ithCore].function_calls*16; + RFWIN->stats_t.writeAc.access = XML->sys.core[ithCore].function_calls*16; + RFWIN->rtp_stats = RFWIN->stats_t; + + IRF->stats_t.readAc.access = XML->sys.core[ithCore].int_regfile_reads + + XML->sys.core[ithCore].function_calls*16; + IRF->stats_t.writeAc.access = XML->sys.core[ithCore].int_regfile_writes + + XML->sys.core[ithCore].function_calls*16; + IRF->rtp_stats = IRF->stats_t; + + FRF->stats_t.readAc.access = XML->sys.core[ithCore].float_regfile_reads + + XML->sys.core[ithCore].function_calls*16;; + FRF->stats_t.writeAc.access = XML->sys.core[ithCore].float_regfile_writes+ + XML->sys.core[ithCore].function_calls*16;; + FRF->rtp_stats = FRF->stats_t; + } + } + IRF->power_t.reset(); + FRF->power_t.reset(); + IRF->power_t.readOp.dynamic += (IRF->stats_t.readAc.access*IRF->local_result.power.readOp.dynamic + +IRF->stats_t.writeAc.access*IRF->local_result.power.writeOp.dynamic); + FRF->power_t.readOp.dynamic += (FRF->stats_t.readAc.access*FRF->local_result.power.readOp.dynamic + +FRF->stats_t.writeAc.access*FRF->local_result.power.writeOp.dynamic); + if (coredynp.regWindowing) + { + RFWIN->power_t.reset(); + RFWIN->power_t.readOp.dynamic += (RFWIN->stats_t.readAc.access*RFWIN->local_result.power.readOp.dynamic + + RFWIN->stats_t.writeAc.access*RFWIN->local_result.power.writeOp.dynamic); + } + + if (is_tdp) + { + IRF->power = IRF->power_t + IRF->local_result.power *coredynp.pppm_lkg_multhread; + FRF->power = FRF->power_t + FRF->local_result.power *coredynp.pppm_lkg_multhread; + power = power + (IRF->power + FRF->power); + if (coredynp.regWindowing) + { + RFWIN->power = RFWIN->power_t + RFWIN->local_result.power *pppm_lkg; + power = power + RFWIN->power; + } + } + else + { + IRF->rt_power = IRF->power_t + IRF->local_result.power *coredynp.pppm_lkg_multhread; + FRF->rt_power = FRF->power_t + FRF->local_result.power *coredynp.pppm_lkg_multhread; + rt_power = rt_power + (IRF->power_t + FRF->power_t); + if (coredynp.regWindowing) + { + RFWIN->rt_power = RFWIN->power_t + RFWIN->local_result.power *pppm_lkg; + rt_power = rt_power + RFWIN->rt_power; + } + } +} + + +void RegFU::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + if (!exist) return; + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + + if (is_tdp) + { cout << indent_str << "Integer RF:" << endl; + cout << indent_str_next << "Area = " << IRF->area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << IRF->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? IRF->power.readOp.longer_channel_leakage:IRF->power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << IRF->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << IRF->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + cout << indent_str<< "Floating Point RF:" << endl; + cout << indent_str_next << "Area = " << FRF->area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << FRF->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? FRF->power.readOp.longer_channel_leakage:FRF->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << FRF->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << FRF->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + if (coredynp.regWindowing) + { + cout << indent_str << "Register Windows:" << endl; + cout << indent_str_next << "Area = " << RFWIN->area.get_area() *1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << RFWIN->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? RFWIN->power.readOp.longer_channel_leakage:RFWIN->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << RFWIN->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << RFWIN->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + } + } + else + { + cout << indent_str_next << "Integer RF Peak Dynamic = " << IRF->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Integer RF Subthreshold Leakage = " << IRF->rt_power.readOp.leakage <<" W" << endl; + cout << indent_str_next << "Integer RF Gate Leakage = " << IRF->rt_power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Floating Point RF Peak Dynamic = " << FRF->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Floating Point RF Subthreshold Leakage = " << FRF->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "Floating Point RF Gate Leakage = " << FRF->rt_power.readOp.gate_leakage << " W" << endl; + if (coredynp.regWindowing) + { + cout << indent_str_next << "Register Windows Peak Dynamic = " << RFWIN->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Register Windows Subthreshold Leakage = " << RFWIN->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "Register Windows Gate Leakage = " << RFWIN->rt_power.readOp.gate_leakage << " W" << endl; + } + } +} + + +void EXECU::computeEnergy(bool is_tdp) +{ + if (!exist) return; + double pppm_t[4] = {1,1,1,1}; +// rfu->power.reset(); +// rfu->rt_power.reset(); +// scheu->power.reset(); +// scheu->rt_power.reset(); +// exeu->power.reset(); +// exeu->rt_power.reset(); + + rfu->computeEnergy(is_tdp); + scheu->computeEnergy(is_tdp); + exeu->computeEnergy(is_tdp); + if (coredynp.num_fpus >0) + { + fp_u->computeEnergy(is_tdp); + } + if (coredynp.num_muls >0) + { + mul->computeEnergy(is_tdp); + } + + if (is_tdp) + { + set_pppm(pppm_t, 2*coredynp.ALU_cdb_duty_cycle, 2, 2, 2*coredynp.ALU_cdb_duty_cycle);//2 means two source operands needs to be passed for each int instruction. + bypass.power = bypass.power + intTagBypass->power*pppm_t + int_bypass->power*pppm_t; + if (coredynp.num_muls >0) + { + set_pppm(pppm_t, 2*coredynp.MUL_cdb_duty_cycle, 2, 2, 2*coredynp.MUL_cdb_duty_cycle);//2 means two source operands needs to be passed for each int instruction. + bypass.power = bypass.power + intTag_mul_Bypass->power*pppm_t + int_mul_bypass->power*pppm_t; + power = power + mul->power; + } + if (coredynp.num_fpus>0) + { + set_pppm(pppm_t, 3*coredynp.FPU_cdb_duty_cycle, 3, 3, 3*coredynp.FPU_cdb_duty_cycle);//3 means three source operands needs to be passed for each fp instruction. + bypass.power = bypass.power + fp_bypass->power*pppm_t + fpTagBypass->power*pppm_t ; + power = power + fp_u->power; + } + + power = power + rfu->power + exeu->power + bypass.power + scheu->power; + } + else + { + set_pppm(pppm_t, XML->sys.core[ithCore].cdb_alu_accesses, 2, 2, XML->sys.core[ithCore].cdb_alu_accesses); + bypass.rt_power = bypass.rt_power + intTagBypass->power*pppm_t; + bypass.rt_power = bypass.rt_power + int_bypass->power*pppm_t; + + if (coredynp.num_muls >0) + { + set_pppm(pppm_t, XML->sys.core[ithCore].cdb_mul_accesses, 2, 2, XML->sys.core[ithCore].cdb_mul_accesses);//2 means two source operands needs to be passed for each int instruction. + bypass.rt_power = bypass.rt_power + intTag_mul_Bypass->power*pppm_t + int_mul_bypass->power*pppm_t; + rt_power = rt_power + mul->rt_power; + } + + if (coredynp.num_fpus>0) + { + set_pppm(pppm_t, XML->sys.core[ithCore].cdb_fpu_accesses, 3, 3, XML->sys.core[ithCore].cdb_fpu_accesses); + bypass.rt_power = bypass.rt_power + fp_bypass->power*pppm_t; + bypass.rt_power = bypass.rt_power + fpTagBypass->power*pppm_t; + rt_power = rt_power + fp_u->rt_power; + } + rt_power = rt_power + rfu->rt_power + exeu->rt_power + bypass.rt_power + scheu->rt_power; + } +} + +void EXECU::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + if (!exist) return; + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + + +// cout << indent_str_next << "Results Broadcast Bus Area = " << bypass->area.get_area() *1e-6 << " mm^2" << endl; + if (is_tdp) + { + cout << indent_str << "Register Files:" << endl; + cout << indent_str_next << "Area = " << rfu->area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << rfu->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? rfu->power.readOp.longer_channel_leakage:rfu->power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << rfu->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << rfu->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + if (plevel>3){ + rfu->displayEnergy(indent+4,is_tdp); + } + cout << indent_str << "Instruction Scheduler:" << endl; + cout << indent_str_next << "Area = " << scheu->area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << scheu->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? scheu->power.readOp.longer_channel_leakage:scheu->power.readOp.leakage) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << scheu->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << scheu->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + if (plevel>3){ + scheu->displayEnergy(indent+4,is_tdp); + } + exeu->displayEnergy(indent,is_tdp); + if (coredynp.num_fpus>0) + { + fp_u->displayEnergy(indent,is_tdp); + } + if (coredynp.num_muls >0) + { + mul->displayEnergy(indent,is_tdp); + } + cout << indent_str << "Results Broadcast Bus:" << endl; + cout << indent_str_next << "Area Overhead = " << bypass.area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << bypass.power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? bypass.power.readOp.longer_channel_leakage:bypass.power.readOp.leakage ) << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << bypass.power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << bypass.rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + } + else + { + cout << indent_str_next << "Register Files Peak Dynamic = " << rfu->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Register Files Subthreshold Leakage = " << rfu->rt_power.readOp.leakage <<" W" << endl; + cout << indent_str_next << "Register Files Gate Leakage = " << rfu->rt_power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Instruction Sheduler Peak Dynamic = " << scheu->rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Instruction Sheduler Subthreshold Leakage = " << scheu->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "Instruction Sheduler Gate Leakage = " << scheu->rt_power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Results Broadcast Bus Peak Dynamic = " << bypass.rt_power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Results Broadcast Bus Subthreshold Leakage = " << bypass.rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "Results Broadcast Bus Gate Leakage = " << bypass.rt_power.readOp.gate_leakage << " W" << endl; + } + +} + +void Core::computeEnergy(bool is_tdp) +{ + //power_point_product_masks + double pppm_t[4] = {1,1,1,1}; + double rtp_pipeline_coe; + double num_units = 4.0; + if (is_tdp) + { + ifu->computeEnergy(is_tdp); + lsu->computeEnergy(is_tdp); + mmu->computeEnergy(is_tdp); + exu->computeEnergy(is_tdp); + + if (coredynp.core_ty==OOO) + { + num_units = 5.0; + rnu->computeEnergy(is_tdp); + set_pppm(pppm_t, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units); + if (rnu->exist) + { + rnu->power = rnu->power + corepipe->power*pppm_t; + power = power + rnu->power; + } + } + + if (ifu->exist) + { + set_pppm(pppm_t, coredynp.num_pipelines/num_units*coredynp.IFU_duty_cycle, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units); +// cout << "IFU = " << ifu->power.readOp.dynamic*clockRate << " W" << endl; + ifu->power = ifu->power + corepipe->power*pppm_t; +// cout << "IFU = " << ifu->power.readOp.dynamic*clockRate << " W" << endl; +// cout << "1/4 pipe = " << corepipe->power.readOp.dynamic*clockRate/num_units << " W" << endl; + power = power + ifu->power; +// cout << "core = " << power.readOp.dynamic*clockRate << " W" << endl; + } + if (lsu->exist) + { + set_pppm(pppm_t, coredynp.num_pipelines/num_units*coredynp.LSU_duty_cycle, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units); + lsu->power = lsu->power + corepipe->power*pppm_t; +// cout << "LSU = " << lsu->power.readOp.dynamic*clockRate << " W" << endl; + power = power + lsu->power; +// cout << "core = " << power.readOp.dynamic*clockRate << " W" << endl; + } + if (exu->exist) + { + set_pppm(pppm_t, coredynp.num_pipelines/num_units*coredynp.ALU_duty_cycle, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units); + exu->power = exu->power + corepipe->power*pppm_t; +// cout << "EXE = " << exu->power.readOp.dynamic*clockRate << " W" << endl; + power = power + exu->power; +// cout << "core = " << power.readOp.dynamic*clockRate << " W" << endl; + } + if (mmu->exist) + { + set_pppm(pppm_t, coredynp.num_pipelines/num_units*(0.5+0.5*coredynp.LSU_duty_cycle), coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units); + mmu->power = mmu->power + corepipe->power*pppm_t; +// cout << "MMU = " << mmu->power.readOp.dynamic*clockRate << " W" << endl; + power = power + mmu->power; +// cout << "core = " << power.readOp.dynamic*clockRate << " W" << endl; + } + + power = power + undiffCore->power; + + if (XML->sys.Private_L2) + { + + l2cache->computeEnergy(is_tdp); + set_pppm(pppm_t,l2cache->cachep.clockRate/clockRate, 1,1,1); + //l2cache->power = l2cache->power*pppm_t; + power = power + l2cache->power*pppm_t; + } + } + else + { + ifu->computeEnergy(is_tdp); + lsu->computeEnergy(is_tdp); + mmu->computeEnergy(is_tdp); + exu->computeEnergy(is_tdp); + if (coredynp.core_ty==OOO) + { + num_units = 5.0; + rnu->computeEnergy(is_tdp); + set_pppm(pppm_t, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units); + if (rnu->exist) + { + rnu->rt_power = rnu->rt_power + corepipe->power*pppm_t; + + rt_power = rt_power + rnu->rt_power; + } + } + else + { + if (XML->sys.homogeneous_cores==1) + { + rtp_pipeline_coe = coredynp.pipeline_duty_cycle * XML->sys.total_cycles * XML->sys.number_of_cores; + } + else + { + rtp_pipeline_coe = coredynp.pipeline_duty_cycle * coredynp.total_cycles; + } + set_pppm(pppm_t, coredynp.num_pipelines*rtp_pipeline_coe/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units, coredynp.num_pipelines/num_units); + } + + if (ifu->exist) + { + ifu->rt_power = ifu->rt_power + corepipe->power*pppm_t; + rt_power = rt_power + ifu->rt_power ; + } + if (lsu->exist) + { + lsu->rt_power = lsu->rt_power + corepipe->power*pppm_t; + rt_power = rt_power + lsu->rt_power; + } + if (exu->exist) + { + exu->rt_power = exu->rt_power + corepipe->power*pppm_t; + rt_power = rt_power + exu->rt_power; + } + if (mmu->exist) + { + mmu->rt_power = mmu->rt_power + corepipe->power*pppm_t; + rt_power = rt_power + mmu->rt_power ; + } + + rt_power = rt_power + undiffCore->power; +// cout << "EXE = " << exu->power.readOp.dynamic*clockRate << " W" << endl; + if (XML->sys.Private_L2) + { + + l2cache->computeEnergy(is_tdp); + //set_pppm(pppm_t,1/l2cache->cachep.executionTime, 1,1,1); + //l2cache->rt_power = l2cache->rt_power*pppm_t; + rt_power = rt_power + l2cache->rt_power; + } + } + +} + +void Core::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + if (is_tdp) + { + cout << "Core:" << endl; + cout << indent_str << "Area = " << area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str << "Subthreshold Leakage = " + << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; + //cout << indent_str << "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl; + cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; + cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; + cout<<endl; + if (ifu->exist) + { + cout << indent_str << "Instruction Fetch Unit:" << endl; + cout << indent_str_next << "Area = " << ifu->area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << ifu->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? ifu->power.readOp.longer_channel_leakage:ifu->power.readOp.leakage) <<" W" << endl; + //cout << indent_str_next << "Subthreshold Leakage = " << ifu->power.readOp.longer_channel_leakage <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << ifu->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << ifu->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + if (plevel >2){ + ifu->displayEnergy(indent+4,plevel,is_tdp); + } + } + if (coredynp.core_ty==OOO) + { + if (rnu->exist) + { + cout << indent_str<< "Renaming Unit:" << endl; + cout << indent_str_next << "Area = " << rnu->area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << rnu->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? rnu->power.readOp.longer_channel_leakage:rnu->power.readOp.leakage) << " W" << endl; + //cout << indent_str_next << "Subthreshold Leakage = " << rnu->power.readOp.longer_channel_leakage << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << rnu->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << rnu->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + if (plevel >2){ + rnu->displayEnergy(indent+4,plevel,is_tdp); + } + } + + } + if (lsu->exist) + { + cout << indent_str<< "Load Store Unit:" << endl; + cout << indent_str_next << "Area = " << lsu->area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << lsu->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? lsu->power.readOp.longer_channel_leakage:lsu->power.readOp.leakage ) << " W" << endl; + //cout << indent_str_next << "Subthreshold Leakage = " << lsu->power.readOp.longer_channel_leakage << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << lsu->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << lsu->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + if (plevel >2){ + lsu->displayEnergy(indent+4,plevel,is_tdp); + } + } + if (mmu->exist) + { + cout << indent_str<< "Memory Management Unit:" << endl; + cout << indent_str_next << "Area = " << mmu->area.get_area() *1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << mmu->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? mmu->power.readOp.longer_channel_leakage:mmu->power.readOp.leakage) << " W" << endl; + //cout << indent_str_next << "Subthreshold Leakage = " << mmu->power.readOp.longer_channel_leakage << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << mmu->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << mmu->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + if (plevel >2){ + mmu->displayEnergy(indent+4,plevel,is_tdp); + } + } + if (exu->exist) + { + cout << indent_str<< "Execution Unit:" << endl; + cout << indent_str_next << "Area = " << exu->area.get_area() *1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << exu->power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? exu->power.readOp.longer_channel_leakage:exu->power.readOp.leakage) << " W" << endl; + //cout << indent_str_next << "Subthreshold Leakage = " << exu->power.readOp.longer_channel_leakage << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << exu->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << exu->rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + if (plevel >2){ + exu->displayEnergy(indent+4,plevel,is_tdp); + } + } +// if (plevel >2) +// { +// if (undiffCore->exist) +// { +// cout << indent_str << "Undifferentiated Core" << endl; +// cout << indent_str_next << "Area = " << undiffCore->area.get_area()*1e-6<< " mm^2" << endl; +// cout << indent_str_next << "Peak Dynamic = " << undiffCore->power.readOp.dynamic*clockRate << " W" << endl; +//// cout << indent_str_next << "Subthreshold Leakage = " << undiffCore->power.readOp.leakage <<" W" << endl; +// cout << indent_str_next << "Subthreshold Leakage = " +// << (long_channel? undiffCore->power.readOp.longer_channel_leakage:undiffCore->power.readOp.leakage) << " W" << endl; +// cout << indent_str_next << "Gate Leakage = " << undiffCore->power.readOp.gate_leakage << " W" << endl; +// // cout << indent_str_next << "Runtime Dynamic = " << undiffCore->rt_power.readOp.dynamic/executionTime << " W" << endl; +// cout <<endl; +// } +// } + if (XML->sys.Private_L2) + { + + l2cache->displayEnergy(4,is_tdp); + } + + } + else + { +// cout << indent_str_next << "Instruction Fetch Unit Peak Dynamic = " << ifu->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Instruction Fetch Unit Subthreshold Leakage = " << ifu->rt_power.readOp.leakage <<" W" << endl; +// cout << indent_str_next << "Instruction Fetch Unit Gate Leakage = " << ifu->rt_power.readOp.gate_leakage << " W" << endl; +// cout << indent_str_next << "Load Store Unit Peak Dynamic = " << lsu->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Load Store Unit Subthreshold Leakage = " << lsu->rt_power.readOp.leakage << " W" << endl; +// cout << indent_str_next << "Load Store Unit Gate Leakage = " << lsu->rt_power.readOp.gate_leakage << " W" << endl; +// cout << indent_str_next << "Memory Management Unit Peak Dynamic = " << mmu->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Memory Management Unit Subthreshold Leakage = " << mmu->rt_power.readOp.leakage << " W" << endl; +// cout << indent_str_next << "Memory Management Unit Gate Leakage = " << mmu->rt_power.readOp.gate_leakage << " W" << endl; +// cout << indent_str_next << "Execution Unit Peak Dynamic = " << exu->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Execution Unit Subthreshold Leakage = " << exu->rt_power.readOp.leakage << " W" << endl; +// cout << indent_str_next << "Execution Unit Gate Leakage = " << exu->rt_power.readOp.gate_leakage << " W" << endl; + } +} +InstFetchU ::~InstFetchU(){ + + if (!exist) return; + if(IB) {delete IB; IB = 0;} + if(ID_inst) {delete ID_inst; ID_inst = 0;} + if(ID_operand) {delete ID_operand; ID_operand = 0;} + if(ID_misc) {delete ID_misc; ID_misc = 0;} + if (coredynp.predictionW>0) + { + if(BTB) {delete BTB; BTB = 0;} + if(BPT) {delete BPT; BPT = 0;} + } +} + +BranchPredictor ::~BranchPredictor(){ + + if (!exist) return; + if(globalBPT) {delete globalBPT; globalBPT = 0;} + if(localBPT) {delete localBPT; localBPT = 0;} + if(L1_localBPT) {delete L1_localBPT; L1_localBPT = 0;} + if(L2_localBPT) {delete L2_localBPT; L2_localBPT = 0;} + if(chooser) {delete chooser; chooser = 0;} + if(RAS) {delete RAS; RAS = 0;} + } + +RENAMINGU ::~RENAMINGU(){ + + if (!exist) return; + if(iFRAT ) {delete iFRAT; iFRAT = 0;} + if(fFRAT ) {delete fFRAT; fFRAT =0;} + if(iRRAT) {delete iRRAT; iRRAT = 0;} + if(iFRAT) {delete iFRAT; iFRAT = 0;} + if(ifreeL) {delete ifreeL;ifreeL= 0;} + if(ffreeL) {delete ffreeL;ffreeL= 0;} + if(idcl) {delete idcl; idcl = 0;} + if(fdcl) {delete fdcl; fdcl = 0;} + if(RAHT) {delete RAHT; RAHT = 0;} + } + +LoadStoreU ::~LoadStoreU(){ + + if (!exist) return; + if(LSQ) {delete LSQ; LSQ = 0;} + } + +MemManU ::~MemManU(){ + + if (!exist) return; + if(itlb) {delete itlb; itlb = 0;} + if(dtlb) {delete dtlb; dtlb = 0;} + } + +RegFU ::~RegFU(){ + + if (!exist) return; + if(IRF) {delete IRF; IRF = 0;} + if(FRF) {delete FRF; FRF = 0;} + if(RFWIN) {delete RFWIN; RFWIN = 0;} + } + +SchedulerU ::~SchedulerU(){ + + if (!exist) return; + if(int_inst_window) {delete int_inst_window; int_inst_window = 0;} + if(fp_inst_window) {delete int_inst_window; int_inst_window = 0;} + if(ROB) {delete ROB; ROB = 0;} + if(instruction_selection) {delete instruction_selection;instruction_selection = 0;} + } + +EXECU ::~EXECU(){ + + if (!exist) return; + if(int_bypass) {delete int_bypass; int_bypass = 0;} + if(intTagBypass) {delete intTagBypass; intTagBypass =0;} + if(int_mul_bypass) {delete int_mul_bypass; int_mul_bypass = 0;} + if(intTag_mul_Bypass) {delete intTag_mul_Bypass; intTag_mul_Bypass =0;} + if(fp_bypass) {delete fp_bypass;fp_bypass = 0;} + if(fpTagBypass) {delete fpTagBypass;fpTagBypass = 0;} + if(fp_u) {delete fp_u;fp_u = 0;} + if(exeu) {delete exeu;exeu = 0;} + if(mul) {delete mul;mul = 0;} + if(rfu) {delete rfu;rfu = 0;} + if(scheu) {delete scheu; scheu = 0;} + } + +Core ::~Core(){ + + if(ifu) {delete ifu; ifu = 0;} + if(lsu) {delete lsu; lsu = 0;} + if(rnu) {delete rnu; rnu = 0;} + if(mmu) {delete mmu; mmu = 0;} + if(exu) {delete exu; exu = 0;} + if(corepipe) {delete corepipe; corepipe = 0;} + if(undiffCore) {delete undiffCore;undiffCore = 0;} + if(l2cache) {delete l2cache;l2cache = 0;} + } + +void Core::set_core_param() +{ + coredynp.opt_local = XML->sys.core[ithCore].opt_local; + coredynp.x86 = XML->sys.core[ithCore].x86; + coredynp.Embedded = XML->sys.Embedded; + coredynp.core_ty = (enum Core_type)XML->sys.core[ithCore].machine_type; + coredynp.rm_ty = (enum Renaming_type)XML->sys.core[ithCore].rename_scheme; + coredynp.fetchW = XML->sys.core[ithCore].fetch_width; + coredynp.decodeW = XML->sys.core[ithCore].decode_width; + coredynp.issueW = XML->sys.core[ithCore].issue_width; + coredynp.peak_issueW = XML->sys.core[ithCore].peak_issue_width; + coredynp.commitW = XML->sys.core[ithCore].commit_width; + coredynp.peak_commitW = XML->sys.core[ithCore].peak_issue_width; + coredynp.predictionW = XML->sys.core[ithCore].prediction_width; + coredynp.fp_issueW = XML->sys.core[ithCore].fp_issue_width; + coredynp.fp_decodeW = XML->sys.core[ithCore].fp_issue_width; + coredynp.num_alus = XML->sys.core[ithCore].ALU_per_core; + coredynp.num_fpus = XML->sys.core[ithCore].FPU_per_core; + coredynp.num_muls = XML->sys.core[ithCore].MUL_per_core; + + + coredynp.num_hthreads = XML->sys.core[ithCore].number_hardware_threads; + coredynp.multithreaded = coredynp.num_hthreads>1? true:false; + coredynp.instruction_length = XML->sys.core[ithCore].instruction_length; + coredynp.pc_width = XML->sys.virtual_address_width; + + coredynp.opcode_length = XML->sys.core[ithCore].opcode_width; + coredynp.micro_opcode_length = XML->sys.core[ithCore].micro_opcode_width; + coredynp.num_pipelines = XML->sys.core[ithCore].pipelines_per_core[0]; + coredynp.pipeline_stages = XML->sys.core[ithCore].pipeline_depth[0]; + coredynp.num_fp_pipelines = XML->sys.core[ithCore].pipelines_per_core[1]; + coredynp.fp_pipeline_stages = XML->sys.core[ithCore].pipeline_depth[1]; + coredynp.int_data_width = int(ceil(XML->sys.machine_bits/32.0))*32; + coredynp.fp_data_width = coredynp.int_data_width; + coredynp.v_address_width = XML->sys.virtual_address_width; + coredynp.p_address_width = XML->sys.physical_address_width; + + coredynp.scheu_ty = (enum Scheduler_type)XML->sys.core[ithCore].instruction_window_scheme; + coredynp.arch_ireg_width = int(ceil(log2(XML->sys.core[ithCore].archi_Regs_IRF_size))); + coredynp.arch_freg_width = int(ceil(log2(XML->sys.core[ithCore].archi_Regs_FRF_size))); + coredynp.num_IRF_entry = XML->sys.core[ithCore].archi_Regs_IRF_size; + coredynp.num_FRF_entry = XML->sys.core[ithCore].archi_Regs_FRF_size; + coredynp.pipeline_duty_cycle = XML->sys.core[ithCore].pipeline_duty_cycle; + coredynp.total_cycles = XML->sys.core[ithCore].total_cycles; + coredynp.busy_cycles = XML->sys.core[ithCore].busy_cycles; + coredynp.idle_cycles = XML->sys.core[ithCore].idle_cycles; + + //Max power duty cycle for peak power estimation +// if (coredynp.core_ty==OOO) +// { +// coredynp.IFU_duty_cycle = 1; +// coredynp.LSU_duty_cycle = 1; +// coredynp.MemManU_I_duty_cycle =1; +// coredynp.MemManU_D_duty_cycle =1; +// coredynp.ALU_duty_cycle =1; +// coredynp.MUL_duty_cycle =1; +// coredynp.FPU_duty_cycle =1; +// coredynp.ALU_cdb_duty_cycle =1; +// coredynp.MUL_cdb_duty_cycle =1; +// coredynp.FPU_cdb_duty_cycle =1; +// } +// else +// { + coredynp.IFU_duty_cycle = XML->sys.core[ithCore].IFU_duty_cycle; + coredynp.BR_duty_cycle = XML->sys.core[ithCore].BR_duty_cycle; + coredynp.LSU_duty_cycle = XML->sys.core[ithCore].LSU_duty_cycle; + coredynp.MemManU_I_duty_cycle = XML->sys.core[ithCore].MemManU_I_duty_cycle; + coredynp.MemManU_D_duty_cycle = XML->sys.core[ithCore].MemManU_D_duty_cycle; + coredynp.ALU_duty_cycle = XML->sys.core[ithCore].ALU_duty_cycle; + coredynp.MUL_duty_cycle = XML->sys.core[ithCore].MUL_duty_cycle; + coredynp.FPU_duty_cycle = XML->sys.core[ithCore].FPU_duty_cycle; + coredynp.ALU_cdb_duty_cycle = XML->sys.core[ithCore].ALU_cdb_duty_cycle; + coredynp.MUL_cdb_duty_cycle = XML->sys.core[ithCore].MUL_cdb_duty_cycle; + coredynp.FPU_cdb_duty_cycle = XML->sys.core[ithCore].FPU_cdb_duty_cycle; +// } + + + if (!((coredynp.core_ty==OOO)||(coredynp.core_ty==Inorder))) + { + cout<<"Invalid Core Type"<<endl; + exit(0); + } +// if (coredynp.core_ty==OOO) +// { +// cout<<"OOO processor models are being updated and will be available in next release"<<endl; +// exit(0); +// } + if (!((coredynp.scheu_ty==PhysicalRegFile)||(coredynp.scheu_ty==ReservationStation))) + { + cout<<"Invalid OOO Scheduler Type"<<endl; + exit(0); + } + + if (!((coredynp.rm_ty ==RAMbased)||(coredynp.rm_ty ==CAMbased))) + { + cout<<"Invalid OOO Renaming Type"<<endl; + exit(0); + } + +if (coredynp.core_ty==OOO) +{ + if (coredynp.scheu_ty==PhysicalRegFile) + { + coredynp.phy_ireg_width = int(ceil(log2(XML->sys.core[ithCore].phy_Regs_IRF_size))); + coredynp.phy_freg_width = int(ceil(log2(XML->sys.core[ithCore].phy_Regs_FRF_size))); + coredynp.num_ifreelist_entries = coredynp.num_IRF_entry = XML->sys.core[ithCore].phy_Regs_IRF_size; + coredynp.num_ffreelist_entries = coredynp.num_FRF_entry = XML->sys.core[ithCore].phy_Regs_FRF_size; + } + else if (coredynp.scheu_ty==ReservationStation) + {//ROB serves as Phy RF in RS based OOO + coredynp.phy_ireg_width = int(ceil(log2(XML->sys.core[ithCore].ROB_size))); + coredynp.phy_freg_width = int(ceil(log2(XML->sys.core[ithCore].ROB_size))); + coredynp.num_ifreelist_entries = XML->sys.core[ithCore].ROB_size; + coredynp.num_ffreelist_entries = XML->sys.core[ithCore].ROB_size; + + } + +} + coredynp.globalCheckpoint = 32;//best check pointing entries for a 4~8 issue OOO should be 16~48;See TR for reference. + coredynp.perThreadState = 8; + coredynp.instruction_length = 32; + coredynp.clockRate = XML->sys.core[ithCore].clock_rate; + coredynp.clockRate *= 1e6; + coredynp.regWindowing= (XML->sys.core[ithCore].register_windows_size>0&&coredynp.core_ty==Inorder)?true:false; + coredynp.executionTime = XML->sys.total_cycles/coredynp.clockRate; + set_pppm(coredynp.pppm_lkg_multhread, 0, coredynp.num_hthreads, coredynp.num_hthreads, 0); +} diff --git a/ext/mcpat/core.h b/ext/mcpat/core.h new file mode 100644 index 000000000..8ef3babdd --- /dev/null +++ b/ext/mcpat/core.h @@ -0,0 +1,262 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + +#ifndef CORE_H_ +#define CORE_H_ + +#include "XML_Parse.h" +#include "array.h" +#include "basic_components.h" +#include "interconnect.h" +#include "logic.h" +#include "parameter.h" +#include "sharedcache.h" + +class BranchPredictor :public Component { + public: + + ParseXML *XML; + int ithCore; + InputParameter interface_ip; + CoreDynParam coredynp; + double clockRate,executionTime; + double scktRatio, chip_PR_overhead, macro_PR_overhead; + ArrayST * globalBPT; + ArrayST * localBPT; + ArrayST * L1_localBPT; + ArrayST * L2_localBPT; + ArrayST * chooser; + ArrayST * RAS; + bool exist; + + BranchPredictor(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exsit=true); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~BranchPredictor(); +}; + + +class InstFetchU :public Component { + public: + + ParseXML *XML; + int ithCore; + InputParameter interface_ip; + CoreDynParam coredynp; + double clockRate,executionTime; + double scktRatio, chip_PR_overhead, macro_PR_overhead; + enum Cache_policy cache_p; + InstCache icache; + ArrayST * IB; + ArrayST * BTB; + BranchPredictor * BPT; + inst_decoder * ID_inst; + inst_decoder * ID_operand; + inst_decoder * ID_misc; + bool exist; + + InstFetchU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exsit=true); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~InstFetchU(); +}; + + +class SchedulerU :public Component { + public: + + ParseXML *XML; + int ithCore; + InputParameter interface_ip; + CoreDynParam coredynp; + double clockRate,executionTime; + double scktRatio, chip_PR_overhead, macro_PR_overhead; + double Iw_height, fp_Iw_height,ROB_height; + ArrayST * int_inst_window; + ArrayST * fp_inst_window; + ArrayST * ROB; + selection_logic * instruction_selection; + bool exist; + + SchedulerU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~SchedulerU(); +}; + +class RENAMINGU :public Component { + public: + + ParseXML *XML; + int ithCore; + InputParameter interface_ip; + double clockRate,executionTime; + CoreDynParam coredynp; + ArrayST * iFRAT; + ArrayST * fFRAT; + ArrayST * iRRAT; + ArrayST * fRRAT; + ArrayST * ifreeL; + ArrayST * ffreeL; + dep_resource_conflict_check * idcl; + dep_resource_conflict_check * fdcl; + ArrayST * RAHT;//register alias history table Used to store GC + bool exist; + + + RENAMINGU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_=true); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~RENAMINGU(); +}; + +class LoadStoreU :public Component { + public: + + ParseXML *XML; + int ithCore; + InputParameter interface_ip; + CoreDynParam coredynp; + enum Cache_policy cache_p; + double clockRate,executionTime; + double scktRatio, chip_PR_overhead, macro_PR_overhead; + double lsq_height; + DataCache dcache; + ArrayST * LSQ;//it is actually the store queue but for inorder processors it serves as both loadQ and StoreQ + ArrayST * LoadQ; + bool exist; + + LoadStoreU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~LoadStoreU(); +}; + +class MemManU :public Component { + public: + + ParseXML *XML; + int ithCore; + InputParameter interface_ip; + CoreDynParam coredynp; + double clockRate,executionTime; + double scktRatio, chip_PR_overhead, macro_PR_overhead; + ArrayST * itlb; + ArrayST * dtlb; + bool exist; + + MemManU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~MemManU(); +}; + +class RegFU :public Component { + public: + + ParseXML *XML; + int ithCore; + InputParameter interface_ip; + CoreDynParam coredynp; + double clockRate,executionTime; + double scktRatio, chip_PR_overhead, macro_PR_overhead; + double int_regfile_height, fp_regfile_height; + ArrayST * IRF; + ArrayST * FRF; + ArrayST * RFWIN; + bool exist; + + RegFU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, bool exist_=true); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~RegFU(); +}; + +class EXECU :public Component { + public: + + ParseXML *XML; + int ithCore; + InputParameter interface_ip; + double clockRate,executionTime; + double scktRatio, chip_PR_overhead, macro_PR_overhead; + double lsq_height; + CoreDynParam coredynp; + RegFU * rfu; + SchedulerU * scheu; + FunctionalUnit * fp_u; + FunctionalUnit * exeu; + FunctionalUnit * mul; + interconnect * int_bypass; + interconnect * intTagBypass; + interconnect * int_mul_bypass; + interconnect * intTag_mul_Bypass; + interconnect * fp_bypass; + interconnect * fpTagBypass; + + Component bypass; + bool exist; + + EXECU(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_, double lsq_height_,const CoreDynParam & dyn_p_, bool exist_=true); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~EXECU(); +}; + + +class Core :public Component { + public: + + ParseXML *XML; + int ithCore; + InputParameter interface_ip; + double clockRate,executionTime; + double scktRatio, chip_PR_overhead, macro_PR_overhead; + InstFetchU * ifu; + LoadStoreU * lsu; + MemManU * mmu; + EXECU * exu; + RENAMINGU * rnu; + Pipeline * corepipe; + UndiffCore * undiffCore; + SharedCache * l2cache; + CoreDynParam coredynp; + //full_decoder inst_decoder; + //clock_network clockNetwork; + Core(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_); + void set_core_param(); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~Core(); +}; + +#endif /* CORE_H_ */ diff --git a/ext/mcpat/globalvar.h b/ext/mcpat/globalvar.h new file mode 100644 index 000000000..953257653 --- /dev/null +++ b/ext/mcpat/globalvar.h @@ -0,0 +1,48 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + +#ifndef GLOBALVAR_H_ +#define GLOBALVAR_H_ + +#ifdef GLOBALVAR +#define EXTERN +#else +#define EXTERN extern +#endif + +EXTERN bool opt_for_clk; + +#endif /* GLOBALVAR_H_ */ + + + + diff --git a/ext/mcpat/interconnect.cc b/ext/mcpat/interconnect.cc new file mode 100644 index 000000000..ba502b6a8 --- /dev/null +++ b/ext/mcpat/interconnect.cc @@ -0,0 +1,222 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + +#include <cassert> +#include <iostream> + +#include "globalvar.h" +#include "interconnect.h" +#include "wire.h" + +interconnect::interconnect( + string name_, + enum Device_ty device_ty_, + double base_w, double base_h, + int data_w, double len,const InputParameter *configure_interface, + int start_wiring_level_, + bool pipelinable_ , + double route_over_perc_ , + bool opt_local_, + enum Core_type core_ty_, + enum Wire_type wire_model, + double width_s, double space_s, + TechnologyParameter::DeviceType *dt +) + :name(name_), + device_ty(device_ty_), + in_rise_time(0), + out_rise_time(0), + base_width(base_w), + base_height(base_h), + data_width(data_w), + wt(wire_model), + width_scaling(width_s), + space_scaling(space_s), + start_wiring_level(start_wiring_level_), + length(len), + //interconnect_latency(1e-12), + //interconnect_throughput(1e-12), + opt_local(opt_local_), + core_ty(core_ty_), + pipelinable(pipelinable_), + route_over_perc(route_over_perc_), + deviceType(dt) +{ + + wt = Global; + l_ip=*configure_interface; + local_result = init_interface(&l_ip); + + + max_unpipelined_link_delay = 0; //TODO + min_w_nmos = g_tp.min_w_nmos_; + min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio * min_w_nmos; + + + + latency = l_ip.latency; + throughput = l_ip.throughput; + latency_overflow=false; + throughput_overflow=false; + + /* + * TODO: Add wiring option from semi-global to global automatically + * And directly jump to global if semi-global cannot satisfy timing + * Fat wires only available for global wires, thus + * if signal wiring layer starts from semi-global, + * the next layer up will be global, i.e., semi-global does + * not have fat wires. + */ + if (pipelinable == false) + //Non-pipelinable wires, such as bypass logic, care latency + { + compute(); + if (opt_for_clk && opt_local) + { + while (delay > latency && width_scaling<3.0) + { + width_scaling *= 2; + space_scaling *= 2; + Wire winit(width_scaling, space_scaling); + compute(); + } + if (delay > latency) + { + latency_overflow=true; + } + } + } + else //Pipelinable wires, such as bus, does not care latency but throughput + { + /* + * TODO: Add pipe regs power, area, and timing; + * Pipelinable wires optimize latency first. + */ + compute(); + if (opt_for_clk && opt_local) + { + while (delay > throughput && width_scaling<3.0) + { + width_scaling *= 2; + space_scaling *= 2; + Wire winit(width_scaling, space_scaling); + compute(); + } + if (delay > throughput) + // insert pipeline stages + { + num_pipe_stages = (int)ceil(delay/throughput); + assert(num_pipe_stages>0); + delay = delay/num_pipe_stages + num_pipe_stages*0.05*delay; + } + } + } + + power_bit = power; + power.readOp.dynamic *= data_width; + power.readOp.leakage *= data_width; + power.readOp.gate_leakage *= data_width; + area.set_area(area.get_area()*data_width); + no_device_under_wire_area.h *= data_width; + + if (latency_overflow==true) + cout<< "Warning: "<< name <<" wire structure cannot satisfy latency constraint." << endl; + + + assert(power.readOp.dynamic > 0); + assert(power.readOp.leakage > 0); + assert(power.readOp.gate_leakage > 0); + + double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); + + double sckRation = g_tp.sckt_co_eff; + power.readOp.dynamic *= sckRation; + power.writeOp.dynamic *= sckRation; + power.searchOp.dynamic *= sckRation; + + power.readOp.longer_channel_leakage = + power.readOp.leakage*long_channel_device_reduction; + + if (pipelinable)//Only global wires has the option to choose whether routing over or not + area.set_area(area.get_area()*route_over_perc + no_device_under_wire_area.get_area()*(1-route_over_perc)); + + Wire wreset(); +} + + + +void +interconnect::compute() +{ + + Wire *wtemp1 = 0; + wtemp1 = new Wire(wt, length, 1, width_scaling, space_scaling); + delay = wtemp1->delay; + power.readOp.dynamic = wtemp1->power.readOp.dynamic; + power.readOp.leakage = wtemp1->power.readOp.leakage; + power.readOp.gate_leakage = wtemp1->power.readOp.gate_leakage; + + area.set_area(wtemp1->area.get_area()); + no_device_under_wire_area.h = (wtemp1->wire_width + wtemp1->wire_spacing); + no_device_under_wire_area.w = length; + + if (wtemp1) + delete wtemp1; + +} + +void interconnect::leakage_feedback(double temperature) +{ + l_ip.temp = (unsigned int)round(temperature/10.0)*10; + uca_org_t init_result = init_interface(&l_ip); // init_result is dummy + + compute(); + + power_bit = power; + power.readOp.dynamic *= data_width; + power.readOp.leakage *= data_width; + power.readOp.gate_leakage *= data_width; + + assert(power.readOp.dynamic > 0); + assert(power.readOp.leakage > 0); + assert(power.readOp.gate_leakage > 0); + + double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); + + double sckRation = g_tp.sckt_co_eff; + power.readOp.dynamic *= sckRation; + power.writeOp.dynamic *= sckRation; + power.searchOp.dynamic *= sckRation; + + power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; +} + diff --git a/ext/mcpat/interconnect.h b/ext/mcpat/interconnect.h new file mode 100644 index 000000000..4cf42dafd --- /dev/null +++ b/ext/mcpat/interconnect.h @@ -0,0 +1,111 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + +#ifndef __INTERCONNECT_H__ +#define __INTERCONNECT_H__ + +#include "assert.h" +#include "basic_circuit.h" +#include "basic_components.h" +#include "cacti_interface.h" +#include "component.h" +#include "parameter.h" +#include "subarray.h" +#include "wire.h" + +// leakge power includes entire htree in a bank (when uca_tree == false) +// leakge power includes only part to one bank when uca_tree == true + +class interconnect : public Component +{ + public: + interconnect( + string name_, + enum Device_ty device_ty_, + double base_w, double base_h, int data_w, double len, + const InputParameter *configure_interface, int start_wiring_level_, + bool pipelinable_ = false, + double route_over_perc_ =0.5, + bool opt_local_=true, + enum Core_type core_ty_=Inorder, + enum Wire_type wire_model=Global, + double width_s=1.0, double space_s=1.0, + TechnologyParameter::DeviceType *dt = &(g_tp.peri_global) + ); + + ~interconnect() {}; + + void compute(); + string name; + enum Device_ty device_ty; + double in_rise_time, out_rise_time; + InputParameter l_ip; + uca_org_t local_result; + Area no_device_under_wire_area; + void set_in_rise_time(double rt) + { + in_rise_time = rt; + } + + void leakage_feedback(double temperature); + double max_unpipelined_link_delay; + powerDef power_bit; + + double wire_bw; + double init_wire_bw; // bus width at root + double base_width; + double base_height; + int data_width; + enum Wire_type wt; + double width_scaling, space_scaling; + int start_wiring_level; + double length; + double min_w_nmos; + double min_w_pmos; + double latency, throughput; + bool latency_overflow; + bool throughput_overflow; + double interconnect_latency; + double interconnect_throughput; + bool opt_local; + enum Core_type core_ty; + bool pipelinable; + double route_over_perc; + int num_pipe_stages; + + private: + TechnologyParameter::DeviceType *deviceType; + +}; + +#endif + diff --git a/ext/mcpat/iocontrollers.cc b/ext/mcpat/iocontrollers.cc new file mode 100644 index 000000000..70b0f2dcb --- /dev/null +++ b/ext/mcpat/iocontrollers.cc @@ -0,0 +1,446 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ +#include <algorithm> +#include <cassert> +#include <cmath> +#include <iostream> +#include <string> + +#include "XML_Parse.h" +#include "basic_circuit.h" +#include "basic_components.h" +#include "const.h" +#include "io.h" +#include "iocontrollers.h" +#include "logic.h" +#include "parameter.h" + +/* +SUN Niagara 2 I/O power analysis: +total signal bits: 711 +Total FBDIMM bits: (14+10)*2*8= 384 +PCIe bits: (8 + 8)*2 = 32 +10Gb NIC: (4*2+4*2)*2 = 32 +Debug I/Os: 168 +Other I/Os: 711- 32-32 - 384 - 168 = 95 + +According to "Implementation of an 8-Core, 64-Thread, Power-Efficient SPARC Server on a Chip" +90% of I/Os are SerDers (the calucaltion is 384+64/(711-168)=83% about the same as the 90% reported in the paper) +--> around 80Pins are common I/Os. +Common I/Os consumes 71mW/Gb/s according to Cadence ChipEstimate @65nm +Niagara 2 I/O clock is 1/4 of core clock. --> 87pin (<--((711-168)*17%)) * 71mW/Gb/s *0.25*1.4Ghz = 2.17W + +Total dynamic power of FBDIMM, NIC, PCIe = 84*0.132 + 84*0.049*0.132 = 11.14 - 2.17 = 8.98 +Further, if assuming I/O logic power is about 50% of I/Os then Total energy of FBDIMM, NIC, PCIe = 11.14 - 2.17*1.5 = 7.89 + */ + +/* + * A bug in Cadence ChipEstimator: After update the clock rate in the clock tab, a user + * need to re-select the IP clock (the same clk) and then click Estimate. if not reselect + * the new clock rate may not be propogate into the IPs. + * + */ + +NIUController::NIUController(ParseXML *XML_interface,InputParameter* interface_ip_) +:XML(XML_interface), + interface_ip(*interface_ip_) + { + local_result = init_interface(&interface_ip); + + double frontend_area, phy_area, mac_area, SerDer_area; + double frontend_dyn, mac_dyn, SerDer_dyn; + double frontend_gates, mac_gates, SerDer_gates; + double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); + double NMOS_sizing, PMOS_sizing; + + set_niu_param(); + + if (niup.type == 0) //high performance NIU + { + //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate using 65nm. + mac_area = (1.53 + 0.3)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065); + //Area estimation based on average of die photo from Niagara 2, ISSCC "An 800mW 10Gb Ethernet Transceiver in 0.13μm CMOS" + //and"A 1.2-V-Only 900-mW 10 Gb Ethernet Transceiver and XAUI Interface With Robust VCO Tuning Technique" Frontend is PCS + frontend_area = (9.8 + (6 + 18)*65/130*65/130)/3 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065); + //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate hard IP @65nm. + //SerDer is very hard to scale + SerDer_area = (1.39 + 0.36) * (interface_ip.F_sz_um/0.065);//* (interface_ip.F_sz_um/0.065); + phy_area = frontend_area + SerDer_area; + //total area + area.set_area((mac_area + frontend_area + SerDer_area)*1e6); + //Power + //Cadence ChipEstimate using 65nm (mac, front_end are all energy. E=P*T = P/F = 1.37/1Ghz = 1.37e-9); + mac_dyn = 2.19e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate; //2.19W@1GHz fully active according to Cadence ChipEstimate @65nm + //Cadence ChipEstimate using 65nm soft IP; + frontend_dyn = 0.27e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate; + //according to "A 100mW 9.6Gb/s Transceiver in 90nm CMOS..." ISSCC 2006 + //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm + SerDer_dyn = 0.01*10*sqrt(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2; + SerDer_dyn /= niup.clockRate;//covert to energy per clock cycle of whole NIU + + //Cadence ChipEstimate using 65nm + mac_gates = 111700; + frontend_gates = 320000; + SerDer_gates = 200000; + NMOS_sizing = 5*g_tp.min_w_nmos_; + PMOS_sizing = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r; + + + } + else + {//Low power implementations are mostly from Cadence ChipEstimator; Ignore the multiple IP effect + // ---When there are multiple IP (same kind or not) selected, Cadence ChipEstimator results are not + // a simple summation of all IPs. Ignore this effect + mac_area = 0.24 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065); + frontend_area = 0.1 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065);//Frontend is the PCS layer + SerDer_area = 0.35 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065); + //Compare 130um implementation in "A 1.2-V-Only 900-mW 10 Gb Ethernet Transceiver and XAUI Interface With Robust VCO Tuning Technique" + //and the ChipEstimator XAUI PHY hard IP, confirm that even PHY can scale perfectly with the technology + //total area + area.set_area((mac_area + frontend_area + SerDer_area)*1e6); + //Power + //Cadence ChipEstimate using 65nm (mac, front_end are all energy. E=P*T = P/F = 1.37/1Ghz = 1.37e-9); + mac_dyn = 1.257e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate; //2.19W@1GHz fully active according to Cadence ChipEstimate @65nm + //Cadence ChipEstimate using 65nm soft IP; + frontend_dyn = 0.6e-9*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);//niup.clockRate; + //SerDer_dyn is power not energy, scaling from 216mw/10Gb/s @130nm + SerDer_dyn = 0.0216*10*(interface_ip.F_sz_um/0.13)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2; + SerDer_dyn /= niup.clockRate;//covert to energy per clock cycle of whole NIU + + mac_gates = 111700; + frontend_gates = 52000; + SerDer_gates = 199260; + + NMOS_sizing = g_tp.min_w_nmos_; + PMOS_sizing = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r; + + } + + power_t.readOp.dynamic = mac_dyn + frontend_dyn + SerDer_dyn; + power_t.readOp.leakage = (mac_gates + frontend_gates + frontend_gates)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W + double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device); + power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction; + power_t.readOp.gate_leakage = (mac_gates + frontend_gates + frontend_gates)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W + } + +void NIUController::computeEnergy(bool is_tdp) +{ + if (is_tdp) + { + + + power = power_t; + power.readOp.dynamic *= niup.duty_cycle; + + } + else + { + rt_power = power_t; + rt_power.readOp.dynamic *= niup.perc_load; + } +} + +void NIUController::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + + if (is_tdp) + { + cout << "NIU:" << endl; + cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*niup.clockRate << " W" << endl; + cout << indent_str<< "Subthreshold Leakage = " + << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; + //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl; + cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; + cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic*niup.clockRate << " W" << endl; + cout<<endl; + } + else + { + + } + +} + +void NIUController::set_niu_param() +{ + niup.clockRate = XML->sys.niu.clockrate; + niup.clockRate *= 1e6; + niup.num_units = XML->sys.niu.number_units; + niup.duty_cycle = XML->sys.niu.duty_cycle; + niup.perc_load = XML->sys.niu.total_load_perc; + niup.type = XML->sys.niu.type; +// niup.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6); +} + +PCIeController::PCIeController(ParseXML *XML_interface,InputParameter* interface_ip_) +:XML(XML_interface), + interface_ip(*interface_ip_) + { + local_result = init_interface(&interface_ip); + double frontend_area, phy_area, ctrl_area, SerDer_area; + double ctrl_dyn, frontend_dyn, SerDer_dyn; + double ctrl_gates,frontend_gates, SerDer_gates; + double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); + double NMOS_sizing, PMOS_sizing; + + /* Assuming PCIe is bit-slice based architecture + * This is the reason for /8 in both area and power calculation + * to get per lane numbers + */ + + set_pcie_param(); + if (pciep.type == 0) //high performance NIU + { + //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate @ 65nm. + ctrl_area = (5.2 + 0.5)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065); + //Area estimation based on average of die photo from Niagara 2, and Cadence ChipEstimate @ 65nm. + frontend_area = (5.2 + 0.1)/2 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065); + //Area estimation based on average of die photo from Niagara 2 and Cadence ChipEstimate hard IP @65nm. + //SerDer is very hard to scale + SerDer_area = (3.03 + 0.36) * (interface_ip.F_sz_um/0.065);//* (interface_ip.F_sz_um/0.065); + phy_area = frontend_area + SerDer_area; + //total area + //Power + //Cadence ChipEstimate using 65nm the controller includes everything: the PHY, the data link and transaction layer + ctrl_dyn = 3.75e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0); + // //Cadence ChipEstimate using 65nm soft IP; + // frontend_dyn = 0.27e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0); + //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm + SerDer_dyn = 0.01*4*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;//PCIe 2.0 max per lane speed is 4Gb/s + SerDer_dyn /= pciep.clockRate;//covert to energy per clock cycle + + //power_t.readOp.dynamic = (ctrl_dyn)*pciep.num_channels; + //Cadence ChipEstimate using 65nm + ctrl_gates = 900000/8*pciep.num_channels; + // frontend_gates = 120000/8; + // SerDer_gates = 200000/8; + NMOS_sizing = 5*g_tp.min_w_nmos_; + PMOS_sizing = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r; + } + else + { + ctrl_area = 0.412 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065); + //Area estimation based on average of die photo from Niagara 2, and Cadence ChipEstimate @ 65nm. + SerDer_area = 0.36 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065); + //total area + //Power + //Cadence ChipEstimate using 65nm the controller includes everything: the PHY, the data link and transaction layer + ctrl_dyn = 2.21e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0); + // //Cadence ChipEstimate using 65nm soft IP; + // frontend_dyn = 0.27e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0); + //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm + SerDer_dyn = 0.01*4*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;//PCIe 2.0 max per lane speed is 4Gb/s + SerDer_dyn /= pciep.clockRate;//covert to energy per clock cycle + + //Cadence ChipEstimate using 65nm + ctrl_gates = 200000/8*pciep.num_channels; + // frontend_gates = 120000/8; + SerDer_gates = 200000/8*pciep.num_channels; + NMOS_sizing = g_tp.min_w_nmos_; + PMOS_sizing = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r; + + } + area.set_area(((ctrl_area + (pciep.withPHY? SerDer_area:0))/8*pciep.num_channels)*1e6); + power_t.readOp.dynamic = (ctrl_dyn + (pciep.withPHY? SerDer_dyn:0))*pciep.num_channels; + power_t.readOp.leakage = (ctrl_gates + (pciep.withPHY? SerDer_gates:0))*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W + double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device); + power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction; + power_t.readOp.gate_leakage = (ctrl_gates + (pciep.withPHY? SerDer_gates:0))*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W + } + +void PCIeController::computeEnergy(bool is_tdp) +{ + if (is_tdp) + { + + + power = power_t; + power.readOp.dynamic *= pciep.duty_cycle; + + } + else + { + rt_power = power_t; + rt_power.readOp.dynamic *= pciep.perc_load; + } +} + +void PCIeController::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + + if (is_tdp) + { + cout << "PCIe:" << endl; + cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*pciep.clockRate << " W" << endl; + cout << indent_str<< "Subthreshold Leakage = " + << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; + //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl; + cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; + cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic*pciep.clockRate << " W" << endl; + cout<<endl; + } + else + { + + } + +} + +void PCIeController::set_pcie_param() +{ + pciep.clockRate = XML->sys.pcie.clockrate; + pciep.clockRate *= 1e6; + pciep.num_units = XML->sys.pcie.number_units; + pciep.num_channels = XML->sys.pcie.num_channels; + pciep.duty_cycle = XML->sys.pcie.duty_cycle; + pciep.perc_load = XML->sys.pcie.total_load_perc; + pciep.type = XML->sys.pcie.type; + pciep.withPHY = XML->sys.pcie.withPHY; +// pciep.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6); + +} + +FlashController::FlashController(ParseXML *XML_interface,InputParameter* interface_ip_) +:XML(XML_interface), + interface_ip(*interface_ip_) + { + local_result = init_interface(&interface_ip); + double frontend_area, phy_area, ctrl_area, SerDer_area; + double ctrl_dyn, frontend_dyn, SerDer_dyn; + double ctrl_gates,frontend_gates, SerDer_gates; + double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); + double NMOS_sizing, PMOS_sizing; + + /* Assuming PCIe is bit-slice based architecture + * This is the reason for /8 in both area and power calculation + * to get per lane numbers + */ + + set_fc_param(); + if (fcp.type == 0) //high performance NIU + { + cout<<"Current McPAT does not support high performance flash contorller since even low power designs are enough for maintain throughput"<<endl; + exit(0); + NMOS_sizing = 5*g_tp.min_w_nmos_; + PMOS_sizing = 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r; + } + else + { + ctrl_area = 0.243 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065); + //Area estimation based on Cadence ChipEstimate @ 65nm: NANDFLASH-CTRL from CAST + SerDer_area = 0.36/8 * (interface_ip.F_sz_um/0.065)* (interface_ip.F_sz_um/0.065); + //based On PCIe PHY TSMC65GP from Cadence ChipEstimate @ 65nm, it support 8x lanes with each lane + //speed up to 250MB/s (PCIe1.1x) This is already saturate the 200MB/s of the flash controller core above. + ctrl_gates = 129267; + SerDer_gates = 200000/8; + NMOS_sizing = g_tp.min_w_nmos_; + PMOS_sizing = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r; + + //Power + //Cadence ChipEstimate using 65nm the controller 125mW for every 200MB/s This is power not energy! + ctrl_dyn = 0.125*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0); + //SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm + SerDer_dyn = 0.01*1.6*(interface_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2; + //max Per controller speed is 1.6Gb/s (200MB/s) + } + double number_channel = 1+(fcp.num_channels-1)*0.2; + area.set_area((ctrl_area + (fcp.withPHY? SerDer_area:0))*1e6*number_channel); + power_t.readOp.dynamic = (ctrl_dyn + (fcp.withPHY? SerDer_dyn:0))*number_channel; + power_t.readOp.leakage = ((ctrl_gates + (fcp.withPHY? SerDer_gates:0))*number_channel)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W + double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device); + power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction; + power_t.readOp.gate_leakage = ((ctrl_gates + (fcp.withPHY? SerDer_gates:0))*number_channel)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W + } + +void FlashController::computeEnergy(bool is_tdp) +{ + if (is_tdp) + { + + + power = power_t; + power.readOp.dynamic *= fcp.duty_cycle; + + } + else + { + rt_power = power_t; + rt_power.readOp.dynamic *= fcp.perc_load; + } +} + +void FlashController::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + + if (is_tdp) + { + cout << "Flash Controller:" << endl; + cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic << " W" << endl;//no multiply of clock since this is power already + cout << indent_str<< "Subthreshold Leakage = " + << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; + //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl; + cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; + cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic << " W" << endl; + cout<<endl; + } + else + { + + } + +} + +void FlashController::set_fc_param() +{ +// fcp.clockRate = XML->sys.flashc.mc_clock; +// fcp.clockRate *= 1e6; + fcp.peakDataTransferRate = XML->sys.flashc.peak_transfer_rate; + fcp.num_channels = ceil(fcp.peakDataTransferRate/200); + fcp.num_mcs = XML->sys.flashc.number_mcs; + fcp.duty_cycle = XML->sys.flashc.duty_cycle; + fcp.perc_load = XML->sys.flashc.total_load_perc; + fcp.type = XML->sys.flashc.type; + fcp.withPHY = XML->sys.flashc.withPHY; +// flashcp.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6); + +} diff --git a/ext/mcpat/iocontrollers.h b/ext/mcpat/iocontrollers.h new file mode 100644 index 000000000..818580abb --- /dev/null +++ b/ext/mcpat/iocontrollers.h @@ -0,0 +1,87 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ +#ifndef IOCONTROLLERS_H_ +#define IOCONTROLLERS_H_ + + +#endif /* IOCONTROLLERS_H_ */ + +#include "XML_Parse.h" +#include "parameter.h" +//#include "io.h" +#include "array.h" +//#include "Undifferentiated_Core_Area.h" +#include <vector> + +#include "basic_components.h" + +class NIUController : public Component { + public: + ParseXML *XML; + InputParameter interface_ip; + NIUParam niup; + powerDef power_t; + uca_org_t local_result; + NIUController(ParseXML *XML_interface,InputParameter* interface_ip_); + void set_niu_param(); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~NIUController(){}; +}; + +class PCIeController : public Component { + public: + ParseXML *XML; + InputParameter interface_ip; + PCIeParam pciep; + powerDef power_t; + uca_org_t local_result; + PCIeController(ParseXML *XML_interface,InputParameter* interface_ip_); + void set_pcie_param(); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~PCIeController(){}; +}; + +class FlashController : public Component { + public: + ParseXML *XML; + InputParameter interface_ip; + MCParam fcp; + powerDef power_t; + uca_org_t local_result; + FlashController(ParseXML *XML_interface,InputParameter* interface_ip_); + void set_fc_param(); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~FlashController(){}; +}; + diff --git a/ext/mcpat/logic.cc b/ext/mcpat/logic.cc new file mode 100644 index 000000000..11519d863 --- /dev/null +++ b/ext/mcpat/logic.cc @@ -0,0 +1,1014 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#include "logic.h" + + +//selection_logic +selection_logic::selection_logic( + bool _is_default, + int win_entries_, + int issue_width_, + const InputParameter *configure_interface, + enum Device_ty device_ty_, + enum Core_type core_ty_) + //const ParseXML *_XML_interface) + :is_default(_is_default), + win_entries(win_entries_), + issue_width(issue_width_), + device_ty(device_ty_), + core_ty(core_ty_) + { + //uca_org_t result2; + l_ip=*configure_interface; + local_result = init_interface(&l_ip); + //init_tech_params(l_ip.F_sz_um, false); + //win_entries=numIBEntries;//IQentries; + //issue_width=issueWidth; + selection_power(); + double sckRation = g_tp.sckt_co_eff; + power.readOp.dynamic *= sckRation; + power.writeOp.dynamic *= sckRation; + power.searchOp.dynamic *= sckRation; + + double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); + power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; + } + +void selection_logic::selection_power() +{//based on cost effective superscalar processor TR pp27-31 + double Ctotal, Cor, Cpencode; + int num_arbiter; + double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp; + + //TODO: the 0.8um process data is used. + WSelORn = 12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process + WSelORprequ = 50 * l_ip.F_sz_um;//this was 40 micron for the 0.8 micron process + WSelPn = 12.5 * l_ip.F_sz_um;//this was 10mcron for the 0.8 micron process + WSelPp = 18.75 * l_ip.F_sz_um;//this was 15 micron for the 0.8 micron process + WSelEnn = 6.25 * l_ip.F_sz_um;//this was 5 micron for the 0.8 micron process + WSelEnp = 12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process + + + Ctotal=0; + num_arbiter=1; + while(win_entries > 4) + { + win_entries = (int)ceil((double)win_entries / 4.0); + num_arbiter += win_entries; + } + //the 4-input OR logic to generate anyreq + Cor = 4 * drain_C_(WSelORn,NCH,1,1, g_tp.cell_h_def) + drain_C_(WSelORprequ,PCH,1,1, g_tp.cell_h_def); + power.readOp.gate_leakage = cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor)*g_tp.peri_global.Vdd; + + //The total capacity of the 4-bit priority encoder + Cpencode = drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,1, 1, g_tp.cell_h_def) + + 2*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,2, 1, g_tp.cell_h_def) + + 3*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,3, 1, g_tp.cell_h_def) + + 4*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,4, 1, g_tp.cell_h_def) +//precompute priority logic + 2*4*gate_C(WSelEnn+WSelEnp,20.0)+ + 4*drain_C_(WSelEnn,NCH,1, 1, g_tp.cell_h_def) + 2*4*drain_C_(WSelEnp,PCH,1, 1, g_tp.cell_h_def)+//enable logic + (2*4+2*3+2*2+2)*gate_C(WSelPn+WSelPp,10.0);//requests signal + + Ctotal += issue_width * num_arbiter*(Cor+Cpencode); + + power.readOp.dynamic = Ctotal*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*2;//2 means the abitration signal need to travel round trip + power.readOp.leakage = issue_width * num_arbiter * + (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p + + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p + + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p + + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic + + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals + )*g_tp.peri_global.Vdd; + power.readOp.gate_leakage = issue_width * num_arbiter * + (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p + + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p + + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p + + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic + + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals + )*g_tp.peri_global.Vdd; +} + + +dep_resource_conflict_check::dep_resource_conflict_check( + const InputParameter *configure_interface, + const CoreDynParam & dyn_p_, + int compare_bits_, + bool _is_default) + : l_ip(*configure_interface), + coredynp(dyn_p_), + compare_bits(compare_bits_), + is_default(_is_default) +{ + Wcompn = 25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process + Wevalinvp = 25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process + Wevalinvn = 100 * l_ip.F_sz_um;//this was 80.0 mcron for the 0.8 micron process + Wcomppreequ = 50 * l_ip.F_sz_um;//this was 40.0 micron for the 0.8 micron process + WNORn = 6.75 * l_ip.F_sz_um;//this was 5.4 micron for the 0.8 micron process + WNORp = 38.125 * l_ip.F_sz_um;//this was 30.5 micron for the 0.8 micron process + + local_result = init_interface(&l_ip); + + if (coredynp.core_ty==Inorder) + compare_bits += 16 + 8 + 8;//TODO: opcode bits + log(shared resources) + REG TAG BITS-->opcode comparator + else + compare_bits += 16 + 8 + 8; + + conflict_check_power(); + double sckRation = g_tp.sckt_co_eff; + power.readOp.dynamic *= sckRation; + power.writeOp.dynamic *= sckRation; + power.searchOp.dynamic *= sckRation; + +} + +void dep_resource_conflict_check::conflict_check_power() +{ + double Ctotal; + int num_comparators; + num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision. + //When decode-width ==1, no dcl logic + + Ctotal = num_comparators * compare_cap(); + //printf("%i,%s\n",XML_interface->sys.core[0].predictor.predictor_entries,XML_interface->sys.core[0].predictor.prediction_scheme); + + power.readOp.dynamic=Ctotal*/*CLOCKRATE*/g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/*AF*/; + power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn, false); + + double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); + power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; + power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos); + +} + +/* estimate comparator power consumption (this comparator is similar + to the tag-match structure in a CAM */ +double dep_resource_conflict_check::compare_cap() +{ + double c1, c2; + + WNORp = WNORp * compare_bits/2.0;//resize the big NOR gate at the DCL according to fan in. + /* bottom part of comparator */ + c2 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def))+ + drain_C_(Wevalinvp,PCH,1,1, g_tp.cell_h_def) + drain_C_(Wevalinvn,NCH,1,1, g_tp.cell_h_def); + + /* top part of comparator */ + c1 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def)+ + drain_C_(Wcomppreequ,NCH,1,1, g_tp.cell_h_def)) + gate_C(WNORn + WNORp,10.0) + + drain_C_(WNORp,NCH,2,1, g_tp.cell_h_def) + compare_bits*drain_C_(WNORn,NCH,2,1, g_tp.cell_h_def); + return(c1 + c2); + +} + +void dep_resource_conflict_check::leakage_feedback(double temperature) +{ + l_ip.temp = (unsigned int)round(temperature/10.0)*10; + uca_org_t init_result = init_interface(&l_ip); // init_result is dummy + + // This is part of conflict_check_power() + int num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision. + power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn, false); + + double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); + power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; + power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos); +} + +//TODO: add inverter and transmission gate base DFF. + +DFFCell::DFFCell( + bool _is_dram, + double _WdecNANDn, + double _WdecNANDp, + double _cell_load, + const InputParameter *configure_interface) +:is_dram(_is_dram), +cell_load(_cell_load), +WdecNANDn(_WdecNANDn), +WdecNANDp(_WdecNANDp) +{//this model is based on the NAND2 based DFF. + l_ip=*configure_interface; +// area.set_area(730*l_ip.F_sz_um*l_ip.F_sz_um); + area.set_area(5*compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp, g_tp.cell_h_def) + + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn, g_tp.cell_h_def)); + + +} + + +double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out) +{ + double Ctotal = 0; + //printf("WdecNANDn = %E\n", WdecNANDn); + + /* part 1: drain cap of NAND gate */ + Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram); + + /* part 2: gate cap of NAND gates */ + Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram); + + return Ctotal; +} + + +void DFFCell::compute_DFF_cell() +{ + double c1, c2, c3, c4, c5, c6; + /* node 5 and node 6 are identical to node 1 in capacitance */ + c1 = c5 = c6 = fpfp_node_cap(2, 1); + c2 = fpfp_node_cap(2, 3); + c3 = fpfp_node_cap(3, 2); + c4 = fpfp_node_cap(2, 2); + + //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2 + clock_cap= 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram); + e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2*cell_load)*0.5*g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;; + + /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */ + e_keep_1.readOp.dynamic += c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ; + e_keep_0.readOp.dynamic += c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ; + e_clock.readOp.dynamic += clock_cap* g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;; + + /* static power */ + e_switch.readOp.leakage += (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF + + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd; + e_switch.readOp.gate_leakage += (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF + + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd; + //printf("leakage =%E\n",cmos_Ileak(1, is_dram) ); +} + +Pipeline::Pipeline( + const InputParameter *configure_interface, + const CoreDynParam & dyn_p_, + enum Device_ty device_ty_, + bool _is_core_pipeline, + bool _is_default) +: l_ip(*configure_interface), + coredynp(dyn_p_), + device_ty(device_ty_), + is_core_pipeline(_is_core_pipeline), + is_default(_is_default), + num_piperegs(0.0) + + { + local_result = init_interface(&l_ip); + if (!coredynp.Embedded) + process_ind = true; + else + process_ind = false; + WNANDn = (process_ind)? 25 * l_ip.F_sz_um : g_tp.min_w_nmos_ ;//this was 20 micron for the 0.8 micron process + WNANDp = (process_ind)? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_*pmos_to_nmos_sz_ratio();//this was 30 micron for the 0.8 micron process + load_per_pipeline_stage = 2*gate_C(WNANDn + WNANDp, 0, false); + compute(); + +} + +void Pipeline::compute() +{ + compute_stage_vector(); + DFFCell pipe_reg(false, WNANDn,WNANDp, load_per_pipeline_stage, &l_ip); + pipe_reg.compute_DFF_cell(); + + double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic; + //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider + //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power. + double pipe_reg_power = num_piperegs * (pipe_reg.e_switch.readOp.dynamic+pipe_reg.e_keep_0.readOp.dynamic+pipe_reg.e_keep_1.readOp.dynamic)/3+clock_power_pipereg; + double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage; + double pipe_reg_gate_leakage = num_piperegs * pipe_reg.e_switch.readOp.gate_leakage; + power.readOp.dynamic +=pipe_reg_power; + power.readOp.leakage +=pipe_reg_leakage; + power.readOp.gate_leakage +=pipe_reg_gate_leakage; + area.set_area(num_piperegs * pipe_reg.area.get_area()); + + double long_channel_device_reduction = longer_channel_device_reduction(device_ty, coredynp.core_ty); + power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; + + + double sckRation = g_tp.sckt_co_eff; + power.readOp.dynamic *= sckRation; + power.writeOp.dynamic *= sckRation; + power.searchOp.dynamic *= sckRation; + double macro_layout_overhead = g_tp.macro_layout_overhead; + if (!coredynp.Embedded) + area.set_area(area.get_area()*macro_layout_overhead); +} + +void Pipeline::compute_stage_vector() +{ + double num_stages, tot_stage_vector, per_stage_vector; + int opcode_length = coredynp.x86? coredynp.micro_opcode_length:coredynp.opcode_length; + //Hthread = thread_clock_gated? 1:num_thread; + + if (!is_core_pipeline) + { + num_piperegs=l_ip.pipeline_stages*l_ip.per_stage_vector;//The number of pipeline stages are calculated based on the achievable throughput and required throughput + } + else + { + if (coredynp.core_ty==Inorder) + { + /* assume 6 pipe stages and try to estimate bits per pipe stage */ + /* pipe stage 0/IF */ + num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads; + /* pipe stage IF/ID */ + num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads; + /* pipe stage IF/ThreadSEL */ + if (coredynp.multithreaded) num_piperegs += coredynp.num_hthreads*coredynp.perThreadState; //8 bit thread states + /* pipe stage ID/EXE */ + num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width + pow(2.0,opcode_length)+ 2*coredynp.int_data_width)*coredynp.num_hthreads; + /* pipe stage EXE/MEM */ + num_piperegs += coredynp.issueW*(3 * coredynp.arch_ireg_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/); + /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/ + num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/); +// /* pipe stage 5/6 */ +// num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/); +// /* pipe stage 6/7 */ +// num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/); +// /* pipe stage 7/8 */ +// num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/**2*powers (2,reg_length)*/); +// /* assume 50% extra in control signals (rule of thumb) */ + num_stages=6; + + } + else + { + /* assume 12 stage pipe stages and try to estimate bits per pipe stage */ + /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */ + + /* pipe stage 0/1F*/ + num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads ;//PC and Next PC + /* pipe stage IF/ID */ + num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is used to feed branch predictor in ID + /* pipe stage 1D/Renaming*/ + num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is for branch exe in later stage. + /* pipe stage Renaming/wire_drive */ + num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width); + /* pipe stage Renaming/IssueQ */ + num_piperegs += coredynp.issueW*(coredynp.instruction_length + coredynp.pc_width + 3*coredynp.phy_ireg_width)*coredynp.num_hthreads;//3*coredynp.phy_ireg_width means 2 sources and 1 dest + /* pipe stage IssueQ/Dispatch */ + num_piperegs += coredynp.issueW*(coredynp.instruction_length + 3 * coredynp.phy_ireg_width); + /* pipe stage Dispatch/EXE */ + + num_piperegs += coredynp.issueW*(3 * coredynp.phy_ireg_width + coredynp.pc_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/); + /* 2^opcode_length means the total decoded signal for the opcode*/ + num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/); + /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/ + num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/); + /* pipe stage EXE/MEM, data need to be read/write, address*/ + num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.v_address_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);//memory Opcode still need to be passed + /* pipe stage MEM/WB; result data, writeback regs */ + num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.phy_ireg_width /* powers (2,opcode_length) + (2,opcode_length)+2*powers (2,reg_length)*/); + /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/ + num_piperegs += coredynp.commitW*(coredynp.int_data_width + coredynp.v_address_width + coredynp.phy_ireg_width/*+ powers (2,opcode_length)*2*powers (2,reg_length)*/)*coredynp.num_hthreads; +// if (multithreaded) +// { +// +// } + num_stages=12; + + } + + /* assume 50% extra in control registers and interrupt registers (rule of thumb) */ + num_piperegs = num_piperegs * 1.5; + tot_stage_vector=num_piperegs; + per_stage_vector=tot_stage_vector/num_stages; + + if (coredynp.core_ty==Inorder) + { + if (coredynp.pipeline_stages>6) + num_piperegs= per_stage_vector*coredynp.pipeline_stages; + } + else//OOO + { + if (coredynp.pipeline_stages>12) + num_piperegs= per_stage_vector*coredynp.pipeline_stages; + } + } + +} + +FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, enum FU_type fu_type_) +:XML(XML_interface), + ithCore(ithCore_), + interface_ip(*interface_ip_), + coredynp(dyn_p_), + fu_type(fu_type_) +{ + double area_t;//, leakage, gate_leakage; + double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); + clockRate = coredynp.clockRate; + executionTime = coredynp.executionTime; + + //XML_interface=_XML_interface; + uca_org_t result2; + result2 = init_interface(&interface_ip); + if (XML->sys.Embedded) + { + if (fu_type == FPU) + { + num_fu=coredynp.num_fpus; + //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 + area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number + //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60% + if (g_ip->F_sz_nm>90) + area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 + leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles. +// base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) +// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); + base_energy = 0; + per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ) + //FPU power from Sandia's processor sizing tech report + FU_height=(18667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data + } + else if (fu_type == ALU) + { + num_fu=coredynp.num_alus; + area_t = 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl + leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; +// base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) +// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); + base_energy = 0; + per_access_energy = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ) + FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU + + } + else if (fu_type == MUL) + { + num_fu=coredynp.num_muls; + area_t = 280*260*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl + leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; +// base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) +// base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); + base_energy = 0; + per_access_energy = 1.15*2/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch + FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data + } + else + { + cout<<"Unknown Functional Unit Type"<<endl; + exit(0); + } + per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy + } + else + { + if (fu_type == FPU) + { + num_fu=coredynp.num_fpus; + //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 + area_t = 8.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 + if (g_ip->F_sz_nm>90) + area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 + leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles. + base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) + base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); + per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ) + FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data + } + else if (fu_type == ALU) + { + num_fu=coredynp.num_alus; + area_t = 280*260*2*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl + leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; + base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) + base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); + per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ) + FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU + + } + else if (fu_type == MUL) + { + num_fu=coredynp.num_muls; + area_t = 280*260*2*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl + leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; + base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) + base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); + per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch + FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data + } + else + { + cout<<"Unknown Functional Unit Type"<<endl; + exit(0); + } + } + //IEXEU, simple ALU and FPU + // double C_ALU, C_EXEU, C_FPU; //Lum Equivalent capacitance of IEXEU and FPU. Based on Intel and Sun 90nm process fabracation. + // + // C_ALU = 0.025e-9;//F + // C_EXEU = 0.05e-9; //F + // C_FPU = 0.35e-9;//F + area.set_area(area_t*num_fu); + leakage *= num_fu; + gate_leakage *=num_fu; + double macro_layout_overhead = g_tp.macro_layout_overhead; +// if (!XML->sys.Embedded) + area.set_area(area.get_area()*macro_layout_overhead); +} + +void FunctionalUnit::computeEnergy(bool is_tdp) +{ + double pppm_t[4] = {1,1,1,1}; + double FU_duty_cycle; + if (is_tdp) + { + + + set_pppm(pppm_t, 2, 2, 2, 2);//2 means two source operands needs to be passed for each int instruction. + if (fu_type == FPU) + { + stats_t.readAc.access = num_fu; + tdp_stats = stats_t; + FU_duty_cycle = coredynp.FPU_duty_cycle; + } + else if (fu_type == ALU) + { + stats_t.readAc.access = 1*num_fu; + tdp_stats = stats_t; + FU_duty_cycle = coredynp.ALU_duty_cycle; + } + else if (fu_type == MUL) + { + stats_t.readAc.access = num_fu; + tdp_stats = stats_t; + FU_duty_cycle = coredynp.MUL_duty_cycle; + } + + //power.readOp.dynamic = base_energy/clockRate + energy*stats_t.readAc.access; + power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy/clockRate; + double sckRation = g_tp.sckt_co_eff; + power.readOp.dynamic *= sckRation*FU_duty_cycle; + power.writeOp.dynamic *= sckRation; + power.searchOp.dynamic *= sckRation; + + power.readOp.leakage = leakage; + power.readOp.gate_leakage = gate_leakage; + double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); + power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; + + } + else + { + if (fu_type == FPU) + { + stats_t.readAc.access = XML->sys.core[ithCore].fpu_accesses; + rtp_stats = stats_t; + } + else if (fu_type == ALU) + { + stats_t.readAc.access = XML->sys.core[ithCore].ialu_accesses; + rtp_stats = stats_t; + } + else if (fu_type == MUL) + { + stats_t.readAc.access = XML->sys.core[ithCore].mul_accesses; + rtp_stats = stats_t; + } + + //rt_power.readOp.dynamic = base_energy*executionTime + energy*stats_t.readAc.access; + rt_power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy*executionTime; + double sckRation = g_tp.sckt_co_eff; + rt_power.readOp.dynamic *= sckRation; + rt_power.writeOp.dynamic *= sckRation; + rt_power.searchOp.dynamic *= sckRation; + + } + + +} + +void FunctionalUnit::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + +// cout << indent_str_next << "Results Broadcast Bus Area = " << bypass->area.get_area() *1e-6 << " mm^2" << endl; + if (is_tdp) + { + if (fu_type == FPU) + { + cout << indent_str << "Floating Point Units (FPUs) (Count: "<< coredynp.num_fpus <<" ):" << endl; + cout << indent_str_next << "Area = " << area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl; + cout << indent_str_next<< "Subthreshold Leakage = " + << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + } + else if (fu_type == ALU) + { + cout << indent_str << "Integer ALUs (Count: "<< coredynp.num_alus <<" ):" << endl; + cout << indent_str_next << "Area = " << area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl; + cout << indent_str_next<< "Subthreshold Leakage = " + << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + } + else if (fu_type == MUL) + { + cout << indent_str << "Complex ALUs (Mul/Div) (Count: "<< coredynp.num_muls <<" ):" << endl; + cout << indent_str_next << "Area = " << area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl; + cout << indent_str_next<< "Subthreshold Leakage = " + << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + + } + + } + else + { + } + +} + +void FunctionalUnit::leakage_feedback(double temperature) +{ + // Update the temperature and initialize the global interfaces. + interface_ip.temp = (unsigned int)round(temperature/10.0)*10; + + uca_org_t init_result = init_interface(&interface_ip); // init_result is dummy + + // This is part of FunctionalUnit() + double area_t, leakage, gate_leakage; + double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); + + if (fu_type == FPU) + { + area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number + if (g_ip->F_sz_nm>90) + area_t = 4.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 + leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + } + else if (fu_type == ALU) + { + area_t = 280*260*2*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl + leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; + } + else if (fu_type == MUL) + { + area_t = 280*260*2*3*num_fu*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl + leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; + } + else + { + cout<<"Unknown Functional Unit Type"<<endl; + exit(1); + } + + power.readOp.leakage = leakage*num_fu; + power.readOp.gate_leakage = gate_leakage*num_fu; + power.readOp.longer_channel_leakage = longer_channel_device_reduction(Core_device, coredynp.core_ty); +} + +UndiffCore::UndiffCore(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_, bool embedded_) +:XML(XML_interface), + ithCore(ithCore_), + interface_ip(*interface_ip_), + coredynp(dyn_p_), + core_ty(coredynp.core_ty), + embedded(XML->sys.Embedded), + pipeline_stage(coredynp.pipeline_stages), + num_hthreads(coredynp.num_hthreads), + issue_width(coredynp.issueW), + exist(exist_) +// is_default(_is_default) +{ + if (!exist) return; + double undifferentiated_core=0; + double core_tx_density=0; + double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); + double undifferentiated_core_coe; + //XML_interface=_XML_interface; + uca_org_t result2; + result2 = init_interface(&interface_ip); + + //Compute undifferentiated core area at 90nm. + if (embedded==false) + { + //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements + if (core_ty==OOO) + { + //undifferentiated_core = (0.0764*pipeline_stage*pipeline_stage -2.3685*pipeline_stage + 10.405);//OOO + undifferentiated_core = (3.57*log(pipeline_stage)-1.2643)>0?(3.57*log(pipeline_stage)-1.2643):0; + } + else if (core_ty==Inorder) + { + //undifferentiated_core = (0.1238*pipeline_stage + 7.2572)*0.9;//inorder + undifferentiated_core = (-2.19*log(pipeline_stage)+6.55)>0?(-2.19*log(pipeline_stage)+6.55):0; + } + else + { + cout<<"invalid core type"<<endl; + exit(0); + } + undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0716); + } + else + { + //Based on the results in paper "parametrized processor models" Sandia Labs + if (XML->sys.opt_clockrate) + undifferentiated_core_coe = 0.05; + else + undifferentiated_core_coe = 0; + undifferentiated_core = (0.4109* pipeline_stage - 0.776)*undifferentiated_core_coe; + undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0426); + } + + undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff*1e6;//change from mm^2 to um^2 + core_tx_density = g_tp.scaling_factor.core_tx_density; + //undifferentiated_core = 3*1e6; + //undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff;//(g_ip->F_sz_um*g_ip->F_sz_um/0.09/0.09)*; + power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W + power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd; + + double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); + power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; + area.set_area(undifferentiated_core); + + scktRatio = g_tp.sckt_co_eff; + power.readOp.dynamic *= scktRatio; + power.writeOp.dynamic *= scktRatio; + power.searchOp.dynamic *= scktRatio; + macro_PR_overhead = g_tp.macro_layout_overhead; + area.set_area(area.get_area()*macro_PR_overhead); + + + +// double vt=g_tp.peri_global.Vth; +// double velocity_index=1.1; +// double c_in=gate_C(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r , 0.0, false); +// double c_out= drain_C_(g_tp.min_w_nmos_, NCH, 2, 1, g_tp.cell_h_def, false) + drain_C_(g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, PCH, 1, 1, g_tp.cell_h_def, false) + c_in; +// double w_nmos=g_tp.min_w_nmos_; +// double w_pmos=g_tp.min_w_nmos_*pmos_to_nmos_sizing_r; +// double i_on_n=1.0; +// double i_on_p=1.0; +// double i_on_n_in=1.0; +// double i_on_p_in=1; +// double vdd=g_tp.peri_global.Vdd; + +// power.readOp.sc=shortcircuit_simple(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd); +// power.readOp.dynamic=c_out*vdd*vdd/2; + +// cout<<power.readOp.dynamic << "dynamic" <<endl; +// cout<<power.readOp.sc << "sc" << endl; + +// power.readOp.sc=shortcircuit(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd); +// power.readOp.dynamic=c_out*vdd*vdd/2; +// +// cout<<power.readOp.dynamic << "dynamic" <<endl; +// cout<<power.readOp.sc << "sc" << endl; + + + +} + + +void UndiffCore::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + + if (is_tdp) + { + cout << indent_str << "UndiffCore:" << endl; + cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; + //cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl; + cout << indent_str_next<< "Subthreshold Leakage = " + << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; + //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + } + else + { + cout << indent_str << "UndiffCore:" << endl; + cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; + //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; + cout <<endl; + } + +} + +inst_decoder::inst_decoder( + bool _is_default, + const InputParameter *configure_interface, + int opcode_length_, + int num_decoders_, + bool x86_, + enum Device_ty device_ty_, + enum Core_type core_ty_) +:is_default(_is_default), + opcode_length(opcode_length_), + num_decoders(num_decoders_), + x86(x86_), + device_ty(device_ty_), + core_ty(core_ty_) + { + /* + * Instruction decoder is different from n to 2^n decoders + * that are commonly used in row decoders in memory arrays. + * The RISC instruction decoder is typically a very simple device. + * We can decode an instruction by simply + * separating the machine word into small parts using wire slices + * The RISC instruction decoder can be approximate by the n to 2^n decoders, + * although this approximation usually underestimate power since each decoded + * instruction normally has more than 1 active signal. + * + * However, decoding a CISC instruction word is much more difficult + * than the RISC case. A CISC decoder is typically set up as a state machine. + * The machine reads the opcode field to determine + * what type of instruction it is, + * and where the other data values are. + * The instruction word is read in piece by piece, + * and decisions are made at each stage as to + * how the remainder of the instruction word will be read. + * (sequencer and ROM are usually needed) + * An x86 decoder can be even more complex since + * it involve both decoding instructions into u-ops and + * merge u-ops when doing micro-ops fusion. + */ + bool is_dram=false; + double pmos_to_nmos_sizing_r; + double load_nmos_width, load_pmos_width; + double C_driver_load, R_wire_load; + Area cell; + + l_ip=*configure_interface; + local_result = init_interface(&l_ip); + cell.h =g_tp.cell_h_def; + cell.w =g_tp.cell_h_def; + + num_decoder_segments = (int)ceil(opcode_length/18.0); + if (opcode_length > 18) opcode_length = 18; + num_decoded_signals= (int)pow(2.0,opcode_length); + pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); + load_nmos_width=g_tp.max_w_nmos_ /2; + load_pmos_width= g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r; + C_driver_load = 1024*gate_C(load_nmos_width + load_pmos_width, 0, is_dram); //TODO: this number 1024 needs to be revisited + R_wire_load = 3000*l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um; + + final_dec = new Decoder( + num_decoded_signals, + false, + C_driver_load, + R_wire_load, + false/*is_fa*/, + false/*is_dram*/, + false/*wl_tr*/, //to use peri device + cell); + + PredecBlk * predec_blk1 = new PredecBlk( + num_decoded_signals, + final_dec, + 0,//Assuming predec and dec are back to back + 0, + 1,//Each Predec only drives one final dec + false/*is_dram*/, + true); + PredecBlk * predec_blk2 = new PredecBlk( + num_decoded_signals, + final_dec, + 0,//Assuming predec and dec are back to back + 0, + 1,//Each Predec only drives one final dec + false/*is_dram*/, + false); + + PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false); + PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false); + + pre_dec = new Predec(predec_blk_drv1, predec_blk_drv2); + + double area_decoder = final_dec->area.get_area() * num_decoded_signals * num_decoder_segments*num_decoders; + //double w_decoder = area_decoder / area.get_h(); + double area_pre_dec = (predec_blk_drv1->area.get_area() + + predec_blk_drv2->area.get_area() + + predec_blk1->area.get_area() + + predec_blk2->area.get_area())* + num_decoder_segments*num_decoders; + area.set_area(area.get_area()+ area_decoder + area_pre_dec); + double macro_layout_overhead = g_tp.macro_layout_overhead; + double chip_PR_overhead = g_tp.chip_layout_overhead; + area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead); + + inst_decoder_delay_power(); + + double sckRation = g_tp.sckt_co_eff; + power.readOp.dynamic *= sckRation; + power.writeOp.dynamic *= sckRation; + power.searchOp.dynamic *= sckRation; + + double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); + power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; + +} + +void inst_decoder::inst_decoder_delay_power() +{ + + double dec_outrisetime; + double inrisetime=0, outrisetime; + double pppm_t[4] = {1,1,1,1}; + double squencer_passes = x86?2:1; + + outrisetime = pre_dec->compute_delays(inrisetime); + dec_outrisetime = final_dec->compute_delays(outrisetime); + set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments); + power = power + pre_dec->power*pppm_t; + set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals, + num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments); + power = power + final_dec->power*pppm_t; +} +void inst_decoder::leakage_feedback(double temperature) +{ + l_ip.temp = (unsigned int)round(temperature/10.0)*10; + uca_org_t init_result = init_interface(&l_ip); // init_result is dummy + + final_dec->leakage_feedback(temperature); + pre_dec->leakage_feedback(temperature); + + double pppm_t[4] = {1,1,1,1}; + double squencer_passes = x86?2:1; + + set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments); + power = pre_dec->power*pppm_t; + + set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments); + power = power + final_dec->power*pppm_t; + + double sckRation = g_tp.sckt_co_eff; + + power.readOp.dynamic *= sckRation; + power.writeOp.dynamic *= sckRation; + power.searchOp.dynamic *= sckRation; + + double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); + power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; +} + +inst_decoder::~inst_decoder() +{ + local_result.cleanup(); + + delete final_dec; + + delete pre_dec->blk1; + delete pre_dec->blk2; + delete pre_dec->drv1; + delete pre_dec->drv2; + delete pre_dec; +} diff --git a/ext/mcpat/logic.h b/ext/mcpat/logic.h new file mode 100644 index 000000000..e2a35e845 --- /dev/null +++ b/ext/mcpat/logic.h @@ -0,0 +1,233 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ +#ifndef LOGIC_H_ +#define LOGIC_H_ + +#include <cassert> +#include <cmath> +#include <cstring> +#include <iostream> + +#include "XML_Parse.h" +#include "arch_const.h" +#include "basic_circuit.h" +#include "basic_components.h" +#include "cacti_interface.h" +#include "component.h" +#include "const.h" +#include "decoder.h" +#include "parameter.h" +#include "xmlParser.h" + +using namespace std; + +class selection_logic : public Component{ +public: + selection_logic(bool _is_default, int win_entries_, + int issue_width_, const InputParameter *configure_interface, + enum Device_ty device_ty_=Core_device, + enum Core_type core_ty_=Inorder);//, const ParseXML *_XML_interface); + bool is_default; + InputParameter l_ip; + uca_org_t local_result; + const ParseXML *XML_interface; + int win_entries; + int issue_width; + int num_threads; + enum Device_ty device_ty; + enum Core_type core_ty; + + void selection_power(); + void leakage_feedback(double temperature); // TODO +}; + +class dep_resource_conflict_check : public Component{ +public: + dep_resource_conflict_check(const InputParameter *configure_interface, const CoreDynParam & dyn_p_, int compare_bits_, bool _is_default=true); + InputParameter l_ip; + uca_org_t local_result; + double WNORn, WNORp, Wevalinvp, Wevalinvn, Wcompn, Wcompp, Wcomppreequ; + CoreDynParam coredynp; + int compare_bits; + bool is_default; + statsDef tdp_stats; + statsDef rtp_stats; + statsDef stats_t; + powerDef power_t; + + void conflict_check_power(); + double compare_cap(); + ~dep_resource_conflict_check(){ + local_result.cleanup(); + } + + void leakage_feedback(double temperature); +}; + +class inst_decoder: public Component{ +public: + inst_decoder(bool _is_default, const InputParameter *configure_interface, + int opcode_length_, + int num_decoders_, + bool x86_, + enum Device_ty device_ty_=Core_device, + enum Core_type core_ty_=Inorder); + inst_decoder(); + bool is_default; + int opcode_length; + int num_decoders; + bool x86; + int num_decoder_segments; + int num_decoded_signals; + InputParameter l_ip; + uca_org_t local_result; + enum Device_ty device_ty; + enum Core_type core_ty; + + Decoder * final_dec; + Predec * pre_dec; + + statsDef tdp_stats; + statsDef rtp_stats; + statsDef stats_t; + powerDef power_t; + void inst_decoder_delay_power(); + ~inst_decoder(); + void leakage_feedback(double temperature); +}; + +class DFFCell : public Component { +public: + DFFCell(bool _is_dram, double _WdecNANDn, double _WdecNANDp,double _cell_load, + const InputParameter *configure_interface); + InputParameter l_ip; + bool is_dram; + double cell_load; + double WdecNANDn; + double WdecNANDp; + double clock_cap; + int model; + int n_switch; + int n_keep_1; + int n_keep_0; + int n_clock; + powerDef e_switch; + powerDef e_keep_1; + powerDef e_keep_0; + powerDef e_clock; + + double fpfp_node_cap(unsigned int fan_in, unsigned int fan_out); + void compute_DFF_cell(void); + }; + +class Pipeline : public Component{ +public: + Pipeline(const InputParameter *configure_interface, const CoreDynParam & dyn_p_, enum Device_ty device_ty_=Core_device, bool _is_core_pipeline=true, bool _is_default=true); + InputParameter l_ip; + uca_org_t local_result; + CoreDynParam coredynp; + enum Device_ty device_ty; + bool is_core_pipeline, is_default; + double num_piperegs; +// int pipeline_stages; +// int tot_stage_vector, per_stage_vector; + bool process_ind; + double WNANDn ; + double WNANDp; + double load_per_pipeline_stage; +// int Hthread, num_thread, fetchWidth, decodeWidth, issueWidth, commitWidth, instruction_length; +// int PC_width, opcode_length, num_arch_reg_tag, data_width,num_phsical_reg_tag, address_width; +// bool thread_clock_gated; +// bool in_order, multithreaded; + void compute_stage_vector(); + void compute(); + ~Pipeline(){ + local_result.cleanup(); + }; + +}; + +//class core_pipeline :public pipeline{ +//public: +// int Hthread, num_thread, fetchWidth, decodeWidth, issueWidth, commitWidth, instruction_length; +// int PC_width, opcode_length, num_arch_reg_tag, data_width,num_phsical_reg_tag, address_width; +// bool thread_clock_gated; +// bool in_order, multithreaded; +// core_pipeline(bool _is_default, const InputParameter *configure_interface); +// virtual void compute_stage_vector(); +// +//}; + +class FunctionalUnit :public Component{ +public: + ParseXML *XML; + int ithCore; + InputParameter interface_ip; + CoreDynParam coredynp; + double FU_height; + double clockRate,executionTime; + double num_fu; + double energy, base_energy,per_access_energy, leakage, gate_leakage; + bool is_default; + enum FU_type fu_type; + statsDef tdp_stats; + statsDef rtp_stats; + statsDef stats_t; + powerDef power_t; + + FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, enum FU_type fu_type); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + void leakage_feedback(double temperature); + +}; + +class UndiffCore :public Component{ +public: + UndiffCore(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_=true, bool embedded_=false); + ParseXML *XML; + int ithCore; + InputParameter interface_ip; + CoreDynParam coredynp; + double clockRate,executionTime; + double scktRatio, chip_PR_overhead, macro_PR_overhead; + enum Core_type core_ty; + bool opt_performance, embedded; + double pipeline_stage,num_hthreads,issue_width; + bool is_default; + + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~UndiffCore(){}; + bool exist; + + +}; +#endif /* LOGIC_H_ */ diff --git a/ext/mcpat/main.cc b/ext/mcpat/main.cc new file mode 100644 index 000000000..8acce8d23 --- /dev/null +++ b/ext/mcpat/main.cc @@ -0,0 +1,101 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ +#include <iostream> + +#include "XML_Parse.h" +#include "globalvar.h" +#include "io.h" +#include "processor.h" +#include "version.h" +#include "xmlParser.h" + +using namespace std; + +void print_usage(char * argv0); + +int main(int argc,char *argv[]) +{ + char * fb ; + bool infile_specified = false; + int plevel = 2; + opt_for_clk =true; + //cout.precision(10); + if (argc <= 1 || argv[1] == string("-h") || argv[1] == string("--help")) + { + print_usage(argv[0]); + } + + for (int32_t i = 0; i < argc; i++) + { + if (argv[i] == string("-infile")) + { + infile_specified = true; + i++; + fb = argv[ i]; + } + + if (argv[i] == string("-print_level")) + { + i++; + plevel = atoi(argv[i]); + } + + if (argv[i] == string("-opt_for_clk")) + { + i++; + opt_for_clk = (bool)atoi(argv[i]); + } + } + if (infile_specified == false) + { + print_usage(argv[0]); + } + + + cout<<"McPAT (version "<< VER_MAJOR <<"."<< VER_MINOR + << " of " << VER_UPDATE << ") is computing the target processor...\n "<<endl; + + //parse XML-based interface + ParseXML *p1= new ParseXML(); + p1->parse(fb); + Processor proc(p1); + proc.displayEnergy(2, plevel); + delete p1; + return 0; +} + +void print_usage(char * argv0) +{ + cerr << "How to use McPAT:" << endl; + cerr << " mcpat -infile <input file name> -print_level < level of details 0~5 > -opt_for_clk < 0 (optimize for ED^2P only)/1 (optimzed for target clock rate)>"<< endl; + //cerr << " Note:default print level is at processor level, please increase it to see the details" << endl; + exit(1); +} diff --git a/ext/mcpat/makefile b/ext/mcpat/makefile new file mode 100644 index 000000000..27f213fa5 --- /dev/null +++ b/ext/mcpat/makefile @@ -0,0 +1,28 @@ +TAR = mcpat + +.PHONY: dbg opt depend clean clean_dbg clean_opt + +all: opt + +dbg: $(TAR).mk obj_dbg + @$(MAKE) TAG=dbg -C . -f $(TAR).mk + +opt: $(TAR).mk obj_opt + @$(MAKE) TAG=opt -C . -f $(TAR).mk + +obj_dbg: + mkdir $@ + +obj_opt: + mkdir $@ + +clean: clean_dbg clean_opt + +clean_dbg: obj_dbg + @$(MAKE) TAG=dbg -C . -f $(TAR).mk clean + rm -rf $< + +clean_opt: obj_opt + @$(MAKE) TAG=opt -C . -f $(TAR).mk clean + rm -rf $< + diff --git a/ext/mcpat/mcpat.mk b/ext/mcpat/mcpat.mk new file mode 100644 index 000000000..9aacbe0e6 --- /dev/null +++ b/ext/mcpat/mcpat.mk @@ -0,0 +1,81 @@ +TARGET = mcpat +SHELL = /bin/sh +.PHONY: all depend clean +.SUFFIXES: .cc .o + +ifndef NTHREADS + NTHREADS = 4 +endif + + +LIBS = +INCS = -lm + +ifeq ($(TAG),dbg) + DBG = -Wall + OPT = -ggdb -g -O0 -DNTHREADS=1 -Icacti +else + DBG = + OPT = -O3 -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS) -Icacti + #OPT = -O0 -DNTHREADS=$(NTHREADS) +endif + +#CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT) +CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT) +CXX = g++ -m32 +CC = gcc -m32 + +VPATH = cacti + +SRCS = \ + Ucache.cc \ + XML_Parse.cc \ + arbiter.cc \ + area.cc \ + array.cc \ + bank.cc \ + basic_circuit.cc \ + basic_components.cc \ + cacti_interface.cc \ + component.cc \ + core.cc \ + crossbar.cc \ + decoder.cc \ + htree2.cc \ + interconnect.cc \ + io.cc \ + iocontrollers.cc \ + logic.cc \ + main.cc \ + mat.cc \ + memoryctrl.cc \ + noc.cc \ + nuca.cc \ + parameter.cc \ + processor.cc \ + router.cc \ + sharedcache.cc \ + subarray.cc \ + technology.cc \ + uca.cc \ + wire.cc \ + xmlParser.cc + +OBJS = $(patsubst %.cc,obj_$(TAG)/%.o,$(SRCS)) + +all: obj_$(TAG)/$(TARGET) + cp -f obj_$(TAG)/$(TARGET) $(TARGET) + +obj_$(TAG)/$(TARGET) : $(OBJS) + $(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread + +#obj_$(TAG)/%.o : %.cc +# $(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $< + +obj_$(TAG)/%.o : %.cc + $(CXX) $(CXXFLAGS) -c $< -o $@ + +clean: + -rm -f *.o $(TARGET) + + diff --git a/ext/mcpat/mcpatXeonCore.mk b/ext/mcpat/mcpatXeonCore.mk new file mode 100644 index 000000000..20cf0ddc8 --- /dev/null +++ b/ext/mcpat/mcpatXeonCore.mk @@ -0,0 +1,81 @@ +TARGET = mcpatXeonCore +SHELL = /bin/sh +.PHONY: all depend clean +.SUFFIXES: .cc .o + +ifndef NTHREADS + NTHREADS = 4 +endif + + +LIBS = +INCS = -lm + +ifeq ($(TAG),dbg) + DBG = -Wall + OPT = -ggdb -g -O0 -DNTHREADS=1 -Icacti +else + DBG = + OPT = -O3 -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS) -Icacti + #OPT = -O0 -DNTHREADS=$(NTHREADS) +endif + +#CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT) +CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT) +CXX = g++ -m32 +CC = gcc -m32 + +VPATH = cacti + +SRCS = \ + Ucache.cc \ + XML_Parse.cc \ + arbiter.cc \ + area.cc \ + array.cc \ + bank.cc \ + basic_circuit.cc \ + basic_components.cc \ + cacti_interface.cc \ + component.cc \ + core.cc \ + crossbar.cc \ + decoder.cc \ + htree2.cc \ + interconnect.cc \ + io.cc \ + iocontrollers.cc \ + logic.cc \ + main.cc \ + mat.cc \ + memoryctrl.cc \ + noc.cc \ + nuca.cc \ + parameter.cc \ + processor.cc \ + router.cc \ + sharedcache.cc \ + subarray.cc \ + technology_xeon_core.cc \ + uca.cc \ + wire.cc \ + xmlParser.cc + +OBJS = $(patsubst %.cc,obj_$(TAG)/%.o,$(SRCS)) + +all: obj_$(TAG)/$(TARGET) + cp -f obj_$(TAG)/$(TARGET) $(TARGET) + +obj_$(TAG)/$(TARGET) : $(OBJS) + $(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread + +#obj_$(TAG)/%.o : %.cc +# $(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $< + +obj_$(TAG)/%.o : %.cc + $(CXX) $(CXXFLAGS) -c $< -o $@ + +clean: + -rm -f *.o $(TARGET) + + diff --git a/ext/mcpat/memoryctrl.cc b/ext/mcpat/memoryctrl.cc new file mode 100644 index 000000000..ae3bc75ec --- /dev/null +++ b/ext/mcpat/memoryctrl.cc @@ -0,0 +1,736 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ +#include <algorithm> +#include <cassert> +#include <cmath> +#include <iostream> +#include <string> + +#include "XML_Parse.h" +#include "basic_circuit.h" +#include "basic_components.h" +#include "const.h" +#include "io.h" +#include "logic.h" +#include "memoryctrl.h" +#include "parameter.h" + +/* overview of MC models: + * McPAT memory controllers are modeled according to large number of industrial data points. + * The Basic memory controller architecture is base on the Synopsis designs + * (DesignWare DDR2/DDR3-Lite memory controllers and DDR2/DDR3-Lite protocol controllers) + * as in Cadence ChipEstimator Tool + * + * An MC has 3 parts as shown in this design. McPAT models both high performance MC + * based on Niagara processor designs and curving and low power MC based on data points in + * Cadence ChipEstimator Tool. + * + * The frontend is modeled analytically, the backend is modeled empirically according to + * DDR2/DDR3-Lite protocol controllers in Cadence ChipEstimator Tool + * The PHY is modeled based on + * "A 100mW 9.6Gb/s Transceiver in 90nm CMOS for next-generation memory interfaces ," ISSCC 2006, + * and A 14mW 6.25Gb/s Transceiver in 90nm CMOS for Serial Chip-to-Chip Communication," ISSCC 2007 + * + * In Cadence ChipEstimator Tool there are two types of memory controllers: the full memory controllers + * that includes the frontend as the DesignWare DDR2/DDR3-Lite memory controllers and the backend only + * memory controllers as the DDR2/DDR3-Lite protocol controllers (except DesignWare DDR2/DDR3-Lite memory + * controllers, all memory controller IP in Cadence ChipEstimator Tool are backend memory controllers such as + * DDRC 1600A and DDRC 800A). Thus,to some extend the area and power difference between DesignWare + * DDR2/DDR3-Lite memory controllers and DDR2/DDR3-Lite protocol controllers can be an estimation to the + * frontend power and area, which is very close the analitically modeled results of the frontend for Niagara2@65nm + * + */ + +MCBackend::MCBackend(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_) +:l_ip(*interface_ip_), + mc_type(mc_type_), + mcp(mcp_) +{ + + local_result = init_interface(&l_ip); + compute(); + +} + + +void MCBackend::compute() +{ + //double max_row_addr_width = 20.0;//Current address 12~18bits + double C_MCB, mc_power, backend_dyn, backend_gates;//, refresh_period,refresh_freq;//Equivalent per bit Cap for backend, + double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); + double NMOS_sizing, PMOS_sizing; + + if (mc_type == MC) + { + if (mcp.type == 0) + { + //area = (2.2927*log(peakDataTransferRate)-14.504)*memDataWidth/144.0*(l_ip.F_sz_um/0.09); + area.set_area((2.7927*log(mcp.peakDataTransferRate*2)-19.862)/2.0*mcp.dataBusWidth/128.0*(l_ip.F_sz_um/0.09)*mcp.num_channels*1e6);//um^2 + //assuming the approximately same scaling factor as seen in processors. + //C_MCB=0.2/1.3/1.3/266/64/0.09*g_ip.F_sz_um;//based on AMD Geode processor which has a very basic mc on chip. + //C_MCB = 1.6/200/1e6/144/1.2/1.2*g_ip.F_sz_um/0.19;//Based on Niagara power numbers.The base power (W) is divided by device frequency and vdd and scale to target process. + //mc_power = 0.0291*2;//29.1mW@200MHz @130nm From Power Analysis of SystemLevel OnChip Communication Architectures by Lahiri et + mc_power = 4.32*0.1;//4.32W@1GhzMHz @65nm Cadence ChipEstimator 10% for backend + C_MCB = mc_power/1e9/72/1.1/1.1*l_ip.F_sz_um/0.065; + power_t.readOp.dynamic = C_MCB*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(mcp.dataBusWidth/*+mcp.addressBusWidth*/);//per access energy in memory controller + power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W + power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W + + } + else + { NMOS_sizing = g_tp.min_w_nmos_; + PMOS_sizing = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r; + area.set_area(0.15*mcp.dataBusWidth/72.0*(l_ip.F_sz_um/0.065)* (l_ip.F_sz_um/0.065)*mcp.num_channels*1e6);//um^2 + backend_dyn = 0.9e-9/800e6*mcp.clockRate/12800*mcp.peakDataTransferRate*mcp.dataBusWidth/72.0*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(l_ip.F_sz_nm/65.0);//Average on DDR2/3 protocol controller and DDRC 1600/800A in Cadence ChipEstimate + //Scaling to technology and DIMM feature. The base IP support DDR3-1600(PC3 12800) + backend_gates = 50000*mcp.dataBusWidth/64.0;//5000 is from Cadence ChipEstimator + + power_t.readOp.dynamic = backend_dyn; + power_t.readOp.leakage = (backend_gates)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W + power_t.readOp.gate_leakage = (backend_gates)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W + + } + } + else + {//skip old model + cout<<"Unknown memory controllers"<<endl;exit(0); + area.set_area(0.243*mcp.dataBusWidth/8);//area based on Cadence ChipEstimator for 8bit bus + //mc_power = 4.32*0.1;//4.32W@1GhzMHz @65nm Cadence ChipEstimator 10% for backend + C_MCB = mc_power/1e9/72/1.1/1.1*l_ip.F_sz_um/0.065; + power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W + power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W + power_t.readOp.dynamic *= 1.2; + power_t.readOp.leakage *= 1.2; + power_t.readOp.gate_leakage *= 1.2; + //flash controller has about 20% more backend power since BCH ECC in flash is complex and power hungry + } + double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device); + power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction; +} + +void MCBackend::computeEnergy(bool is_tdp) +{ + //backend uses internal data buswidth + if (is_tdp) + { + //init stats for Peak + stats_t.readAc.access = 0.5*mcp.num_channels; + stats_t.writeAc.access = 0.5*mcp.num_channels; + tdp_stats = stats_t; + } + else + { + //init stats for runtime power (RTP) + stats_t.readAc.access = mcp.reads; + stats_t.writeAc.access = mcp.writes; + tdp_stats = stats_t; + } + if (is_tdp) + { + power = power_t; + power.readOp.dynamic = (stats_t.readAc.access + stats_t.writeAc.access)*power_t.readOp.dynamic; + + } + else + { + rt_power.readOp.dynamic = (stats_t.readAc.access + stats_t.writeAc.access)*mcp.llcBlockSize*8.0/mcp.dataBusWidth*power_t.readOp.dynamic; + rt_power = rt_power + power_t*pppm_lkg; + rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime; + //Assume 10% of peak power is consumed by routine job including memory refreshing and scrubbing + } +} + + +MCPHY::MCPHY(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_) +:l_ip(*interface_ip_), + mc_type(mc_type_), + mcp(mcp_) +{ + + local_result = init_interface(&l_ip); + compute(); +} + +void MCPHY::compute() +{ + //PHY uses internal data buswidth but the actuall off-chip datawidth is 64bits + ecc + double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio() ; + /* + * according to "A 100mW 9.6Gb/s Transceiver in 90nm CMOS for next-generation memory interfaces ," ISSCC 2006; + * From Cadence ChipEstimator for normal I/O around 0.4~0.8 mW/Gb/s + */ + double power_per_gb_per_s, phy_dyn,phy_gates, NMOS_sizing, PMOS_sizing; + + if (mc_type == MC) + { + if (mcp.type == 0) + { + power_per_gb_per_s = mcp.LVDS? 0.01:0.04; + //Based on die photos from Niagara 1 and 2. + //TODO merge this into undifferentiated core.PHY only achieves square root of the ideal scaling. + //area = (6.4323*log(peakDataTransferRate)-34.76)*memDataWidth/128.0*(l_ip.F_sz_um/0.09); + area.set_area((6.4323*log(mcp.peakDataTransferRate*2)-48.134)*mcp.dataBusWidth/128.0*(l_ip.F_sz_um/0.09)*mcp.num_channels*1e6/2);//TODO:/2 + //This is from curve fitting based on Niagara 1 and 2's PHY die photo. + //This is power not energy, 10mw/Gb/s @90nm for each channel and scaling down + //power.readOp.dynamic = 0.02*memAccesses*llcBlocksize*8;//change from Bytes to bits. + power_t.readOp.dynamic = power_per_gb_per_s*sqrt(l_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2; + power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W + power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W + + } + else + { + NMOS_sizing = g_tp.min_w_nmos_; + PMOS_sizing = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r; + //Designware/synopsis 16bit DDR3 PHY is 1.3mm (WITH IOs) at 40nm for upto DDR3 2133 (PC3 17066) + double non_IO_percentage = 0.2; + area.set_area(1.3*non_IO_percentage/2133.0e6*mcp.clockRate/17066*mcp.peakDataTransferRate*mcp.dataBusWidth/16.0*(l_ip.F_sz_um/0.040)* (l_ip.F_sz_um/0.040)*mcp.num_channels*1e6);//um^2 + phy_gates = 200000*mcp.dataBusWidth/64.0; + power_per_gb_per_s = 0.01; + //This is power not energy, 10mw/Gb/s @90nm for each channel and scaling down + power_t.readOp.dynamic = power_per_gb_per_s*(l_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2; + power_t.readOp.leakage = (mcp.withPHY? phy_gates:0)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W + power_t.readOp.gate_leakage = (mcp.withPHY? phy_gates:0)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W + } + + } + else + { + area.set_area(0.4e6/2*mcp.dataBusWidth/8);//area based on Cadence ChipEstimator for 8bit bus + } + +// double phy_factor = (int)ceil(mcp.dataBusWidth/72.0);//Previous phy power numbers are based on 72 bit DIMM interface +// power_t.readOp.dynamic *= phy_factor; +// power_t.readOp.leakage *= phy_factor; +// power_t.readOp.gate_leakage *= phy_factor; + + double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device); + power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction; +} + + +void MCPHY::computeEnergy(bool is_tdp) +{ + if (is_tdp) + { + //init stats for Peak + stats_t.readAc.access = 0.5*mcp.num_channels; //time share on buses + stats_t.writeAc.access = 0.5*mcp.num_channels; + tdp_stats = stats_t; + } + else + { + //init stats for runtime power (RTP) + stats_t.readAc.access = mcp.reads; + stats_t.writeAc.access = mcp.writes; + tdp_stats = stats_t; + } + + if (is_tdp) + { + double data_transfer_unit = (mc_type == MC)? 72:16;/*DIMM data width*/ + power = power_t; + power.readOp.dynamic = power.readOp.dynamic * (mcp.peakDataTransferRate*8*1e6/1e9/*change to Gbs*/)*mcp.dataBusWidth/data_transfer_unit*mcp.num_channels/mcp.clockRate; + // divide by clock rate is for match the final computation where *clock is used + //(stats_t.readAc.access*power_t.readOp.dynamic+ +// stats_t.writeAc.access*power_t.readOp.dynamic); + + } + else + { + rt_power = power_t; +// rt_power.readOp.dynamic = (stats_t.readAc.access*power_t.readOp.dynamic+ +// stats_t.writeAc.access*power_t.readOp.dynamic); + + rt_power.readOp.dynamic=power_t.readOp.dynamic*(stats_t.readAc.access + stats_t.writeAc.access)*(mcp.llcBlockSize)*8/1e9/mcp.executionTime*(mcp.executionTime); + rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime; + } +} + +MCFrontEnd::MCFrontEnd(ParseXML *XML_interface,InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_) +:XML(XML_interface), + interface_ip(*interface_ip_), + mc_type(mc_type_), + mcp(mcp_), + MC_arb(0), + frontendBuffer(0), + readBuffer(0), + writeBuffer(0) +{ + /* All computations are for a single MC + * + */ + + int tag, data; + bool is_default =true;//indication for default setup + + /* MC frontend engine channels share the same engines but logically partitioned + * For all hardware inside MC. different channels do not share resources. + * TODO: add docodeing/mux stage to steer memory requests to different channels. + */ + + //memory request reorder buffer + tag = mcp.addressBusWidth + EXTRA_TAG_BITS + mcp.opcodeW; + data = int(ceil((XML->sys.physical_address_width + mcp.opcodeW)/8.0)); + interface_ip.cache_sz = data*XML->sys.mc.req_window_size_per_channel; + interface_ip.line_sz = data; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.access_mode = 0; + interface_ip.throughput = 1.0/mcp.clockRate; + interface_ip.latency = 1.0/mcp.clockRate; + interface_ip.is_cache = true; + interface_ip.pure_cam = false; + interface_ip.pure_ram = false; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = XML->sys.mc.memory_channels_per_mc; + interface_ip.num_wr_ports = interface_ip.num_rd_ports; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports = XML->sys.mc.memory_channels_per_mc; + frontendBuffer = new ArrayST(&interface_ip, "MC ReorderBuffer", Uncore_device); + frontendBuffer->area.set_area(frontendBuffer->area.get_area()+ frontendBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc); + area.set_area(area.get_area()+ frontendBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc); + + //selection and arbitration logic + MC_arb = new selection_logic(is_default, XML->sys.mc.req_window_size_per_channel,1,&interface_ip, Uncore_device); + + //read buffers. + data = (int)ceil(mcp.dataBusWidth/8.0);//Support key words first operation //8 means converting bit to Byte + interface_ip.cache_sz = data*XML->sys.mc.IO_buffer_size_per_channel;//*llcBlockSize; + interface_ip.line_sz = data; + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 1; + interface_ip.throughput = 1.0/mcp.clockRate; + interface_ip.latency = 1.0/mcp.clockRate; + interface_ip.is_cache = false; + interface_ip.pure_cam = false; + interface_ip.pure_ram = true; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0;//XML->sys.mc.memory_channels_per_mc*2>2?2:XML->sys.mc.memory_channels_per_mc*2; + interface_ip.num_rd_ports = XML->sys.mc.memory_channels_per_mc; + interface_ip.num_wr_ports = interface_ip.num_rd_ports; + interface_ip.num_se_rd_ports = 0; + readBuffer = new ArrayST(&interface_ip, "MC ReadBuffer", Uncore_device); + readBuffer->area.set_area(readBuffer->area.get_area()+ readBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc); + area.set_area(area.get_area()+ readBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc); + + //write buffer + data = (int)ceil(mcp.dataBusWidth/8.0);//Support key words first operation //8 means converting bit to Byte + interface_ip.cache_sz = data*XML->sys.mc.IO_buffer_size_per_channel;//*llcBlockSize; + interface_ip.line_sz = data; + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = 1.0/mcp.clockRate; + interface_ip.latency = 1.0/mcp.clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 0; + interface_ip.num_rd_ports = XML->sys.mc.memory_channels_per_mc; + interface_ip.num_wr_ports = interface_ip.num_rd_ports; + interface_ip.num_se_rd_ports = 0; + writeBuffer = new ArrayST(&interface_ip, "MC writeBuffer", Uncore_device); + writeBuffer->area.set_area(writeBuffer->area.get_area()+ writeBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc); + area.set_area(area.get_area()+ writeBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc); +} + +void MCFrontEnd::computeEnergy(bool is_tdp) +{ + if (is_tdp) + { + //init stats for Peak + frontendBuffer->stats_t.readAc.access = frontendBuffer->l_ip.num_search_ports; + frontendBuffer->stats_t.writeAc.access = frontendBuffer->l_ip.num_wr_ports; + frontendBuffer->tdp_stats = frontendBuffer->stats_t; + + readBuffer->stats_t.readAc.access = readBuffer->l_ip.num_rd_ports*mcp.frontend_duty_cycle; + readBuffer->stats_t.writeAc.access = readBuffer->l_ip.num_wr_ports*mcp.frontend_duty_cycle; + readBuffer->tdp_stats = readBuffer->stats_t; + + writeBuffer->stats_t.readAc.access = writeBuffer->l_ip.num_rd_ports*mcp.frontend_duty_cycle; + writeBuffer->stats_t.writeAc.access = writeBuffer->l_ip.num_wr_ports*mcp.frontend_duty_cycle; + writeBuffer->tdp_stats = writeBuffer->stats_t; + + } + else + { + //init stats for runtime power (RTP) + frontendBuffer->stats_t.readAc.access = XML->sys.mc.memory_reads *mcp.llcBlockSize*8.0/mcp.dataBusWidth*mcp.dataBusWidth/72; + //For each channel, each memory word need to check the address data to achieve best scheduling results. + //and this need to be done on all physical DIMMs in each logical memory DIMM *mcp.dataBusWidth/72 + frontendBuffer->stats_t.writeAc.access = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth*mcp.dataBusWidth/72; + frontendBuffer->rtp_stats = frontendBuffer->stats_t; + + readBuffer->stats_t.readAc.access = XML->sys.mc.memory_reads*mcp.llcBlockSize*8.0/mcp.dataBusWidth;//support key word first + readBuffer->stats_t.writeAc.access = XML->sys.mc.memory_reads*mcp.llcBlockSize*8.0/mcp.dataBusWidth;//support key word first + readBuffer->rtp_stats = readBuffer->stats_t; + + writeBuffer->stats_t.readAc.access = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth; + writeBuffer->stats_t.writeAc.access = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth; + writeBuffer->rtp_stats = writeBuffer->stats_t; + } + + frontendBuffer->power_t.reset(); + readBuffer->power_t.reset(); + writeBuffer->power_t.reset(); + +// frontendBuffer->power_t.readOp.dynamic += (frontendBuffer->stats_t.readAc.access* +// (frontendBuffer->local_result.power.searchOp.dynamic+frontendBuffer->local_result.power.readOp.dynamic)+ +// frontendBuffer->stats_t.writeAc.access*frontendBuffer->local_result.power.writeOp.dynamic); + + frontendBuffer->power_t.readOp.dynamic += (frontendBuffer->stats_t.readAc.access + + frontendBuffer->stats_t.writeAc.access)*frontendBuffer->local_result.power.searchOp.dynamic + + frontendBuffer->stats_t.readAc.access * frontendBuffer->local_result.power.readOp.dynamic + + frontendBuffer->stats_t.writeAc.access*frontendBuffer->local_result.power.writeOp.dynamic; + + readBuffer->power_t.readOp.dynamic += (readBuffer->stats_t.readAc.access* + readBuffer->local_result.power.readOp.dynamic+ + readBuffer->stats_t.writeAc.access*readBuffer->local_result.power.writeOp.dynamic); + writeBuffer->power_t.readOp.dynamic += (writeBuffer->stats_t.readAc.access* + writeBuffer->local_result.power.readOp.dynamic+ + writeBuffer->stats_t.writeAc.access*writeBuffer->local_result.power.writeOp.dynamic); + + if (is_tdp) + { + power = power + frontendBuffer->power_t + readBuffer->power_t + writeBuffer->power_t + + (frontendBuffer->local_result.power + + readBuffer->local_result.power + + writeBuffer->local_result.power)*pppm_lkg; + + } + else + { + rt_power = rt_power + frontendBuffer->power_t + readBuffer->power_t + writeBuffer->power_t + + (frontendBuffer->local_result.power + + readBuffer->local_result.power + + writeBuffer->local_result.power)*pppm_lkg; + rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime; + } +} + +void MCFrontEnd::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + + if (is_tdp) + { + cout << indent_str << "Front End ROB:" << endl; + cout << indent_str_next << "Area = " << frontendBuffer->area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << frontendBuffer->power.readOp.dynamic*mcp.clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " << frontendBuffer->power.readOp.leakage <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << frontendBuffer->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << frontendBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl; + + cout <<endl; + cout << indent_str<< "Read Buffer:" << endl; + cout << indent_str_next << "Area = " << readBuffer->area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << readBuffer->power.readOp.dynamic*mcp.clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " << readBuffer->power.readOp.leakage << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << readBuffer->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << readBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl; + cout <<endl; + cout << indent_str << "Write Buffer:" << endl; + cout << indent_str_next << "Area = " << writeBuffer->area.get_area() *1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << writeBuffer->power.readOp.dynamic*mcp.clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " << writeBuffer->power.readOp.leakage << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << writeBuffer->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << writeBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl; + cout <<endl; + } + else + { + cout << indent_str << "Front End ROB:" << endl; + cout << indent_str_next << "Area = " << frontendBuffer->area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << frontendBuffer->rt_power.readOp.dynamic*mcp.clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " << frontendBuffer->rt_power.readOp.leakage <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << frontendBuffer->rt_power.readOp.gate_leakage << " W" << endl; + cout <<endl; + cout << indent_str<< "Read Buffer:" << endl; + cout << indent_str_next << "Area = " << readBuffer->area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << readBuffer->rt_power.readOp.dynamic*mcp.clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " << readBuffer->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << readBuffer->rt_power.readOp.gate_leakage << " W" << endl; + cout <<endl; + cout << indent_str << "Write Buffer:" << endl; + cout << indent_str_next << "Area = " << writeBuffer->area.get_area() *1e-6 << " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << writeBuffer->rt_power.readOp.dynamic*mcp.clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " << writeBuffer->rt_power.readOp.leakage << " W" << endl; + cout << indent_str_next << "Gate Leakage = " << writeBuffer->rt_power.readOp.gate_leakage << " W" << endl; + } + +} + + +MemoryController::MemoryController(ParseXML *XML_interface,InputParameter* interface_ip_, enum MemoryCtrl_type mc_type_) +:XML(XML_interface), + interface_ip(*interface_ip_), + mc_type(mc_type_), + frontend(0), + transecEngine(0), + PHY(0), + pipeLogic(0) +{ + /* All computations are for a single MC + * + */ + interface_ip.wire_is_mat_type = 2; + interface_ip.wire_os_mat_type = 2; + interface_ip.wt =Global; + set_mc_param(); + frontend = new MCFrontEnd(XML, &interface_ip, mcp, mc_type); + area.set_area(area.get_area()+ frontend->area.get_area()); + transecEngine = new MCBackend(&interface_ip, mcp, mc_type); + area.set_area(area.get_area()+ transecEngine->area.get_area()); + if (mcp.type==0 || (mcp.type==1&&mcp.withPHY)) + { + PHY = new MCPHY(&interface_ip, mcp, mc_type); + area.set_area(area.get_area()+ PHY->area.get_area()); + } + //+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc. +// transecEngine.initialize(&interface_ip); +// transecEngine.peakDataTransferRate = XML->sys.mem.peak_transfer_rate; +// transecEngine.memDataWidth = dataBusWidth; +// transecEngine.memRank = XML->sys.mem.number_ranks; +// //transecEngine.memAccesses=XML->sys.mc.memory_accesses; +// //transecEngine.llcBlocksize=llcBlockSize; +// transecEngine.compute(); +// transecEngine.area.set_area(XML->sys.mc.memory_channels_per_mc*transecEngine.area.get_area()) ; +// area.set_area(area.get_area()+ transecEngine.area.get_area()); +// ///cout<<"area="<<area<<endl; +//// +// //++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers +// PHY.initialize(&interface_ip); +// PHY.peakDataTransferRate = XML->sys.mem.peak_transfer_rate; +// PHY.memDataWidth = dataBusWidth; +// //PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power +// //PHY.llcBlocksize=llcBlockSize; +// PHY.compute(); +// PHY.area.set_area(XML->sys.mc.memory_channels_per_mc*PHY.area.get_area()) ; +// area.set_area(area.get_area()+ PHY.area.get_area()); + ///cout<<"area="<<area<<endl; +// +// interface_ip.pipeline_stages = 5;//normal memory controller has five stages in the pipeline. +// interface_ip.per_stage_vector = addressBusWidth + XML->sys.core[0].opcode_width + dataBusWidth; +// pipeLogic = new pipeline(is_default, &interface_ip); +// //pipeLogic.init_pipeline(is_default, &interface_ip); +// pipeLogic->compute_pipeline(); +// area.set_area(area.get_area()+ pipeLogic->area.get_area()*1e-6); +// area.set_area((area.get_area()+mc_area*1e-6)*1.1);//placement and routing overhead +// +// +//// //clock +//// clockNetwork.init_wire_external(is_default, &interface_ip); +//// clockNetwork.clk_area =area*1.1;//10% of placement overhead. rule of thumb +//// clockNetwork.end_wiring_level =5;//toplevel metal +//// clockNetwork.start_wiring_level =5;//toplevel metal +//// clockNetwork.num_regs = pipeLogic.tot_stage_vector; +//// clockNetwork.optimize_wire(); + + +} +void MemoryController::computeEnergy(bool is_tdp) +{ + + frontend->computeEnergy(is_tdp); + transecEngine->computeEnergy(is_tdp); + if (mcp.type==0 || (mcp.type==1&&mcp.withPHY)) + { + PHY->computeEnergy(is_tdp); + } + if (is_tdp) + { + power = power + frontend->power + transecEngine->power; + if (mcp.type==0 || (mcp.type==1&&mcp.withPHY)) + { + power = power + PHY->power; + } + } + else + { + rt_power = rt_power + frontend->rt_power + transecEngine->rt_power; + if (mcp.type==0 || (mcp.type==1&&mcp.withPHY)) + { + rt_power = rt_power + PHY->rt_power; + } + } +} + +void MemoryController::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + + if (is_tdp) + { + cout << "Memory Controller:" << endl; + cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*mcp.clockRate << " W" << endl; + cout << indent_str<< "Subthreshold Leakage = " + << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; + //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl; + cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; + cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic/mcp.executionTime << " W" << endl; + cout<<endl; + cout << indent_str << "Front End Engine:" << endl; + cout << indent_str_next << "Area = " << frontend->area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << frontend->power.readOp.dynamic*mcp.clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? frontend->power.readOp.longer_channel_leakage:frontend->power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << frontend->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << frontend->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl; + cout <<endl; + if (plevel >2){ + frontend->displayEnergy(indent+4,is_tdp); + } + cout << indent_str << "Transaction Engine:" << endl; + cout << indent_str_next << "Area = " << transecEngine->area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << transecEngine->power.readOp.dynamic*mcp.clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? transecEngine->power.readOp.longer_channel_leakage:transecEngine->power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << transecEngine->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << transecEngine->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl; + cout <<endl; + if (mcp.type==0 || (mcp.type==1&&mcp.withPHY)) + { + cout << indent_str << "PHY:" << endl; + cout << indent_str_next << "Area = " << PHY->area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << PHY->power.readOp.dynamic*mcp.clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? PHY->power.readOp.longer_channel_leakage:PHY->power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << PHY->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << PHY->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl; + cout <<endl; + } + } + else + { + cout << "Memory Controller:" << endl; + cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*mcp.clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; + cout<<endl; + } + +} + +void MemoryController::set_mc_param() +{ + + if (mc_type==MC) + { + mcp.clockRate =XML->sys.mc.mc_clock*2;//DDR double pumped + mcp.clockRate *= 1e6; + mcp.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6); + + mcp.llcBlockSize =int(ceil(XML->sys.mc.llc_line_length/8.0))+XML->sys.mc.llc_line_length;//ecc overhead + mcp.dataBusWidth =int(ceil(XML->sys.mc.databus_width/8.0)) + XML->sys.mc.databus_width; + mcp.addressBusWidth =int(ceil(XML->sys.mc.addressbus_width));//XML->sys.physical_address_width; + mcp.opcodeW =16; + mcp.num_mcs = XML->sys.mc.number_mcs; + mcp.num_channels = XML->sys.mc.memory_channels_per_mc; + mcp.reads = XML->sys.mc.memory_reads; + mcp.writes = XML->sys.mc.memory_writes; + //+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc. + mcp.peakDataTransferRate = XML->sys.mc.peak_transfer_rate; + mcp.memRank = XML->sys.mc.number_ranks; + //++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers + //PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power + //PHY.llcBlocksize=llcBlockSize; + mcp.frontend_duty_cycle = 0.5;//for max power, the actual off-chip links is bidirectional but time shared + mcp.LVDS = XML->sys.mc.LVDS; + mcp.type = XML->sys.mc.type; + mcp.withPHY = XML->sys.mc.withPHY; + } +// else if (mc_type==FLASHC) +// { +// mcp.clockRate =XML->sys.flashc.mc_clock*2;//DDR double pumped +// mcp.clockRate *= 1e6; +// mcp.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6); +// +// mcp.llcBlockSize =int(ceil(XML->sys.flashc.llc_line_length/8.0))+XML->sys.flashc.llc_line_length;//ecc overhead +// mcp.dataBusWidth =int(ceil(XML->sys.flashc.databus_width/8.0)) + XML->sys.flashc.databus_width; +// mcp.addressBusWidth =int(ceil(XML->sys.flashc.addressbus_width));//XML->sys.physical_address_width; +// mcp.opcodeW =16; +// mcp.num_mcs = XML->sys.flashc.number_mcs; +// mcp.num_channels = XML->sys.flashc.memory_channels_per_mc; +// mcp.reads = XML->sys.flashc.memory_reads; +// mcp.writes = XML->sys.flashc.memory_writes; +// //+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc. +// mcp.peakDataTransferRate = XML->sys.flashc.peak_transfer_rate; +// mcp.memRank = XML->sys.flashc.number_ranks; +// //++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers +// //PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power +// //PHY.llcBlocksize=llcBlockSize; +// mcp.frontend_duty_cycle = 0.5;//for max power, the actual off-chip links is bidirectional but time shared +// mcp.LVDS = XML->sys.flashc.LVDS; +// mcp.type = XML->sys.flashc.type; +// } + else + { + cout<<"Unknown memory controller type: neither DRAM controller nor Flash controller" <<endl; + exit(0); + } +} + +MCFrontEnd ::~MCFrontEnd(){ + + if(MC_arb) {delete MC_arb; MC_arb = 0;} + if(frontendBuffer) {delete frontendBuffer; frontendBuffer = 0;} + if(readBuffer) {delete readBuffer; readBuffer = 0;} + if(writeBuffer) {delete writeBuffer; writeBuffer = 0;} +} + +MemoryController ::~MemoryController(){ + + if(frontend) {delete frontend; frontend = 0;} + if(transecEngine) {delete transecEngine; transecEngine = 0;} + if(PHY) {delete PHY; PHY = 0;} + if(pipeLogic) {delete pipeLogic; pipeLogic = 0;} +} + diff --git a/ext/mcpat/memoryctrl.h b/ext/mcpat/memoryctrl.h new file mode 100644 index 000000000..65be20a8f --- /dev/null +++ b/ext/mcpat/memoryctrl.h @@ -0,0 +1,113 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#ifndef MEMORYCTRL_H_ +#define MEMORYCTRL_H_ + +#include "XML_Parse.h" +#include "parameter.h" +//#include "io.h" +#include "array.h" +//#include "Undifferentiated_Core_Area.h" +#include <vector> + +#include "basic_components.h" + +class MCBackend : public Component { + public: + InputParameter l_ip; + uca_org_t local_result; + enum MemoryCtrl_type mc_type; + MCParam mcp; + statsDef tdp_stats; + statsDef rtp_stats; + statsDef stats_t; + powerDef power_t; + MCBackend(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_); + void compute(); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~MCBackend(){}; +}; + +class MCPHY : public Component { + public: + InputParameter l_ip; + uca_org_t local_result; + enum MemoryCtrl_type mc_type; + MCParam mcp; + statsDef tdp_stats; + statsDef rtp_stats; + statsDef stats_t; + powerDef power_t; + MCPHY(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_); + void compute(); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~MCPHY(){}; +}; + +class MCFrontEnd : public Component { + public: + ParseXML *XML; + InputParameter interface_ip; + enum MemoryCtrl_type mc_type; + MCParam mcp; + selection_logic * MC_arb; + ArrayST * frontendBuffer; + ArrayST * readBuffer; + ArrayST * writeBuffer; + + MCFrontEnd(ParseXML *XML_interface,InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~MCFrontEnd(); +}; + +class MemoryController : public Component { + public: + ParseXML *XML; + InputParameter interface_ip; + enum MemoryCtrl_type mc_type; + MCParam mcp; + MCFrontEnd * frontend; + MCBackend * transecEngine; + MCPHY * PHY; + Pipeline * pipeLogic; + + //clock_network clockNetwork; + MemoryController(ParseXML *XML_interface,InputParameter* interface_ip_, enum MemoryCtrl_type mc_type_); + void set_mc_param(); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~MemoryController(); +}; +#endif /* MEMORYCTRL_H_ */ diff --git a/ext/mcpat/noc.cc b/ext/mcpat/noc.cc new file mode 100644 index 000000000..d5dfbb137 --- /dev/null +++ b/ext/mcpat/noc.cc @@ -0,0 +1,355 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <cmath> +#include <iostream> +#include <string> + +#include "XML_Parse.h" +#include "basic_circuit.h" +#include "const.h" +#include "io.h" +#include "noc.h" +#include "parameter.h" + +NoC::NoC(ParseXML *XML_interface, int ithNoC_, InputParameter* interface_ip_, double M_traffic_pattern_, double link_len_) +:XML(XML_interface), +ithNoC(ithNoC_), +interface_ip(*interface_ip_), +router(0), +link_bus(0), +link_bus_exist(false), +router_exist(false), +M_traffic_pattern(M_traffic_pattern_) +{ + /* + * initialize, compute and optimize individual components. + */ + + if (XML->sys.Embedded) + { + interface_ip.wt =Global_30; + interface_ip.wire_is_mat_type = 0; + interface_ip.wire_os_mat_type = 1; + } + else + { + interface_ip.wt =Global; + interface_ip.wire_is_mat_type = 2; + interface_ip.wire_os_mat_type = 2; + } + set_noc_param(); + local_result=init_interface(&interface_ip); + scktRatio = g_tp.sckt_co_eff; + + if (nocdynp.type) + {/* + * if NOC compute router, router links must be computed separately + * and called from external + * since total chip area must be known first + */ + init_router(); + } + else + { + init_link_bus(link_len_); //if bus compute bus + } + + // //clock power + // clockNetwork.init_wire_external(is_default, &interface_ip); + // clockNetwork.clk_area =area*1.1;//10% of placement overhead. rule of thumb + // clockNetwork.end_wiring_level =5;//toplevel metal + // clockNetwork.start_wiring_level =5;//toplevel metal + // clockNetwork.num_regs = corepipe.tot_stage_vector; + // clockNetwork.optimize_wire(); +} + +void NoC::init_router() +{ + router = new Router(nocdynp.flit_size, + nocdynp.virtual_channel_per_port*nocdynp.input_buffer_entries_per_vc, + nocdynp.virtual_channel_per_port, &(g_tp.peri_global), + nocdynp.input_ports,nocdynp.output_ports, M_traffic_pattern); + //router->print_router(); + area.set_area(area.get_area()+ router->area.get_area()*nocdynp.total_nodes); + + double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device); + router->power.readOp.longer_channel_leakage = router->power.readOp.leakage * long_channel_device_reduction; + router->buffer.power.readOp.longer_channel_leakage = router->buffer.power.readOp.leakage * long_channel_device_reduction; + router->crossbar.power.readOp.longer_channel_leakage = router->crossbar.power.readOp.leakage * long_channel_device_reduction; + router->arbiter.power.readOp.longer_channel_leakage = router->arbiter.power.readOp.leakage * long_channel_device_reduction; + router_exist = true; +} + +void NoC ::init_link_bus(double link_len_) +{ + + +// if (nocdynp.min_ports==1 ) + if (nocdynp.type) + link_name = "Links"; + else + link_name = "Bus"; + + link_len=link_len_; + assert(link_len>0); + + interface_ip.throughput = nocdynp.link_throughput/nocdynp.clockRate; + interface_ip.latency = nocdynp.link_latency/nocdynp.clockRate; + + link_len /= (nocdynp.horizontal_nodes + nocdynp.vertical_nodes)/2; + + if (nocdynp.total_nodes >1) link_len /=2; //All links are shared by neighbors + link_bus = new interconnect(name, Uncore_device, 1, 1, nocdynp.flit_size, + link_len, &interface_ip, 3, true/*pipelinable*/, nocdynp.route_over_perc); + + link_bus_tot_per_Router.area.set_area(link_bus_tot_per_Router.area.get_area()+ link_bus->area.get_area() + * nocdynp.global_linked_ports); + + area.set_area(area.get_area()+ link_bus_tot_per_Router.area.get_area()* nocdynp.total_nodes); + link_bus_exist = true; +} +void NoC::computeEnergy(bool is_tdp) +{ + //power_point_product_masks + double pppm_t[4] = {1,1,1,1}; + double M=nocdynp.duty_cycle; + if (is_tdp) + { + //init stats for TDP + stats_t.readAc.access = M; + tdp_stats = stats_t; + if (router_exist) + { + set_pppm(pppm_t, 1*M, 1, 1, 1);//reset traffic pattern + router->power = router->power*pppm_t; + set_pppm(pppm_t, nocdynp.total_nodes, nocdynp.total_nodes, nocdynp.total_nodes, nocdynp.total_nodes); + power = power + router->power*pppm_t; + } + if (link_bus_exist) + { + if (nocdynp.type) + set_pppm(pppm_t, 1*M_traffic_pattern*M*(nocdynp.min_ports -1), nocdynp.global_linked_ports, + nocdynp.global_linked_ports, nocdynp.global_linked_ports); + //reset traffic pattern; local port do not have router links + else + set_pppm(pppm_t, 1*M_traffic_pattern*M*(nocdynp.min_ports), nocdynp.global_linked_ports, + nocdynp.global_linked_ports, nocdynp.global_linked_ports);//reset traffic pattern + + link_bus_tot_per_Router.power = link_bus->power*pppm_t; + + set_pppm(pppm_t, nocdynp.total_nodes, + nocdynp.total_nodes, + nocdynp.total_nodes, + nocdynp.total_nodes); + power = power + link_bus_tot_per_Router.power*pppm_t; + + } + } + else + { + //init stats for runtime power (RTP) + stats_t.readAc.access = XML->sys.NoC[ithNoC].total_accesses; + rtp_stats = stats_t; + set_pppm(pppm_t, 1, 0 , 0, 0); + if (router_exist) + { + router->buffer.rt_power.readOp.dynamic = (router->buffer.power.readOp.dynamic + router->buffer.power.writeOp.dynamic)*rtp_stats.readAc.access ; + router->crossbar.rt_power.readOp.dynamic = router->crossbar.power.readOp.dynamic*rtp_stats.readAc.access ; + router->arbiter.rt_power.readOp.dynamic = router->arbiter.power.readOp.dynamic*rtp_stats.readAc.access ; + + router->rt_power = router->rt_power + (router->buffer.rt_power + router->crossbar.rt_power + router->arbiter.rt_power)*pppm_t + + router->power*pppm_lkg;//TDP power must be calculated first! + rt_power = rt_power + router->rt_power; + } + if (link_bus_exist) + { + set_pppm(pppm_t, rtp_stats.readAc.access, 1 , 1, rtp_stats.readAc.access); + link_bus->rt_power = link_bus->power * pppm_t; + rt_power = rt_power + link_bus->rt_power; + } + + } +} + + +void NoC::displayEnergy(uint32_t indent,int plevel,bool is_tdp) +{ + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + + double M =M_traffic_pattern*nocdynp.duty_cycle; + /*only router as a whole has been applied the M_traffic_pattern(0.6 by default) factor in router.cc; + * When power of crossbars, arbiters, etc need to be displayed, the M_traffic_pattern factor need to + * be applied together with McPAT's extra traffic pattern. + * */ + if (is_tdp) + { + cout << name << endl; + cout << indent_str << "Area = " << area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str<< "Peak Dynamic = " << power.readOp.dynamic*nocdynp.clockRate << " W" << endl; + cout << indent_str << "Subthreshold Leakage = " + << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; + cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; + cout << indent_str<< "Runtime Dynamic = " << rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl; + cout<<endl; + + if (router_exist) + { + cout << indent_str << "Router: " << endl; + cout << indent_str_next << "Area = " << router->area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next<< "Peak Dynamic = " << router->power.readOp.dynamic*nocdynp.clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? router->power.readOp.longer_channel_leakage:router->power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << router->power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next<< "Runtime Dynamic = " << router->rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl; + cout<<endl; + if (plevel >2){ + cout << indent_str<< indent_str << "Virtual Channel Buffer:" << endl; + cout << indent_str<< indent_str_next << "Area = " << router->buffer.area.get_area()*1e-6*nocdynp.input_ports<< " mm^2" << endl; + cout << indent_str<< indent_str_next << "Peak Dynamic = " <<(router->buffer.power.readOp.dynamic + router->buffer.power.writeOp.dynamic) + *nocdynp.min_ports*M*nocdynp.clockRate << " W" << endl; + cout << indent_str<< indent_str_next << "Subthreshold Leakage = " + << (long_channel? router->buffer.power.readOp.longer_channel_leakage*nocdynp.input_ports:router->buffer.power.readOp.leakage*nocdynp.input_ports) <<" W" << endl; + cout << indent_str<< indent_str_next << "Gate Leakage = " << router->buffer.power.readOp.gate_leakage*nocdynp.input_ports << " W" << endl; + cout << indent_str<< indent_str_next << "Runtime Dynamic = " << router->buffer.rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl; + cout <<endl; + cout << indent_str<< indent_str<< "Crossbar:" << endl; + cout << indent_str<< indent_str_next << "Area = " << router->crossbar.area.get_area()*1e-6 << " mm^2" << endl; + cout << indent_str<< indent_str_next << "Peak Dynamic = " << router->crossbar.power.readOp.dynamic*nocdynp.clockRate*nocdynp.min_ports*M << " W" << endl; + cout << indent_str<< indent_str_next << "Subthreshold Leakage = " + << (long_channel? router->crossbar.power.readOp.longer_channel_leakage:router->crossbar.power.readOp.leakage) << " W" << endl; + cout << indent_str<< indent_str_next << "Gate Leakage = " << router->crossbar.power.readOp.gate_leakage << " W" << endl; + cout << indent_str<< indent_str_next << "Runtime Dynamic = " << router->crossbar.rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl; + cout <<endl; + cout << indent_str<< indent_str<< "Arbiter:" << endl; + cout << indent_str<< indent_str_next << "Peak Dynamic = " << router->arbiter.power.readOp.dynamic*nocdynp.clockRate*nocdynp.min_ports*M << " W" << endl; + cout << indent_str<< indent_str_next << "Subthreshold Leakage = " + << (long_channel? router->arbiter.power.readOp.longer_channel_leakage:router->arbiter.power.readOp.leakage) << " W" << endl; + cout << indent_str<< indent_str_next << "Gate Leakage = " << router->arbiter.power.readOp.gate_leakage << " W" << endl; + cout << indent_str<< indent_str_next << "Runtime Dynamic = " << router->arbiter.rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl; + cout <<endl; + } + } + if (link_bus_exist) + { + cout << indent_str << (nocdynp.type? "Per Router ":"") << link_name<<": " << endl; + cout << indent_str_next << "Area = " << link_bus_tot_per_Router.area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next<< "Peak Dynamic = " << link_bus_tot_per_Router.power.readOp.dynamic* + nocdynp.clockRate << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? link_bus_tot_per_Router.power.readOp.longer_channel_leakage:link_bus_tot_per_Router.power.readOp.leakage) + <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << link_bus_tot_per_Router.power.readOp.gate_leakage + << " W" << endl; + cout << indent_str_next<< "Runtime Dynamic = " << link_bus->rt_power.readOp.dynamic/nocdynp.executionTime << " W" << endl; + cout<<endl; + + } + } + else + { +// cout << indent_str_next << "Instruction Fetch Unit Peak Dynamic = " << ifu->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Instruction Fetch Unit Subthreshold Leakage = " << ifu->rt_power.readOp.leakage <<" W" << endl; +// cout << indent_str_next << "Instruction Fetch Unit Gate Leakage = " << ifu->rt_power.readOp.gate_leakage << " W" << endl; +// cout << indent_str_next << "Load Store Unit Peak Dynamic = " << lsu->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Load Store Unit Subthreshold Leakage = " << lsu->rt_power.readOp.leakage << " W" << endl; +// cout << indent_str_next << "Load Store Unit Gate Leakage = " << lsu->rt_power.readOp.gate_leakage << " W" << endl; +// cout << indent_str_next << "Memory Management Unit Peak Dynamic = " << mmu->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Memory Management Unit Subthreshold Leakage = " << mmu->rt_power.readOp.leakage << " W" << endl; +// cout << indent_str_next << "Memory Management Unit Gate Leakage = " << mmu->rt_power.readOp.gate_leakage << " W" << endl; +// cout << indent_str_next << "Execution Unit Peak Dynamic = " << exu->rt_power.readOp.dynamic*clockRate << " W" << endl; +// cout << indent_str_next << "Execution Unit Subthreshold Leakage = " << exu->rt_power.readOp.leakage << " W" << endl; +// cout << indent_str_next << "Execution Unit Gate Leakage = " << exu->rt_power.readOp.gate_leakage << " W" << endl; + } +} + +void NoC::set_noc_param() +{ + + nocdynp.type = XML->sys.NoC[ithNoC].type; + nocdynp.clockRate =XML->sys.NoC[ithNoC].clockrate; + nocdynp.clockRate *= 1e6; + nocdynp.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6); + + nocdynp.flit_size = XML->sys.NoC[ithNoC].flit_bits; + if (nocdynp.type) + { + nocdynp.input_ports = XML->sys.NoC[ithNoC].input_ports; + nocdynp.output_ports = XML->sys.NoC[ithNoC].output_ports;//later minus 1 + nocdynp.min_ports = min(nocdynp.input_ports,nocdynp.output_ports); + nocdynp.global_linked_ports = (nocdynp.input_ports-1) + (nocdynp.output_ports-1); + /* + * Except local i/o ports, all ports needs links( global_linked_ports); + * However only min_ports can be fully active simultaneously + * since the fewer number of ports (input or output ) is the bottleneck. + */ + } + else + { + nocdynp.input_ports = 1; + nocdynp.output_ports = 1; + nocdynp.min_ports = min(nocdynp.input_ports,nocdynp.output_ports); + nocdynp.global_linked_ports = 1; + } + + nocdynp.virtual_channel_per_port = XML->sys.NoC[ithNoC].virtual_channel_per_port; + nocdynp.input_buffer_entries_per_vc = XML->sys.NoC[ithNoC].input_buffer_entries_per_vc; + + nocdynp.horizontal_nodes = XML->sys.NoC[ithNoC].horizontal_nodes; + nocdynp.vertical_nodes = XML->sys.NoC[ithNoC].vertical_nodes; + nocdynp.total_nodes = nocdynp.horizontal_nodes*nocdynp.vertical_nodes; + nocdynp.duty_cycle = XML->sys.NoC[ithNoC].duty_cycle; + nocdynp.has_global_link = XML->sys.NoC[ithNoC].has_global_link; + nocdynp.link_throughput = XML->sys.NoC[ithNoC].link_throughput; + nocdynp.link_latency = XML->sys.NoC[ithNoC].link_latency; + nocdynp.chip_coverage = XML->sys.NoC[ithNoC].chip_coverage; + nocdynp.route_over_perc = XML->sys.NoC[ithNoC].route_over_perc; + + assert (nocdynp.chip_coverage <=1); + assert (nocdynp.route_over_perc <=1); + + if (nocdynp.type) + name = "NOC"; + else + name = "BUSES"; + +} + + +NoC ::~NoC(){ + + if(router) {delete router; router = 0;} + if(link_bus) {delete link_bus; link_bus = 0;} +} diff --git a/ext/mcpat/noc.h b/ext/mcpat/noc.h new file mode 100644 index 000000000..31b5b3b2e --- /dev/null +++ b/ext/mcpat/noc.h @@ -0,0 +1,75 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#ifndef NOC_H_ +#define NOC_H_ +#include "XML_Parse.h" +#include "array.h" +#include "basic_components.h" +#include "interconnect.h" +#include "logic.h" +#include "parameter.h" +#include "router.h" + +class NoC :public Component { + public: + + ParseXML *XML; + int ithNoC; + InputParameter interface_ip; + double link_len; + double executionTime; + double scktRatio, chip_PR_overhead, macro_PR_overhead; + Router * router; + interconnect * link_bus; + NoCParam nocdynp; + uca_org_t local_result; + statsDef tdp_stats; + statsDef rtp_stats; + statsDef stats_t; + powerDef power_t; + Component link_bus_tot_per_Router; + bool link_bus_exist; + bool router_exist; + string name, link_name; + double M_traffic_pattern; + NoC(ParseXML *XML_interface, int ithNoC_, InputParameter* interface_ip_, double M_traffic_pattern_ = 0.6,double link_len_=0); + void set_noc_param(); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + void init_link_bus(double link_len_); + void init_router(); + void computeEnergy_link_bus(bool is_tdp=true); + void displayEnergy_link_bus(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + ~NoC(); +}; + +#endif /* NOC_H_ */ diff --git a/ext/mcpat/processor.cc b/ext/mcpat/processor.cc new file mode 100644 index 000000000..8520c9633 --- /dev/null +++ b/ext/mcpat/processor.cc @@ -0,0 +1,839 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ +#include <algorithm> +#include <cassert> +#include <cmath> +#include <cstdio> +#include <cstring> +#include <fstream> +#include <iostream> + +#include "XML_Parse.h" +#include "array.h" +#include "basic_circuit.h" +#include "const.h" +#include "parameter.h" +#include "processor.h" +#include "version.h" + +Processor::Processor(ParseXML *XML_interface) +:XML(XML_interface),//TODO: using one global copy may have problems. + mc(0), + niu(0), + pcie(0), + flashcontroller(0) +{ + /* + * placement and routing overhead is 10%, core scales worse than cache 40% is accumulated from 90 to 22nm + * There is no point to have heterogeneous memory controller on chip, + * thus McPAT only support homogeneous memory controllers. + */ + int i; + double pppm_t[4] = {1,1,1,1}; + set_proc_param(); + if (procdynp.homoCore) + numCore = procdynp.numCore==0? 0:1; + else + numCore = procdynp.numCore; + + if (procdynp.homoL2) + numL2 = procdynp.numL2==0? 0:1; + else + numL2 = procdynp.numL2; + + if (XML->sys.Private_L2 && numCore != numL2) + { + cout<<"Number of private L2 does not match number of cores"<<endl; + exit(0); + } + + if (procdynp.homoL3) + numL3 = procdynp.numL3==0? 0:1; + else + numL3 = procdynp.numL3; + + if (procdynp.homoNOC) + numNOC = procdynp.numNOC==0? 0:1; + else + numNOC = procdynp.numNOC; + +// if (!procdynp.homoNOC) +// { +// cout<<"Current McPAT does not support heterogeneous NOC"<<endl; +// exit(0); +// } + + if (procdynp.homoL1Dir) + numL1Dir = procdynp.numL1Dir==0? 0:1; + else + numL1Dir = procdynp.numL1Dir; + + if (procdynp.homoL2Dir) + numL2Dir = procdynp.numL2Dir==0? 0:1; + else + numL2Dir = procdynp.numL2Dir; + + for (i = 0;i < numCore; i++) + { + cores.push_back(new Core(XML,i, &interface_ip)); + cores[i]->computeEnergy(); + cores[i]->computeEnergy(false); + if (procdynp.homoCore){ + core.area.set_area(core.area.get_area() + cores[i]->area.get_area()*procdynp.numCore); + set_pppm(pppm_t,cores[i]->clockRate*procdynp.numCore, procdynp.numCore,procdynp.numCore,procdynp.numCore); + core.power = core.power + cores[i]->power*pppm_t; + set_pppm(pppm_t,1/cores[i]->executionTime, procdynp.numCore,procdynp.numCore,procdynp.numCore); + core.rt_power = core.rt_power + cores[i]->rt_power*pppm_t; + area.set_area(area.get_area() + core.area.get_area());//placement and routing overhead is 10%, core scales worse than cache 40% is accumulated from 90 to 22nm + power = power + core.power; + rt_power = rt_power + core.rt_power; + } + else{ + core.area.set_area(core.area.get_area() + cores[i]->area.get_area()); + area.set_area(area.get_area() + cores[i]->area.get_area());//placement and routing overhead is 10%, core scales worse than cache 40% is accumulated from 90 to 22nm + + set_pppm(pppm_t,cores[i]->clockRate, 1, 1, 1); + core.power = core.power + cores[i]->power*pppm_t; + power = power + cores[i]->power*pppm_t; + + set_pppm(pppm_t,1/cores[i]->executionTime, 1, 1, 1); + core.rt_power = core.rt_power + cores[i]->rt_power*pppm_t; + rt_power = rt_power + cores[i]->rt_power*pppm_t; + } + } + + if (!XML->sys.Private_L2) + { + if (numL2 >0) + for (i = 0;i < numL2; i++) + { + l2array.push_back(new SharedCache(XML,i, &interface_ip)); + l2array[i]->computeEnergy(); + l2array[i]->computeEnergy(false); + if (procdynp.homoL2){ + l2.area.set_area(l2.area.get_area() + l2array[i]->area.get_area()*procdynp.numL2); + set_pppm(pppm_t,l2array[i]->cachep.clockRate*procdynp.numL2, procdynp.numL2,procdynp.numL2,procdynp.numL2); + l2.power = l2.power + l2array[i]->power*pppm_t; + set_pppm(pppm_t,1/l2array[i]->cachep.executionTime, procdynp.numL2,procdynp.numL2,procdynp.numL2); + l2.rt_power = l2.rt_power + l2array[i]->rt_power*pppm_t; + area.set_area(area.get_area() + l2.area.get_area());//placement and routing overhead is 10%, l2 scales worse than cache 40% is accumulated from 90 to 22nm + power = power + l2.power; + rt_power = rt_power + l2.rt_power; + } + else{ + l2.area.set_area(l2.area.get_area() + l2array[i]->area.get_area()); + area.set_area(area.get_area() + l2array[i]->area.get_area());//placement and routing overhead is 10%, l2 scales worse than cache 40% is accumulated from 90 to 22nm + + set_pppm(pppm_t,l2array[i]->cachep.clockRate, 1, 1, 1); + l2.power = l2.power + l2array[i]->power*pppm_t; + power = power + l2array[i]->power*pppm_t;; + set_pppm(pppm_t,1/l2array[i]->cachep.executionTime, 1, 1, 1); + l2.rt_power = l2.rt_power + l2array[i]->rt_power*pppm_t; + rt_power = rt_power + l2array[i]->rt_power*pppm_t; + } + } + } + + if (numL3 >0) + for (i = 0;i < numL3; i++) + { + l3array.push_back(new SharedCache(XML,i, &interface_ip, L3)); + l3array[i]->computeEnergy(); + l3array[i]->computeEnergy(false); + if (procdynp.homoL3){ + l3.area.set_area(l3.area.get_area() + l3array[i]->area.get_area()*procdynp.numL3); + set_pppm(pppm_t,l3array[i]->cachep.clockRate*procdynp.numL3, procdynp.numL3,procdynp.numL3,procdynp.numL3); + l3.power = l3.power + l3array[i]->power*pppm_t; + set_pppm(pppm_t,1/l3array[i]->cachep.executionTime, procdynp.numL3,procdynp.numL3,procdynp.numL3); + l3.rt_power = l3.rt_power + l3array[i]->rt_power*pppm_t; + area.set_area(area.get_area() + l3.area.get_area());//placement and routing overhead is 10%, l3 scales worse than cache 40% is accumulated from 90 to 22nm + power = power + l3.power; + rt_power = rt_power + l3.rt_power; + + } + else{ + l3.area.set_area(l3.area.get_area() + l3array[i]->area.get_area()); + area.set_area(area.get_area() + l3array[i]->area.get_area());//placement and routing overhead is 10%, l3 scales worse than cache 40% is accumulated from 90 to 22nm + set_pppm(pppm_t,l3array[i]->cachep.clockRate, 1, 1, 1); + l3.power = l3.power + l3array[i]->power*pppm_t; + power = power + l3array[i]->power*pppm_t; + set_pppm(pppm_t,1/l3array[i]->cachep.executionTime, 1, 1, 1); + l3.rt_power = l3.rt_power + l3array[i]->rt_power*pppm_t; + rt_power = rt_power + l3array[i]->rt_power*pppm_t; + + } + } + if (numL1Dir >0) + for (i = 0;i < numL1Dir; i++) + { + l1dirarray.push_back(new SharedCache(XML,i, &interface_ip, L1Directory)); + l1dirarray[i]->computeEnergy(); + l1dirarray[i]->computeEnergy(false); + if (procdynp.homoL1Dir){ + l1dir.area.set_area(l1dir.area.get_area() + l1dirarray[i]->area.get_area()*procdynp.numL1Dir); + set_pppm(pppm_t,l1dirarray[i]->cachep.clockRate*procdynp.numL1Dir, procdynp.numL1Dir,procdynp.numL1Dir,procdynp.numL1Dir); + l1dir.power = l1dir.power + l1dirarray[i]->power*pppm_t; + set_pppm(pppm_t,1/l1dirarray[i]->cachep.executionTime, procdynp.numL1Dir,procdynp.numL1Dir,procdynp.numL1Dir); + l1dir.rt_power = l1dir.rt_power + l1dirarray[i]->rt_power*pppm_t; + area.set_area(area.get_area() + l1dir.area.get_area());//placement and routing overhead is 10%, l1dir scales worse than cache 40% is accumulated from 90 to 22nm + power = power + l1dir.power; + rt_power = rt_power + l1dir.rt_power; + + } + else{ + l1dir.area.set_area(l1dir.area.get_area() + l1dirarray[i]->area.get_area()); + area.set_area(area.get_area() + l1dirarray[i]->area.get_area()); + set_pppm(pppm_t,l1dirarray[i]->cachep.clockRate, 1, 1, 1); + l1dir.power = l1dir.power + l1dirarray[i]->power*pppm_t; + power = power + l1dirarray[i]->power; + set_pppm(pppm_t,1/l1dirarray[i]->cachep.executionTime, 1, 1, 1); + l1dir.rt_power = l1dir.rt_power + l1dirarray[i]->rt_power*pppm_t; + rt_power = rt_power + l1dirarray[i]->rt_power; + } + } + + if (numL2Dir >0) + for (i = 0;i < numL2Dir; i++) + { + l2dirarray.push_back(new SharedCache(XML,i, &interface_ip, L2Directory)); + l2dirarray[i]->computeEnergy(); + l2dirarray[i]->computeEnergy(false); + if (procdynp.homoL2Dir){ + l2dir.area.set_area(l2dir.area.get_area() + l2dirarray[i]->area.get_area()*procdynp.numL2Dir); + set_pppm(pppm_t,l2dirarray[i]->cachep.clockRate*procdynp.numL2Dir, procdynp.numL2Dir,procdynp.numL2Dir,procdynp.numL2Dir); + l2dir.power = l2dir.power + l2dirarray[i]->power*pppm_t; + set_pppm(pppm_t,1/l2dirarray[i]->cachep.executionTime, procdynp.numL2Dir,procdynp.numL2Dir,procdynp.numL2Dir); + l2dir.rt_power = l2dir.rt_power + l2dirarray[i]->rt_power*pppm_t; + area.set_area(area.get_area() + l2dir.area.get_area());//placement and routing overhead is 10%, l2dir scales worse than cache 40% is accumulated from 90 to 22nm + power = power + l2dir.power; + rt_power = rt_power + l2dir.rt_power; + + } + else{ + l2dir.area.set_area(l2dir.area.get_area() + l2dirarray[i]->area.get_area()); + area.set_area(area.get_area() + l2dirarray[i]->area.get_area()); + set_pppm(pppm_t,l2dirarray[i]->cachep.clockRate, 1, 1, 1); + l2dir.power = l2dir.power + l2dirarray[i]->power*pppm_t; + power = power + l2dirarray[i]->power*pppm_t; + set_pppm(pppm_t,1/l2dirarray[i]->cachep.executionTime, 1, 1, 1); + l2dir.rt_power = l2dir.rt_power + l2dirarray[i]->rt_power*pppm_t; + rt_power = rt_power + l2dirarray[i]->rt_power*pppm_t; + } + } + + if (XML->sys.mc.number_mcs >0 && XML->sys.mc.memory_channels_per_mc>0) + { + mc = new MemoryController(XML, &interface_ip, MC); + mc->computeEnergy(); + mc->computeEnergy(false); + mcs.area.set_area(mcs.area.get_area()+mc->area.get_area()*XML->sys.mc.number_mcs); + area.set_area(area.get_area()+mc->area.get_area()*XML->sys.mc.number_mcs); + set_pppm(pppm_t,XML->sys.mc.number_mcs*mc->mcp.clockRate, XML->sys.mc.number_mcs,XML->sys.mc.number_mcs,XML->sys.mc.number_mcs); + mcs.power = mc->power*pppm_t; + power = power + mcs.power; + set_pppm(pppm_t,1/mc->mcp.executionTime, XML->sys.mc.number_mcs,XML->sys.mc.number_mcs,XML->sys.mc.number_mcs); + mcs.rt_power = mc->rt_power*pppm_t; + rt_power = rt_power + mcs.rt_power; + + } + + if (XML->sys.flashc.number_mcs >0 )//flash controller + { + flashcontroller = new FlashController(XML, &interface_ip); + flashcontroller->computeEnergy(); + flashcontroller->computeEnergy(false); + double number_fcs = flashcontroller->fcp.num_mcs; + flashcontrollers.area.set_area(flashcontrollers.area.get_area()+flashcontroller->area.get_area()*number_fcs); + area.set_area(area.get_area()+flashcontrollers.area.get_area()); + set_pppm(pppm_t,number_fcs, number_fcs ,number_fcs, number_fcs ); + flashcontrollers.power = flashcontroller->power*pppm_t; + power = power + flashcontrollers.power; + set_pppm(pppm_t,number_fcs , number_fcs ,number_fcs ,number_fcs ); + flashcontrollers.rt_power = flashcontroller->rt_power*pppm_t; + rt_power = rt_power + flashcontrollers.rt_power; + + } + + if (XML->sys.niu.number_units >0) + { + niu = new NIUController(XML, &interface_ip); + niu->computeEnergy(); + niu->computeEnergy(false); + nius.area.set_area(nius.area.get_area()+niu->area.get_area()*XML->sys.niu.number_units); + area.set_area(area.get_area()+niu->area.get_area()*XML->sys.niu.number_units); + set_pppm(pppm_t,XML->sys.niu.number_units*niu->niup.clockRate, XML->sys.niu.number_units,XML->sys.niu.number_units,XML->sys.niu.number_units); + nius.power = niu->power*pppm_t; + power = power + nius.power; + set_pppm(pppm_t,XML->sys.niu.number_units*niu->niup.clockRate, XML->sys.niu.number_units,XML->sys.niu.number_units,XML->sys.niu.number_units); + nius.rt_power = niu->rt_power*pppm_t; + rt_power = rt_power + nius.rt_power; + + } + + if (XML->sys.pcie.number_units >0 && XML->sys.pcie.num_channels >0) + { + pcie = new PCIeController(XML, &interface_ip); + pcie->computeEnergy(); + pcie->computeEnergy(false); + pcies.area.set_area(pcies.area.get_area()+pcie->area.get_area()*XML->sys.pcie.number_units); + area.set_area(area.get_area()+pcie->area.get_area()*XML->sys.pcie.number_units); + set_pppm(pppm_t,XML->sys.pcie.number_units*pcie->pciep.clockRate, XML->sys.pcie.number_units,XML->sys.pcie.number_units,XML->sys.pcie.number_units); + pcies.power = pcie->power*pppm_t; + power = power + pcies.power; + set_pppm(pppm_t,XML->sys.pcie.number_units*pcie->pciep.clockRate, XML->sys.pcie.number_units,XML->sys.pcie.number_units,XML->sys.pcie.number_units); + pcies.rt_power = pcie->rt_power*pppm_t; + rt_power = rt_power + pcies.rt_power; + + } + + if (numNOC >0) + { + for (i = 0;i < numNOC; i++) + { + if (XML->sys.NoC[i].type) + {//First add up area of routers if NoC is used + nocs.push_back(new NoC(XML,i, &interface_ip, 1)); + if (procdynp.homoNOC) + { + noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area()*procdynp.numNOC); + area.set_area(area.get_area() + noc.area.get_area()); + } + else + { + noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area()); + area.set_area(area.get_area() + nocs[i]->area.get_area()); + } + } + else + {//Bus based interconnect + nocs.push_back(new NoC(XML,i, &interface_ip, 1, sqrt(area.get_area()*XML->sys.NoC[i].chip_coverage))); + if (procdynp.homoNOC){ + noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area()*procdynp.numNOC); + area.set_area(area.get_area() + noc.area.get_area()); + } + else + { + noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area()); + area.set_area(area.get_area() + nocs[i]->area.get_area()); + } + } + } + + /* + * Compute global links associated with each NOC, if any. This must be done at the end (even after the NOC router part) since the total chip + * area must be obtain to decide the link routing + */ + for (i = 0;i < numNOC; i++) + { + if (nocs[i]->nocdynp.has_global_link && XML->sys.NoC[i].type) + { + nocs[i]->init_link_bus(sqrt(area.get_area()*XML->sys.NoC[i].chip_coverage));//compute global links + if (procdynp.homoNOC) + { + noc.area.set_area(noc.area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area() + * nocs[i]->nocdynp.total_nodes + * procdynp.numNOC); + area.set_area(area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area() + * nocs[i]->nocdynp.total_nodes + * procdynp.numNOC); + } + else + { + noc.area.set_area(noc.area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area() + * nocs[i]->nocdynp.total_nodes); + area.set_area(area.get_area() + nocs[i]->link_bus_tot_per_Router.area.get_area() + * nocs[i]->nocdynp.total_nodes); + } + } + } + //Compute energy of NoC (w or w/o links) or buses + for (i = 0;i < numNOC; i++) + { + nocs[i]->computeEnergy(); + nocs[i]->computeEnergy(false); + if (procdynp.homoNOC){ + set_pppm(pppm_t,procdynp.numNOC*nocs[i]->nocdynp.clockRate, procdynp.numNOC,procdynp.numNOC,procdynp.numNOC); + noc.power = noc.power + nocs[i]->power*pppm_t; + set_pppm(pppm_t,1/nocs[i]->nocdynp.executionTime, procdynp.numNOC,procdynp.numNOC,procdynp.numNOC); + noc.rt_power = noc.rt_power + nocs[i]->rt_power*pppm_t; + power = power + noc.power; + rt_power = rt_power + noc.rt_power; + } + else + { + set_pppm(pppm_t,nocs[i]->nocdynp.clockRate, 1, 1, 1); + noc.power = noc.power + nocs[i]->power*pppm_t; + power = power + nocs[i]->power*pppm_t; + set_pppm(pppm_t,1/nocs[i]->nocdynp.executionTime, 1, 1, 1); + noc.rt_power = noc.rt_power + nocs[i]->rt_power*pppm_t; + rt_power = rt_power + nocs[i]->rt_power*pppm_t; + + + } + } + } + +// //clock power +// globalClock.init_wire_external(is_default, &interface_ip); +// globalClock.clk_area =area*1e6; //change it from mm^2 to um^2 +// globalClock.end_wiring_level =5;//toplevel metal +// globalClock.start_wiring_level =5;//toplevel metal +// globalClock.l_ip.with_clock_grid=false;//global clock does not drive local final nodes +// globalClock.optimize_wire(); + +} + +void Processor::displayDeviceType(int device_type_, uint32_t indent) +{ + string indent_str(indent, ' '); + + switch ( device_type_ ) { + + case 0 : + cout <<indent_str<<"Device Type= "<<"ITRS high performance device type"<<endl; + break; + case 1 : + cout <<indent_str<<"Device Type= "<<"ITRS low standby power device type"<<endl; + break; + case 2 : + cout <<indent_str<<"Device Type= "<<"ITRS low operating power device type"<<endl; + break; + case 3 : + cout <<indent_str<<"Device Type= "<<"LP-DRAM device type"<<endl; + break; + case 4 : + cout <<indent_str<<"Device Type= "<<"COMM-DRAM device type"<<endl; + break; + default : + { + cout <<indent_str<<"Unknown Device Type"<<endl; + exit(0); + } + } +} + +void Processor::displayInterconnectType(int interconnect_type_, uint32_t indent) +{ + string indent_str(indent, ' '); + + switch ( interconnect_type_ ) { + + case 0 : + cout <<indent_str<<"Interconnect metal projection= "<<"aggressive interconnect technology projection"<<endl; + break; + case 1 : + cout <<indent_str<<"Interconnect metal projection= "<<"conservative interconnect technology projection"<<endl; + break; + default : + { + cout <<indent_str<<"Unknown Interconnect Projection Type"<<endl; + exit(0); + } + } +} + +void Processor::displayEnergy(uint32_t indent, int plevel, bool is_tdp) +{ + int i; + bool long_channel = XML->sys.longer_channel_device; + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + if (is_tdp) + { + + if (plevel<5) + { + cout<<"\nMcPAT (version "<< VER_MAJOR <<"."<< VER_MINOR + << " of " << VER_UPDATE << ") results (current print level is "<< plevel + <<", please increase print level to see the details in components): "<<endl; + } + else + { + cout<<"\nMcPAT (version "<< VER_MAJOR <<"."<< VER_MINOR + << " of " << VER_UPDATE << ") results (current print level is 5)"<< endl; + } + cout <<"*****************************************************************************************"<<endl; + cout <<indent_str<<"Technology "<<XML->sys.core_tech_node<<" nm"<<endl; + //cout <<indent_str<<"Device Type= "<<XML->sys.device_type<<endl; + if (long_channel) + cout <<indent_str<<"Using Long Channel Devices When Appropriate"<<endl; + //cout <<indent_str<<"Interconnect metal projection= "<<XML->sys.interconnect_projection_type<<endl; + displayInterconnectType(XML->sys.interconnect_projection_type, indent); + cout <<indent_str<<"Core clock Rate(MHz) "<<XML->sys.core[0].clock_rate<<endl; + cout <<endl; + cout <<"*****************************************************************************************"<<endl; + cout <<"Processor: "<<endl; + cout << indent_str << "Area = " << area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str << "Peak Power = " << power.readOp.dynamic + + (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) + power.readOp.gate_leakage <<" W" << endl; + cout << indent_str << "Total Leakage = " << + (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) + power.readOp.gate_leakage <<" W" << endl; + cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic << " W" << endl; + cout << indent_str << "Subthreshold Leakage = " << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; + //cout << indent_str << "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl; + cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; + cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic << " W" << endl; + cout <<endl; + if (numCore >0){ + cout <<indent_str<<"Total Cores: "<<XML->sys.number_of_cores << " cores "<<endl; + displayDeviceType(XML->sys.device_type,indent); + cout << indent_str_next << "Area = " << core.area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << core.power.readOp.dynamic << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? core.power.readOp.longer_channel_leakage:core.power.readOp.leakage) <<" W" << endl; + //cout << indent_str_next << "Subthreshold Leakage = " << core.power.readOp.longer_channel_leakage <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << core.power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << core.rt_power.readOp.dynamic << " W" << endl; + cout <<endl; + } + if (!XML->sys.Private_L2) + { + if (numL2 >0){ + cout <<indent_str<<"Total L2s: "<<endl; + displayDeviceType(XML->sys.L2[0].device_type,indent); + cout << indent_str_next << "Area = " << l2.area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << l2.power.readOp.dynamic << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? l2.power.readOp.longer_channel_leakage:l2.power.readOp.leakage) <<" W" << endl; + //cout << indent_str_next << "Subthreshold Leakage = " << l2.power.readOp.longer_channel_leakage <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << l2.power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << l2.rt_power.readOp.dynamic << " W" << endl; + cout <<endl; + } + } + if (numL3 >0){ + cout <<indent_str<<"Total L3s: "<<endl; + displayDeviceType(XML->sys.L3[0].device_type, indent); + cout << indent_str_next << "Area = " << l3.area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << l3.power.readOp.dynamic << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? l3.power.readOp.longer_channel_leakage:l3.power.readOp.leakage) <<" W" << endl; + //cout << indent_str_next << "Subthreshold Leakage = " << l3.power.readOp.longer_channel_leakage <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << l3.power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << l3.rt_power.readOp.dynamic << " W" << endl; + cout <<endl; + } + if (numL1Dir >0){ + cout <<indent_str<<"Total First Level Directory: "<<endl; + displayDeviceType(XML->sys.L1Directory[0].device_type, indent); + cout << indent_str_next << "Area = " << l1dir.area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << l1dir.power.readOp.dynamic << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? l1dir.power.readOp.longer_channel_leakage:l1dir.power.readOp.leakage) <<" W" << endl; + //cout << indent_str_next << "Subthreshold Leakage = " << l1dir.power.readOp.longer_channel_leakage <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << l1dir.power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << l1dir.rt_power.readOp.dynamic << " W" << endl; + cout <<endl; + } + if (numL2Dir >0){ + cout <<indent_str<<"Total First Level Directory: "<<endl; + displayDeviceType(XML->sys.L1Directory[0].device_type, indent); + cout << indent_str_next << "Area = " << l2dir.area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << l2dir.power.readOp.dynamic << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? l2dir.power.readOp.longer_channel_leakage:l2dir.power.readOp.leakage) <<" W" << endl; + //cout << indent_str_next << "Subthreshold Leakage = " << l2dir.power.readOp.longer_channel_leakage <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << l2dir.power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << l2dir.rt_power.readOp.dynamic << " W" << endl; + cout <<endl; + } + if (numNOC >0){ + cout <<indent_str<<"Total NoCs (Network/Bus): "<<endl; + displayDeviceType(XML->sys.device_type, indent); + cout << indent_str_next << "Area = " << noc.area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << noc.power.readOp.dynamic << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? noc.power.readOp.longer_channel_leakage:noc.power.readOp.leakage) <<" W" << endl; + //cout << indent_str_next << "Subthreshold Leakage = " << noc.power.readOp.longer_channel_leakage <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << noc.power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << noc.rt_power.readOp.dynamic << " W" << endl; + cout <<endl; + } + if (XML->sys.mc.number_mcs >0 && XML->sys.mc.memory_channels_per_mc>0) + { + cout <<indent_str<<"Total MCs: "<<XML->sys.mc.number_mcs << " Memory Controllers "<<endl; + displayDeviceType(XML->sys.device_type, indent); + cout << indent_str_next << "Area = " << mcs.area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << mcs.power.readOp.dynamic << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? mcs.power.readOp.longer_channel_leakage:mcs.power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << mcs.power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << mcs.rt_power.readOp.dynamic << " W" << endl; + cout <<endl; + } + if (XML->sys.flashc.number_mcs >0) + { + cout <<indent_str<<"Total Flash/SSD Controllers: "<<flashcontroller->fcp.num_mcs << " Flash/SSD Controllers "<<endl; + displayDeviceType(XML->sys.device_type, indent); + cout << indent_str_next << "Area = " << flashcontrollers.area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << flashcontrollers.power.readOp.dynamic << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? flashcontrollers.power.readOp.longer_channel_leakage:flashcontrollers.power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << flashcontrollers.power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << flashcontrollers.rt_power.readOp.dynamic << " W" << endl; + cout <<endl; + } + if (XML->sys.niu.number_units >0 ) + { + cout <<indent_str<<"Total NIUs: "<<niu->niup.num_units << " Network Interface Units "<<endl; + displayDeviceType(XML->sys.device_type, indent); + cout << indent_str_next << "Area = " << nius.area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << nius.power.readOp.dynamic << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? nius.power.readOp.longer_channel_leakage:nius.power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << nius.power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << nius.rt_power.readOp.dynamic << " W" << endl; + cout <<endl; + } + if (XML->sys.pcie.number_units >0 && XML->sys.pcie.num_channels>0) + { + cout <<indent_str<<"Total PCIes: "<<pcie->pciep.num_units << " PCIe Controllers "<<endl; + displayDeviceType(XML->sys.device_type, indent); + cout << indent_str_next << "Area = " << pcies.area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str_next << "Peak Dynamic = " << pcies.power.readOp.dynamic << " W" << endl; + cout << indent_str_next << "Subthreshold Leakage = " + << (long_channel? pcies.power.readOp.longer_channel_leakage:pcies.power.readOp.leakage) <<" W" << endl; + cout << indent_str_next << "Gate Leakage = " << pcies.power.readOp.gate_leakage << " W" << endl; + cout << indent_str_next << "Runtime Dynamic = " << pcies.rt_power.readOp.dynamic << " W" << endl; + cout <<endl; + } + cout <<"*****************************************************************************************"<<endl; + if (plevel >1) + { + for (i = 0;i < numCore; i++) + { + cores[i]->displayEnergy(indent+4,plevel,is_tdp); + cout <<"*****************************************************************************************"<<endl; + } + if (!XML->sys.Private_L2) + { + for (i = 0;i < numL2; i++) + { + l2array[i]->displayEnergy(indent+4,is_tdp); + cout <<"*****************************************************************************************"<<endl; + } + } + for (i = 0;i < numL3; i++) + { + l3array[i]->displayEnergy(indent+4,is_tdp); + cout <<"*****************************************************************************************"<<endl; + } + for (i = 0;i < numL1Dir; i++) + { + l1dirarray[i]->displayEnergy(indent+4,is_tdp); + cout <<"*****************************************************************************************"<<endl; + } + for (i = 0;i < numL2Dir; i++) + { + l2dirarray[i]->displayEnergy(indent+4,is_tdp); + cout <<"*****************************************************************************************"<<endl; + } + if (XML->sys.mc.number_mcs >0 && XML->sys.mc.memory_channels_per_mc>0) + { + mc->displayEnergy(indent+4,is_tdp); + cout <<"*****************************************************************************************"<<endl; + } + if (XML->sys.flashc.number_mcs >0 && XML->sys.flashc.memory_channels_per_mc>0) + { + flashcontroller->displayEnergy(indent+4,is_tdp); + cout <<"*****************************************************************************************"<<endl; + } + if (XML->sys.niu.number_units >0 ) + { + niu->displayEnergy(indent+4,is_tdp); + cout <<"*****************************************************************************************"<<endl; + } + if (XML->sys.pcie.number_units >0 && XML->sys.pcie.num_channels>0) + { + pcie->displayEnergy(indent+4,is_tdp); + cout <<"*****************************************************************************************"<<endl; + } + + for (i = 0;i < numNOC; i++) + { + nocs[i]->displayEnergy(indent+4,plevel,is_tdp); + cout <<"*****************************************************************************************"<<endl; + } + } + } + else + { + + } + +} + +void Processor::set_proc_param() +{ + bool debug = false; + + procdynp.homoCore = bool(debug?1:XML->sys.homogeneous_cores); + procdynp.homoL2 = bool(debug?1:XML->sys.homogeneous_L2s); + procdynp.homoL3 = bool(debug?1:XML->sys.homogeneous_L3s); + procdynp.homoNOC = bool(debug?1:XML->sys.homogeneous_NoCs); + procdynp.homoL1Dir = bool(debug?1:XML->sys.homogeneous_L1Directories); + procdynp.homoL2Dir = bool(debug?1:XML->sys.homogeneous_L2Directories); + + procdynp.numCore = XML->sys.number_of_cores; + procdynp.numL2 = XML->sys.number_of_L2s; + procdynp.numL3 = XML->sys.number_of_L3s; + procdynp.numNOC = XML->sys.number_of_NoCs; + procdynp.numL1Dir = XML->sys.number_of_L1Directories; + procdynp.numL2Dir = XML->sys.number_of_L2Directories; + procdynp.numMC = XML->sys.mc.number_mcs; + procdynp.numMCChannel = XML->sys.mc.memory_channels_per_mc; + +// if (procdynp.numCore<1) +// { +// cout<<" The target processor should at least have one core on chip." <<endl; +// exit(0); +// } + + // if (numNOCs<0 || numNOCs>2) + // { + // cout <<"number of NOCs must be 1 (only global NOCs) or 2 (both global and local NOCs)"<<endl; + // exit(0); + // } + + /* Basic parameters*/ + interface_ip.data_arr_ram_cell_tech_type = debug?0:XML->sys.device_type; + interface_ip.data_arr_peri_global_tech_type = debug?0:XML->sys.device_type; + interface_ip.tag_arr_ram_cell_tech_type = debug?0:XML->sys.device_type; + interface_ip.tag_arr_peri_global_tech_type = debug?0:XML->sys.device_type; + + interface_ip.ic_proj_type = debug?0:XML->sys.interconnect_projection_type; + interface_ip.delay_wt = 100;//Fixed number, make sure timing can be satisfied. + interface_ip.area_wt = 0;//Fixed number, This is used to exhaustive search for individual components. + interface_ip.dynamic_power_wt = 100;//Fixed number, This is used to exhaustive search for individual components. + interface_ip.leakage_power_wt = 0; + interface_ip.cycle_time_wt = 0; + + interface_ip.delay_dev = 10000;//Fixed number, make sure timing can be satisfied. + interface_ip.area_dev = 10000;//Fixed number, This is used to exhaustive search for individual components. + interface_ip.dynamic_power_dev = 10000;//Fixed number, This is used to exhaustive search for individual components. + interface_ip.leakage_power_dev = 10000; + interface_ip.cycle_time_dev = 10000; + + interface_ip.ed = 2; + interface_ip.burst_len = 1;//parameters are fixed for processor section, since memory is processed separately + interface_ip.int_prefetch_w = 1; + interface_ip.page_sz_bits = 0; + interface_ip.temp = debug?360: XML->sys.temperature; + interface_ip.F_sz_nm = debug?90:XML->sys.core_tech_node;//XML->sys.core_tech_node; + interface_ip.F_sz_um = interface_ip.F_sz_nm / 1000; + + //***********This section of code does not have real meaning, they are just to ensure all data will have initial value to prevent errors. + //They will be overridden during each components initialization + interface_ip.cache_sz =64; + interface_ip.line_sz = 1; + interface_ip.assoc = 1; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.specific_tag = 1; + interface_ip.tag_w = 64; + interface_ip.access_mode = 2; + + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + + interface_ip.is_main_mem = false; + interface_ip.rpters_in_htree = true ; + interface_ip.ver_htree_wires_over_array = 0; + interface_ip.broadcast_addr_din_over_ver_htrees = 0; + + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports = 1; + interface_ip.nuca = 0; + interface_ip.nuca_bank_count = 0; + interface_ip.is_cache =true; + interface_ip.pure_ram =false; + interface_ip.pure_cam =false; + interface_ip.force_cache_config =false; + if (XML->sys.Embedded) + { + interface_ip.wt =Global_30; + interface_ip.wire_is_mat_type = 0; + interface_ip.wire_os_mat_type = 0; + } + else + { + interface_ip.wt =Global; + interface_ip.wire_is_mat_type = 2; + interface_ip.wire_os_mat_type = 2; + } + interface_ip.force_wiretype = false; + interface_ip.print_detail = 1; + interface_ip.add_ecc_b_ =true; +} + +Processor::~Processor(){ + while (!cores.empty()) + { + delete cores.back(); + cores.pop_back(); + } + while (!l2array.empty()) + { + delete l2array.back(); + l2array.pop_back(); + } + while (!l3array.empty()) + { + delete l3array.back(); + l3array.pop_back(); + } + while (!nocs.empty()) + { + delete nocs.back(); + nocs.pop_back(); + } + if (!mc) + { + delete mc; + } + if (!niu) + { + delete niu; + } + if (!pcie) + { + delete pcie; + } + if (!flashcontroller) + { + delete flashcontroller; + } +}; diff --git a/ext/mcpat/processor.h b/ext/mcpat/processor.h new file mode 100644 index 000000000..5a7a2f7f5 --- /dev/null +++ b/ext/mcpat/processor.h @@ -0,0 +1,79 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ +#ifndef PROCESSOR_H_ +#define PROCESSOR_H_ + +#include <vector> + +#include "XML_Parse.h" +#include "arbiter.h" +#include "area.h" +#include "array.h" +#include "basic_components.h" +#include "core.h" +#include "decoder.h" +#include "iocontrollers.h" +#include "memoryctrl.h" +#include "noc.h" +#include "parameter.h" +#include "router.h" +#include "sharedcache.h" + +class Processor : public Component +{ + public: + ParseXML *XML; + vector<Core *> cores; + vector<SharedCache *> l2array; + vector<SharedCache *> l3array; + vector<SharedCache *> l1dirarray; + vector<SharedCache *> l2dirarray; + vector<NoC *> nocs; + MemoryController * mc; + NIUController * niu; + PCIeController * pcie; + FlashController * flashcontroller; + InputParameter interface_ip; + ProcParam procdynp; + //wire globalInterconnect; + //clock_network globalClock; + Component core, l2, l3, l1dir, l2dir, noc, mcs, cc, nius, pcies,flashcontrollers; + int numCore, numL2, numL3, numNOC, numL1Dir, numL2Dir; + Processor(ParseXML *XML_interface); + void compute(); + void set_proc_param(); + void displayEnergy(uint32_t indent = 0,int plevel = 100, bool is_tdp=true); + void displayDeviceType(int device_type_, uint32_t indent = 0); + void displayInterconnectType(int interconnect_type_, uint32_t indent = 0); + ~Processor(); +}; + +#endif /* PROCESSOR_H_ */ diff --git a/ext/mcpat/results/A9_2000 b/ext/mcpat/results/A9_2000 new file mode 100644 index 000000000..e91243f6c --- /dev/null +++ b/ext/mcpat/results/A9_2000 @@ -0,0 +1,321 @@ +McPAT (version 0.8 of Aug, 2010) is computing the target processor... + +Warning: Branch Target Buffer array structure cannot satisfy latency constraint. + +McPAT (version 0.8 of Aug, 2010) results (current print level is 5) +***************************************************************************************** + Technology 40 nm + Using Long Channel Devices When Appropriate + Interconnect metal projection= conservative interconnect technology projection + Core clock Rate(MHz) 2000 + +***************************************************************************************** +Processor: + Area = 5.83937 mm^2 + Peak Power = 1.32283 W + Total Leakage = 0.182558 W + Peak Dynamic = 1.14027 W + Subthreshold Leakage = 0.0869601 W + Gate Leakage = 0.095598 W + Runtime Dynamic = 2.86361 W + + Total Cores: + Device Type= ITRS low operating power device type + Area = 5.33485 mm^2 + Peak Dynamic = 1.07823 W + Subthreshold Leakage = 0.0827641 W + Gate Leakage = 0.0887315 W + Runtime Dynamic = 0.975395 W + + Total First Level Directory: + Device Type= ITRS low operating power device type + Area = 0.489711 mm^2 + Peak Dynamic = 0.0449752 W + Subthreshold Leakage = 0.00397708 W + Gate Leakage = 0.00655632 W + Runtime Dynamic = 1.80289 W + + Total NoCs (Network/Bus): + Device Type= ITRS low operating power device type + Area = 0.0148119 mm^2 + Peak Dynamic = 0.0170648 W + Subthreshold Leakage = 0.000218992 W + Gate Leakage = 0.000310207 W + Runtime Dynamic = 0.0853239 W + +***************************************************************************************** +Core: + Area = 2.66742 mm^2 + Peak Dynamic = 0.539116 W + Subthreshold Leakage = 0.041382 W + Gate Leakage = 0.0443657 W + Runtime Dynamic = 0.975395 W + + Instruction Fetch Unit: + Area = 0.565848 mm^2 + Peak Dynamic = 0.184724 W + Subthreshold Leakage = 0.00572394 W + Gate Leakage = 0.00380598 W + Runtime Dynamic = 0.283222 W + + Instruction Cache: + Area = 0.235613 mm^2 + Peak Dynamic = 0.0310428 W + Subthreshold Leakage = 0.00309635 W + Gate Leakage = 0.00216385 W + Runtime Dynamic = 0.0461626 W + + Branch Target Buffer: + Area = 0.251259 mm^2 + Peak Dynamic = 0.0174433 W + Subthreshold Leakage = 0.00170231 W + Gate Leakage = 0.000908123 W + Runtime Dynamic = 0.0697733 W + + Branch Predictor: + Area = 0.064441 mm^2 + Peak Dynamic = 0.00815792 W + Subthreshold Leakage = 0.00070444 W + Gate Leakage = 0.000477387 W + Runtime Dynamic = 0.0113878 W + + Global Predictor: + Area = 0.0313969 mm^2 + Peak Dynamic = 0.00374527 W + Subthreshold Leakage = 0.00034631 W + Gate Leakage = 0.000233555 W + Runtime Dynamic = 0.00545806 W + + Local Predictor: + Area = 0.000711939 mm^2 + Peak Dynamic = 0.000301014 W + Subthreshold Leakage = 6.13457e-06 W + Gate Leakage = 5.63471e-06 W + Runtime Dynamic = 0.000471566 W + + Area = 0.000650815 mm^2 + Peak Dynamic = 0.000230123 W + Subthreshold Leakage = 5.7769e-06 W + Gate Leakage = 4.75075e-06 W + Runtime Dynamic = 0.000354988 W + + Chooser: + Area = 0.0313969 mm^2 + Peak Dynamic = 0.00374527 W + Subthreshold Leakage = 0.00034631 W + Gate Leakage = 0.000233555 W + Runtime Dynamic = 0.00545806 W + + RAS: + Area = 0.000996272 mm^2 + Peak Dynamic = 0.000366372 W + Subthreshold Leakage = 5.68653e-06 W + Gate Leakage = 4.64147e-06 W + Runtime Dynamic = 6.23994e-08 W + + Instruction Buffer: + Area = 0.00820192 mm^2 + Peak Dynamic = 0.0669878 W + Subthreshold Leakage = 6.33536e-05 W + Gate Leakage = 4.34841e-05 W + Runtime Dynamic = 0.0382787 W + + Instruction Decoder: + Area = 0.00468731 mm^2 + Peak Dynamic = 0.05881 W + Subthreshold Leakage = 0.000127696 W + Gate Leakage = 0.000115494 W + Runtime Dynamic = 0.11762 W + + Renaming Unit: + Area = 0.0903068 mm^2 + Peak Dynamic = 0.0451514 W + Subthreshold Leakage = 0.000345688 W + Gate Leakage = 0.00032022 W + Runtime Dynamic = 0.0731287 W + + Int Front End RAT: + Area = 0.0543672 mm^2 + Peak Dynamic = 0.0237617 W + Subthreshold Leakage = 0.000175223 W + Gate Leakage = 0.000121525 W + Runtime Dynamic = 0.0475234 W + + FP Front End RAT: + Area = 0.0185325 mm^2 + Peak Dynamic = 0.00949419 W + Subthreshold Leakage = 0.000100325 W + Gate Leakage = 6.76251e-05 W + Runtime Dynamic = 0.00949419 W + + Free List: + Area = 0.00599955 mm^2 + Peak Dynamic = 0.00225065 W + Subthreshold Leakage = 1.24363e-05 W + Gate Leakage = 1.00844e-05 W + Runtime Dynamic = 0.0090026 W + + Int Retire RAT: + Area = 0.00605969 mm^2 + Peak Dynamic = 0.00448392 W + Subthreshold Leakage = 1.33231e-05 W + Gate Leakage = 1.16235e-05 W + Runtime Dynamic = 0.00448392 W + + FP Retire RAT: + Area = 0.000650815 mm^2 + Peak Dynamic = 0.00067334 W + Subthreshold Leakage = 5.7769e-06 W + Gate Leakage = 4.75075e-06 W + Runtime Dynamic = 0.00067334 W + + FP Free List: + Area = 0.00305098 mm^2 + Peak Dynamic = 0.00195124 W + Subthreshold Leakage = 8.81712e-06 W + Gate Leakage = 6.96054e-06 W + Runtime Dynamic = 0.00195124 W + + Load Store Unit: + Area = 0.274913 mm^2 + Peak Dynamic = 0.0347482 W + Subthreshold Leakage = 0.0032012 W + Gate Leakage = 0.00235752 W + Runtime Dynamic = 0.195304 W + + Data Cache: + Area = 0.240878 mm^2 + Peak Dynamic = 0.0293665 W + Subthreshold Leakage = 0.00312878 W + Gate Leakage = 0.00220794 W + Runtime Dynamic = 0.19026 W + + StoreQ: + Area = 0.00754674 mm^2 + Peak Dynamic = 0.00358087 W + Subthreshold Leakage = 4.2633e-05 W + Gate Leakage = 5.19212e-05 W + Runtime Dynamic = 0.00504348 W + + Memory Management Unit: + Area = 0.021508 mm^2 + Peak Dynamic = 0.0127337 W + Subthreshold Leakage = 0.000210621 W + Gate Leakage = 0.000290666 W + Runtime Dynamic = 0.037071 W + + Itlb: + Area = 0.00993091 mm^2 + Peak Dynamic = 0.00617846 W + Subthreshold Leakage = 9.04168e-05 W + Gate Leakage = 9.65082e-05 W + Runtime Dynamic = 0.012357 W + + Dtlb: + Area = 0.00993091 mm^2 + Peak Dynamic = 0.00438671 W + Subthreshold Leakage = 9.04168e-05 W + Gate Leakage = 9.65082e-05 W + Runtime Dynamic = 0.0247139 W + + Execution Unit: + Area = 1.65498 mm^2 + Peak Dynamic = 0.261758 W + Subthreshold Leakage = 0.0305522 W + Gate Leakage = 0.0360036 W + Runtime Dynamic = 0.386669 W + + Register Files: + Area = 0.203203 mm^2 + Peak Dynamic = 0.0763282 W + Subthreshold Leakage = 0.000197046 W + Gate Leakage = 0.00016338 W + Runtime Dynamic = 0.0386066 W + + Integer RF: + Area = 0.146073 mm^2 + Peak Dynamic = 0.0763282 W + Subthreshold Leakage = 0.000120303 W + Gate Leakage = 9.97867e-05 W + Runtime Dynamic = 0.0345689 W + + Floating Point RF: + Area = 0.05713 mm^2 + Peak Dynamic = 0 W + Subthreshold Leakage = 7.67427e-05 W + Gate Leakage = 6.35938e-05 W + Runtime Dynamic = 0.00403765 W + + Instruction Scheduler: + Area = 0.0582889 mm^2 + Peak Dynamic = 0.0522571 W + Subthreshold Leakage = 0.000128698 W + Gate Leakage = 0.000185714 W + Runtime Dynamic = 0.0787473 W + + Instruction Window: + Area = 0.053925 mm^2 + Peak Dynamic = 0.0445895 W + Subthreshold Leakage = 9.52936e-05 W + Gate Leakage = 0.000130718 W + Runtime Dynamic = 0.0602231 W + + FP Instruction Window: + Area = 0.00436388 mm^2 + Peak Dynamic = 0.00766759 W + Subthreshold Leakage = 3.34043e-05 W + Gate Leakage = 5.49962e-05 W + Runtime Dynamic = 0.0185242 W + + Integer ALUs (Count: 3 ): + Area = 0.312404 mm^2 + Peak Dynamic = 0.0283684 W + Subthreshold Leakage = 0.0140724 W + Gate Leakage = 0.0165703 W + Runtime Dynamic = 0.0373268 W + + Floating Point Units (FPUs) (Count: 1 ): + Area = 0.971259 mm^2 + Peak Dynamic = 0 W + Subthreshold Leakage = 0.0109377 W + Gate Leakage = 0.0128792 W + Runtime Dynamic = 0.0373268 W + + Complex ALUs (Mul/Div) (Count: 1 ): + Area = 0.104135 mm^2 + Peak Dynamic = 0.0204053 W + Subthreshold Leakage = 0.00469079 W + Gate Leakage = 0.00552345 W + Runtime Dynamic = 0.049769 W + + Results Broadcast Bus: + Area Overhead = 0.00404385 mm^2 + Peak Dynamic = 0.0824719 W + Subthreshold Leakage = 0.000495836 W + Gate Leakage = 0.000583852 W + Runtime Dynamic = 0.144892 W + +***************************************************************************************** +First Level Directory + Area = 0.244856 mm^2 + Peak Dynamic = 0.0224876 W + Subthreshold Leakage = 0.00198854 W + Gate Leakage = 0.00327816 W + Runtime Dynamic = 1.80289 W + +***************************************************************************************** +BUSES + Area = 0.0148119 mm^2 + Peak Dynamic = 0.0170648 W + Subthreshold Leakage = 0.000218992 W + Gate Leakage = 0.000310207 W + Runtime Dynamic = 0.0853239 W + + Bus: + Area = 0.0148119 mm^2 + Peak Dynamic = 0.0170648 W + Subthreshold Leakage = 0.000218992 W + Gate Leakage = 0.000310207 W + Runtime Dynamic = 0.0853239 W + +***************************************************************************************** diff --git a/ext/mcpat/results/A9_2000_withIOC b/ext/mcpat/results/A9_2000_withIOC new file mode 100644 index 000000000..b47509320 --- /dev/null +++ b/ext/mcpat/results/A9_2000_withIOC @@ -0,0 +1,410 @@ +McPAT (version 0.8 of Aug, 2010) is computing the target processor... + +Warning: Branch Target Buffer array structure cannot satisfy latency constraint. +SerDer_dyn 0.00216115 +ctrl_dyn 0.0278216 +ctrl_dyn 6.14856e-11 +SerDer_dyn 1.54368e-11 + +McPAT (version 0.8 of Aug, 2010) results (current print level is 5) +***************************************************************************************** + Technology 40 nm + Using Long Channel Devices When Appropriate + Interconnect metal projection= conservative interconnect technology projection + Core clock Rate(MHz) 2000 + +***************************************************************************************** +Processor: + Area = 7.05775 mm^2 + Peak Power = 2.06734 W + Total Leakage = 0.204814 W + Peak Dynamic = 1.86253 W + Subthreshold Leakage = 0.0916805 W + Gate Leakage = 0.113134 W + Runtime Dynamic = 5.3744 W + + Total Cores: 2 cores + Device Type= ITRS low operating power device type + Area = 5.33485 mm^2 + Peak Dynamic = 1.07823 W + Subthreshold Leakage = 0.0827641 W + Gate Leakage = 0.0887315 W + Runtime Dynamic = 0.975395 W + + Total First Level Directory: + Device Type= ITRS low operating power device type + Area = 0.489711 mm^2 + Peak Dynamic = 0.0449752 W + Subthreshold Leakage = 0.00397708 W + Gate Leakage = 0.00655632 W + Runtime Dynamic = 1.80289 W + + Total NoCs (Network/Bus): + Device Type= ITRS low operating power device type + Area = 0.0162858 mm^2 + Peak Dynamic = 0.0187629 W + Subthreshold Leakage = 0.000240784 W + Gate Leakage = 0.000341076 W + Runtime Dynamic = 0.0938146 W + + Total MCs: 1 Memory Controllers + Device Type= ITRS low operating power device type + Area = 0.554183 mm^2 + Peak Dynamic = 0.31033 W + Subthreshold Leakage = 0.0020922 W + Gate Leakage = 0.00751531 W + Runtime Dynamic = 2.21514 W + + Total Flash/SSD Controllers: 1 Flash/SSD Controllers + Device Type= ITRS low operating power device type + Area = 0.109065 mm^2 + Peak Dynamic = 0.0299827 W + Subthreshold Leakage = 0.000522213 W + Gate Leakage = 0.0020015 W + Runtime Dynamic = 0.0209879 W + + Total NIUs: 1 Network Interface Units + Device Type= ITRS low operating power device type + Area = 0.261302 mm^2 + Peak Dynamic = 0.164859 W + Subthreshold Leakage = 0.000730171 W + Gate Leakage = 0.00279855 W + Runtime Dynamic = 0.115402 W + + Total PCIes: 1 PCIe Controllers + Device Type= ITRS low operating power device type + Area = 0.292355 mm^2 + Peak Dynamic = 0.215383 W + Subthreshold Leakage = 0.00135405 W + Gate Leakage = 0.00518971 W + Runtime Dynamic = 0.150768 W + +***************************************************************************************** +Core: + Area = 2.66742 mm^2 + Peak Dynamic = 0.539116 W + Subthreshold Leakage = 0.041382 W + Gate Leakage = 0.0443657 W + Runtime Dynamic = 0.975395 W + + Instruction Fetch Unit: + Area = 0.565848 mm^2 + Peak Dynamic = 0.184724 W + Subthreshold Leakage = 0.00572394 W + Gate Leakage = 0.00380598 W + Runtime Dynamic = 0.283222 W + + Instruction Cache: + Area = 0.235613 mm^2 + Peak Dynamic = 0.0310428 W + Subthreshold Leakage = 0.00309635 W + Gate Leakage = 0.00216385 W + Runtime Dynamic = 0.0461626 W + + Branch Target Buffer: + Area = 0.251259 mm^2 + Peak Dynamic = 0.0174433 W + Subthreshold Leakage = 0.00170231 W + Gate Leakage = 0.000908123 W + Runtime Dynamic = 0.0697733 W + + Branch Predictor: + Area = 0.064441 mm^2 + Peak Dynamic = 0.00815792 W + Subthreshold Leakage = 0.00070444 W + Gate Leakage = 0.000477387 W + Runtime Dynamic = 0.0113878 W + + Global Predictor: + Area = 0.0313969 mm^2 + Peak Dynamic = 0.00374527 W + Subthreshold Leakage = 0.00034631 W + Gate Leakage = 0.000233555 W + Runtime Dynamic = 0.00545806 W + + Local Predictor: + Area = 0.000711939 mm^2 + Peak Dynamic = 0.000301014 W + Subthreshold Leakage = 6.13457e-06 W + Gate Leakage = 5.63471e-06 W + Runtime Dynamic = 0.000471566 W + + Area = 0.000650815 mm^2 + Peak Dynamic = 0.000230123 W + Subthreshold Leakage = 5.7769e-06 W + Gate Leakage = 4.75075e-06 W + Runtime Dynamic = 0.000354988 W + + Chooser: + Area = 0.0313969 mm^2 + Peak Dynamic = 0.00374527 W + Subthreshold Leakage = 0.00034631 W + Gate Leakage = 0.000233555 W + Runtime Dynamic = 0.00545806 W + + RAS: + Area = 0.000996272 mm^2 + Peak Dynamic = 0.000366372 W + Subthreshold Leakage = 5.68653e-06 W + Gate Leakage = 4.64147e-06 W + Runtime Dynamic = 6.23994e-08 W + + Instruction Buffer: + Area = 0.00820192 mm^2 + Peak Dynamic = 0.0669878 W + Subthreshold Leakage = 6.33536e-05 W + Gate Leakage = 4.34841e-05 W + Runtime Dynamic = 0.0382787 W + + Instruction Decoder: + Area = 0.00468731 mm^2 + Peak Dynamic = 0.05881 W + Subthreshold Leakage = 0.000127696 W + Gate Leakage = 0.000115494 W + Runtime Dynamic = 0.11762 W + + Renaming Unit: + Area = 0.0903068 mm^2 + Peak Dynamic = 0.0451514 W + Subthreshold Leakage = 0.000345688 W + Gate Leakage = 0.00032022 W + Runtime Dynamic = 0.0731287 W + + Int Front End RAT: + Area = 0.0543672 mm^2 + Peak Dynamic = 0.0237617 W + Subthreshold Leakage = 0.000175223 W + Gate Leakage = 0.000121525 W + Runtime Dynamic = 0.0475234 W + + FP Front End RAT: + Area = 0.0185325 mm^2 + Peak Dynamic = 0.00949419 W + Subthreshold Leakage = 0.000100325 W + Gate Leakage = 6.76251e-05 W + Runtime Dynamic = 0.00949419 W + + Free List: + Area = 0.00599955 mm^2 + Peak Dynamic = 0.00225065 W + Subthreshold Leakage = 1.24363e-05 W + Gate Leakage = 1.00844e-05 W + Runtime Dynamic = 0.0090026 W + + Int Retire RAT: + Area = 0.00605969 mm^2 + Peak Dynamic = 0.00448392 W + Subthreshold Leakage = 1.33231e-05 W + Gate Leakage = 1.16235e-05 W + Runtime Dynamic = 0.00448392 W + + FP Retire RAT: + Area = 0.000650815 mm^2 + Peak Dynamic = 0.00067334 W + Subthreshold Leakage = 5.7769e-06 W + Gate Leakage = 4.75075e-06 W + Runtime Dynamic = 0.00067334 W + + FP Free List: + Area = 0.00305098 mm^2 + Peak Dynamic = 0.00195124 W + Subthreshold Leakage = 8.81712e-06 W + Gate Leakage = 6.96054e-06 W + Runtime Dynamic = 0.00195124 W + + Load Store Unit: + Area = 0.274913 mm^2 + Peak Dynamic = 0.0347482 W + Subthreshold Leakage = 0.0032012 W + Gate Leakage = 0.00235752 W + Runtime Dynamic = 0.195304 W + + Data Cache: + Area = 0.240878 mm^2 + Peak Dynamic = 0.0293665 W + Subthreshold Leakage = 0.00312878 W + Gate Leakage = 0.00220794 W + Runtime Dynamic = 0.19026 W + + StoreQ: + Area = 0.00754674 mm^2 + Peak Dynamic = 0.00358087 W + Subthreshold Leakage = 4.2633e-05 W + Gate Leakage = 5.19212e-05 W + Runtime Dynamic = 0.00504348 W + + Memory Management Unit: + Area = 0.021508 mm^2 + Peak Dynamic = 0.0127337 W + Subthreshold Leakage = 0.000210621 W + Gate Leakage = 0.000290666 W + Runtime Dynamic = 0.037071 W + + Itlb: + Area = 0.00993091 mm^2 + Peak Dynamic = 0.00617846 W + Subthreshold Leakage = 9.04168e-05 W + Gate Leakage = 9.65082e-05 W + Runtime Dynamic = 0.012357 W + + Dtlb: + Area = 0.00993091 mm^2 + Peak Dynamic = 0.00438671 W + Subthreshold Leakage = 9.04168e-05 W + Gate Leakage = 9.65082e-05 W + Runtime Dynamic = 0.0247139 W + + Execution Unit: + Area = 1.65498 mm^2 + Peak Dynamic = 0.261758 W + Subthreshold Leakage = 0.0305522 W + Gate Leakage = 0.0360036 W + Runtime Dynamic = 0.386669 W + + Register Files: + Area = 0.203203 mm^2 + Peak Dynamic = 0.0763282 W + Subthreshold Leakage = 0.000197046 W + Gate Leakage = 0.00016338 W + Runtime Dynamic = 0.0386066 W + + Integer RF: + Area = 0.146073 mm^2 + Peak Dynamic = 0.0763282 W + Subthreshold Leakage = 0.000120303 W + Gate Leakage = 9.97867e-05 W + Runtime Dynamic = 0.0345689 W + + Floating Point RF: + Area = 0.05713 mm^2 + Peak Dynamic = 0 W + Subthreshold Leakage = 7.67427e-05 W + Gate Leakage = 6.35938e-05 W + Runtime Dynamic = 0.00403765 W + + Instruction Scheduler: + Area = 0.0582889 mm^2 + Peak Dynamic = 0.0522571 W + Subthreshold Leakage = 0.000128698 W + Gate Leakage = 0.000185714 W + Runtime Dynamic = 0.0787473 W + + Instruction Window: + Area = 0.053925 mm^2 + Peak Dynamic = 0.0445895 W + Subthreshold Leakage = 9.52936e-05 W + Gate Leakage = 0.000130718 W + Runtime Dynamic = 0.0602231 W + + FP Instruction Window: + Area = 0.00436388 mm^2 + Peak Dynamic = 0.00766759 W + Subthreshold Leakage = 3.34043e-05 W + Gate Leakage = 5.49962e-05 W + Runtime Dynamic = 0.0185242 W + + Integer ALUs (Count: 3 ): + Area = 0.312404 mm^2 + Peak Dynamic = 0.0283684 W + Subthreshold Leakage = 0.0140724 W + Gate Leakage = 0.0165703 W + Runtime Dynamic = 0.0373268 W + + Floating Point Units (FPUs) (Count: 1 ): + Area = 0.971259 mm^2 + Peak Dynamic = 0 W + Subthreshold Leakage = 0.0109377 W + Gate Leakage = 0.0128792 W + Runtime Dynamic = 0.0373268 W + + Complex ALUs (Mul/Div) (Count: 1 ): + Area = 0.104135 mm^2 + Peak Dynamic = 0.0204053 W + Subthreshold Leakage = 0.00469079 W + Gate Leakage = 0.00552345 W + Runtime Dynamic = 0.049769 W + + Results Broadcast Bus: + Area Overhead = 0.00404385 mm^2 + Peak Dynamic = 0.0824719 W + Subthreshold Leakage = 0.000495836 W + Gate Leakage = 0.000583852 W + Runtime Dynamic = 0.144892 W + +***************************************************************************************** +First Level Directory + Area = 0.244856 mm^2 + Peak Dynamic = 0.0224876 W + Subthreshold Leakage = 0.00198854 W + Gate Leakage = 0.00327816 W + Runtime Dynamic = 1.80289 W + +***************************************************************************************** +Memory Controller: + Area = 0.554183 mm^2 + Peak Dynamic = 0.31033 W + Subthreshold Leakage = 0.0020922 W + Gate Leakage = 0.00751531 W + Runtime Dynamic = 2.21514 W + + Front End Engine: + Area = 0.111447 mm^2 + Peak Dynamic = 0.0117646 W + Subthreshold Leakage = 0.000188068 W + Gate Leakage = 0.000217277 W + Runtime Dynamic = 0.0796061 W + + Transaction Engine: + Area = 0.113609 mm^2 + Peak Dynamic = 0.160252 W + Subthreshold Leakage = 0.000380826 W + Gate Leakage = 0.00145961 W + Runtime Dynamic = 1.08436 W + + PHY: + Area = 0.329127 mm^2 + Peak Dynamic = 0.138314 W + Subthreshold Leakage = 0.00152331 W + Gate Leakage = 0.00583843 W + Runtime Dynamic = 1.05117 W + +***************************************************************************************** +Flash Controller: + Area = 0.109065 mm^2 + Peak Dynamic = 0.0299827 W + Subthreshold Leakage = 0.000522213 W + Gate Leakage = 0.0020015 W + Runtime Dynamic = 0.0209879 W + +***************************************************************************************** +NIU: + Area = 0.261302 mm^2 + Peak Dynamic = 0.164859 W + Subthreshold Leakage = 0.000730171 W + Gate Leakage = 0.00279855 W + Runtime Dynamic = 0.115402 W + +***************************************************************************************** +PCIe: + Area = 0.292355 mm^2 + Peak Dynamic = 0.215383 W + Subthreshold Leakage = 0.00135405 W + Gate Leakage = 0.00518971 W + Runtime Dynamic = 0.150768 W + +***************************************************************************************** +BUSES + Area = 0.0162858 mm^2 + Peak Dynamic = 0.0187629 W + Subthreshold Leakage = 0.000240784 W + Gate Leakage = 0.000341076 W + Runtime Dynamic = 0.0938146 W + + Bus: + Area = 0.0162858 mm^2 + Peak Dynamic = 0.0187629 W + Subthreshold Leakage = 0.000240784 W + Gate Leakage = 0.000341076 W + Runtime Dynamic = 0.0938146 W + +***************************************************************************************** diff --git a/ext/mcpat/results/A9_800 b/ext/mcpat/results/A9_800 new file mode 100644 index 000000000..e8f3301b5 --- /dev/null +++ b/ext/mcpat/results/A9_800 @@ -0,0 +1,320 @@ +McPAT (version 0.8 of Aug, 2010) is computing the target processor... + + +McPAT (version 0.8 of Aug, 2010) results (current print level is 5) +***************************************************************************************** + Technology 40 nm + Using Long Channel Devices When Appropriate + Interconnect metal projection= conservative interconnect technology projection + Core clock Rate(MHz) 800 + +***************************************************************************************** +Processor: + Area = 5.48929 mm^2 + Peak Power = 0.577263 W + Total Leakage = 0.127046 W + Peak Dynamic = 0.450217 W + Subthreshold Leakage = 0.0608257 W + Gate Leakage = 0.0662198 W + Runtime Dynamic = 1.13304 W + + Total Cores: + Device Type= ITRS low operating power device type + Area = 4.98521 mm^2 + Peak Dynamic = 0.425609 W + Subthreshold Leakage = 0.0577408 W + Gate Leakage = 0.061241 W + Runtime Dynamic = 0.37879 W + + Total First Level Directory: + Device Type= ITRS low operating power device type + Area = 0.489711 mm^2 + Peak Dynamic = 0.0179901 W + Subthreshold Leakage = 0.0029286 W + Gate Leakage = 0.00476045 W + Runtime Dynamic = 0.721156 W + + Total NoCs (Network/Bus): + Device Type= ITRS low operating power device type + Area = 0.0143604 mm^2 + Peak Dynamic = 0.00661787 W + Subthreshold Leakage = 0.000156344 W + Gate Leakage = 0.000218372 W + Runtime Dynamic = 0.0330893 W + +***************************************************************************************** +Core: + Area = 2.49261 mm^2 + Peak Dynamic = 0.212805 W + Subthreshold Leakage = 0.0288704 W + Gate Leakage = 0.0306205 W + Runtime Dynamic = 0.37879 W + + Instruction Fetch Unit: + Area = 0.450898 mm^2 + Peak Dynamic = 0.0710479 W + Subthreshold Leakage = 0.00360576 W + Gate Leakage = 0.00232348 W + Runtime Dynamic = 0.101921 W + + Instruction Cache: + Area = 0.235613 mm^2 + Peak Dynamic = 0.0124171 W + Subthreshold Leakage = 0.00228006 W + Gate Leakage = 0.00157114 W + Runtime Dynamic = 0.018465 W + + Branch Target Buffer: + Area = 0.136309 mm^2 + Peak Dynamic = 0.00413545 W + Subthreshold Leakage = 0.000644359 W + Gate Leakage = 0.000219381 W + Runtime Dynamic = 0.0165418 W + + Branch Predictor: + Area = 0.064441 mm^2 + Peak Dynamic = 0.00326317 W + Subthreshold Leakage = 0.000518728 W + Gate Leakage = 0.000346624 W + Runtime Dynamic = 0.0045551 W + + Global Predictor: + Area = 0.0313969 mm^2 + Peak Dynamic = 0.00149811 W + Subthreshold Leakage = 0.000255012 W + Gate Leakage = 0.000169581 W + Runtime Dynamic = 0.00218323 W + + Local Predictor: + Area = 0.000711939 mm^2 + Peak Dynamic = 0.000120406 W + Subthreshold Leakage = 4.51731e-06 W + Gate Leakage = 4.09128e-06 W + Runtime Dynamic = 0.000188626 W + + Area = 0.000650815 mm^2 + Peak Dynamic = 9.20494e-05 W + Subthreshold Leakage = 4.25393e-06 W + Gate Leakage = 3.44945e-06 W + Runtime Dynamic = 0.000141995 W + + Chooser: + Area = 0.0313969 mm^2 + Peak Dynamic = 0.00149811 W + Subthreshold Leakage = 0.000255012 W + Gate Leakage = 0.000169581 W + Runtime Dynamic = 0.00218323 W + + RAS: + Area = 0.000996272 mm^2 + Peak Dynamic = 0.000146549 W + Subthreshold Leakage = 4.18739e-06 W + Gate Leakage = 3.3701e-06 W + Runtime Dynamic = 2.49598e-08 W + + Instruction Buffer: + Area = 0.00820192 mm^2 + Peak Dynamic = 0.0267951 W + Subthreshold Leakage = 4.66516e-05 W + Gate Leakage = 3.15732e-05 W + Runtime Dynamic = 0.0153115 W + + Instruction Decoder: + Area = 0.00468731 mm^2 + Peak Dynamic = 0.023524 W + Subthreshold Leakage = 9.40317e-05 W + Gate Leakage = 8.38587e-05 W + Runtime Dynamic = 0.047048 W + + Renaming Unit: + Area = 0.0903068 mm^2 + Peak Dynamic = 0.0180606 W + Subthreshold Leakage = 0.000254554 W + Gate Leakage = 0.000232507 W + Runtime Dynamic = 0.0292515 W + + Int Front End RAT: + Area = 0.0543672 mm^2 + Peak Dynamic = 0.00950468 W + Subthreshold Leakage = 0.000129029 W + Gate Leakage = 8.82378e-05 W + Runtime Dynamic = 0.0190094 W + + FP Front End RAT: + Area = 0.0185325 mm^2 + Peak Dynamic = 0.00379768 W + Subthreshold Leakage = 7.38761e-05 W + Gate Leakage = 4.91016e-05 W + Runtime Dynamic = 0.00379768 W + + Free List: + Area = 0.00599955 mm^2 + Peak Dynamic = 0.00090026 W + Subthreshold Leakage = 9.15772e-06 W + Gate Leakage = 7.32213e-06 W + Runtime Dynamic = 0.00360104 W + + Int Retire RAT: + Area = 0.00605969 mm^2 + Peak Dynamic = 0.00179357 W + Subthreshold Leakage = 9.8107e-06 W + Gate Leakage = 8.43969e-06 W + Runtime Dynamic = 0.00179357 W + + FP Retire RAT: + Area = 0.000650815 mm^2 + Peak Dynamic = 0.000269336 W + Subthreshold Leakage = 4.25393e-06 W + Gate Leakage = 3.44945e-06 W + Runtime Dynamic = 0.000269336 W + + FP Free List: + Area = 0.00305098 mm^2 + Peak Dynamic = 0.000780497 W + Subthreshold Leakage = 6.49266e-06 W + Gate Leakage = 5.05395e-06 W + Runtime Dynamic = 0.000780497 W + + Load Store Unit: + Area = 0.274913 mm^2 + Peak Dynamic = 0.0138993 W + Subthreshold Leakage = 0.00235727 W + Gate Leakage = 0.00171176 W + Runtime Dynamic = 0.0781216 W + + Data Cache: + Area = 0.240878 mm^2 + Peak Dynamic = 0.0117466 W + Subthreshold Leakage = 0.00230394 W + Gate Leakage = 0.00160316 W + Runtime Dynamic = 0.0761042 W + + StoreQ: + Area = 0.00754674 mm^2 + Peak Dynamic = 0.00143235 W + Subthreshold Leakage = 3.13936e-05 W + Gate Leakage = 3.76992e-05 W + Runtime Dynamic = 0.00201739 W + + Memory Management Unit: + Area = 0.021508 mm^2 + Peak Dynamic = 0.0050935 W + Subthreshold Leakage = 0.000155095 W + Gate Leakage = 0.000211049 W + Runtime Dynamic = 0.0148284 W + + Itlb: + Area = 0.00993091 mm^2 + Peak Dynamic = 0.00247139 W + Subthreshold Leakage = 6.65801e-05 W + Gate Leakage = 7.00732e-05 W + Runtime Dynamic = 0.0049428 W + + Dtlb: + Area = 0.00993091 mm^2 + Peak Dynamic = 0.00175468 W + Subthreshold Leakage = 6.65801e-05 W + Gate Leakage = 7.00732e-05 W + Runtime Dynamic = 0.00988557 W + + Execution Unit: + Area = 1.65498 mm^2 + Peak Dynamic = 0.104703 W + Subthreshold Leakage = 0.0224977 W + Gate Leakage = 0.0261417 W + Runtime Dynamic = 0.154667 W + + Register Files: + Area = 0.203203 mm^2 + Peak Dynamic = 0.0305313 W + Subthreshold Leakage = 0.000145099 W + Gate Leakage = 0.000118628 W + Runtime Dynamic = 0.0154426 W + + Integer RF: + Area = 0.146073 mm^2 + Peak Dynamic = 0.0305313 W + Subthreshold Leakage = 8.85877e-05 W + Gate Leakage = 7.24537e-05 W + Runtime Dynamic = 0.0138276 W + + Floating Point RF: + Area = 0.05713 mm^2 + Peak Dynamic = 0 W + Subthreshold Leakage = 5.6511e-05 W + Gate Leakage = 4.61745e-05 W + Runtime Dynamic = 0.00161506 W + + Instruction Scheduler: + Area = 0.0582889 mm^2 + Peak Dynamic = 0.0209028 W + Subthreshold Leakage = 9.47693e-05 W + Gate Leakage = 0.000134844 W + Runtime Dynamic = 0.0314989 W + + Instruction Window: + Area = 0.053925 mm^2 + Peak Dynamic = 0.0178358 W + Subthreshold Leakage = 7.01713e-05 W + Gate Leakage = 9.49122e-05 W + Runtime Dynamic = 0.0240893 W + + FP Instruction Window: + Area = 0.00436388 mm^2 + Peak Dynamic = 0.00306704 W + Subthreshold Leakage = 2.45979e-05 W + Gate Leakage = 3.99319e-05 W + Runtime Dynamic = 0.00740966 W + + Integer ALUs (Count: 3 ): + Area = 0.312404 mm^2 + Peak Dynamic = 0.0113473 W + Subthreshold Leakage = 0.0103625 W + Gate Leakage = 0.0120315 W + Runtime Dynamic = 0.0149307 W + + Floating Point Units (FPUs) (Count: 1 ): + Area = 0.971259 mm^2 + Peak Dynamic = 0 W + Subthreshold Leakage = 0.00805417 W + Gate Leakage = 0.00935142 W + Runtime Dynamic = 0.0149307 W + + Complex ALUs (Mul/Div) (Count: 1 ): + Area = 0.104135 mm^2 + Peak Dynamic = 0.00816212 W + Subthreshold Leakage = 0.00345415 W + Gate Leakage = 0.0040105 W + Runtime Dynamic = 0.0199076 W + + Results Broadcast Bus: + Area Overhead = 0.00404385 mm^2 + Peak Dynamic = 0.0329888 W + Subthreshold Leakage = 0.000365119 W + Gate Leakage = 0.000423926 W + Runtime Dynamic = 0.0579569 W + +***************************************************************************************** +First Level Directory + Area = 0.244856 mm^2 + Peak Dynamic = 0.00899504 W + Subthreshold Leakage = 0.0014643 W + Gate Leakage = 0.00238022 W + Runtime Dynamic = 0.721156 W + +***************************************************************************************** +BUSES + Area = 0.0143604 mm^2 + Peak Dynamic = 0.00661787 W + Subthreshold Leakage = 0.000156344 W + Gate Leakage = 0.000218372 W + Runtime Dynamic = 0.0330893 W + + Bus: + Area = 0.0143604 mm^2 + Peak Dynamic = 0.00661787 W + Subthreshold Leakage = 0.000156344 W + Gate Leakage = 0.000218372 W + Runtime Dynamic = 0.0330893 W + +***************************************************************************************** diff --git a/ext/mcpat/results/Alpha21364 b/ext/mcpat/results/Alpha21364 new file mode 100644 index 000000000..1b3d9e4bd --- /dev/null +++ b/ext/mcpat/results/Alpha21364 @@ -0,0 +1,441 @@ +McPAT (version 0.7 of May, 2010) is computing the target processor... + +Warning: icache array structure cannot satisfy throughput constraint. +Warning: icache array structure cannot satisfy latency constraint. +Warning: InstBuffer array structure cannot satisfy throughput constraint. +Warning: InstBuffer array structure cannot satisfy latency constraint. +Warning: Branch Target Buffer array structure cannot satisfy throughput constraint. +Warning: Branch Target Buffer array structure cannot satisfy latency constraint. +Warning: Global Predictor array structure cannot satisfy throughput constraint. +Warning: Global Predictor array structure cannot satisfy latency constraint. +Warning: L1 local Predictor array structure cannot satisfy throughput constraint. +Warning: L1 local Predictor array structure cannot satisfy latency constraint. +Warning: L2 local Predictor array structure cannot satisfy throughput constraint. +Warning: L2 local Predictor array structure cannot satisfy latency constraint. +Warning: Predictor Chooser array structure cannot satisfy throughput constraint. +Warning: Predictor Chooser array structure cannot satisfy latency constraint. +Warning: RAS array structure cannot satisfy throughput constraint. +Warning: RAS array structure cannot satisfy latency constraint. +Warning: dcache array structure cannot satisfy throughput constraint. +Warning: dcache array structure cannot satisfy latency constraint. +Warning: Integer Register File array structure cannot satisfy throughput constraint. +Warning: Integer Register File array structure cannot satisfy latency constraint. +Warning: Floating point Register File array structure cannot satisfy throughput constraint. +Warning: Floating point Register File array structure cannot satisfy latency constraint. +Warning: ReorderBuffer array structure cannot satisfy throughput constraint. +Warning: ReorderBuffer array structure cannot satisfy latency constraint. +Warning: Int RetireRAT array structure cannot satisfy throughput constraint. +Warning: Int RetireRAT array structure cannot satisfy latency constraint. +Warning: Int RetireRAT array structure cannot satisfy latency constraint. +Warning: Int Free List array structure cannot satisfy throughput constraint. +Warning: Int Free List array structure cannot satisfy latency constraint. +Warning: Int Free List array structure cannot satisfy throughput constraint. +Warning: Int Free List array structure cannot satisfy latency constraint. +Warning: MC ReadBuffer array structure cannot satisfy throughput constraint. +Warning: MC ReadBuffer array structure cannot satisfy latency constraint. +Warning: MC writeBuffer array structure cannot satisfy throughput constraint. +Warning: MC writeBuffer array structure cannot satisfy latency constraint. + +McPAT (version 0.7 of May, 2010) results (current print level is 5) +***************************************************************************************** + Technology 180 nm + Interconnect metal projection= aggressive interconnect technology projection + Core clock Rate(MHz) 1200 + +***************************************************************************************** +Processor: + Area = 323.859 mm^2 + Peak Power = 90.0375 W + Total Leakage = 0.156795 W + Peak Dynamic = 89.8807 W + Subthreshold Leakage = 0.151936 W + Gate Leakage = 0.00485969 W + Runtime Dynamic = 85.2036 W + + Total Cores: + Device Type= ITRS high performance device type + Area = 137.839 mm^2 + Peak Dynamic = 60.6776 W + Subthreshold Leakage = 0.067186 W + Gate Leakage = 0.00428355 W + Runtime Dynamic = 73.9555 W + + Total L2s: + Device Type= ITRS high performance device type + Area = 137.063 mm^2 + Peak Dynamic = 3.55835 W + Subthreshold Leakage = 0.0778886 W + Gate Leakage = 0.00016078 W + Runtime Dynamic = 6.34872 W + + Total First Level Directory: + Device Type= ITRS high performance device type + Area = 1.59954 mm^2 + Peak Dynamic = 0.805902 W + Subthreshold Leakage = 0.000311783 W + Gate Leakage = 2.63568e-05 W + Runtime Dynamic = 0.547665 W + + Total NoCs (Network/Bus): + Device Type= ITRS high performance device type + Area = 29.1057 mm^2 + Peak Dynamic = 16.5188 W + Subthreshold Leakage = 0.00292556 W + Gate Leakage = 0.000166293 W + Runtime Dynamic = 2.54446 W + + Total MCs: + Device Type= ITRS high performance device type + Area = 18.2519 mm^2 + Peak Dynamic = 8.32001 W + Subthreshold Leakage = 0.00362353 W + Gate Leakage = 0.000222708 W + Runtime Dynamic = 1.80731 W + +***************************************************************************************** +Core: + Area = 137.839 mm^2 + Peak Dynamic = 60.6776 W + Subthreshold Leakage = 0.067186 W + Gate Leakage = 0.00428355 W + Runtime Dynamic = 73.9555 W + + Instruction Fetch Unit: + Area = 27.6096 mm^2 + Peak Dynamic = 9.86655 W + Subthreshold Leakage = 0.00622106 W + Gate Leakage = 0.000344671 W + Runtime Dynamic = 10.0567 W + + Instruction Cache: + Area = 11.4511 mm^2 + Peak Dynamic = 1.53259 W + Subthreshold Leakage = 0.00371341 W + Gate Leakage = 0.000171069 W + Runtime Dynamic = 2.13168 W + + Branch Target Buffer: + Area = 13.3377 mm^2 + Peak Dynamic = 0.56236 W + Subthreshold Leakage = 0.001581 W + Gate Leakage = 9.5198e-05 W + Runtime Dynamic = 2.24944 W + + Branch Predictor: + Area = 2.1618 mm^2 + Peak Dynamic = 0.234643 W + Subthreshold Leakage = 0.000469396 W + Gate Leakage = 2.01907e-05 W + Runtime Dynamic = 0.198646 W + + Global Predictor: + Area = 0.893575 mm^2 + Peak Dynamic = 0.0726984 W + Subthreshold Leakage = 0.000182866 W + Gate Leakage = 7.91951e-06 W + Runtime Dynamic = 0.0726984 W + + Local Predictor: + Area = 0.420241 mm^2 + Peak Dynamic = 0.0532456 W + Subthreshold Leakage = 9.20027e-05 W + Gate Leakage = 3.89162e-06 W + Runtime Dynamic = 0.0532456 W + + Area = 0.291886 mm^2 + Peak Dynamic = 0.0292091 W + Subthreshold Leakage = 5.262e-05 W + Gate Leakage = 2.51093e-06 W + Runtime Dynamic = 0.0292091 W + + Chooser: + Area = 0.893575 mm^2 + Peak Dynamic = 0.0726984 W + Subthreshold Leakage = 0.000182866 W + Gate Leakage = 7.91951e-06 W + Runtime Dynamic = 0.0726984 W + + RAS: + Area = 0.0827607 mm^2 + Peak Dynamic = 0.0360009 W + Subthreshold Leakage = 1.16623e-05 W + Gate Leakage = 4.60036e-07 W + Runtime Dynamic = 3.58028e-06 W + + Instruction Buffer: + Area = 0.465385 mm^2 + Peak Dynamic = 2.10455 W + Subthreshold Leakage = 6.13248e-05 W + Gate Leakage = 4.88113e-06 W + Runtime Dynamic = 1.40303 W + + Instruction Decoder: + Area = 0.146031 mm^2 + Peak Dynamic = 4.07384 W + Subthreshold Leakage = 7.07416e-05 W + Gate Leakage = 3.32268e-06 W + Runtime Dynamic = 4.07384 W + + Renaming Unit: + Area = 11.7262 mm^2 + Peak Dynamic = 12.5584 W + Subthreshold Leakage = 0.000886804 W + Gate Leakage = 9.92419e-05 W + Runtime Dynamic = 9.90647 W + + Int Front End RAT: + Area = 8.24345 mm^2 + Peak Dynamic = 8.04227 W + Subthreshold Leakage = 0.000376247 W + Gate Leakage = 3.40623e-05 W + Runtime Dynamic = 8.04227 W + + FP Front End RAT: + Area = 2.549 mm^2 + Peak Dynamic = 2.75082 W + Subthreshold Leakage = 0.000149367 W + Gate Leakage = 1.30084e-05 W + Runtime Dynamic = 1.37541 W + + Free List: + Area = 0.446019 mm^2 + Peak Dynamic = 0.156051 W + Subthreshold Leakage = 1.32133e-05 W + Gate Leakage = 7.4667e-07 W + Runtime Dynamic = 0.312102 W + + Int Retire RAT: + Area = 0.184445 mm^2 + Peak Dynamic = 0.102656 W + Subthreshold Leakage = 8.50239e-06 W + Gate Leakage = 5.28869e-07 W + Runtime Dynamic = 0.102656 W + + FP Retire RAT: + Area = 0.0567228 mm^2 + Peak Dynamic = 0.0367258 W + Subthreshold Leakage = 5.67894e-06 W + Gate Leakage = 3.75578e-07 W + Runtime Dynamic = 0.0183629 W + + FP Free List: + Area = 0.198929 mm^2 + Peak Dynamic = 0.111293 W + Subthreshold Leakage = 8.61952e-06 W + Gate Leakage = 5.10875e-07 W + Runtime Dynamic = 0.0556467 W + + Load Store Unit: + Area = 49.742 mm^2 + Peak Dynamic = 11.7952 W + Subthreshold Leakage = 0.00715349 W + Gate Leakage = 0.00052778 W + Runtime Dynamic = 31.7658 W + + Data Cache: + Area = 36.106 mm^2 + Peak Dynamic = 9.28008 W + Subthreshold Leakage = 0.00663485 W + Gate Leakage = 0.000466572 W + Runtime Dynamic = 31.332 W + + LoadQ: + Area = 2.60005 mm^2 + Peak Dynamic = 0.578279 W + Subthreshold Leakage = 9.67302e-05 W + Gate Leakage = 5.59905e-06 W + Runtime Dynamic = 0.14457 W + + StoreQ: + Area = 2.60005 mm^2 + Peak Dynamic = 0.578279 W + Subthreshold Leakage = 9.67302e-05 W + Gate Leakage = 5.59905e-06 W + Runtime Dynamic = 0.289139 W + + Memory Management Unit: + Area = 8.74543 mm^2 + Peak Dynamic = 3.77198 W + Subthreshold Leakage = 0.00119904 W + Gate Leakage = 0.000127183 W + Runtime Dynamic = 4.82688 W + + Itlb: + Area = 1.97969 mm^2 + Peak Dynamic = 0.537563 W + Subthreshold Leakage = 0.000270576 W + Gate Leakage = 2.0845e-05 W + Runtime Dynamic = 1.07513 W + + Dtlb: + Area = 6.71814 mm^2 + Peak Dynamic = 1.87586 W + Subthreshold Leakage = 0.00060329 W + Gate Leakage = 5.63286e-05 W + Runtime Dynamic = 3.75174 W + + Execution Unit: + Area = 31.4918 mm^2 + Peak Dynamic = 22.6855 W + Subthreshold Leakage = 0.0320294 W + Gate Leakage = 0.00198102 W + Runtime Dynamic = 17.3997 W + + Register Files: + Area = 9.9318 mm^2 + Peak Dynamic = 3.92301 W + Subthreshold Leakage = 0.000295352 W + Gate Leakage = 1.33517e-05 W + Runtime Dynamic = 1.7929 W + + Integer RF: + Area = 6.76678 mm^2 + Peak Dynamic = 2.35597 W + Subthreshold Leakage = 0.000185762 W + Gate Leakage = 8.51701e-06 W + Runtime Dynamic = 1.60634 W + + Floating Point RF: + Area = 3.16503 mm^2 + Peak Dynamic = 1.56704 W + Subthreshold Leakage = 0.00010959 W + Gate Leakage = 4.83467e-06 W + Runtime Dynamic = 0.186553 W + + Instruction Scheduler: + Area = 5.20691 mm^2 + Peak Dynamic = 2.77224 W + Subthreshold Leakage = 0.000202187 W + Gate Leakage = 1.05832e-05 W + Runtime Dynamic = 3.11355 W + + Instruction Window: + Area = 1.23862 mm^2 + Peak Dynamic = 0.985117 W + Subthreshold Leakage = 5.55506e-05 W + Gate Leakage = 3.78978e-06 W + Runtime Dynamic = 1.23906 W + + FP Instruction Window: + Area = 0.481718 mm^2 + Peak Dynamic = 0.438839 W + Subthreshold Leakage = 2.5962e-05 W + Gate Leakage = 2.00351e-06 W + Runtime Dynamic = 0.526208 W + + ROB: + Area = 3.48657 mm^2 + Peak Dynamic = 1.34828 W + Subthreshold Leakage = 0.000120674 W + Gate Leakage = 4.78991e-06 W + Runtime Dynamic = 1.34828 W + + Integer ALUs (Count: 4 ): + Area = 3.4944 mm^2 + Peak Dynamic = 4.23312 W + Subthreshold Leakage = 0.016149 W + Gate Leakage = 0.000986885 W + Runtime Dynamic = 3.21343 W + + Floating Point Units (FPUs) (Count: 1 ): + Area = 12.705 mm^2 + Peak Dynamic = 3.52215 W + Subthreshold Leakage = 0.0146787 W + Gate Leakage = 0.000897034 W + Runtime Dynamic = 3.52215 W + + Results Broadcast Bus: + Area Overhead = 0.106062 mm^2 + Peak Dynamic = 6.87645 W + Subthreshold Leakage = 0.000378957 W + Gate Leakage = 2.31585e-05 W + Runtime Dynamic = 5.75766 W + +***************************************************************************************** +L2 + Area = 137.063 mm^2 + Peak Dynamic = 3.55835 W + Subthreshold Leakage = 0.0778886 W + Gate Leakage = 0.00016078 W + Runtime Dynamic = 6.34872 W + +***************************************************************************************** +Second Level Directory + Area = 1.59954 mm^2 + Peak Dynamic = 0.805902 W + Subthreshold Leakage = 0.000311783 W + Gate Leakage = 2.63568e-05 W + Runtime Dynamic = 0.547665 W + +***************************************************************************************** +Memory Controller: + Area = 9.12595 mm^2 + Peak Dynamic = 4.16 W + Subthreshold Leakage = 0.00181177 W + Gate Leakage = 0.000111354 W + Runtime Dynamic = 1.80731 W + + Front End Engine: + Area = 5.49326 mm^2 + Peak Dynamic = 1.42883 W + Subthreshold Leakage = 0.000132955 W + Gate Leakage = 8.76015e-06 W + Runtime Dynamic = 0.348049 W + + Transaction Engine: + Area = 1.50616 mm^2 + Peak Dynamic = 1.93117 W + Subthreshold Leakage = 0.000696058 W + Gate Leakage = 4.25369e-05 W + Runtime Dynamic = 0.579332 W + + PHY: + Area = 2.12653 mm^2 + Peak Dynamic = 0.8 W + Subthreshold Leakage = 0.000982753 W + Gate Leakage = 6.00571e-05 W + Runtime Dynamic = 0.879928 W + +***************************************************************************************** +NOC + Area = 29.1057 mm^2 + Peak Dynamic = 16.5188 W + Subthreshold Leakage = 0.00292556 W + Gate Leakage = 0.000166293 W + Runtime Dynamic = 2.54446 W + + Router: + Area = 28.4197 mm^2 + Peak Dynamic = 8.76431 W + Subthreshold Leakage = 0.00199965 W + Gate Leakage = 0.000109709 W + Runtime Dynamic = 1.25204 W + + Virtual Channel Buffer: + Area = 17.0424 mm^2 + Peak Dynamic = 7.30291 W + Subthreshold Leakage = 0.00119658 W + Gate Leakage = 4.15511e-05 W + Runtime Dynamic = 1.04327 W + + Crossbar: + Area = 0.357655 mm^2 + Peak Dynamic = 1.27997 W + Subthreshold Leakage = 0.000801415 W + Gate Leakage = 6.80527e-05 W + Runtime Dynamic = 0.182853 W + + Arbiter: + Peak Dynamic = 0.18143 W + Subthreshold Leakage = 1.65956e-06 W + Gate Leakage = 1.05559e-07 W + Runtime Dynamic = 0.0259186 W + + Per Router : + Area = 0.685989 mm^2 + Peak Dynamic = 7.75447 W + Subthreshold Leakage = 0.000925911 W + Gate Leakage = 5.65834e-05 W + Runtime Dynamic = 1.29241 W + +***************************************************************************************** diff --git a/ext/mcpat/results/Alpha21364_90nm b/ext/mcpat/results/Alpha21364_90nm new file mode 100644 index 000000000..2a97d7732 --- /dev/null +++ b/ext/mcpat/results/Alpha21364_90nm @@ -0,0 +1,408 @@ +McPAT (version 0.8 of Aug, 2010) is computing the target processor... + +Warning: icache array structure cannot satisfy latency constraint. +Warning: dcache array structure cannot satisfy latency constraint. + +McPAT (version 0.8 of Aug, 2010) results (current print level is 5) +***************************************************************************************** + Technology 90 nm + Interconnect metal projection= aggressive interconnect technology projection + Core clock Rate(MHz) 1200 + +***************************************************************************************** +Processor: + Area = 139.86 mm^2 + Peak Power = 34.9936 W + Total Leakage = 4.16949 W + Peak Dynamic = 30.8241 W + Subthreshold Leakage = 3.86203 W + Gate Leakage = 0.307463 W + Runtime Dynamic = 34.0612 W + + Total Cores: + Device Type= ITRS high performance device type + Area = 61.1957 mm^2 + Peak Dynamic = 19.6269 W + Subthreshold Leakage = 2.04452 W + Gate Leakage = 0.277429 W + Runtime Dynamic = 29.5972 W + + Total L2s: + Device Type= ITRS high performance device type + Area = 62.2653 mm^2 + Peak Dynamic = 1.42987 W + Subthreshold Leakage = 1.65481 W + Gate Leakage = 0.00860545 W + Runtime Dynamic = 2.73329 W + + Total First Level Directory: + Device Type= ITRS high performance device type + Area = 0.533824 mm^2 + Peak Dynamic = 0.275566 W + Subthreshold Leakage = 0.00929753 W + Gate Leakage = 0.00179126 W + Runtime Dynamic = 0.193681 W + + Total NoCs (Network/Bus): + Device Type= ITRS high performance device type + Area = 8.77595 mm^2 + Peak Dynamic = 6.17873 W + Subthreshold Leakage = 0.108357 W + Gate Leakage = 0.0139259 W + Runtime Dynamic = 0.963385 W + + Total MCs: + Device Type= ITRS high performance device type + Area = 7.08925 mm^2 + Peak Dynamic = 3.3131 W + Subthreshold Leakage = 0.0450389 W + Gate Leakage = 0.00571171 W + Runtime Dynamic = 0.573656 W + +***************************************************************************************** +Core: + Area = 61.1957 mm^2 + Peak Dynamic = 19.6269 W + Subthreshold Leakage = 2.04452 W + Gate Leakage = 0.277429 W + Runtime Dynamic = 29.5972 W + + Instruction Fetch Unit: + Area = 7.40352 mm^2 + Peak Dynamic = 2.10646 W + Subthreshold Leakage = 0.126581 W + Gate Leakage = 0.0150397 W + Runtime Dynamic = 2.55478 W + + Instruction Cache: + Area = 5.01657 mm^2 + Peak Dynamic = 0.745807 W + Subthreshold Leakage = 0.0906167 W + Gate Leakage = 0.010922 W + Runtime Dynamic = 1.22193 W + + Branch Target Buffer: + Area = 1.63475 mm^2 + Peak Dynamic = 0.0974373 W + Subthreshold Leakage = 0.0188281 W + Gate Leakage = 0.00126965 W + Runtime Dynamic = 0.389749 W + + Branch Predictor: + Area = 0.474272 mm^2 + Peak Dynamic = 0.0682449 W + Subthreshold Leakage = 0.00901262 W + Gate Leakage = 0.00067136 W + Runtime Dynamic = 0.0636543 W + + Global Predictor: + Area = 0.190297 mm^2 + Peak Dynamic = 0.0224229 W + Subthreshold Leakage = 0.00351842 W + Gate Leakage = 0.000260107 W + Runtime Dynamic = 0.0239711 W + + Local Predictor: + Area = 0.0959237 mm^2 + Peak Dynamic = 0.0143301 W + Subthreshold Leakage = 0.00171829 W + Gate Leakage = 0.00012889 W + Runtime Dynamic = 0.015711 W + + Area = 0.0484908 mm^2 + Peak Dynamic = 0.0077514 W + Subthreshold Leakage = 0.000926283 W + Gate Leakage = 7.55051e-05 W + Runtime Dynamic = 0.00850163 W + + Chooser: + Area = 0.190297 mm^2 + Peak Dynamic = 0.0224229 W + Subthreshold Leakage = 0.00351842 W + Gate Leakage = 0.000260107 W + Runtime Dynamic = 0.0239711 W + + RAS: + Area = 0.0451868 mm^2 + Peak Dynamic = 0.00906891 W + Subthreshold Leakage = 0.00025749 W + Gate Leakage = 2.22565e-05 W + Runtime Dynamic = 1.06361e-06 W + + Instruction Buffer: + Area = 0.11139 mm^2 + Peak Dynamic = 0.30298 W + Subthreshold Leakage = 0.000556928 W + Gate Leakage = 4.34124e-05 W + Runtime Dynamic = 0.201987 W + + Instruction Decoder: + Area = 0.0481902 mm^2 + Peak Dynamic = 0.677465 W + Subthreshold Leakage = 0.00135195 W + Gate Leakage = 0.000132907 W + Runtime Dynamic = 0.677465 W + + Renaming Unit: + Area = 4.5037 mm^2 + Peak Dynamic = 4.11785 W + Subthreshold Leakage = 0.0296009 W + Gate Leakage = 0.00668098 W + Runtime Dynamic = 3.24944 W + + Int Front End RAT: + Area = 2.76467 mm^2 + Peak Dynamic = 2.43279 W + Subthreshold Leakage = 0.0129405 W + Gate Leakage = 0.00255854 W + Runtime Dynamic = 2.43279 W + + FP Front End RAT: + Area = 1.39233 mm^2 + Peak Dynamic = 1.35403 W + Subthreshold Leakage = 0.00981219 W + Gate Leakage = 0.00205621 W + Runtime Dynamic = 0.677017 W + + Free List: + Area = 0.116928 mm^2 + Peak Dynamic = 0.0436483 W + Subthreshold Leakage = 0.000259915 W + Gate Leakage = 2.53395e-05 W + Runtime Dynamic = 0.0872966 W + + Int Retire RAT: + Area = 0.0429772 mm^2 + Peak Dynamic = 0.0318091 W + Subthreshold Leakage = 0.000152798 W + Gate Leakage = 1.86722e-05 W + Runtime Dynamic = 0.0318091 W + + FP Retire RAT: + Area = 0.0153516 mm^2 + Peak Dynamic = 0.00997874 W + Subthreshold Leakage = 8.06509e-05 W + Gate Leakage = 7.17049e-06 W + Runtime Dynamic = 0.00498937 W + + FP Free List: + Area = 0.0530951 mm^2 + Peak Dynamic = 0.0310624 W + Subthreshold Leakage = 0.000140326 W + Gate Leakage = 1.46766e-05 W + Runtime Dynamic = 0.0155312 W + + Load Store Unit: + Area = 20.5622 mm^2 + Peak Dynamic = 5.14439 W + Subthreshold Leakage = 0.207699 W + Gate Leakage = 0.0357344 W + Runtime Dynamic = 16.0217 W + + Data Cache: + Area = 15.2468 mm^2 + Peak Dynamic = 4.5468 W + Subthreshold Leakage = 0.19694 W + Gate Leakage = 0.0331746 W + Runtime Dynamic = 15.8781 W + + LoadQ: + Area = 0.863734 mm^2 + Peak Dynamic = 0.191536 W + Subthreshold Leakage = 0.00227213 W + Gate Leakage = 0.000279753 W + Runtime Dynamic = 0.047884 W + + StoreQ: + Area = 0.863734 mm^2 + Peak Dynamic = 0.191536 W + Subthreshold Leakage = 0.00227213 W + Gate Leakage = 0.000279753 W + Runtime Dynamic = 0.0957681 W + + Memory Management Unit: + Area = 3.49533 mm^2 + Peak Dynamic = 1.34391 W + Subthreshold Leakage = 0.0412098 W + Gate Leakage = 0.00931467 W + Runtime Dynamic = 2.25879 W + + Itlb: + Area = 1.12903 mm^2 + Peak Dynamic = 0.425717 W + Subthreshold Leakage = 0.0152632 W + Gate Leakage = 0.00308734 W + Runtime Dynamic = 0.851444 W + + Dtlb: + Area = 2.24796 mm^2 + Peak Dynamic = 0.703668 W + Subthreshold Leakage = 0.0197321 W + Gate Leakage = 0.00422696 W + Runtime Dynamic = 1.40735 W + + Execution Unit: + Area = 18.9802 mm^2 + Peak Dynamic = 6.91426 W + Subthreshold Leakage = 1.01207 W + Gate Leakage = 0.130415 W + Runtime Dynamic = 5.51245 W + + Register Files: + Area = 4.63431 mm^2 + Peak Dynamic = 1.07973 W + Subthreshold Leakage = 0.00557121 W + Gate Leakage = 0.000534421 W + Runtime Dynamic = 0.491409 W + + Integer RF: + Area = 3.11444 mm^2 + Peak Dynamic = 0.64479 W + Subthreshold Leakage = 0.00348926 W + Gate Leakage = 0.000338898 W + Runtime Dynamic = 0.43963 W + + Floating Point RF: + Area = 1.51987 mm^2 + Peak Dynamic = 0.434944 W + Subthreshold Leakage = 0.00208194 W + Gate Leakage = 0.000195523 W + Runtime Dynamic = 0.051779 W + + Instruction Scheduler: + Area = 2.2958 mm^2 + Peak Dynamic = 0.682653 W + Subthreshold Leakage = 0.0043779 W + Gate Leakage = 0.000496354 W + Runtime Dynamic = 0.783433 W + + Instruction Window: + Area = 0.416485 mm^2 + Peak Dynamic = 0.230852 W + Subthreshold Leakage = 0.001531 W + Gate Leakage = 0.000214549 W + Runtime Dynamic = 0.308242 W + + FP Instruction Window: + Area = 0.160067 mm^2 + Peak Dynamic = 0.0899719 W + Subthreshold Leakage = 0.000573841 W + Gate Leakage = 9.08104e-05 W + Runtime Dynamic = 0.113361 W + + ROB: + Area = 1.71925 mm^2 + Peak Dynamic = 0.361829 W + Subthreshold Leakage = 0.00227307 W + Gate Leakage = 0.000190995 W + Runtime Dynamic = 0.361829 W + + Integer ALUs (Count: 4 ): + Area = 2.56256 mm^2 + Peak Dynamic = 1.45952 W + Subthreshold Leakage = 0.514377 W + Gate Leakage = 0.0657924 W + Runtime Dynamic = 1.12031 W + + Floating Point Units (FPUs) (Count: 1 ): + Area = 9.317 mm^2 + Peak Dynamic = 1.32571 W + Subthreshold Leakage = 0.467545 W + Gate Leakage = 0.0598023 W + Runtime Dynamic = 1.32571 W + + Results Broadcast Bus: + Area Overhead = 0.0521609 mm^2 + Peak Dynamic = 2.15212 W + Subthreshold Leakage = 0.0139887 W + Gate Leakage = 0.00178925 W + Runtime Dynamic = 1.79159 W + +***************************************************************************************** +L2 + Area = 62.2653 mm^2 + Peak Dynamic = 1.42987 W + Subthreshold Leakage = 1.65481 W + Gate Leakage = 0.00860545 W + Runtime Dynamic = 2.73329 W + +***************************************************************************************** +Second Level Directory + Area = 0.533824 mm^2 + Peak Dynamic = 0.275566 W + Subthreshold Leakage = 0.00929753 W + Gate Leakage = 0.00179126 W + Runtime Dynamic = 0.193681 W + +***************************************************************************************** +Memory Controller: + Area = 3.54463 mm^2 + Peak Dynamic = 1.65655 W + Subthreshold Leakage = 0.0225194 W + Gate Leakage = 0.00285586 W + Runtime Dynamic = 0.573656 W + + Front End Engine: + Area = 1.72828 mm^2 + Peak Dynamic = 0.389588 W + Subthreshold Leakage = 0.00246696 W + Gate Leakage = 0.000291005 W + Runtime Dynamic = 0.0911898 W + + Transaction Engine: + Area = 0.75308 mm^2 + Peak Dynamic = 1.13896 W + Subthreshold Leakage = 0.00831402 W + Gate Leakage = 0.00106342 W + Runtime Dynamic = 0.341678 W + + PHY: + Area = 1.06326 mm^2 + Peak Dynamic = 0.128 W + Subthreshold Leakage = 0.0117384 W + Gate Leakage = 0.00150143 W + Runtime Dynamic = 0.140788 W + +***************************************************************************************** +NOC + Area = 8.77595 mm^2 + Peak Dynamic = 6.17873 W + Subthreshold Leakage = 0.108357 W + Gate Leakage = 0.0139259 W + Runtime Dynamic = 0.963385 W + + Router: + Area = 8.3047 mm^2 + Peak Dynamic = 2.78895 W + Subthreshold Leakage = 0.0606175 W + Gate Leakage = 0.00781974 W + Runtime Dynamic = 0.398421 W + + Virtual Channel Buffer: + Area = 4.2978 mm^2 + Peak Dynamic = 2.31409 W + Subthreshold Leakage = 0.028002 W + Gate Leakage = 0.00227471 W + Runtime Dynamic = 0.330584 W + + Crossbar: + Area = 0.160538 mm^2 + Peak Dynamic = 0.437862 W + Subthreshold Leakage = 0.0325996 W + Gate Leakage = 0.00554292 W + Runtime Dynamic = 0.0625517 W + + Arbiter: + Peak Dynamic = 0.0370018 W + Subthreshold Leakage = 1.5858e-05 W + Gate Leakage = 2.11117e-06 W + Runtime Dynamic = 0.00528597 W + + Per Router Links: + Area = 0.471256 mm^2 + Peak Dynamic = 3.38978 W + Subthreshold Leakage = 0.0477391 W + Gate Leakage = 0.00610616 W + Runtime Dynamic = 0.564963 W + +***************************************************************************************** diff --git a/ext/mcpat/results/Penryn b/ext/mcpat/results/Penryn new file mode 100644 index 000000000..af39390d1 --- /dev/null +++ b/ext/mcpat/results/Penryn @@ -0,0 +1,315 @@ +McPAT (version 0.8 of Aug, 2010) is computing the target processor... + + +McPAT (version 0.8 of Aug, 2010) results (current print level is 5) +***************************************************************************************** + Technology 45 nm + Using Long Channel Devices When Appropriate + Interconnect metal projection= aggressive interconnect technology projection + Core clock Rate(MHz) 3700 + +***************************************************************************************** +Processor: + Area = 92.2661 mm^2 + Peak Power = 61.0228 W + Total Leakage = 10.8609 W + Peak Dynamic = 50.1619 W + Subthreshold Leakage = 10.2773 W + Gate Leakage = 0.583567 W + Runtime Dynamic = 69.6347 W + + Total Cores: 2 cores + Device Type= ITRS high performance device type + Area = 48.2438 mm^2 + Peak Dynamic = 39.6676 W + Subthreshold Leakage = 6.96165 W + Gate Leakage = 0.541077 W + Runtime Dynamic = 51.4987 W + + Total L2s: + Device Type= ITRS high performance device type + Area = 43.1009 mm^2 + Peak Dynamic = 6.43272 W + Subthreshold Leakage = 3.28049 W + Gate Leakage = 0.0386655 W + Runtime Dynamic = 13.716 W + + Total NoCs (Network/Bus): + Device Type= ITRS high performance device type + Area = 0.921404 mm^2 + Peak Dynamic = 4.06164 W + Subthreshold Leakage = 0.035183 W + Gate Leakage = 0.00382481 W + Runtime Dynamic = 4.42002 W + +***************************************************************************************** +Core: + Area = 24.1219 mm^2 + Peak Dynamic = 19.8338 W + Subthreshold Leakage = 3.48083 W + Gate Leakage = 0.270538 W + Runtime Dynamic = 51.4987 W + + Instruction Fetch Unit: + Area = 3.13582 mm^2 + Peak Dynamic = 2.49774 W + Subthreshold Leakage = 0.421089 W + Gate Leakage = 0.0246791 W + Runtime Dynamic = 2.42869 W + + Instruction Cache: + Area = 0.702441 mm^2 + Peak Dynamic = 0.419702 W + Subthreshold Leakage = 0.0413175 W + Gate Leakage = 0.00175164 W + Runtime Dynamic = 0.487111 W + + Branch Target Buffer: + Area = 0.349484 mm^2 + Peak Dynamic = 0.0903353 W + Subthreshold Leakage = 0.0243658 W + Gate Leakage = 0.000966387 W + Runtime Dynamic = 0.361341 W + + Branch Predictor: + Area = 0.153017 mm^2 + Peak Dynamic = 0.0718712 W + Subthreshold Leakage = 0.0142615 W + Gate Leakage = 0.000619154 W + Runtime Dynamic = 0.0647272 W + + Global Predictor: + Area = 0.0475693 mm^2 + Peak Dynamic = 0.0231158 W + Subthreshold Leakage = 0.00544747 W + Gate Leakage = 0.000234591 W + Runtime Dynamic = 0.0245764 W + + Local Predictor: + L1_Local Predictor: + Area = 0.0239764 mm^2 + Peak Dynamic = 0.0142817 W + Subthreshold Leakage = 0.00265926 W + Gate Leakage = 0.00011608 W + Runtime Dynamic = 0.0155731 W + + L2_Local Predictor: + Area = 0.012121 mm^2 + Peak Dynamic = 0.00767395 W + Subthreshold Leakage = 0.00143248 W + Gate Leakage = 6.77717e-05 W + Runtime Dynamic = 0.00837399 W + + Chooser: + Area = 0.0475693 mm^2 + Peak Dynamic = 0.0231158 W + Subthreshold Leakage = 0.00544747 W + Gate Leakage = 0.000234591 W + Runtime Dynamic = 0.0245764 W + + RAS: + Area = 0.0217815 mm^2 + Peak Dynamic = 0.0113578 W + Subthreshold Leakage = 0.000707258 W + Gate Leakage = 3.38921e-05 W + Runtime Dynamic = 1.2459e-06 W + + Instruction Buffer: + Area = 0.0278406 mm^2 + Peak Dynamic = 0.282368 W + Subthreshold Leakage = 0.000861686 W + Gate Leakage = 3.91839e-05 W + Runtime Dynamic = 0.188245 W + + Instruction Decoder: + Area = 1.85799 mm^2 + Peak Dynamic = 1.32726 W + Subthreshold Leakage = 0.325606 W + Gate Leakage = 0.0185411 W + Runtime Dynamic = 1.32726 W + + Renaming Unit: + Area = 1.02517 mm^2 + Peak Dynamic = 2.25746 W + Subthreshold Leakage = 0.042129 W + Gate Leakage = 0.00480502 W + Runtime Dynamic = 1.55315 W + + Int Front End RAT: + Area = 0.59725 mm^2 + Peak Dynamic = 1.25286 W + Subthreshold Leakage = 0.0159587 W + Gate Leakage = 0.00122436 W + Runtime Dynamic = 1.11309 W + + FP Front End RAT: + Area = 0.350662 mm^2 + Peak Dynamic = 0.652971 W + Subthreshold Leakage = 0.0110219 W + Gate Leakage = 0.00079321 W + Runtime Dynamic = 0.326485 W + + Free List: + Area = 0.0322035 mm^2 + Peak Dynamic = 0.0454309 W + Subthreshold Leakage = 0.000471802 W + Gate Leakage = 2.57995e-05 W + Runtime Dynamic = 0.113577 W + + Load Store Unit: + Area = 7.24152 mm^2 + Peak Dynamic = 6.57278 W + Subthreshold Leakage = 0.310798 W + Gate Leakage = 0.0358085 W + Runtime Dynamic = 34.9208 W + + Data Cache: + Area = 4.65034 mm^2 + Peak Dynamic = 5.03369 W + Subthreshold Leakage = 0.237004 W + Gate Leakage = 0.0253255 W + Runtime Dynamic = 33.601 W + + LoadQ: + Area = 0.260806 mm^2 + Peak Dynamic = 0.132332 W + Subthreshold Leakage = 0.00523814 W + Gate Leakage = 0.000359005 W + Runtime Dynamic = 0.0661662 W + + StoreQ: + Area = 1.06006 mm^2 + Peak Dynamic = 1.25365 W + Subthreshold Leakage = 0.0538794 W + Gate Leakage = 0.00736236 W + Runtime Dynamic = 1.25365 W + + Memory Management Unit: + Area = 0.363299 mm^2 + Peak Dynamic = 0.610831 W + Subthreshold Leakage = 0.0388017 W + Gate Leakage = 0.00431691 W + Runtime Dynamic = 1.29234 W + + Itlb: + Area = 0.0590462 mm^2 + Peak Dynamic = 0.116192 W + Subthreshold Leakage = 0.00608044 W + Gate Leakage = 0.000398475 W + Runtime Dynamic = 0.232386 W + + Dtlb: + Area = 0.259199 mm^2 + Peak Dynamic = 0.264986 W + Subthreshold Leakage = 0.0180446 W + Gate Leakage = 0.00115678 W + Runtime Dynamic = 1.05995 W + + Execution Unit: + Area = 7.9594 mm^2 + Peak Dynamic = 7.89497 W + Subthreshold Leakage = 1.28761 W + Gate Leakage = 0.0977152 W + Runtime Dynamic = 11.3037 W + + Register Files: + Area = 0.528076 mm^2 + Peak Dynamic = 0.554172 W + Subthreshold Leakage = 0.00459231 W + Gate Leakage = 0.000305031 W + Runtime Dynamic = 0.283985 W + + Integer RF: + Area = 0.336446 mm^2 + Peak Dynamic = 0.461344 W + Subthreshold Leakage = 0.00257976 W + Gate Leakage = 0.00018025 W + Runtime Dynamic = 0.247149 W + + Floating Point RF: + Area = 0.19163 mm^2 + Peak Dynamic = 0.0928276 W + Subthreshold Leakage = 0.00201255 W + Gate Leakage = 0.000124781 W + Runtime Dynamic = 0.0368364 W + + Instruction Scheduler: + Area = 1.97424 mm^2 + Peak Dynamic = 1.76421 W + Subthreshold Leakage = 0.0212898 W + Gate Leakage = 0.0014052 W + Runtime Dynamic = 1.96388 W + + Instruction Window: + Area = 0.889691 mm^2 + Peak Dynamic = 0.468182 W + Subthreshold Leakage = 0.0081033 W + Gate Leakage = 0.000620258 W + Runtime Dynamic = 0.601258 W + + FP Instruction Window: + Area = 0.347423 mm^2 + Peak Dynamic = 0.230453 W + Subthreshold Leakage = 0.00381664 W + Gate Leakage = 0.000293336 W + Runtime Dynamic = 0.29704 W + + ROB: + Area = 0.737129 mm^2 + Peak Dynamic = 1.06558 W + Subthreshold Leakage = 0.00936988 W + Gate Leakage = 0.000491606 W + Runtime Dynamic = 1.06558 W + + Integer ALUs (Count: 6 ): + Area = 0.47087 mm^2 + Peak Dynamic = 2.2206 W + Subthreshold Leakage = 0.295671 W + Gate Leakage = 0.0221076 W + Runtime Dynamic = 1.14549 W + + Floating Point Units (FPUs) (Count: 2 ): + Area = 4.6585 mm^2 + Peak Dynamic = 0.708407 W + Subthreshold Leakage = 0.731296 W + Gate Leakage = 0.0546797 W + Runtime Dynamic = 1.28625 W + + Complex ALUs (Mul/Div) (Count: 1 ): + Area = 0.235435 mm^2 + Peak Dynamic = 0.257249 W + Subthreshold Leakage = 0.147835 W + Gate Leakage = 0.0110538 W + Runtime Dynamic = 1.57424 W + + Results Broadcast Bus: + Area Overhead = 0.0472187 mm^2 + Peak Dynamic = 2.08413 W + Subthreshold Leakage = 0.0722513 W + Gate Leakage = 0.00540229 W + Runtime Dynamic = 5.04986 W + +***************************************************************************************** +L2 + Area = 43.1009 mm^2 + Peak Dynamic = 6.43272 W + Subthreshold Leakage = 3.28049 W + Gate Leakage = 0.0386655 W + Runtime Dynamic = 13.716 W + +***************************************************************************************** +BUSES + Area = 0.921404 mm^2 + Peak Dynamic = 4.06164 W + Subthreshold Leakage = 0.035183 W + Gate Leakage = 0.00382481 W + Runtime Dynamic = 4.42002 W + + Bus: + Area = 0.921404 mm^2 + Peak Dynamic = 4.06164 W + Subthreshold Leakage = 0.035183 W + Gate Leakage = 0.00382481 W + Runtime Dynamic = 4.42002 W + +***************************************************************************************** diff --git a/ext/mcpat/results/T1 b/ext/mcpat/results/T1 new file mode 100644 index 000000000..f63e51c81 --- /dev/null +++ b/ext/mcpat/results/T1 @@ -0,0 +1,296 @@ +McPAT (version 0.8 of Aug, 2010) is computing the target processor... + + +McPAT (version 0.8 of Aug, 2010) results (current print level is 5) +***************************************************************************************** + Technology 90 nm + Using Long Channel Devices When Appropriate + Interconnect metal projection= aggressive interconnect technology projection + Core clock Rate(MHz) 1200 + +***************************************************************************************** +Processor: + Area = 283.287 mm^2 + Peak Power = 55.0318 W + Total Leakage = 9.78078 W + Peak Dynamic = 45.2511 W + Subthreshold Leakage = 8.64906 W + Gate Leakage = 1.13172 W + Runtime Dynamic = 45.5013 W + + Total Cores: + Device Type= ITRS high performance device type + Area = 117.887 mm^2 + Peak Dynamic = 28.1307 W + Subthreshold Leakage = 5.19354 W + Gate Leakage = 0.730037 W + Runtime Dynamic = 18.917 W + + Total L2s: + Device Type= ITRS high performance device type + Area = 116.308 mm^2 + Peak Dynamic = 5.51367 W + Subthreshold Leakage = 2.41316 W + Gate Leakage = 0.242513 W + Runtime Dynamic = 4.00707 W + + Total First Level Directory: + Device Type= ITRS high performance device type + Area = 8.77473 mm^2 + Peak Dynamic = 3.38588 W + Subthreshold Leakage = 0.224524 W + Gate Leakage = 0.0320801 W + Runtime Dynamic = 15.1158 W + + Total NoCs (Network/Bus): + Device Type= ITRS high performance device type + Area = 8.87598 mm^2 + Peak Dynamic = 3.67515 W + Subthreshold Leakage = 0.488892 W + Gate Leakage = 0.0852308 W + Runtime Dynamic = 2.20509 W + + Total MCs: + Device Type= ITRS high performance device type + Area = 31.441 mm^2 + Peak Dynamic = 4.5457 W + Subthreshold Leakage = 0.328953 W + Gate Leakage = 0.0418558 W + Runtime Dynamic = 5.25637 W + +***************************************************************************************** +Core: + Area = 14.7359 mm^2 + Peak Dynamic = 3.51633 W + Subthreshold Leakage = 0.649192 W + Gate Leakage = 0.0912546 W + Runtime Dynamic = 18.917 W + + Instruction Fetch Unit: + Area = 3.60967 mm^2 + Peak Dynamic = 0.560912 W + Subthreshold Leakage = 0.0396492 W + Gate Leakage = 0.00709504 W + Runtime Dynamic = 3.76593 W + + Instruction Cache: + Area = 3.41818 mm^2 + Peak Dynamic = 0.308492 W + Subthreshold Leakage = 0.0286475 W + Gate Leakage = 0.00418329 W + Runtime Dynamic = 0.95332 W + + Instruction Buffer: + Area = 0.0122742 mm^2 + Peak Dynamic = 0.0121268 W + Subthreshold Leakage = 0.0002042 W + Gate Leakage = 1.78658e-05 W + Runtime Dynamic = 0.0970143 W + + Instruction Decoder: + Area = 0.0229327 mm^2 + Peak Dynamic = 0.169467 W + Subthreshold Leakage = 0.00259055 W + Gate Leakage = 0.000252139 W + Runtime Dynamic = 1.35574 W + + Load Store Unit: + Area = 3.07616 mm^2 + Peak Dynamic = 0.390349 W + Subthreshold Leakage = 0.0362126 W + Gate Leakage = 0.00713432 W + Runtime Dynamic = 3.85623 W + + Data Cache: + Area = 1.47986 mm^2 + Peak Dynamic = 0.191211 W + Subthreshold Leakage = 0.0157454 W + Gate Leakage = 0.00208738 W + Runtime Dynamic = 0.443377 W + + Load/Store Queue: + Area = 1.17458 mm^2 + Peak Dynamic = 0.128312 W + Subthreshold Leakage = 0.0122603 W + Gate Leakage = 0.0024052 W + Runtime Dynamic = 2.05299 W + + Memory Management Unit: + Area = 1.27751 mm^2 + Peak Dynamic = 0.324071 W + Subthreshold Leakage = 0.0192968 W + Gate Leakage = 0.0049902 W + Runtime Dynamic = 2.53591 W + + Itlb: + Area = 0.560615 mm^2 + Peak Dynamic = 0.117604 W + Subthreshold Leakage = 0.00554488 W + Gate Leakage = 0.00117423 W + Runtime Dynamic = 0.940838 W + + Dtlb: + Area = 0.560615 mm^2 + Peak Dynamic = 0.0294011 W + Subthreshold Leakage = 0.00554488 W + Gate Leakage = 0.00117423 W + Runtime Dynamic = 0.235211 W + + Execution Unit: + Area = 3.47025 mm^2 + Peak Dynamic = 2.241 W + Subthreshold Leakage = 0.222601 W + Gate Leakage = 0.0296426 W + Runtime Dynamic = 8.75894 W + + Register Files: + Area = 1.38355 mm^2 + Peak Dynamic = 0.0746572 W + Subthreshold Leakage = 0.00827136 W + Gate Leakage = 0.000628178 W + Runtime Dynamic = 0.320633 W + + Integer RF: + Area = 0.592652 mm^2 + Peak Dynamic = 0.0582404 W + Subthreshold Leakage = 0.00161128 W + Gate Leakage = 0.000148771 W + Runtime Dynamic = 0.312722 W + + Floating Point RF: + Area = 0.592652 mm^2 + Peak Dynamic = 0.0164168 W + Subthreshold Leakage = 0.00161128 W + Gate Leakage = 0.000148771 W + Runtime Dynamic = 0.00783962 W + + Register Windows: + Area = 0.198243 mm^2 + Peak Dynamic = 0 W + Subthreshold Leakage = 0.00504879 W + Gate Leakage = 0.000330636 W + Runtime Dynamic = 7.11291e-05 W + + Instruction Scheduler: + Area = 0.04377 mm^2 + Peak Dynamic = 0.0284368 W + Subthreshold Leakage = 0.000336066 W + Gate Leakage = 5.10703e-05 W + Runtime Dynamic = 0.244528 W + + Instruction Window: + Area = 0.04377 mm^2 + Peak Dynamic = 0.0284368 W + Subthreshold Leakage = 0.000336066 W + Gate Leakage = 5.10703e-05 W + Runtime Dynamic = 0.244528 W + + Integer ALUs (Count: 1 ): + Area = 0.16016 mm^2 + Peak Dynamic = 0.305285 W + Subthreshold Leakage = 0.0321485 W + Gate Leakage = 0.00411202 W + Runtime Dynamic = 2.71365 W + + Floating Point Units (FPUs) (Count: 0.125 ): + Area = 1.16463 mm^2 + Peak Dynamic = 0.0508808 W + Subthreshold Leakage = 0.0584431 W + Gate Leakage = 0.00747528 W + Runtime Dynamic = 0.101762 W + + Complex ALUs (Mul/Div) (Count: 1 ): + Area = 0.48048 mm^2 + Peak Dynamic = 0.339206 W + Subthreshold Leakage = 0.0964456 W + Gate Leakage = 0.0123361 W + Runtime Dynamic = 0.678411 W + + Results Broadcast Bus: + Area Overhead = 0.0813807 mm^2 + Peak Dynamic = 1.18756 W + Subthreshold Leakage = 0.0187498 W + Gate Leakage = 0.00239823 W + Runtime Dynamic = 3.3401 W + +***************************************************************************************** +L2 + Area = 29.0771 mm^2 + Peak Dynamic = 1.37842 W + Subthreshold Leakage = 0.603289 W + Gate Leakage = 0.0606283 W + Runtime Dynamic = 4.00707 W + +***************************************************************************************** +First Level Directory + Area = 2.19368 mm^2 + Peak Dynamic = 0.84647 W + Subthreshold Leakage = 0.0561311 W + Gate Leakage = 0.00802003 W + Runtime Dynamic = 15.1158 W + +***************************************************************************************** +Memory Controller: + Area = 7.86025 mm^2 + Peak Dynamic = 1.13642 W + Subthreshold Leakage = 0.0822383 W + Gate Leakage = 0.0104639 W + Runtime Dynamic = 5.25637 W + + Front End Engine: + Area = 0.63078 mm^2 + Peak Dynamic = 0.0549429 W + Subthreshold Leakage = 0.00242476 W + Gate Leakage = 0.00025524 W + Runtime Dynamic = 0.241753 W + + Transaction Engine: + Area = 2.59502 mm^2 + Peak Dynamic = 0.569482 W + Subthreshold Leakage = 0.0286491 W + Gate Leakage = 0.00366442 W + Runtime Dynamic = 2.50577 W + + PHY: + Area = 4.63445 mm^2 + Peak Dynamic = 0.512 W + Subthreshold Leakage = 0.0511644 W + Gate Leakage = 0.00654429 W + Runtime Dynamic = 2.50885 W + +***************************************************************************************** +NOC + Area = 8.87598 mm^2 + Peak Dynamic = 3.67515 W + Subthreshold Leakage = 0.488892 W + Gate Leakage = 0.0852308 W + Runtime Dynamic = 2.20509 W + + Router: + Area = 4.43799 mm^2 + Peak Dynamic = 1.83757 W + Subthreshold Leakage = 0.244446 W + Gate Leakage = 0.0426154 W + Runtime Dynamic = 2.20509 W + + Virtual Channel Buffer: + Area = 1.22928 mm^2 + Peak Dynamic = 0.0508654 W + Subthreshold Leakage = 0.000485491 W + Gate Leakage = 7.24213e-05 W + Runtime Dynamic = 0.0610385 W + + Crossbar: + Area = 1.35717 mm^2 + Peak Dynamic = 1.77185 W + Subthreshold Leakage = 0.243949 W + Gate Leakage = 0.0425414 W + Runtime Dynamic = 2.12622 W + + Arbiter: + Peak Dynamic = 0.0148566 W + Subthreshold Leakage = 1.15783e-05 W + Gate Leakage = 1.54103e-06 W + Runtime Dynamic = 0.0178279 W + +***************************************************************************************** diff --git a/ext/mcpat/results/T1_DC_64 b/ext/mcpat/results/T1_DC_64 new file mode 100644 index 000000000..cdb0a1b3c --- /dev/null +++ b/ext/mcpat/results/T1_DC_64 @@ -0,0 +1,270 @@ +McPAT (version 0.8 of Aug, 2010) is computing the target processor... + +line64 +size1.04858e+06 +line9 +size1.04858e+06 + +McPAT (version 0.8 of Aug, 2010) results (current print level is 5) +***************************************************************************************** + Technology 22 nm + Using Long Channel Devices When Appropriate + Interconnect metal projection= aggressive interconnect technology projection + Core clock Rate(MHz) 3500 + +***************************************************************************************** +Processor: + Area = 322.362 mm^2 + Peak Power = 112.557 W + Total Leakage = 28.0714 W + Peak Dynamic = 84.4853 W + Subthreshold Leakage = 27.7571 W + Gate Leakage = 0.314289 W + Runtime Dynamic = 13.4278 W + + Total Cores: 64 cores + Device Type= ITRS high performance device type + Area = 87.1986 mm^2 + Peak Dynamic = 42.426 W + Subthreshold Leakage = 7.80232 W + Gate Leakage = 0.0799149 W + Runtime Dynamic = 9.61388 W + + Total L2s: + Device Type= ITRS high performance device type + Area = 161.532 mm^2 + Peak Dynamic = 21.1059 W + Subthreshold Leakage = 8.9583 W + Gate Leakage = 0.100733 W + Runtime Dynamic = 1.14063 W + + Total First Level Directory: + Device Type= ITRS high performance device type + Area = 22.1741 mm^2 + Peak Dynamic = 0.831407 W + Subthreshold Leakage = 1.57123 W + Gate Leakage = 0.0148674 W + Runtime Dynamic = 0.175856 W + + Total NoCs (Network/Bus): + Device Type= ITRS high performance device type + Area = 51.4571 mm^2 + Peak Dynamic = 20.122 W + Subthreshold Leakage = 9.42527 W + Gate Leakage = 0.118774 W + Runtime Dynamic = 2.49747 W + +***************************************************************************************** +Core: + Area = 1.36248 mm^2 + Peak Dynamic = 0.662906 W + Subthreshold Leakage = 0.121911 W + Gate Leakage = 0.00124867 W + Runtime Dynamic = 9.61388 W + + Instruction Fetch Unit: + Area = 0.140786 mm^2 + Peak Dynamic = 0.0863256 W + Subthreshold Leakage = 0.00636762 W + Gate Leakage = 7.4998e-05 W + Runtime Dynamic = 2.08883 W + + Instruction Cache: + Area = 0.129377 mm^2 + Peak Dynamic = 0.0476007 W + Subthreshold Leakage = 0.00381804 W + Gate Leakage = 2.35266e-05 W + Runtime Dynamic = 0.0698158 W + + Instruction Buffer: + Area = 0.000754971 mm^2 + Peak Dynamic = 0.00238165 W + Subthreshold Leakage = 4.99334e-05 W + Gate Leakage = 3.27157e-07 W + Runtime Dynamic = 0.0190532 W + + Instruction Decoder: + Area = 0.00131543 mm^2 + Peak Dynamic = 0.0246042 W + Subthreshold Leakage = 0.000538954 W + Gate Leakage = 3.91915e-06 W + Runtime Dynamic = 0.196833 W + + Load Store Unit: + Area = 0.0977414 mm^2 + Peak Dynamic = 0.0587123 W + Subthreshold Leakage = 0.00580883 W + Gate Leakage = 7.48788e-05 W + Runtime Dynamic = 2.07447 W + + Data Cache: + Area = 0.0569223 mm^2 + Peak Dynamic = 0.0329939 W + Subthreshold Leakage = 0.00249221 W + Gate Leakage = 1.63814e-05 W + Runtime Dynamic = 0.0476753 W + + Load/Store Queue: + Area = 0.023444 mm^2 + Peak Dynamic = 0.0139792 W + Subthreshold Leakage = 0.00135593 W + Gate Leakage = 1.12722e-05 W + Runtime Dynamic = 0.223667 W + + Memory Management Unit: + Area = 0.0313997 mm^2 + Peak Dynamic = 0.0446647 W + Subthreshold Leakage = 0.0029577 W + Gate Leakage = 5.57335e-05 W + Runtime Dynamic = 1.92566 W + + Itlb: + Area = 0.0110306 mm^2 + Peak Dynamic = 0.0122535 W + Subthreshold Leakage = 0.000498504 W + Gate Leakage = 4.25417e-06 W + Runtime Dynamic = 0.0980282 W + + Dtlb: + Area = 0.0110306 mm^2 + Peak Dynamic = 0.00306337 W + Subthreshold Leakage = 0.000498504 W + Gate Leakage = 4.25417e-06 W + Runtime Dynamic = 0.0245072 W + + Execution Unit: + Area = 0.299667 mm^2 + Peak Dynamic = 0.473204 W + Subthreshold Leakage = 0.0379242 W + Gate Leakage = 0.000384077 W + Runtime Dynamic = 3.52491 W + + Register Files: + Area = 0.0598365 mm^2 + Peak Dynamic = 0.0168768 W + Subthreshold Leakage = 0.0020814 W + Gate Leakage = 1.24237e-05 W + Runtime Dynamic = 0.072481 W + + Integer RF: + Area = 0.0240072 mm^2 + Peak Dynamic = 0.0131657 W + Subthreshold Leakage = 0.000449165 W + Gate Leakage = 3.33111e-06 W + Runtime Dynamic = 0.0706931 W + + Floating Point RF: + Area = 0.0240072 mm^2 + Peak Dynamic = 0.00371113 W + Subthreshold Leakage = 0.000449165 W + Gate Leakage = 3.33111e-06 W + Runtime Dynamic = 0.0017722 W + + Register Windows: + Area = 0.0118221 mm^2 + Peak Dynamic = 0 W + Subthreshold Leakage = 0.00118307 W + Gate Leakage = 5.76149e-06 W + Runtime Dynamic = 1.56951e-05 W + + Instruction Scheduler: + Area = 0.00263062 mm^2 + Peak Dynamic = 0.00540689 W + Subthreshold Leakage = 8.27524e-05 W + Gate Leakage = 9.38261e-07 W + Runtime Dynamic = 0.0464411 W + + Instruction Window: + Area = 0.00263062 mm^2 + Peak Dynamic = 0.00540689 W + Subthreshold Leakage = 8.27524e-05 W + Gate Leakage = 9.38261e-07 W + Runtime Dynamic = 0.0464411 W + + Integer ALUs (Count: 1 ): + Area = 0.0384544 mm^2 + Peak Dynamic = 0.0946992 W + Subthreshold Leakage = 0.00667865 W + Gate Leakage = 6.39207e-05 W + Runtime Dynamic = 0.841771 W + + Floating Point Units (FPUs) (Count: 0.125 ): + Area = 0.0695899 mm^2 + Peak Dynamic = 0.0157832 W + Subthreshold Leakage = 0.00302155 W + Gate Leakage = 2.89189e-05 W + Runtime Dynamic = 0.0315664 W + + Complex ALUs (Mul/Div) (Count: 1 ): + Area = 0.115363 mm^2 + Peak Dynamic = 0.105221 W + Subthreshold Leakage = 0.020036 W + Gate Leakage = 0.000191762 W + Runtime Dynamic = 0.210443 W + + Results Broadcast Bus: + Area Overhead = 0.00445381 mm^2 + Peak Dynamic = 0.192955 W + Subthreshold Leakage = 0.00406321 W + Gate Leakage = 3.88886e-05 W + Runtime Dynamic = 0.519078 W + +***************************************************************************************** +L2 + Area = 2.52394 mm^2 + Peak Dynamic = 0.32978 W + Subthreshold Leakage = 0.139973 W + Gate Leakage = 0.00157395 W + Runtime Dynamic = 1.14063 W + +***************************************************************************************** +Second Level Directory + Area = 2.77176 mm^2 + Peak Dynamic = 0.103926 W + Subthreshold Leakage = 0.196403 W + Gate Leakage = 0.00185842 W + Runtime Dynamic = 0.175856 W + +***************************************************************************************** +NOC + Area = 51.4571 mm^2 + Peak Dynamic = 20.122 W + Subthreshold Leakage = 9.42527 W + Gate Leakage = 0.118774 W + Runtime Dynamic = 2.49747 W + + Router: + Area = 0.578434 mm^2 + Peak Dynamic = 0.184548 W + Subthreshold Leakage = 0.125515 W + Gate Leakage = 0.0016409 W + Runtime Dynamic = 1.32875 W + + Virtual Channel Buffer: + Area = 0.159162 mm^2 + Peak Dynamic = 0.00394081 W + Subthreshold Leakage = 0.000194478 W + Gate Leakage = 1.84946e-06 W + Runtime Dynamic = 0.0283738 W + + Crossbar: + Area = 0.160976 mm^2 + Peak Dynamic = 0.179891 W + Subthreshold Leakage = 0.12532 W + Gate Leakage = 0.00163905 W + Runtime Dynamic = 1.29522 W + + Arbiter: + Peak Dynamic = 0.000716053 W + Subthreshold Leakage = 3.67148e-07 W + Gate Leakage = 3.86991e-09 W + Runtime Dynamic = 0.00515558 W + + Per Router Links: + Area = 0.225583 mm^2 + Peak Dynamic = 0.129858 W + Subthreshold Leakage = 0.0217549 W + Gate Leakage = 0.000214933 W + Runtime Dynamic = 1.16872 W + +***************************************************************************************** diff --git a/ext/mcpat/results/T1_SBT_64 b/ext/mcpat/results/T1_SBT_64 new file mode 100644 index 000000000..ec8968a19 --- /dev/null +++ b/ext/mcpat/results/T1_SBT_64 @@ -0,0 +1,252 @@ +McPAT (version 0.8 of Aug, 2010) is computing the target processor... + +line72 +size1.17965e+06 + +McPAT (version 0.8 of Aug, 2010) results (current print level is 5) +***************************************************************************************** + Technology 22 nm + Using Long Channel Devices When Appropriate + Interconnect metal projection= aggressive interconnect technology projection + Core clock Rate(MHz) 3500 + +***************************************************************************************** +Processor: + Area = 321.412 mm^2 + Peak Power = 114.076 W + Total Leakage = 27.4353 W + Peak Dynamic = 86.6406 W + Subthreshold Leakage = 27.1256 W + Gate Leakage = 0.309772 W + Runtime Dynamic = 13.4064 W + + Total Cores: 64 cores + Device Type= ITRS high performance device type + Area = 87.1986 mm^2 + Peak Dynamic = 42.426 W + Subthreshold Leakage = 7.80232 W + Gate Leakage = 0.0799149 W + Runtime Dynamic = 9.61388 W + + Total L2s: + Device Type= ITRS high performance device type + Area = 182.778 mm^2 + Peak Dynamic = 24.1051 W + Subthreshold Leakage = 9.90006 W + Gate Leakage = 0.111104 W + Runtime Dynamic = 1.29686 W + + Total NoCs (Network/Bus): + Device Type= ITRS high performance device type + Area = 51.4353 mm^2 + Peak Dynamic = 20.1095 W + Subthreshold Leakage = 9.42317 W + Gate Leakage = 0.118753 W + Runtime Dynamic = 2.4957 W + +***************************************************************************************** +Core: + Area = 1.36248 mm^2 + Peak Dynamic = 0.662906 W + Subthreshold Leakage = 0.121911 W + Gate Leakage = 0.00124867 W + Runtime Dynamic = 9.61388 W + + Instruction Fetch Unit: + Area = 0.140786 mm^2 + Peak Dynamic = 0.0863256 W + Subthreshold Leakage = 0.00636762 W + Gate Leakage = 7.4998e-05 W + Runtime Dynamic = 2.08883 W + + Instruction Cache: + Area = 0.129377 mm^2 + Peak Dynamic = 0.0476007 W + Subthreshold Leakage = 0.00381804 W + Gate Leakage = 2.35266e-05 W + Runtime Dynamic = 0.0698158 W + + Instruction Buffer: + Area = 0.000754971 mm^2 + Peak Dynamic = 0.00238165 W + Subthreshold Leakage = 4.99334e-05 W + Gate Leakage = 3.27157e-07 W + Runtime Dynamic = 0.0190532 W + + Instruction Decoder: + Area = 0.00131543 mm^2 + Peak Dynamic = 0.0246042 W + Subthreshold Leakage = 0.000538954 W + Gate Leakage = 3.91915e-06 W + Runtime Dynamic = 0.196833 W + + Load Store Unit: + Area = 0.0977414 mm^2 + Peak Dynamic = 0.0587123 W + Subthreshold Leakage = 0.00580883 W + Gate Leakage = 7.48788e-05 W + Runtime Dynamic = 2.07447 W + + Data Cache: + Area = 0.0569223 mm^2 + Peak Dynamic = 0.0329939 W + Subthreshold Leakage = 0.00249221 W + Gate Leakage = 1.63814e-05 W + Runtime Dynamic = 0.0476753 W + + Load/Store Queue: + Area = 0.023444 mm^2 + Peak Dynamic = 0.0139792 W + Subthreshold Leakage = 0.00135593 W + Gate Leakage = 1.12722e-05 W + Runtime Dynamic = 0.223667 W + + Memory Management Unit: + Area = 0.0313997 mm^2 + Peak Dynamic = 0.0446647 W + Subthreshold Leakage = 0.0029577 W + Gate Leakage = 5.57335e-05 W + Runtime Dynamic = 1.92566 W + + Itlb: + Area = 0.0110306 mm^2 + Peak Dynamic = 0.0122535 W + Subthreshold Leakage = 0.000498504 W + Gate Leakage = 4.25417e-06 W + Runtime Dynamic = 0.0980282 W + + Dtlb: + Area = 0.0110306 mm^2 + Peak Dynamic = 0.00306337 W + Subthreshold Leakage = 0.000498504 W + Gate Leakage = 4.25417e-06 W + Runtime Dynamic = 0.0245072 W + + Execution Unit: + Area = 0.299667 mm^2 + Peak Dynamic = 0.473204 W + Subthreshold Leakage = 0.0379242 W + Gate Leakage = 0.000384077 W + Runtime Dynamic = 3.52491 W + + Register Files: + Area = 0.0598365 mm^2 + Peak Dynamic = 0.0168768 W + Subthreshold Leakage = 0.0020814 W + Gate Leakage = 1.24237e-05 W + Runtime Dynamic = 0.072481 W + + Integer RF: + Area = 0.0240072 mm^2 + Peak Dynamic = 0.0131657 W + Subthreshold Leakage = 0.000449165 W + Gate Leakage = 3.33111e-06 W + Runtime Dynamic = 0.0706931 W + + Floating Point RF: + Area = 0.0240072 mm^2 + Peak Dynamic = 0.00371113 W + Subthreshold Leakage = 0.000449165 W + Gate Leakage = 3.33111e-06 W + Runtime Dynamic = 0.0017722 W + + Register Windows: + Area = 0.0118221 mm^2 + Peak Dynamic = 0 W + Subthreshold Leakage = 0.00118307 W + Gate Leakage = 5.76149e-06 W + Runtime Dynamic = 1.56951e-05 W + + Instruction Scheduler: + Area = 0.00263062 mm^2 + Peak Dynamic = 0.00540689 W + Subthreshold Leakage = 8.27524e-05 W + Gate Leakage = 9.38261e-07 W + Runtime Dynamic = 0.0464411 W + + Instruction Window: + Area = 0.00263062 mm^2 + Peak Dynamic = 0.00540689 W + Subthreshold Leakage = 8.27524e-05 W + Gate Leakage = 9.38261e-07 W + Runtime Dynamic = 0.0464411 W + + Integer ALUs (Count: 1 ): + Area = 0.0384544 mm^2 + Peak Dynamic = 0.0946992 W + Subthreshold Leakage = 0.00667865 W + Gate Leakage = 6.39207e-05 W + Runtime Dynamic = 0.841771 W + + Floating Point Units (FPUs) (Count: 0.125 ): + Area = 0.0695899 mm^2 + Peak Dynamic = 0.0157832 W + Subthreshold Leakage = 0.00302155 W + Gate Leakage = 2.89189e-05 W + Runtime Dynamic = 0.0315664 W + + Complex ALUs (Mul/Div) (Count: 1 ): + Area = 0.115363 mm^2 + Peak Dynamic = 0.105221 W + Subthreshold Leakage = 0.020036 W + Gate Leakage = 0.000191762 W + Runtime Dynamic = 0.210443 W + + Results Broadcast Bus: + Area Overhead = 0.00445381 mm^2 + Peak Dynamic = 0.192955 W + Subthreshold Leakage = 0.00406321 W + Gate Leakage = 3.88886e-05 W + Runtime Dynamic = 0.519078 W + +***************************************************************************************** +L2 + Area = 2.85591 mm^2 + Peak Dynamic = 0.376642 W + Subthreshold Leakage = 0.154688 W + Gate Leakage = 0.001736 W + Runtime Dynamic = 1.29686 W + +***************************************************************************************** +NOC + Area = 51.4353 mm^2 + Peak Dynamic = 20.1095 W + Subthreshold Leakage = 9.42317 W + Gate Leakage = 0.118753 W + Runtime Dynamic = 2.4957 W + + Router: + Area = 0.578434 mm^2 + Peak Dynamic = 0.184548 W + Subthreshold Leakage = 0.125515 W + Gate Leakage = 0.0016409 W + Runtime Dynamic = 1.32875 W + + Virtual Channel Buffer: + Area = 0.159162 mm^2 + Peak Dynamic = 0.00394081 W + Subthreshold Leakage = 0.000194478 W + Gate Leakage = 1.84946e-06 W + Runtime Dynamic = 0.0283738 W + + Crossbar: + Area = 0.160976 mm^2 + Peak Dynamic = 0.179891 W + Subthreshold Leakage = 0.12532 W + Gate Leakage = 0.00163905 W + Runtime Dynamic = 1.29522 W + + Arbiter: + Peak Dynamic = 0.000716053 W + Subthreshold Leakage = 3.67148e-07 W + Gate Leakage = 3.86991e-09 W + Runtime Dynamic = 0.00515558 W + + Per Router Links: + Area = 0.225243 mm^2 + Peak Dynamic = 0.129662 W + Subthreshold Leakage = 0.0217221 W + Gate Leakage = 0.000214609 W + Runtime Dynamic = 1.16696 W + +***************************************************************************************** diff --git a/ext/mcpat/results/T1_ST_64 b/ext/mcpat/results/T1_ST_64 new file mode 100644 index 000000000..f3d95b541 --- /dev/null +++ b/ext/mcpat/results/T1_ST_64 @@ -0,0 +1,270 @@ +McPAT (version 0.8 of Aug, 2010) is computing the target processor... + +line64 +size1.04858e+06 +line9 +size8.38861e+06 + +McPAT (version 0.8 of Aug, 2010) results (current print level is 5) +***************************************************************************************** + Technology 22 nm + Using Long Channel Devices When Appropriate + Interconnect metal projection= aggressive interconnect technology projection + Core clock Rate(MHz) 3500 + +***************************************************************************************** +Processor: + Area = 358.016 mm^2 + Peak Power = 168.519 W + Total Leakage = 30.8855 W + Peak Dynamic = 137.634 W + Subthreshold Leakage = 30.5351 W + Gate Leakage = 0.350385 W + Runtime Dynamic = 84.2366 W + + Total Cores: 64 cores + Device Type= ITRS high performance device type + Area = 87.1986 mm^2 + Peak Dynamic = 42.426 W + Subthreshold Leakage = 7.80232 W + Gate Leakage = 0.0799149 W + Runtime Dynamic = 9.61388 W + + Total L2s: + Device Type= ITRS high performance device type + Area = 161.532 mm^2 + Peak Dynamic = 21.1059 W + Subthreshold Leakage = 8.9583 W + Gate Leakage = 0.100733 W + Runtime Dynamic = 1.14063 W + + Total First Level Directory: + Device Type= ITRS high performance device type + Area = 57.033 mm^2 + Peak Dynamic = 53.5219 W + Subthreshold Leakage = 4.27249 W + Gate Leakage = 0.050206 W + Runtime Dynamic = 70.9203 W + + Total NoCs (Network/Bus): + Device Type= ITRS high performance device type + Area = 52.2524 mm^2 + Peak Dynamic = 20.5798 W + Subthreshold Leakage = 9.50197 W + Gate Leakage = 0.119531 W + Runtime Dynamic = 2.56185 W + +***************************************************************************************** +Core: + Area = 1.36248 mm^2 + Peak Dynamic = 0.662906 W + Subthreshold Leakage = 0.121911 W + Gate Leakage = 0.00124867 W + Runtime Dynamic = 9.61388 W + + Instruction Fetch Unit: + Area = 0.140786 mm^2 + Peak Dynamic = 0.0863256 W + Subthreshold Leakage = 0.00636762 W + Gate Leakage = 7.4998e-05 W + Runtime Dynamic = 2.08883 W + + Instruction Cache: + Area = 0.129377 mm^2 + Peak Dynamic = 0.0476007 W + Subthreshold Leakage = 0.00381804 W + Gate Leakage = 2.35266e-05 W + Runtime Dynamic = 0.0698158 W + + Instruction Buffer: + Area = 0.000754971 mm^2 + Peak Dynamic = 0.00238165 W + Subthreshold Leakage = 4.99334e-05 W + Gate Leakage = 3.27157e-07 W + Runtime Dynamic = 0.0190532 W + + Instruction Decoder: + Area = 0.00131543 mm^2 + Peak Dynamic = 0.0246042 W + Subthreshold Leakage = 0.000538954 W + Gate Leakage = 3.91915e-06 W + Runtime Dynamic = 0.196833 W + + Load Store Unit: + Area = 0.0977414 mm^2 + Peak Dynamic = 0.0587123 W + Subthreshold Leakage = 0.00580883 W + Gate Leakage = 7.48788e-05 W + Runtime Dynamic = 2.07447 W + + Data Cache: + Area = 0.0569223 mm^2 + Peak Dynamic = 0.0329939 W + Subthreshold Leakage = 0.00249221 W + Gate Leakage = 1.63814e-05 W + Runtime Dynamic = 0.0476753 W + + Load/Store Queue: + Area = 0.023444 mm^2 + Peak Dynamic = 0.0139792 W + Subthreshold Leakage = 0.00135593 W + Gate Leakage = 1.12722e-05 W + Runtime Dynamic = 0.223667 W + + Memory Management Unit: + Area = 0.0313997 mm^2 + Peak Dynamic = 0.0446647 W + Subthreshold Leakage = 0.0029577 W + Gate Leakage = 5.57335e-05 W + Runtime Dynamic = 1.92566 W + + Itlb: + Area = 0.0110306 mm^2 + Peak Dynamic = 0.0122535 W + Subthreshold Leakage = 0.000498504 W + Gate Leakage = 4.25417e-06 W + Runtime Dynamic = 0.0980282 W + + Dtlb: + Area = 0.0110306 mm^2 + Peak Dynamic = 0.00306337 W + Subthreshold Leakage = 0.000498504 W + Gate Leakage = 4.25417e-06 W + Runtime Dynamic = 0.0245072 W + + Execution Unit: + Area = 0.299667 mm^2 + Peak Dynamic = 0.473204 W + Subthreshold Leakage = 0.0379242 W + Gate Leakage = 0.000384077 W + Runtime Dynamic = 3.52491 W + + Register Files: + Area = 0.0598365 mm^2 + Peak Dynamic = 0.0168768 W + Subthreshold Leakage = 0.0020814 W + Gate Leakage = 1.24237e-05 W + Runtime Dynamic = 0.072481 W + + Integer RF: + Area = 0.0240072 mm^2 + Peak Dynamic = 0.0131657 W + Subthreshold Leakage = 0.000449165 W + Gate Leakage = 3.33111e-06 W + Runtime Dynamic = 0.0706931 W + + Floating Point RF: + Area = 0.0240072 mm^2 + Peak Dynamic = 0.00371113 W + Subthreshold Leakage = 0.000449165 W + Gate Leakage = 3.33111e-06 W + Runtime Dynamic = 0.0017722 W + + Register Windows: + Area = 0.0118221 mm^2 + Peak Dynamic = 0 W + Subthreshold Leakage = 0.00118307 W + Gate Leakage = 5.76149e-06 W + Runtime Dynamic = 1.56951e-05 W + + Instruction Scheduler: + Area = 0.00263062 mm^2 + Peak Dynamic = 0.00540689 W + Subthreshold Leakage = 8.27524e-05 W + Gate Leakage = 9.38261e-07 W + Runtime Dynamic = 0.0464411 W + + Instruction Window: + Area = 0.00263062 mm^2 + Peak Dynamic = 0.00540689 W + Subthreshold Leakage = 8.27524e-05 W + Gate Leakage = 9.38261e-07 W + Runtime Dynamic = 0.0464411 W + + Integer ALUs (Count: 1 ): + Area = 0.0384544 mm^2 + Peak Dynamic = 0.0946992 W + Subthreshold Leakage = 0.00667865 W + Gate Leakage = 6.39207e-05 W + Runtime Dynamic = 0.841771 W + + Floating Point Units (FPUs) (Count: 0.125 ): + Area = 0.0695899 mm^2 + Peak Dynamic = 0.0157832 W + Subthreshold Leakage = 0.00302155 W + Gate Leakage = 2.89189e-05 W + Runtime Dynamic = 0.0315664 W + + Complex ALUs (Mul/Div) (Count: 1 ): + Area = 0.115363 mm^2 + Peak Dynamic = 0.105221 W + Subthreshold Leakage = 0.020036 W + Gate Leakage = 0.000191762 W + Runtime Dynamic = 0.210443 W + + Results Broadcast Bus: + Area Overhead = 0.00445381 mm^2 + Peak Dynamic = 0.192955 W + Subthreshold Leakage = 0.00406321 W + Gate Leakage = 3.88886e-05 W + Runtime Dynamic = 0.519078 W + +***************************************************************************************** +L2 + Area = 2.52394 mm^2 + Peak Dynamic = 0.32978 W + Subthreshold Leakage = 0.139973 W + Gate Leakage = 0.00157395 W + Runtime Dynamic = 1.14063 W + +***************************************************************************************** +Second Level Directory + Area = 57.033 mm^2 + Peak Dynamic = 53.5219 W + Subthreshold Leakage = 4.27249 W + Gate Leakage = 0.050206 W + Runtime Dynamic = 70.9203 W + +***************************************************************************************** +NOC + Area = 52.2524 mm^2 + Peak Dynamic = 20.5798 W + Subthreshold Leakage = 9.50197 W + Gate Leakage = 0.119531 W + Runtime Dynamic = 2.56185 W + + Router: + Area = 0.578434 mm^2 + Peak Dynamic = 0.184548 W + Subthreshold Leakage = 0.125515 W + Gate Leakage = 0.0016409 W + Runtime Dynamic = 1.32875 W + + Virtual Channel Buffer: + Area = 0.159162 mm^2 + Peak Dynamic = 0.00394081 W + Subthreshold Leakage = 0.000194478 W + Gate Leakage = 1.84946e-06 W + Runtime Dynamic = 0.0283738 W + + Crossbar: + Area = 0.160976 mm^2 + Peak Dynamic = 0.179891 W + Subthreshold Leakage = 0.12532 W + Gate Leakage = 0.00163905 W + Runtime Dynamic = 1.29522 W + + Arbiter: + Peak Dynamic = 0.000716053 W + Subthreshold Leakage = 3.67148e-07 W + Gate Leakage = 3.86991e-09 W + Runtime Dynamic = 0.00515558 W + + Per Router Links: + Area = 0.238009 mm^2 + Peak Dynamic = 0.137011 W + Subthreshold Leakage = 0.0229533 W + Gate Leakage = 0.000226773 W + Runtime Dynamic = 1.2331 W + +***************************************************************************************** diff --git a/ext/mcpat/results/T2 b/ext/mcpat/results/T2 new file mode 100644 index 000000000..e24701ab2 --- /dev/null +++ b/ext/mcpat/results/T2 @@ -0,0 +1,321 @@ +McPAT (version 0.8 of Aug, 2010) is computing the target processor... + + +McPAT (version 0.8 of Aug, 2010) results (current print level is 5) +***************************************************************************************** + Technology 65 nm + Using Long Channel Devices When Appropriate + Interconnect metal projection= aggressive interconnect technology projection + Core clock Rate(MHz) 1400 + +***************************************************************************************** +Processor: + Area = 277.068 mm^2 + Peak Power = 71.8237 W + Total Leakage = 18.2234 W + Peak Dynamic = 53.6003 W + Subthreshold Leakage = 14.7124 W + Gate Leakage = 3.51096 W + Runtime Dynamic = 48.652 W + + Total Cores: 8 cores + Device Type= ITRS high performance device type + Area = 116.441 mm^2 + Peak Dynamic = 28.0277 W + Subthreshold Leakage = 9.00023 W + Gate Leakage = 1.93139 W + Runtime Dynamic = 27.9237 W + + Total L2s: + Device Type= ITRS high performance device type + Area = 85.0391 mm^2 + Peak Dynamic = 9.87481 W + Subthreshold Leakage = 2.71188 W + Gate Leakage = 0.684324 W + Runtime Dynamic = 3.97632 W + + Total First Level Directory: + Device Type= ITRS high performance device type + Area = 11.6417 mm^2 + Peak Dynamic = 5.32369 W + Subthreshold Leakage = 0.249885 W + Gate Leakage = 0.107486 W + Runtime Dynamic = 5.38275 W + + Total NoCs (Network/Bus): + Device Type= ITRS high performance device type + Area = 9.56584 mm^2 + Peak Dynamic = 1.07754 W + Subthreshold Leakage = 1.61961 W + Gate Leakage = 0.389994 W + Runtime Dynamic = 1.07754 W + + Total MCs: 4 Memory Controllers + Device Type= ITRS high performance device type + Area = 32.2777 mm^2 + Peak Dynamic = 5.92507 W + Subthreshold Leakage = 0.559071 W + Gate Leakage = 0.10416 W + Runtime Dynamic = 7.93157 W + + Total NIUs: 2 Network Interface Units + Device Type= ITRS high performance device type + Area = 15.8633 mm^2 + Peak Dynamic = 1.86482 W + Subthreshold Leakage = 0.357626 W + Gate Leakage = 0.183662 W + Runtime Dynamic = 1.30537 W + + Total PCIes: 1 PCIe Controllers + Device Type= ITRS high performance device type + Area = 6.24 mm^2 + Peak Dynamic = 1.5067 W + Subthreshold Leakage = 0.214091 W + Gate Leakage = 0.109948 W + Runtime Dynamic = 1.05469 W + +***************************************************************************************** +Core: + Area = 14.5551 mm^2 + Peak Dynamic = 3.50346 W + Subthreshold Leakage = 1.12503 W + Gate Leakage = 0.241423 W + Runtime Dynamic = 27.9237 W + + Instruction Fetch Unit: + Area = 2.75911 mm^2 + Peak Dynamic = 0.817936 W + Subthreshold Leakage = 0.0912466 W + Gate Leakage = 0.0284483 W + Runtime Dynamic = 4.81754 W + + Instruction Cache: + Area = 2.51671 mm^2 + Peak Dynamic = 0.513783 W + Subthreshold Leakage = 0.062355 W + Gate Leakage = 0.0164185 W + Runtime Dynamic = 1.59033 W + + Instruction Buffer: + Area = 0.0130935 mm^2 + Peak Dynamic = 0.0100268 W + Subthreshold Leakage = 0.000434992 W + Gate Leakage = 6.02581e-05 W + Runtime Dynamic = 0.160429 W + + Instruction Decoder: + Area = 0.0119193 mm^2 + Peak Dynamic = 0.0892213 W + Subthreshold Leakage = 0.00298091 W + Gate Leakage = 0.000408973 W + Runtime Dynamic = 1.42754 W + + Load Store Unit: + Area = 2.14252 mm^2 + Peak Dynamic = 0.487978 W + Subthreshold Leakage = 0.0802768 W + Gate Leakage = 0.0247378 W + Runtime Dynamic = 10.9331 W + + Data Cache: + Area = 0.52868 mm^2 + Peak Dynamic = 0.0991646 W + Subthreshold Leakage = 0.0119043 W + Gate Leakage = 0.00145618 W + Runtime Dynamic = 0.1303 W + + Load/Store Queue: + Area = 1.22144 mm^2 + Peak Dynamic = 0.286361 W + Subthreshold Leakage = 0.0428969 W + Gate Leakage = 0.011721 W + Runtime Dynamic = 9.16355 W + + Memory Management Unit: + Area = 1.1006 mm^2 + Peak Dynamic = 0.399121 W + Subthreshold Leakage = 0.0527367 W + Gate Leakage = 0.0195353 W + Runtime Dynamic = 2.78316 W + + Itlb: + Area = 0.293144 mm^2 + Peak Dynamic = 0.0743045 W + Subthreshold Leakage = 0.00720086 W + Gate Leakage = 0.00218791 W + Runtime Dynamic = 0.594438 W + + Dtlb: + Area = 0.590071 mm^2 + Peak Dynamic = 0.0686851 W + Subthreshold Leakage = 0.0200602 W + Gate Leakage = 0.00578676 W + Runtime Dynamic = 0.549486 W + + Execution Unit: + Area = 6.79584 mm^2 + Peak Dynamic = 1.79843 W + Subthreshold Leakage = 0.610924 W + Gate Leakage = 0.116437 W + Runtime Dynamic = 9.38994 W + + Register Files: + Area = 1.18037 mm^2 + Peak Dynamic = 0.0639548 W + Subthreshold Leakage = 0.00981018 W + Gate Leakage = 0.00106415 W + Runtime Dynamic = 0.401933 W + + Integer RF: + Area = 0.648931 mm^2 + Peak Dynamic = 0.0485174 W + Subthreshold Leakage = 0.00196627 W + Gate Leakage = 0.000259389 W + Runtime Dynamic = 0.392074 W + + Floating Point RF: + Area = 0.324465 mm^2 + Peak Dynamic = 0.0154374 W + Subthreshold Leakage = 0.00196627 W + Gate Leakage = 0.000259389 W + Runtime Dynamic = 0.0098154 W + + Register Windows: + Area = 0.206972 mm^2 + Peak Dynamic = 0 W + Subthreshold Leakage = 0.00587765 W + Gate Leakage = 0.000545372 W + Runtime Dynamic = 4.40062e-05 W + + Instruction Scheduler: + Area = 0.0458096 mm^2 + Peak Dynamic = 0.0333897 W + Subthreshold Leakage = 0.000402487 W + Gate Leakage = 8.61395e-05 W + Runtime Dynamic = 0.287483 W + + Instruction Window: + Area = 0.0458096 mm^2 + Peak Dynamic = 0.0333897 W + Subthreshold Leakage = 0.000402487 W + Gate Leakage = 8.61395e-05 W + Runtime Dynamic = 0.287483 W + + Integer ALUs (Count: 2 ): + Area = 0.448448 mm^2 + Peak Dynamic = 0.425547 W + Subthreshold Leakage = 0.147955 W + Gate Leakage = 0.0266792 W + Runtime Dynamic = 3.78264 W + + Floating Point Units (FPUs) (Count: 1 ): + Area = 4.85979 mm^2 + Peak Dynamic = 0.425547 W + Subthreshold Leakage = 0.400843 W + Gate Leakage = 0.07228 W + Runtime Dynamic = 0.0709246 W + + Results Broadcast Bus: + Area Overhead = 0.0440413 mm^2 + Peak Dynamic = 0.481158 W + Subthreshold Leakage = 0.0264373 W + Gate Leakage = 0.00476717 W + Runtime Dynamic = 3.20772 W + +***************************************************************************************** +L2 + Area = 10.6299 mm^2 + Peak Dynamic = 1.23435 W + Subthreshold Leakage = 0.338985 W + Gate Leakage = 0.0855405 W + Runtime Dynamic = 3.97632 W + +***************************************************************************************** +First Level Directory + Area = 1.45521 mm^2 + Peak Dynamic = 0.665462 W + Subthreshold Leakage = 0.0312356 W + Gate Leakage = 0.0134358 W + Runtime Dynamic = 5.38275 W + +***************************************************************************************** +Memory Controller: + Area = 8.06942 mm^2 + Peak Dynamic = 1.48127 W + Subthreshold Leakage = 0.139768 W + Gate Leakage = 0.0260401 W + Runtime Dynamic = 7.93157 W + + Front End Engine: + Area = 0.250458 mm^2 + Peak Dynamic = 0.05883 W + Subthreshold Leakage = 0.0029079 W + Gate Leakage = 0.000455875 W + Runtime Dynamic = 0.298069 W + + Transaction Engine: + Area = 2.66058 mm^2 + Peak Dynamic = 0.6912 W + Subthreshold Leakage = 0.0465697 W + Gate Leakage = 0.00870562 W + Runtime Dynamic = 3.50205 W + + PHY: + Area = 5.15838 mm^2 + Peak Dynamic = 0.731237 W + Subthreshold Leakage = 0.0902901 W + Gate Leakage = 0.0168786 W + Runtime Dynamic = 4.13145 W + +***************************************************************************************** +NIU: + Area = 7.93167 mm^2 + Peak Dynamic = 0.93241 W + Subthreshold Leakage = 0.178813 W + Gate Leakage = 0.0918312 W + Runtime Dynamic = 0.652687 W + +***************************************************************************************** +PCIe: + Area = 6.24 mm^2 + Peak Dynamic = 1.5067 W + Subthreshold Leakage = 0.214091 W + Gate Leakage = 0.109948 W + Runtime Dynamic = 1.05469 W + +***************************************************************************************** +NOC + Area = 9.56584 mm^2 + Peak Dynamic = 1.07754 W + Subthreshold Leakage = 1.61961 W + Gate Leakage = 0.389994 W + Runtime Dynamic = 1.07754 W + + Router: + Area = 4.78292 mm^2 + Peak Dynamic = 0.538772 W + Subthreshold Leakage = 0.809805 W + Gate Leakage = 0.194997 W + Runtime Dynamic = 1.07754 W + + Virtual Channel Buffer: + Area = 0.827721 mm^2 + Peak Dynamic = 0.0223838 W + Subthreshold Leakage = 0.00314985 W + Gate Leakage = 0.000413272 W + Runtime Dynamic = 0.0447677 W + + Crossbar: + Area = 1.69589 mm^2 + Peak Dynamic = 0.511174 W + Subthreshold Leakage = 0.806641 W + Gate Leakage = 0.194581 W + Runtime Dynamic = 1.02235 W + + Arbiter: + Peak Dynamic = 0.00521447 W + Subthreshold Leakage = 1.42757e-05 W + Gate Leakage = 2.78294e-06 W + Runtime Dynamic = 0.0104289 W + +***************************************************************************************** diff --git a/ext/mcpat/results/Xeon_core b/ext/mcpat/results/Xeon_core new file mode 100644 index 000000000..0cc9ae648 --- /dev/null +++ b/ext/mcpat/results/Xeon_core @@ -0,0 +1,341 @@ +McPAT (version 0.7 of May, 2010) is computing the target processor... + + +McPAT (version 0.7 of May, 2010) results (current print level is 5) +***************************************************************************************** + Technology 65 nm + Using Long Channel Devices When Appropriate + Interconnect metal projection= aggressive interconnect technology projection + Core clock Rate(MHz) 3400 + +***************************************************************************************** +Processor: + Area = 417.445 mm^2 + Peak Power = 142.148 W + Total Leakage = 55.8021 W + Peak Dynamic = 86.3458 W + Subthreshold Leakage = 52.785 W + Gate Leakage = 3.01712 W + Runtime Dynamic = 63.1851 W + + Total Cores: + Device Type= ITRS high performance device type + Area = 133.278 mm^2 + Peak Dynamic = 63.8414 W + Subthreshold Leakage = 32.4393 W + Gate Leakage = 2.72517 W + Runtime Dynamic = 41.616 W + + Total L3s: + Device Type= ITRS high performance device type + Area = 278.612 mm^2 + Peak Dynamic = 6.11346 W + Subthreshold Leakage = 20.1995 W + Gate Leakage = 0.267752 W + Runtime Dynamic = 5.1782 W + + Total NoCs (Network/Bus): + Device Type= ITRS high performance device type + Area = 5.5548 mm^2 + Peak Dynamic = 16.3909 W + Subthreshold Leakage = 0.146229 W + Gate Leakage = 0.0241913 W + Runtime Dynamic = 16.3909 W + +***************************************************************************************** +Core: + Area = 66.6389 mm^2 + Peak Dynamic = 31.9207 W + Subthreshold Leakage = 16.2197 W + Gate Leakage = 1.36259 W + Runtime Dynamic = 41.616 W + + Instruction Fetch Unit: + Area = 7.41271 mm^2 + Peak Dynamic = 5.04492 W + Subthreshold Leakage = 1.26751 W + Gate Leakage = 0.09429 W + Runtime Dynamic = 5.39803 W + + Instruction Cache: + Area = 2.44324 mm^2 + Peak Dynamic = 1.42048 W + Subthreshold Leakage = 0.359444 W + Gate Leakage = 0.0187045 W + Runtime Dynamic = 2.13804 W + + Branch Target Buffer: + Area = 0.729086 mm^2 + Peak Dynamic = 0.161698 W + Subthreshold Leakage = 0.0616324 W + Gate Leakage = 0.00336254 W + Runtime Dynamic = 0.646794 W + + Branch Predictor: + Area = 0.430961 mm^2 + Peak Dynamic = 0.188469 W + Subthreshold Leakage = 0.0698834 W + Gate Leakage = 0.00415943 W + Runtime Dynamic = 0.166045 W + + Global Predictor: + Area = 0.174771 mm^2 + Peak Dynamic = 0.0633335 W + Subthreshold Leakage = 0.0274086 W + Gate Leakage = 0.00158249 W + Runtime Dynamic = 0.0633335 W + + Local Predictor: + Area = 0.0735854 mm^2 + Peak Dynamic = 0.0393754 W + Subthreshold Leakage = 0.0111166 W + Gate Leakage = 0.000721196 W + Runtime Dynamic = 0.0393754 W + + Area = 0.0507308 mm^2 + Peak Dynamic = 0.0258383 W + Subthreshold Leakage = 0.00749994 W + Gate Leakage = 0.000498805 W + Runtime Dynamic = 0.0258383 W + + Chooser: + Area = 0.174771 mm^2 + Peak Dynamic = 0.0633335 W + Subthreshold Leakage = 0.0274086 W + Gate Leakage = 0.00158249 W + Runtime Dynamic = 0.0633335 W + + RAS: + Area = 0.0613744 mm^2 + Peak Dynamic = 0.0224266 W + Subthreshold Leakage = 0.00394955 W + Gate Leakage = 0.000273252 W + Runtime Dynamic = 2.51602e-06 W + + Instruction Buffer: + Area = 0.0684348 mm^2 + Peak Dynamic = 0.704461 W + Subthreshold Leakage = 0.00411741 W + Gate Leakage = 0.000240288 W + Runtime Dynamic = 0.46964 W + + Instruction Decoder: + Area = 3.73007 mm^2 + Peak Dynamic = 1.97751 W + Subthreshold Leakage = 0.733056 W + Gate Leakage = 0.0575912 W + Runtime Dynamic = 1.97751 W + + Renaming Unit: + Area = 1.82421 mm^2 + Peak Dynamic = 2.76284 W + Subthreshold Leakage = 0.0765654 W + Gate Leakage = 0.0125478 W + Runtime Dynamic = 1.94438 W + + Int Front End RAT: + Area = 0.875874 mm^2 + Peak Dynamic = 1.249 W + Subthreshold Leakage = 0.0113878 W + Gate Leakage = 0.000693471 W + Runtime Dynamic = 1.249 W + + FP Front End RAT: + Area = 0.405459 mm^2 + Peak Dynamic = 0.610062 W + Subthreshold Leakage = 0.0144803 W + Gate Leakage = 0.000906674 W + Runtime Dynamic = 0.305031 W + + Free List: + Area = 0.297629 mm^2 + Peak Dynamic = 0.137664 W + Subthreshold Leakage = 0.0054316 W + Gate Leakage = 0.000326171 W + Runtime Dynamic = 0.275328 W + + Int Retire RAT: + Area = 0.0530903 mm^2 + Peak Dynamic = 0.056222 W + Subthreshold Leakage = 0.00135314 W + Gate Leakage = 0.00011607 W + Runtime Dynamic = 0.056222 W + + FP Retire RAT: + Area = 0.018828 mm^2 + Peak Dynamic = 0.0186388 W + Subthreshold Leakage = 0.000788229 W + Gate Leakage = 6.41952e-05 W + Runtime Dynamic = 0.00931941 W + + FP Free List: + Area = 0.162422 mm^2 + Peak Dynamic = 0.0989385 W + Subthreshold Leakage = 0.00375181 W + Gate Leakage = 0.000209083 W + Runtime Dynamic = 0.0494693 W + + Load Store Unit: + Area = 4.35998 mm^2 + Peak Dynamic = 2.94939 W + Subthreshold Leakage = 0.208781 W + Gate Leakage = 0.0232213 W + Runtime Dynamic = 3.60184 W + + Data Cache: + Area = 2.2051 mm^2 + Peak Dynamic = 1.08067 W + Subthreshold Leakage = 0.0877157 W + Gate Leakage = 0.00573003 W + Runtime Dynamic = 2.30478 W + + LoadQ: + Area = 0.637121 mm^2 + Peak Dynamic = 0.551016 W + Subthreshold Leakage = 0.0283256 W + Gate Leakage = 0.00254841 W + Runtime Dynamic = 0.275508 W + + StoreQ: + Area = 0.809965 mm^2 + Peak Dynamic = 1.02155 W + Subthreshold Leakage = 0.053367 W + Gate Leakage = 0.00471074 W + Runtime Dynamic = 1.02155 W + + Memory Management Unit: + Area = 0.517456 mm^2 + Peak Dynamic = 0.979218 W + Subthreshold Leakage = 0.0808171 W + Gate Leakage = 0.0139952 W + Runtime Dynamic = 1.66678 W + + Itlb: + Area = 0.127123 mm^2 + Peak Dynamic = 0.236587 W + Subthreshold Leakage = 0.0160962 W + Gate Leakage = 0.00146431 W + Runtime Dynamic = 0.473177 W + + Dtlb: + Area = 0.379422 mm^2 + Peak Dynamic = 0.298399 W + Subthreshold Leakage = 0.0253484 W + Gate Leakage = 0.00229878 W + Runtime Dynamic = 1.1936 W + + Execution Unit: + Area = 27.5381 mm^2 + Peak Dynamic = 16.9637 W + Subthreshold Leakage = 7.08185 W + Gate Leakage = 0.73316 W + Runtime Dynamic = 22.7198 W + + Register Files: + Area = 11.2548 mm^2 + Peak Dynamic = 3.2925 W + Subthreshold Leakage = 0.11111 W + Gate Leakage = 0.00754256 W + Runtime Dynamic = 1.69823 W + + Integer RF: + Area = 7.55916 mm^2 + Peak Dynamic = 2.82012 W + Subthreshold Leakage = 0.0664048 W + Gate Leakage = 0.00458288 W + Runtime Dynamic = 1.51078 W + + Floating Point RF: + Area = 3.69565 mm^2 + Peak Dynamic = 0.472385 W + Subthreshold Leakage = 0.0447053 W + Gate Leakage = 0.00295968 W + Runtime Dynamic = 0.187454 W + + Instruction Scheduler: + Area = 2.08681 mm^2 + Peak Dynamic = 2.1684 W + Subthreshold Leakage = 0.0325294 W + Gate Leakage = 0.00296372 W + Runtime Dynamic = 2.59089 W + + Instruction Window: + Area = 0.287309 mm^2 + Peak Dynamic = 0.929972 W + Subthreshold Leakage = 0.0127376 W + Gate Leakage = 0.00137073 W + Runtime Dynamic = 1.2089 W + + FP Instruction Window: + Area = 0.128977 mm^2 + Peak Dynamic = 0.478661 W + Subthreshold Leakage = 0.00802287 W + Gate Leakage = 0.000873414 W + Runtime Dynamic = 0.622222 W + + ROB: + Area = 1.67052 mm^2 + Peak Dynamic = 0.759764 W + Subthreshold Leakage = 0.0117689 W + Gate Leakage = 0.000719579 W + Runtime Dynamic = 0.759764 W + + Integer ALUs (Count: 6 ): + Area = 4.03603 mm^2 + Peak Dynamic = 4.55818 W + Subthreshold Leakage = 3.9898 W + Gate Leakage = 0.412015 W + Runtime Dynamic = 2.33394 W + + Floating Point Units (FPUs) (Count: 2 ): + Area = 9.71959 mm^2 + Peak Dynamic = 1.43327 W + Subthreshold Leakage = 2.40207 W + Gate Leakage = 0.248054 W + Runtime Dynamic = 2.55333 W + + Complex ALUs (Mul/Div) (Count: 1 ): + Area = 0.336336 mm^2 + Peak Dynamic = 0.510666 W + Subthreshold Leakage = 0.332484 W + Gate Leakage = 0.0343346 W + Runtime Dynamic = 3.18505 W + + Results Broadcast Bus: + Area Overhead = 0.0936618 mm^2 + Peak Dynamic = 4.4084 W + Subthreshold Leakage = 0.174486 W + Gate Leakage = 0.0180186 W + Runtime Dynamic = 10.3584 W + + L2 + Area = 15.914 mm^2 + Peak Dynamic = 3.22061 W + Subthreshold Leakage = 3.01991 W + Gate Leakage = 0.0223008 W + Runtime Dynamic = 6.28514 W + +***************************************************************************************** + L3 + Area = 278.612 mm^2 + Peak Dynamic = 6.11346 W + Subthreshold Leakage = 20.1995 W + Gate Leakage = 0.267752 W + Runtime Dynamic = 5.1782 W + +***************************************************************************************** +BUSES + Area = 5.5548 mm^2 + Peak Dynamic = 16.3909 W + Subthreshold Leakage = 0.146229 W + Gate Leakage = 0.0241913 W + Runtime Dynamic = 16.3909 W + + Bus: + Area = 5.5548 mm^2 + Peak Dynamic = 16.3909 W + Subthreshold Leakage = 0.146229 W + Gate Leakage = 0.0241913 W + Runtime Dynamic = 16.3909 W + +***************************************************************************************** diff --git a/ext/mcpat/results/Xeon_uncore b/ext/mcpat/results/Xeon_uncore new file mode 100644 index 000000000..558331c25 --- /dev/null +++ b/ext/mcpat/results/Xeon_uncore @@ -0,0 +1,341 @@ +McPAT (version 0.7 of May, 2010) is computing the target processor... + + +McPAT (version 0.7 of May, 2010) results (current print level is 5) +***************************************************************************************** + Technology 65 nm + Using Long Channel Devices When Appropriate + Interconnect metal projection= aggressive interconnect technology projection + Core clock Rate(MHz) 3400 + +***************************************************************************************** +Processor: + Area = 418.629 mm^2 + Peak Power = 96.2032 W + Total Leakage = 27.5568 W + Peak Dynamic = 68.6463 W + Subthreshold Leakage = 25.8287 W + Gate Leakage = 1.72809 W + Runtime Dynamic = 50.332 W + + Total Cores: + Device Type= ITRS high performance device type + Area = 134.217 mm^2 + Peak Dynamic = 50.8677 W + Subthreshold Leakage = 15.0187 W + Gate Leakage = 1.57092 W + Runtime Dynamic = 33.3003 W + + Total L3s: + Device Type= ITRS high performance device type + Area = 278.843 mm^2 + Peak Dynamic = 4.84476 W + Subthreshold Leakage = 10.7416 W + Gate Leakage = 0.144361 W + Runtime Dynamic = 4.09781 W + + Total NoCs (Network/Bus): + Device Type= ITRS high performance device type + Area = 5.56828 mm^2 + Peak Dynamic = 12.9339 W + Subthreshold Leakage = 0.0684953 W + Gate Leakage = 0.0128043 W + Runtime Dynamic = 12.9339 W + +***************************************************************************************** +Core: + Area = 67.1085 mm^2 + Peak Dynamic = 25.4338 W + Subthreshold Leakage = 7.50933 W + Gate Leakage = 0.78546 W + Runtime Dynamic = 33.3003 W + + Instruction Fetch Unit: + Area = 7.56843 mm^2 + Peak Dynamic = 4.27305 W + Subthreshold Leakage = 0.571346 W + Gate Leakage = 0.0523885 W + Runtime Dynamic = 4.67953 W + + Instruction Cache: + Area = 2.44678 mm^2 + Peak Dynamic = 1.1785 W + Subthreshold Leakage = 0.151766 W + Gate Leakage = 0.009764 W + Runtime Dynamic = 1.7926 W + + Branch Target Buffer: + Area = 0.718635 mm^2 + Peak Dynamic = 0.151619 W + Subthreshold Leakage = 0.0238082 W + Gate Leakage = 0.0015503 W + Runtime Dynamic = 0.606475 W + + Branch Predictor: + Area = 0.446844 mm^2 + Peak Dynamic = 0.158508 W + Subthreshold Leakage = 0.0293041 W + Gate Leakage = 0.0021362 W + Runtime Dynamic = 0.14087 W + + Global Predictor: + Area = 0.174801 mm^2 + Peak Dynamic = 0.0543932 W + Subthreshold Leakage = 0.0116121 W + Gate Leakage = 0.000827171 W + Runtime Dynamic = 0.0543932 W + + Local Predictor: + Area = 0.0788692 mm^2 + Peak Dynamic = 0.0320817 W + Subthreshold Leakage = 0.00452837 W + Gate Leakage = 0.000354718 W + Runtime Dynamic = 0.0320817 W + + Area = 0.050748 mm^2 + Peak Dynamic = 0.0218669 W + Subthreshold Leakage = 0.00318852 W + Gate Leakage = 0.000264126 W + Runtime Dynamic = 0.0218669 W + + Chooser: + Area = 0.174801 mm^2 + Peak Dynamic = 0.0543932 W + Subthreshold Leakage = 0.0116121 W + Gate Leakage = 0.000827171 W + Runtime Dynamic = 0.0543932 W + + RAS: + Area = 0.0929863 mm^2 + Peak Dynamic = 0.0176394 W + Subthreshold Leakage = 0.00155163 W + Gate Leakage = 0.00012714 W + Runtime Dynamic = 1.96119e-06 W + + Instruction Buffer: + Area = 0.0687233 mm^2 + Peak Dynamic = 0.579633 W + Subthreshold Leakage = 0.00177049 W + Gate Leakage = 0.000129185 W + Runtime Dynamic = 0.386422 W + + Instruction Decoder: + Area = 3.87654 mm^2 + Peak Dynamic = 1.75316 W + Subthreshold Leakage = 0.348225 W + Gate Leakage = 0.0335628 W + Runtime Dynamic = 1.75316 W + + Renaming Unit: + Area = 1.83366 mm^2 + Peak Dynamic = 2.16025 W + Subthreshold Leakage = 0.0324638 W + Gate Leakage = 0.00648876 W + Runtime Dynamic = 1.53428 W + + Int Front End RAT: + Area = 0.879521 mm^2 + Peak Dynamic = 0.975897 W + Subthreshold Leakage = 0.00490782 W + Gate Leakage = 0.000372282 W + Runtime Dynamic = 0.975897 W + + FP Front End RAT: + Area = 0.407642 mm^2 + Peak Dynamic = 0.477469 W + Subthreshold Leakage = 0.00619591 W + Gate Leakage = 0.000483134 W + Runtime Dynamic = 0.238735 W + + Free List: + Area = 0.300513 mm^2 + Peak Dynamic = 0.112906 W + Subthreshold Leakage = 0.00233243 W + Gate Leakage = 0.000174984 W + Runtime Dynamic = 0.225813 W + + Int Retire RAT: + Area = 0.0534147 mm^2 + Peak Dynamic = 0.0453154 W + Subthreshold Leakage = 0.00058142 W + Gate Leakage = 6.26682e-05 W + Runtime Dynamic = 0.0453154 W + + FP Retire RAT: + Area = 0.018897 mm^2 + Peak Dynamic = 0.0151716 W + Subthreshold Leakage = 0.000337803 W + Gate Leakage = 3.45545e-05 W + Runtime Dynamic = 0.00758578 W + + FP Free List: + Area = 0.162758 mm^2 + Peak Dynamic = 0.081858 W + Subthreshold Leakage = 0.00163685 W + Gate Leakage = 0.000115075 W + Runtime Dynamic = 0.040929 W + + Load Store Unit: + Area = 4.4281 mm^2 + Peak Dynamic = 2.34722 W + Subthreshold Leakage = 0.0896936 W + Gate Leakage = 0.0121845 W + Runtime Dynamic = 2.89901 W + + Data Cache: + Area = 2.25853 mm^2 + Peak Dynamic = 0.888323 W + Subthreshold Leakage = 0.0382167 W + Gate Leakage = 0.00311455 W + Runtime Dynamic = 1.88387 W + + LoadQ: + Area = 0.638298 mm^2 + Peak Dynamic = 0.435889 W + Subthreshold Leakage = 0.0121526 W + Gate Leakage = 0.00134375 W + Runtime Dynamic = 0.217944 W + + StoreQ: + Area = 0.811765 mm^2 + Peak Dynamic = 0.79719 W + Subthreshold Leakage = 0.0228527 W + Gate Leakage = 0.00248017 W + Runtime Dynamic = 0.79719 W + + Memory Management Unit: + Area = 0.518866 mm^2 + Peak Dynamic = 0.760463 W + Subthreshold Leakage = 0.0342246 W + Gate Leakage = 0.00722713 W + Runtime Dynamic = 1.31193 W + + Itlb: + Area = 0.12744 mm^2 + Peak Dynamic = 0.187517 W + Subthreshold Leakage = 0.00686539 W + Gate Leakage = 0.000767441 W + Runtime Dynamic = 0.375037 W + + Dtlb: + Area = 0.380515 mm^2 + Peak Dynamic = 0.234221 W + Subthreshold Leakage = 0.0108877 W + Gate Leakage = 0.00121362 W + Runtime Dynamic = 0.936886 W + + Execution Unit: + Area = 27.5564 mm^2 + Peak Dynamic = 13.34 W + Subthreshold Leakage = 3.35055 W + Gate Leakage = 0.425 W + Runtime Dynamic = 17.8618 W + + Register Files: + Area = 11.2668 mm^2 + Peak Dynamic = 2.65925 W + Subthreshold Leakage = 0.0472795 W + Gate Leakage = 0.00398463 W + Runtime Dynamic = 1.37147 W + + Integer RF: + Area = 7.56635 mm^2 + Peak Dynamic = 2.27672 W + Subthreshold Leakage = 0.0282472 W + Gate Leakage = 0.00241709 W + Runtime Dynamic = 1.21967 W + + Floating Point RF: + Area = 3.70048 mm^2 + Peak Dynamic = 0.382527 W + Subthreshold Leakage = 0.0190323 W + Gate Leakage = 0.00156754 W + Runtime Dynamic = 0.151797 W + + Instruction Scheduler: + Area = 2.09118 mm^2 + Peak Dynamic = 1.7092 W + Subthreshold Leakage = 0.0139125 W + Gate Leakage = 0.00156067 W + Runtime Dynamic = 2.04197 W + + Instruction Window: + Area = 0.287606 mm^2 + Peak Dynamic = 0.721714 W + Subthreshold Leakage = 0.00547415 W + Gate Leakage = 0.000721338 W + Runtime Dynamic = 0.940723 W + + FP Instruction Window: + Area = 0.129287 mm^2 + Peak Dynamic = 0.372875 W + Subthreshold Leakage = 0.0034355 W + Gate Leakage = 0.00045775 W + Runtime Dynamic = 0.486639 W + + ROB: + Area = 1.67428 mm^2 + Peak Dynamic = 0.61461 W + Subthreshold Leakage = 0.00500288 W + Gate Leakage = 0.00038158 W + Runtime Dynamic = 0.61461 W + + Integer ALUs (Count: 6 ): + Area = 4.03603 mm^2 + Peak Dynamic = 3.52986 W + Subthreshold Leakage = 1.89726 W + Gate Leakage = 0.240113 W + Runtime Dynamic = 1.8074 W + + Floating Point Units (FPUs) (Count: 2 ): + Area = 9.71959 mm^2 + Peak Dynamic = 1.10993 W + Subthreshold Leakage = 1.14225 W + Gate Leakage = 0.14456 W + Runtime Dynamic = 1.9773 W + + Complex ALUs (Mul/Div) (Count: 1 ): + Area = 0.336336 mm^2 + Peak Dynamic = 0.405148 W + Subthreshold Leakage = 0.158105 W + Gate Leakage = 0.0200094 W + Runtime Dynamic = 2.4988 W + + Results Broadcast Bus: + Area Overhead = 0.0954831 mm^2 + Peak Dynamic = 3.47499 W + Subthreshold Leakage = 0.0752739 W + Gate Leakage = 0.00952648 W + Runtime Dynamic = 8.1649 W + + L2 + Area = 16.1307 mm^2 + Peak Dynamic = 2.55285 W + Subthreshold Leakage = 1.29868 W + Gate Leakage = 0.012304 W + Runtime Dynamic = 5.01368 W + +***************************************************************************************** + L3 + Area = 278.843 mm^2 + Peak Dynamic = 4.84476 W + Subthreshold Leakage = 10.7416 W + Gate Leakage = 0.144361 W + Runtime Dynamic = 4.09781 W + +***************************************************************************************** +BUSES + Area = 5.56828 mm^2 + Peak Dynamic = 12.9339 W + Subthreshold Leakage = 0.0684953 W + Gate Leakage = 0.0128043 W + Runtime Dynamic = 12.9339 W + + Bus: + Area = 5.56828 mm^2 + Peak Dynamic = 12.9339 W + Subthreshold Leakage = 0.0684953 W + Gate Leakage = 0.0128043 W + Runtime Dynamic = 12.9339 W + +***************************************************************************************** diff --git a/ext/mcpat/sharedcache.cc b/ext/mcpat/sharedcache.cc new file mode 100644 index 000000000..3a61e1b6d --- /dev/null +++ b/ext/mcpat/sharedcache.cc @@ -0,0 +1,1162 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#include <algorithm> +#include <cassert> +#include <cmath> +#include <cstring> +#include <iostream> + +#include "XML_Parse.h" +#include "arbiter.h" +#include "array.h" +#include "basic_circuit.h" +#include "const.h" +#include "io.h" +#include "logic.h" +#include "parameter.h" +#include "sharedcache.h" + +SharedCache::SharedCache(ParseXML* XML_interface, int ithCache_, InputParameter* interface_ip_, enum cache_level cacheL_) +:XML(XML_interface), + ithCache(ithCache_), + interface_ip(*interface_ip_), + cacheL(cacheL_), + dir_overhead(0) +{ + int idx; + int tag, data; + bool is_default, debug; + enum Device_ty device_t; + enum Core_type core_t; + double size, line, assoc, banks; + if (cacheL==L2 && XML->sys.Private_L2) + { + device_t=Core_device; + core_t = (enum Core_type)XML->sys.core[ithCache].machine_type; + } + else + { + device_t=LLC_device; + core_t = Inorder; + } + + debug = false; + is_default=true;//indication for default setup + if (XML->sys.Embedded) + { + interface_ip.wt =Global_30; + interface_ip.wire_is_mat_type = 0; + interface_ip.wire_os_mat_type = 1; + } + else + { + interface_ip.wt =Global; + interface_ip.wire_is_mat_type = 2; + interface_ip.wire_os_mat_type = 2; + } + set_cache_param(); + + //All lower level cache are physically indexed and tagged. + size = cachep.capacity; + line = cachep.blockW; + assoc = cachep.assoc; + banks = cachep.nbanks; + if ((cachep.dir_ty==ST&& cacheL==L1Directory)||(cachep.dir_ty==ST&& cacheL==L2Directory)) + { + assoc = 0; + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + interface_ip.num_search_ports = 1; + } + else + { + idx = debug?9:int(ceil(log2(size/line/assoc))); + tag = debug?51:XML->sys.physical_address_width-idx-int(ceil(log2(line))) + EXTRA_TAG_BITS; + interface_ip.num_search_ports = 0; + if (cachep.dir_ty==SBT) + { + dir_overhead = ceil(XML->sys.number_of_cores/8.0)*8/(cachep.blockW*8); + line = cachep.blockW*(1+ dir_overhead) ; + size = cachep.capacity*(1+ dir_overhead); + + } + } +// if (XML->sys.first_level_dir==2) +// tag += int(XML->sys.domain_size + 5); + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.cache_sz = (int)size; + interface_ip.line_sz = (int)line; + interface_ip.assoc = (int)assoc; + interface_ip.nbanks = (int)banks; + interface_ip.out_w = interface_ip.line_sz*8/2; + interface_ip.access_mode = 1; + interface_ip.throughput = cachep.throughput; + interface_ip.latency = cachep.latency; + interface_ip.is_cache = true; + interface_ip.pure_ram = false; + interface_ip.pure_cam = false; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//lower level cache usually has one port. + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; +// interface_ip.force_cache_config =true; +// interface_ip.ndwl = 4; +// interface_ip.ndbl = 8; +// interface_ip.nspd = 1; +// interface_ip.ndcm =1 ; +// interface_ip.ndsam1 =1; +// interface_ip.ndsam2 =1; + unicache.caches = new ArrayST(&interface_ip, cachep.name + "cache", device_t, true, core_t); + unicache.area.set_area(unicache.area.get_area()+ unicache.caches->local_result.area); + area.set_area(area.get_area()+ unicache.caches->local_result.area); + interface_ip.force_cache_config =false; + + if (!((cachep.dir_ty==ST&& cacheL==L1Directory)||(cachep.dir_ty==ST&& cacheL==L2Directory))) + { + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = (XML->sys.physical_address_width) + int(ceil(log2(size/line))) + unicache.caches->l_ip.line_sz; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0)); + interface_ip.cache_sz = cachep.missb_size*interface_ip.line_sz; + interface_ip.assoc = 0; + interface_ip.is_cache = true; + interface_ip.pure_ram = false; + interface_ip.pure_cam = false; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8/2; + interface_ip.access_mode = 0; + interface_ip.throughput = cachep.throughput;//means cycle time + interface_ip.latency = cachep.latency;//means access time + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + interface_ip.num_search_ports = 1; + unicache.missb = new ArrayST(&interface_ip, cachep.name + "MissB", device_t, true, core_t); + unicache.area.set_area(unicache.area.get_area()+ unicache.missb->local_result.area); + area.set_area(area.get_area()+ unicache.missb->local_result.area); + //fill buffer + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = unicache.caches->l_ip.line_sz; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data;//int(pow(2.0,ceil(log2(data)))); + interface_ip.cache_sz = data*cachep.fu_size ; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8/2; + interface_ip.access_mode = 0; + interface_ip.throughput = cachep.throughput; + interface_ip.latency = cachep.latency; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + unicache.ifb = new ArrayST(&interface_ip, cachep.name + "FillB", device_t, true, core_t); + unicache.area.set_area(unicache.area.get_area()+ unicache.ifb->local_result.area); + area.set_area(area.get_area()+ unicache.ifb->local_result.area); + //prefetch buffer + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;//check with previous entries to decide wthether to merge. + data = unicache.caches->l_ip.line_sz;//separate queue to prevent from cache polution. + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data;//int(pow(2.0,ceil(log2(data)))); + interface_ip.cache_sz = cachep.prefetchb_size*interface_ip.line_sz; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8/2; + interface_ip.access_mode = 0; + interface_ip.throughput = cachep.throughput; + interface_ip.latency = cachep.latency; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + unicache.prefetchb = new ArrayST(&interface_ip, cachep.name + "PrefetchB", device_t, true, core_t); + unicache.area.set_area(unicache.area.get_area()+ unicache.prefetchb->local_result.area); + area.set_area(area.get_area()+ unicache.prefetchb->local_result.area); + //WBB + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = unicache.caches->l_ip.line_sz; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data; + interface_ip.cache_sz = cachep.wbb_size*interface_ip.line_sz; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8/2; + interface_ip.access_mode = 0; + interface_ip.throughput = cachep.throughput; + interface_ip.latency = cachep.latency; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + unicache.wbb = new ArrayST(&interface_ip, cachep.name + "WBB", device_t, true, core_t); + unicache.area.set_area(unicache.area.get_area()+ unicache.wbb->local_result.area); + area.set_area(area.get_area()+ unicache.wbb->local_result.area); + } + // //pipeline +// interface_ip.pipeline_stages = int(ceil(llCache.caches.local_result.access_time/llCache.caches.local_result.cycle_time)); +// interface_ip.per_stage_vector = llCache.caches.l_ip.out_w + llCache.caches.l_ip.tag_w ; +// pipeLogicCache.init_pipeline(is_default, &interface_ip); +// pipeLogicCache.compute_pipeline(); + + /* + if (!((XML->sys.number_of_dir_levels==1 && XML->sys.first_level_dir ==1) + ||(XML->sys.number_of_dir_levels==1 && XML->sys.first_level_dir ==2)))//not single level IC and DIC + { + //directory Now assuming one directory per bank, TODO:should change it later + size = XML->sys.L2directory.L2Dir_config[0]; + line = XML->sys.L2directory.L2Dir_config[1]; + assoc = XML->sys.L2directory.L2Dir_config[2]; + banks = XML->sys.L2directory.L2Dir_config[3]; + tag = debug?51:XML->sys.physical_address_width + EXTRA_TAG_BITS;//TODO: a little bit over estimate + interface_ip.specific_tag = 0; + interface_ip.tag_w = tag; + interface_ip.cache_sz = XML->sys.L2directory.L2Dir_config[0]; + interface_ip.line_sz = XML->sys.L2directory.L2Dir_config[1]; + interface_ip.assoc = XML->sys.L2directory.L2Dir_config[2]; + interface_ip.nbanks = XML->sys.L2directory.L2Dir_config[3]; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0;//debug?0:XML->sys.core[ithCore].icache.icache_config[5]; + interface_ip.throughput = XML->sys.L2directory.L2Dir_config[4]/clockRate; + interface_ip.latency = XML->sys.L2directory.L2Dir_config[5]/clockRate; + interface_ip.is_cache = true; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//lower level cache usually has one port. + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + + strcpy(directory.caches.name,"L2 Directory"); + directory.caches.init_cache(&interface_ip); + directory.caches.optimize_array(); + directory.area += directory.caches.local_result.area; + //output_data_csv(directory.caches.local_result); + ///cout<<"area="<<area<<endl; + + //miss buffer Each MSHR contains enough state to handle one or more accesses of any type to a single memory line. + //Due to the generality of the MSHR mechanism, the amount of state involved is non-trivial, + //including the address, pointers to the cache entry and destination register, written data, and various other pieces of state. + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = (XML->sys.physical_address_width) + int(ceil(log2(size/line))) + directory.caches.l_ip.line_sz; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0)); + interface_ip.cache_sz = XML->sys.L2[ithCache].buffer_sizes[0]*interface_ip.line_sz; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = XML->sys.L2[ithCache].L2_config[4]/clockRate;//means cycle time + interface_ip.latency = XML->sys.L2[ithCache].L2_config[5]/clockRate;//means access time + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + strcpy(directory.missb.name,"directoryMissB"); + directory.missb.init_cache(&interface_ip); + directory.missb.optimize_array(); + directory.area += directory.missb.local_result.area; + //output_data_csv(directory.missb.local_result); + ///cout<<"area="<<area<<endl; + + //fill buffer + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = directory.caches.l_ip.line_sz; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data;//int(pow(2.0,ceil(log2(data)))); + interface_ip.cache_sz = data*XML->sys.L2[ithCache].buffer_sizes[1]; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = XML->sys.L2[ithCache].L2_config[4]/clockRate; + interface_ip.latency = XML->sys.L2[ithCache].L2_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + strcpy(directory.ifb.name,"directoryFillB"); + directory.ifb.init_cache(&interface_ip); + directory.ifb.optimize_array(); + directory.area += directory.ifb.local_result.area; + //output_data_csv(directory.ifb.local_result); + ///cout<<"area="<<area<<endl; + + //prefetch buffer + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;//check with previous entries to decide wthether to merge. + data = directory.caches.l_ip.line_sz;//separate queue to prevent from cache polution. + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data;//int(pow(2.0,ceil(log2(data)))); + interface_ip.cache_sz = XML->sys.L2[ithCache].buffer_sizes[2]*interface_ip.line_sz; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = XML->sys.L2[ithCache].L2_config[4]/clockRate; + interface_ip.latency = XML->sys.L2[ithCache].L2_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + strcpy(directory.prefetchb.name,"directoryPrefetchB"); + directory.prefetchb.init_cache(&interface_ip); + directory.prefetchb.optimize_array(); + directory.area += directory.prefetchb.local_result.area; + //output_data_csv(directory.prefetchb.local_result); + ///cout<<"area="<<area<<endl; + + //WBB + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = directory.caches.l_ip.line_sz; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data; + interface_ip.cache_sz = XML->sys.L2[ithCache].buffer_sizes[3]*interface_ip.line_sz; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = XML->sys.L2[ithCache].L2_config[4]/clockRate; + interface_ip.latency = XML->sys.L2[ithCache].L2_config[4]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + strcpy(directory.wbb.name,"directoryWBB"); + directory.wbb.init_cache(&interface_ip); + directory.wbb.optimize_array(); + directory.area += directory.wbb.local_result.area; + } + + if (XML->sys.number_of_dir_levels ==2 && XML->sys.first_level_dir==0) + { + //first level directory + size = XML->sys.L2directory.L2Dir_config[0]*XML->sys.domain_size/128; + line = int(ceil(XML->sys.domain_size/8.0)); + assoc = XML->sys.L2directory.L2Dir_config[2]; + banks = XML->sys.L2directory.L2Dir_config[3]; + tag = debug?51:XML->sys.physical_address_width + EXTRA_TAG_BITS;//TODO: a little bit over estimate + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.cache_sz = XML->sys.L2directory.L2Dir_config[0]; + interface_ip.line_sz = XML->sys.L2directory.L2Dir_config[1]; + interface_ip.assoc = XML->sys.L2directory.L2Dir_config[2]; + interface_ip.nbanks = XML->sys.L2directory.L2Dir_config[3]; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0;//debug?0:XML->sys.core[ithCore].icache.icache_config[5]; + interface_ip.throughput = XML->sys.L2directory.L2Dir_config[4]/clockRate; + interface_ip.latency = XML->sys.L2directory.L2Dir_config[5]/clockRate; + interface_ip.is_cache = true; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1;//lower level cache usually has one port. + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + + strcpy(directory1.caches.name,"first level Directory"); + directory1.caches.init_cache(&interface_ip); + directory1.caches.optimize_array(); + directory1.area += directory1.caches.local_result.area; + //output_data_csv(directory.caches.local_result); + ///cout<<"area="<<area<<endl; + + //miss buffer Each MSHR contains enough state to handle one or more accesses of any type to a single memory line. + //Due to the generality of the MSHR mechanism, the amount of state involved is non-trivial, + //including the address, pointers to the cache entry and destination register, written data, and various other pieces of state. + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = (XML->sys.physical_address_width) + int(ceil(log2(size/line))) + directory1.caches.l_ip.line_sz; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0)); + interface_ip.cache_sz = XML->sys.L2[ithCache].buffer_sizes[0]*interface_ip.line_sz; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = XML->sys.L2[ithCache].L2_config[4]/clockRate;//means cycle time + interface_ip.latency = XML->sys.L2[ithCache].L2_config[5]/clockRate;//means access time + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + strcpy(directory1.missb.name,"directory1MissB"); + directory1.missb.init_cache(&interface_ip); + directory1.missb.optimize_array(); + directory1.area += directory1.missb.local_result.area; + //output_data_csv(directory.missb.local_result); + ///cout<<"area="<<area<<endl; + + //fill buffer + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = directory1.caches.l_ip.line_sz; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data;//int(pow(2.0,ceil(log2(data)))); + interface_ip.cache_sz = data*XML->sys.L2[ithCache].buffer_sizes[1]; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = XML->sys.L2[ithCache].L2_config[4]/clockRate; + interface_ip.latency = XML->sys.L2[ithCache].L2_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + strcpy(directory1.ifb.name,"directory1FillB"); + directory1.ifb.init_cache(&interface_ip); + directory1.ifb.optimize_array(); + directory1.area += directory1.ifb.local_result.area; + //output_data_csv(directory.ifb.local_result); + ///cout<<"area="<<area<<endl; + + //prefetch buffer + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;//check with previous entries to decide wthether to merge. + data = directory1.caches.l_ip.line_sz;//separate queue to prevent from cache polution. + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data;//int(pow(2.0,ceil(log2(data)))); + interface_ip.cache_sz = XML->sys.L2[ithCache].buffer_sizes[2]*interface_ip.line_sz; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = XML->sys.L2[ithCache].L2_config[4]/clockRate; + interface_ip.latency = XML->sys.L2[ithCache].L2_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + strcpy(directory1.prefetchb.name,"directory1PrefetchB"); + directory1.prefetchb.init_cache(&interface_ip); + directory1.prefetchb.optimize_array(); + directory1.area += directory1.prefetchb.local_result.area; + //output_data_csv(directory.prefetchb.local_result); + ///cout<<"area="<<area<<endl; + + //WBB + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = directory1.caches.l_ip.line_sz; + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data; + interface_ip.cache_sz = XML->sys.L2[ithCache].buffer_sizes[3]*interface_ip.line_sz; + interface_ip.assoc = 0; + interface_ip.nbanks = 1; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = XML->sys.L2[ithCache].L2_config[4]/clockRate; + interface_ip.latency = XML->sys.L2[ithCache].L2_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + strcpy(directory1.wbb.name,"directoryWBB"); + directory1.wbb.init_cache(&interface_ip); + directory1.wbb.optimize_array(); + directory1.area += directory1.wbb.local_result.area; + } + + if (XML->sys.first_level_dir==1)//IC + { + tag = XML->sys.physical_address_width + EXTRA_TAG_BITS; + data = int(ceil(XML->sys.domain_size/8.0)); + interface_ip.specific_tag = 1; + interface_ip.tag_w = tag; + interface_ip.line_sz = data; + interface_ip.cache_sz = XML->sys.domain_size*data*XML->sys.L2[ithCache].L2_config[0]/XML->sys.L2[ithCache].L2_config[1]; + interface_ip.assoc = 0; + interface_ip.nbanks = 1024; + interface_ip.out_w = interface_ip.line_sz*8; + interface_ip.access_mode = 0; + interface_ip.throughput = XML->sys.L2[ithCache].L2_config[4]/clockRate; + interface_ip.latency = XML->sys.L2[ithCache].L2_config[5]/clockRate; + interface_ip.obj_func_dyn_energy = 0; + interface_ip.obj_func_dyn_power = 0; + interface_ip.obj_func_leak_power = 0; + interface_ip.obj_func_cycle_t = 1; + interface_ip.num_rw_ports = 1; + interface_ip.num_rd_ports = 0; + interface_ip.num_wr_ports = 0; + interface_ip.num_se_rd_ports = 0; + strcpy(inv_dir.caches.name,"inv_dir"); + inv_dir.caches.init_cache(&interface_ip); + inv_dir.caches.optimize_array(); + inv_dir.area = inv_dir.caches.local_result.area; + + } +*/ +// //pipeline +// interface_ip.pipeline_stages = int(ceil(directory.caches.local_result.access_time/directory.caches.local_result.cycle_time)); +// interface_ip.per_stage_vector = directory.caches.l_ip.out_w + directory.caches.l_ip.tag_w ; +// pipeLogicDirectory.init_pipeline(is_default, &interface_ip); +// pipeLogicDirectory.compute_pipeline(); +// +// //clock power +// clockNetwork.init_wire_external(is_default, &interface_ip); +// clockNetwork.clk_area =area*1.1;//10% of placement overhead. rule of thumb +// clockNetwork.end_wiring_level =5;//toplevel metal +// clockNetwork.start_wiring_level =5;//toplevel metal +// clockNetwork.num_regs = pipeLogicCache.tot_stage_vector + pipeLogicDirectory.tot_stage_vector; +// clockNetwork.optimize_wire(); + +} + + +void SharedCache::computeEnergy(bool is_tdp) +{ + double homenode_data_access = (cachep.dir_ty==SBT)? 0.9:1.0; + if (is_tdp) + { + if (!((cachep.dir_ty==ST&& cacheL==L1Directory)||(cachep.dir_ty==ST&& cacheL==L2Directory))) + { + //init stats for Peak + unicache.caches->stats_t.readAc.access = .67*unicache.caches->l_ip.num_rw_ports*cachep.duty_cycle*homenode_data_access; + unicache.caches->stats_t.readAc.miss = 0; + unicache.caches->stats_t.readAc.hit = unicache.caches->stats_t.readAc.access - unicache.caches->stats_t.readAc.miss; + unicache.caches->stats_t.writeAc.access = .33*unicache.caches->l_ip.num_rw_ports*cachep.duty_cycle*homenode_data_access; + unicache.caches->stats_t.writeAc.miss = 0; + unicache.caches->stats_t.writeAc.hit = unicache.caches->stats_t.writeAc.access - unicache.caches->stats_t.writeAc.miss; + unicache.caches->tdp_stats = unicache.caches->stats_t; + + if (cachep.dir_ty==SBT) + { + homenode_stats_t.readAc.access = .67*unicache.caches->l_ip.num_rw_ports*cachep.dir_duty_cycle*(1-homenode_data_access); + homenode_stats_t.readAc.miss = 0; + homenode_stats_t.readAc.hit = homenode_stats_t.readAc.access - homenode_stats_t.readAc.miss; + homenode_stats_t.writeAc.access = .67*unicache.caches->l_ip.num_rw_ports*cachep.dir_duty_cycle*(1-homenode_data_access); + homenode_stats_t.writeAc.miss = 0; + homenode_stats_t.writeAc.hit = homenode_stats_t.writeAc.access - homenode_stats_t.writeAc.miss; + homenode_tdp_stats = homenode_stats_t; + } + + unicache.missb->stats_t.readAc.access = unicache.missb->l_ip.num_search_ports; + unicache.missb->stats_t.writeAc.access = unicache.missb->l_ip.num_search_ports; + unicache.missb->tdp_stats = unicache.missb->stats_t; + + unicache.ifb->stats_t.readAc.access = unicache.ifb->l_ip.num_search_ports; + unicache.ifb->stats_t.writeAc.access = unicache.ifb->l_ip.num_search_ports; + unicache.ifb->tdp_stats = unicache.ifb->stats_t; + + unicache.prefetchb->stats_t.readAc.access = unicache.prefetchb->l_ip.num_search_ports; + unicache.prefetchb->stats_t.writeAc.access = unicache.ifb->l_ip.num_search_ports; + unicache.prefetchb->tdp_stats = unicache.prefetchb->stats_t; + + unicache.wbb->stats_t.readAc.access = unicache.wbb->l_ip.num_search_ports; + unicache.wbb->stats_t.writeAc.access = unicache.wbb->l_ip.num_search_ports; + unicache.wbb->tdp_stats = unicache.wbb->stats_t; + } + else + { + unicache.caches->stats_t.readAc.access = unicache.caches->l_ip.num_search_ports*cachep.duty_cycle; + unicache.caches->stats_t.readAc.miss = 0; + unicache.caches->stats_t.readAc.hit = unicache.caches->stats_t.readAc.access - unicache.caches->stats_t.readAc.miss; + unicache.caches->stats_t.writeAc.access = 0; + unicache.caches->stats_t.writeAc.miss = 0; + unicache.caches->stats_t.writeAc.hit = unicache.caches->stats_t.writeAc.access - unicache.caches->stats_t.writeAc.miss; + unicache.caches->tdp_stats = unicache.caches->stats_t; + + } + + } + else + { + //init stats for runtime power (RTP) + if (cacheL==L2) + { + unicache.caches->stats_t.readAc.access = XML->sys.L2[ithCache].read_accesses; + unicache.caches->stats_t.readAc.miss = XML->sys.L2[ithCache].read_misses; + unicache.caches->stats_t.readAc.hit = unicache.caches->stats_t.readAc.access - unicache.caches->stats_t.readAc.miss; + unicache.caches->stats_t.writeAc.access = XML->sys.L2[ithCache].write_accesses; + unicache.caches->stats_t.writeAc.miss = XML->sys.L2[ithCache].write_misses; + unicache.caches->stats_t.writeAc.hit = unicache.caches->stats_t.writeAc.access - unicache.caches->stats_t.writeAc.miss; + unicache.caches->rtp_stats = unicache.caches->stats_t; + + if (cachep.dir_ty==SBT) + { + homenode_rtp_stats.readAc.access = XML->sys.L2[ithCache].homenode_read_accesses; + homenode_rtp_stats.readAc.miss = XML->sys.L2[ithCache].homenode_read_misses; + homenode_rtp_stats.readAc.hit = homenode_rtp_stats.readAc.access - homenode_rtp_stats.readAc.miss; + homenode_rtp_stats.writeAc.access = XML->sys.L2[ithCache].homenode_write_accesses; + homenode_rtp_stats.writeAc.miss = XML->sys.L2[ithCache].homenode_write_misses; + homenode_rtp_stats.writeAc.hit = homenode_rtp_stats.writeAc.access - homenode_rtp_stats.writeAc.miss; + } + } + else if (cacheL==L3) + { + unicache.caches->stats_t.readAc.access = XML->sys.L3[ithCache].read_accesses; + unicache.caches->stats_t.readAc.miss = XML->sys.L3[ithCache].read_misses; + unicache.caches->stats_t.readAc.hit = unicache.caches->stats_t.readAc.access - unicache.caches->stats_t.readAc.miss; + unicache.caches->stats_t.writeAc.access = XML->sys.L3[ithCache].write_accesses; + unicache.caches->stats_t.writeAc.miss = XML->sys.L3[ithCache].write_misses; + unicache.caches->stats_t.writeAc.hit = unicache.caches->stats_t.writeAc.access - unicache.caches->stats_t.writeAc.miss; + unicache.caches->rtp_stats = unicache.caches->stats_t; + + if (cachep.dir_ty==SBT) + { + homenode_rtp_stats.readAc.access = XML->sys.L3[ithCache].homenode_read_accesses; + homenode_rtp_stats.readAc.miss = XML->sys.L3[ithCache].homenode_read_misses; + homenode_rtp_stats.readAc.hit = homenode_rtp_stats.readAc.access - homenode_rtp_stats.readAc.miss; + homenode_rtp_stats.writeAc.access = XML->sys.L3[ithCache].homenode_write_accesses; + homenode_rtp_stats.writeAc.miss = XML->sys.L3[ithCache].homenode_write_misses; + homenode_rtp_stats.writeAc.hit = homenode_rtp_stats.writeAc.access - homenode_rtp_stats.writeAc.miss; + } + } + else if (cacheL==L1Directory) + { + unicache.caches->stats_t.readAc.access = XML->sys.L1Directory[ithCache].read_accesses; + unicache.caches->stats_t.readAc.miss = XML->sys.L1Directory[ithCache].read_misses; + unicache.caches->stats_t.readAc.hit = unicache.caches->stats_t.readAc.access - unicache.caches->stats_t.readAc.miss; + unicache.caches->stats_t.writeAc.access = XML->sys.L1Directory[ithCache].write_accesses; + unicache.caches->stats_t.writeAc.miss = XML->sys.L1Directory[ithCache].write_misses; + unicache.caches->stats_t.writeAc.hit = unicache.caches->stats_t.writeAc.access - unicache.caches->stats_t.writeAc.miss; + unicache.caches->rtp_stats = unicache.caches->stats_t; + } + else if (cacheL==L2Directory) + { + unicache.caches->stats_t.readAc.access = XML->sys.L2Directory[ithCache].read_accesses; + unicache.caches->stats_t.readAc.miss = XML->sys.L2Directory[ithCache].read_misses; + unicache.caches->stats_t.readAc.hit = unicache.caches->stats_t.readAc.access - unicache.caches->stats_t.readAc.miss; + unicache.caches->stats_t.writeAc.access = XML->sys.L2Directory[ithCache].write_accesses; + unicache.caches->stats_t.writeAc.miss = XML->sys.L2Directory[ithCache].write_misses; + unicache.caches->stats_t.writeAc.hit = unicache.caches->stats_t.writeAc.access - unicache.caches->stats_t.writeAc.miss; + unicache.caches->rtp_stats = unicache.caches->stats_t; + } + if (!((cachep.dir_ty==ST&& cacheL==L1Directory)||(cachep.dir_ty==ST&& cacheL==L2Directory))) + { //Assuming write back and write-allocate cache + + unicache.missb->stats_t.readAc.access = unicache.caches->stats_t.writeAc.miss ; + unicache.missb->stats_t.writeAc.access = unicache.caches->stats_t.writeAc.miss; + unicache.missb->rtp_stats = unicache.missb->stats_t; + + unicache.ifb->stats_t.readAc.access = unicache.caches->stats_t.writeAc.miss; + unicache.ifb->stats_t.writeAc.access = unicache.caches->stats_t.writeAc.miss; + unicache.ifb->rtp_stats = unicache.ifb->stats_t; + + unicache.prefetchb->stats_t.readAc.access = unicache.caches->stats_t.writeAc.miss; + unicache.prefetchb->stats_t.writeAc.access = unicache.caches->stats_t.writeAc.miss; + unicache.prefetchb->rtp_stats = unicache.prefetchb->stats_t; + + unicache.wbb->stats_t.readAc.access = unicache.caches->stats_t.writeAc.miss; + unicache.wbb->stats_t.writeAc.access = unicache.caches->stats_t.writeAc.miss; + if (cachep.dir_ty==SBT) + { + unicache.missb->stats_t.readAc.access += homenode_rtp_stats.writeAc.miss; + unicache.missb->stats_t.writeAc.access += homenode_rtp_stats.writeAc.miss; + unicache.missb->rtp_stats = unicache.missb->stats_t; + + unicache.missb->stats_t.readAc.access += homenode_rtp_stats.writeAc.miss; + unicache.missb->stats_t.writeAc.access += homenode_rtp_stats.writeAc.miss; + unicache.missb->rtp_stats = unicache.missb->stats_t; + + unicache.ifb->stats_t.readAc.access += homenode_rtp_stats.writeAc.miss; + unicache.ifb->stats_t.writeAc.access += homenode_rtp_stats.writeAc.miss; + unicache.ifb->rtp_stats = unicache.ifb->stats_t; + + unicache.prefetchb->stats_t.readAc.access += homenode_rtp_stats.writeAc.miss; + unicache.prefetchb->stats_t.writeAc.access += homenode_rtp_stats.writeAc.miss; + unicache.prefetchb->rtp_stats = unicache.prefetchb->stats_t; + + unicache.wbb->stats_t.readAc.access += homenode_rtp_stats.writeAc.miss; + unicache.wbb->stats_t.writeAc.access += homenode_rtp_stats.writeAc.miss; + } + unicache.wbb->rtp_stats = unicache.wbb->stats_t; + + } + + } + + unicache.power_t.reset(); + if (!((cachep.dir_ty==ST&& cacheL==L1Directory)||(cachep.dir_ty==ST&& cacheL==L2Directory))) + { + unicache.power_t.readOp.dynamic += (unicache.caches->stats_t.readAc.hit*unicache.caches->local_result.power.readOp.dynamic+ + unicache.caches->stats_t.readAc.miss*unicache.caches->local_result.tag_array2->power.readOp.dynamic+ + unicache.caches->stats_t.writeAc.miss*unicache.caches->local_result.tag_array2->power.writeOp.dynamic+ + unicache.caches->stats_t.writeAc.access*unicache.caches->local_result.power.writeOp.dynamic);//write miss will also generate a write later + + if (cachep.dir_ty==SBT) + { + unicache.power_t.readOp.dynamic += homenode_stats_t.readAc.hit * (unicache.caches->local_result.data_array2->power.readOp.dynamic*dir_overhead + + unicache.caches->local_result.tag_array2->power.readOp.dynamic) + + homenode_stats_t.readAc.miss*unicache.caches->local_result.tag_array2->power.readOp.dynamic + + homenode_stats_t.writeAc.miss*unicache.caches->local_result.tag_array2->power.readOp.dynamic + + homenode_stats_t.writeAc.hit*(unicache.caches->local_result.data_array2->power.writeOp.dynamic*dir_overhead + + unicache.caches->local_result.tag_array2->power.readOp.dynamic+ + homenode_stats_t.writeAc.miss*unicache.caches->local_result.power.writeOp.dynamic);//write miss on dynamic home node will generate a replacement write on whole cache block + + + } + + unicache.power_t.readOp.dynamic += unicache.missb->stats_t.readAc.access*unicache.missb->local_result.power.searchOp.dynamic + + unicache.missb->stats_t.writeAc.access*unicache.missb->local_result.power.writeOp.dynamic;//each access to missb involves a CAM and a write + unicache.power_t.readOp.dynamic += unicache.ifb->stats_t.readAc.access*unicache.ifb->local_result.power.searchOp.dynamic + + unicache.ifb->stats_t.writeAc.access*unicache.ifb->local_result.power.writeOp.dynamic; + unicache.power_t.readOp.dynamic += unicache.prefetchb->stats_t.readAc.access*unicache.prefetchb->local_result.power.searchOp.dynamic + + unicache.prefetchb->stats_t.writeAc.access*unicache.prefetchb->local_result.power.writeOp.dynamic; + unicache.power_t.readOp.dynamic += unicache.wbb->stats_t.readAc.access*unicache.wbb->local_result.power.searchOp.dynamic + + unicache.wbb->stats_t.writeAc.access*unicache.wbb->local_result.power.writeOp.dynamic; + } + else + { + unicache.power_t.readOp.dynamic += (unicache.caches->stats_t.readAc.access*unicache.caches->local_result.power.searchOp.dynamic+ + unicache.caches->stats_t.writeAc.access*unicache.caches->local_result.power.writeOp.dynamic); + } + + if (is_tdp) + { + unicache.power = unicache.power_t + (unicache.caches->local_result.power)*pppm_lkg; + if (!((cachep.dir_ty==ST&& cacheL==L1Directory)||(cachep.dir_ty==ST&& cacheL==L2Directory))) + { + unicache.power = unicache.power+ + (unicache.missb->local_result.power + + unicache.ifb->local_result.power + + unicache.prefetchb->local_result.power + + unicache.wbb->local_result.power)*pppm_lkg; + } + power = power + unicache.power; +// cout<<"unicache.caches->local_result.power.readOp.dynamic"<<unicache.caches->local_result.power.readOp.dynamic<<endl; +// cout<<"unicache.caches->local_result.power.writeOp.dynamic"<<unicache.caches->local_result.power.writeOp.dynamic<<endl; + } + else + { + unicache.rt_power = unicache.power_t + (unicache.caches->local_result.power)*pppm_lkg; + if (!((cachep.dir_ty==ST&& cacheL==L1Directory)||(cachep.dir_ty==ST&& cacheL==L2Directory))) + { + (unicache.rt_power = unicache.rt_power + + unicache.missb->local_result.power + + unicache.ifb->local_result.power + + unicache.prefetchb->local_result.power + + unicache.wbb->local_result.power)*pppm_lkg; + } + rt_power = rt_power + unicache.rt_power; + } +} + +void SharedCache::displayEnergy(uint32_t indent,bool is_tdp) +{ + string indent_str(indent, ' '); + string indent_str_next(indent+2, ' '); + bool long_channel = XML->sys.longer_channel_device; + + if (is_tdp) + { + cout << (XML->sys.Private_L2? indent_str:"")<< cachep.name << endl; + cout << indent_str << "Area = " << area.get_area()*1e-6<< " mm^2" << endl; + cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*cachep.clockRate << " W" << endl; + cout << indent_str << "Subthreshold Leakage = " + << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; + //cout << indent_str << "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl; + cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; + cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic/cachep.executionTime << " W" << endl; + cout <<endl; + } + else + { + } +} + +//void SharedCache::computeMaxPower() +//{ +// //Compute maximum power and runtime power. +// //When computing runtime power, McPAT gets or reasons out the statistics based on XML input. +// maxPower = 0.0; +// //llCache,itlb +// llCache.maxPower = 0.0; +// llCache.maxPower += (llCache.caches.l_ip.num_rw_ports*(0.67*llCache.caches.local_result.power.readOp.dynamic+0.33*llCache.caches.local_result.power.writeOp.dynamic) +// +llCache.caches.l_ip.num_rd_ports*llCache.caches.local_result.power.readOp.dynamic+llCache.caches.l_ip.num_wr_ports*llCache.caches.local_result.power.writeOp.dynamic +// +llCache.caches.l_ip.num_se_rd_ports*llCache.caches.local_result.power.readOp.dynamic)*clockRate; +// ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl; +// +// llCache.maxPower += llCache.missb.l_ip.num_search_ports*llCache.missb.local_result.power.searchOp.dynamic*clockRate; +// ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl; +// +// llCache.maxPower += llCache.ifb.l_ip.num_search_ports*llCache.ifb.local_result.power.searchOp.dynamic*clockRate; +// ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl; +// +// llCache.maxPower += llCache.prefetchb.l_ip.num_search_ports*llCache.prefetchb.local_result.power.searchOp.dynamic*clockRate; +// ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl; +// +// llCache.maxPower += llCache.wbb.l_ip.num_search_ports*llCache.wbb.local_result.power.searchOp.dynamic*clockRate; +// //llCache.maxPower *= scktRatio; //TODO: this calculation should be self-contained +// ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl; +// +//// directory_power = (directory.caches.l_ip.num_rw_ports*(0.67*directory.caches.local_result.power.readOp.dynamic+0.33*directory.caches.local_result.power.writeOp.dynamic) +//// +directory.caches.l_ip.num_rd_ports*directory.caches.local_result.power.readOp.dynamic+directory.caches.l_ip.num_wr_ports*directory.caches.local_result.power.writeOp.dynamic +//// +directory.caches.l_ip.num_se_rd_ports*directory.caches.local_result.power.readOp.dynamic)*clockRate; +// +// L2Tot.power.readOp.dynamic = llCache.maxPower; +// L2Tot.power.readOp.leakage = llCache.caches.local_result.power.readOp.leakage + +// llCache.missb.local_result.power.readOp.leakage + +// llCache.ifb.local_result.power.readOp.leakage + +// llCache.prefetchb.local_result.power.readOp.leakage + +// llCache.wbb.local_result.power.readOp.leakage; +// +// L2Tot.area.set_area(llCache.area*1.1*1e-6);//placement and routing overhead +// +// if (XML->sys.number_of_dir_levels==1) +// { +// if (XML->sys.first_level_dir==0) +// { +// directory.maxPower = 0.0; +// directory.maxPower += (directory.caches.l_ip.num_rw_ports*(0.67*directory.caches.local_result.power.readOp.dynamic+0.33*directory.caches.local_result.power.writeOp.dynamic) +// +directory.caches.l_ip.num_rd_ports*directory.caches.local_result.power.readOp.dynamic+directory.caches.l_ip.num_wr_ports*directory.caches.local_result.power.writeOp.dynamic +// +directory.caches.l_ip.num_se_rd_ports*directory.caches.local_result.power.readOp.dynamic)*clockRate; +// ///cout<<"directory.maxPower=" <<directory.maxPower<<endl; +// +// directory.maxPower += directory.missb.l_ip.num_search_ports*directory.missb.local_result.power.searchOp.dynamic*clockRate; +// ///cout<<"directory.maxPower=" <<directory.maxPower<<endl; +// +// directory.maxPower += directory.ifb.l_ip.num_search_ports*directory.ifb.local_result.power.searchOp.dynamic*clockRate; +// ///cout<<"directory.maxPower=" <<directory.maxPower<<endl; +// +// directory.maxPower += directory.prefetchb.l_ip.num_search_ports*directory.prefetchb.local_result.power.searchOp.dynamic*clockRate; +// ///cout<<"directory.maxPower=" <<directory.maxPower<<endl; +// +// directory.maxPower += directory.wbb.l_ip.num_search_ports*directory.wbb.local_result.power.searchOp.dynamic*clockRate; +// +// cc.power.readOp.dynamic = directory.maxPower*scktRatio*8;//8 is the memory controller counts +// cc.power.readOp.leakage = directory.caches.local_result.power.readOp.leakage + +// directory.missb.local_result.power.readOp.leakage + +// directory.ifb.local_result.power.readOp.leakage + +// directory.prefetchb.local_result.power.readOp.leakage + +// directory.wbb.local_result.power.readOp.leakage; +// +// cc.power.readOp.leakage *=8; +// +// cc.area.set_area(directory.area*8); +// cout<<"CC area="<<cc.area.get_area()*1e-6<<endl; +// cout<<"CC Power="<<cc.power.readOp.dynamic<<endl; +// ccTot.area.set_area(cc.area.get_area()*1e-6); +// ccTot.power = cc.power; +// cout<<"DC energy per access" << cc.power.readOp.dynamic/clockRate/8; +// } +// else if (XML->sys.first_level_dir==1) +// { +// inv_dir.maxPower = inv_dir.caches.local_result.power.searchOp.dynamic*clockRate*XML->sys.domain_size; +// cc.power.readOp.dynamic = inv_dir.maxPower*scktRatio*64/XML->sys.domain_size; +// cc.power.readOp.leakage = inv_dir.caches.local_result.power.readOp.leakage*inv_dir.caches.l_ip.nbanks*64/XML->sys.domain_size; +// +// cc.area.set_area(inv_dir.area*64/XML->sys.domain_size); +// cout<<"CC area="<<cc.area.get_area()*1e-6<<endl; +// cout<<"CC Power="<<cc.power.readOp.dynamic<<endl; +// ccTot.area.set_area(cc.area.get_area()*1e-6); +// cout<<"DC energy per access" << cc.power.readOp.dynamic/clockRate/8; +// ccTot.power = cc.power; +// } +// } +// +// else if (XML->sys.number_of_dir_levels==2) +// { +// +// directory.maxPower = 0.0; +// directory.maxPower += (directory.caches.l_ip.num_rw_ports*(0.67*directory.caches.local_result.power.readOp.dynamic+0.33*directory.caches.local_result.power.writeOp.dynamic) +// +directory.caches.l_ip.num_rd_ports*directory.caches.local_result.power.readOp.dynamic+directory.caches.l_ip.num_wr_ports*directory.caches.local_result.power.writeOp.dynamic +// +directory.caches.l_ip.num_se_rd_ports*directory.caches.local_result.power.readOp.dynamic)*clockRate; +// ///cout<<"directory.maxPower=" <<directory.maxPower<<endl; +// +// directory.maxPower += directory.missb.l_ip.num_search_ports*directory.missb.local_result.power.searchOp.dynamic*clockRate; +// ///cout<<"directory.maxPower=" <<directory.maxPower<<endl; +// +// directory.maxPower += directory.ifb.l_ip.num_search_ports*directory.ifb.local_result.power.searchOp.dynamic*clockRate; +// ///cout<<"directory.maxPower=" <<directory.maxPower<<endl; +// +// directory.maxPower += directory.prefetchb.l_ip.num_search_ports*directory.prefetchb.local_result.power.searchOp.dynamic*clockRate; +// ///cout<<"directory.maxPower=" <<directory.maxPower<<endl; +// +// directory.maxPower += directory.wbb.l_ip.num_search_ports*directory.wbb.local_result.power.searchOp.dynamic*clockRate; +// +// cc.power.readOp.dynamic = directory.maxPower*scktRatio*8;//8 is the memory controller counts +// cc.power.readOp.leakage = directory.caches.local_result.power.readOp.leakage + +// directory.missb.local_result.power.readOp.leakage + +// directory.ifb.local_result.power.readOp.leakage + +// directory.prefetchb.local_result.power.readOp.leakage + +// directory.wbb.local_result.power.readOp.leakage; +// cc.power.readOp.leakage *=8; +// cc.area.set_area(directory.area*8); +// +// if (XML->sys.first_level_dir==0) +// { +// directory1.maxPower = 0.0; +// directory1.maxPower += (directory1.caches.l_ip.num_rw_ports*(0.67*directory1.caches.local_result.power.readOp.dynamic+0.33*directory1.caches.local_result.power.writeOp.dynamic) +// +directory1.caches.l_ip.num_rd_ports*directory1.caches.local_result.power.readOp.dynamic+directory1.caches.l_ip.num_wr_ports*directory1.caches.local_result.power.writeOp.dynamic +// +directory1.caches.l_ip.num_se_rd_ports*directory1.caches.local_result.power.readOp.dynamic)*clockRate; +// ///cout<<"directory1.maxPower=" <<directory1.maxPower<<endl; +// +// directory1.maxPower += directory1.missb.l_ip.num_search_ports*directory1.missb.local_result.power.searchOp.dynamic*clockRate; +// ///cout<<"directory1.maxPower=" <<directory1.maxPower<<endl; +// +// directory1.maxPower += directory1.ifb.l_ip.num_search_ports*directory1.ifb.local_result.power.searchOp.dynamic*clockRate; +// ///cout<<"directory1.maxPower=" <<directory1.maxPower<<endl; +// +// directory1.maxPower += directory1.prefetchb.l_ip.num_search_ports*directory1.prefetchb.local_result.power.searchOp.dynamic*clockRate; +// ///cout<<"directory1.maxPower=" <<directory1.maxPower<<endl; +// +// directory1.maxPower += directory1.wbb.l_ip.num_search_ports*directory1.wbb.local_result.power.searchOp.dynamic*clockRate; +// +// cc1.power.readOp.dynamic = directory1.maxPower*scktRatio*64/XML->sys.domain_size; +// cc1.power.readOp.leakage = directory1.caches.local_result.power.readOp.leakage + +// directory1.missb.local_result.power.readOp.leakage + +// directory1.ifb.local_result.power.readOp.leakage + +// directory1.prefetchb.local_result.power.readOp.leakage + +// directory1.wbb.local_result.power.readOp.leakage; +// cc1.power.readOp.leakage *= 64/XML->sys.domain_size; +// cc1.area.set_area(directory1.area*64/XML->sys.domain_size); +// +// cout<<"CC area="<<(cc.area.get_area()+cc1.area.get_area())*1e-6<<endl; +// cout<<"CC Power="<<cc.power.readOp.dynamic + cc1.power.readOp.dynamic <<endl; +// ccTot.area.set_area((cc.area.get_area()+cc1.area.get_area())*1e-6); +// ccTot.power = cc.power + cc1.power; +// } +// else if (XML->sys.first_level_dir==1) +// { +// inv_dir.maxPower = inv_dir.caches.local_result.power.searchOp.dynamic*clockRate*XML->sys.domain_size; +// cc1.power.readOp.dynamic = inv_dir.maxPower*scktRatio*(64/XML->sys.domain_size); +// cc1.power.readOp.leakage = inv_dir.caches.local_result.power.readOp.leakage*inv_dir.caches.l_ip.nbanks*XML->sys.domain_size; +// +// cc1.area.set_area(inv_dir.area*64/XML->sys.domain_size); +// cout<<"CC area="<<(cc.area.get_area()+cc1.area.get_area())*1e-6<<endl; +// cout<<"CC Power="<<cc.power.readOp.dynamic + cc1.power.readOp.dynamic <<endl; +// ccTot.area.set_area((cc.area.get_area()+cc1.area.get_area())*1e-6); +// ccTot.power = cc.power + cc1.power; +// +// } +// else if (XML->sys.first_level_dir==2) +// { +// cout<<"CC area="<<cc.area.get_area()*1e-6<<endl; +// cout<<"CC Power="<<cc.power.readOp.dynamic<<endl; +// ccTot.area.set_area(cc.area.get_area()*1e-6); +// ccTot.power = cc.power; +// } +// } +// +//cout<<"L2cache size="<<L2Tot.area.get_area()*1e-6<<endl; +//cout<<"L2cache dynamic power="<<L2Tot.power.readOp.dynamic<<endl; +//cout<<"L2cache laeakge power="<<L2Tot.power.readOp.leakage<<endl; +// +// ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl; +// +// +// maxPower += llCache.maxPower; +// ///cout<<"maxpower=" <<maxPower<<endl; +// +//// maxPower += pipeLogicCache.power.readOp.dynamic*clockRate; +//// ///cout<<"pipeLogic.power="<<pipeLogicCache.power.readOp.dynamic*clockRate<<endl; +//// ///cout<<"maxpower=" <<maxPower<<endl; +//// +//// maxPower += pipeLogicDirectory.power.readOp.dynamic*clockRate; +//// ///cout<<"pipeLogic.power="<<pipeLogicDirectory.power.readOp.dynamic*clockRate<<endl; +//// ///cout<<"maxpower=" <<maxPower<<endl; +//// +//// //clock power +//// maxPower += clockNetwork.total_power.readOp.dynamic*clockRate; +//// ///cout<<"clockNetwork.total_power="<<clockNetwork.total_power.readOp.dynamic*clockRate<<endl; +//// ///cout<<"maxpower=" <<maxPower<<endl; +// +//} + +void SharedCache::set_cache_param() +{ + if (cacheL==L2) + { + cachep.name = "L2"; + cachep.clockRate = XML->sys.L2[ithCache].clockrate; + cachep.clockRate *= 1e6; + cachep.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6); + interface_ip.data_arr_ram_cell_tech_type = XML->sys.L2[ithCache].device_type;//long channel device LSTP + interface_ip.data_arr_peri_global_tech_type = XML->sys.L2[ithCache].device_type; + interface_ip.tag_arr_ram_cell_tech_type = XML->sys.L2[ithCache].device_type; + interface_ip.tag_arr_peri_global_tech_type = XML->sys.L2[ithCache].device_type; + cachep.capacity = XML->sys.L2[ithCache].L2_config[0]; + cachep.blockW = XML->sys.L2[ithCache].L2_config[1]; + cachep.assoc = XML->sys.L2[ithCache].L2_config[2]; + cachep.nbanks = XML->sys.L2[ithCache].L2_config[3]; + cachep.throughput = XML->sys.L2[ithCache].L2_config[4]/cachep.clockRate; + cachep.latency = XML->sys.L2[ithCache].L2_config[5]/cachep.clockRate; + cachep.missb_size = XML->sys.L2[ithCache].buffer_sizes[0]; + cachep.fu_size = XML->sys.L2[ithCache].buffer_sizes[1]; + cachep.prefetchb_size= XML->sys.L2[ithCache].buffer_sizes[2]; + cachep.wbb_size = XML->sys.L2[ithCache].buffer_sizes[3]; + cachep.duty_cycle = XML->sys.L2[ithCache].duty_cycle; + if (!XML->sys.L2[ithCache].merged_dir) + { + cachep.dir_ty = NonDir; + } + else + { + cachep.dir_ty = SBT; + cachep.dir_duty_cycle = XML->sys.L2[ithCache].dir_duty_cycle; + } + } + else if (cacheL==L3) + { + cachep.name = "L3"; + cachep.clockRate = XML->sys.L3[ithCache].clockrate; + cachep.clockRate *= 1e6; + cachep.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6); + interface_ip.data_arr_ram_cell_tech_type = XML->sys.L3[ithCache].device_type;//long channel device LSTP + interface_ip.data_arr_peri_global_tech_type = XML->sys.L3[ithCache].device_type; + interface_ip.tag_arr_ram_cell_tech_type = XML->sys.L3[ithCache].device_type; + interface_ip.tag_arr_peri_global_tech_type = XML->sys.L3[ithCache].device_type; + cachep.capacity = XML->sys.L3[ithCache].L3_config[0]; + cachep.blockW = XML->sys.L3[ithCache].L3_config[1]; + cachep.assoc = XML->sys.L3[ithCache].L3_config[2]; + cachep.nbanks = XML->sys.L3[ithCache].L3_config[3]; + cachep.throughput = XML->sys.L3[ithCache].L3_config[4]/cachep.clockRate; + cachep.latency = XML->sys.L3[ithCache].L3_config[5]/cachep.clockRate; + cachep.missb_size = XML->sys.L3[ithCache].buffer_sizes[0]; + cachep.fu_size = XML->sys.L3[ithCache].buffer_sizes[1]; + cachep.prefetchb_size= XML->sys.L3[ithCache].buffer_sizes[2]; + cachep.wbb_size = XML->sys.L3[ithCache].buffer_sizes[3]; + cachep.duty_cycle = XML->sys.L3[ithCache].duty_cycle; + if (!XML->sys.L2[ithCache].merged_dir) + { + cachep.dir_ty = NonDir; + } + else + { + cachep.dir_ty = SBT; + cachep.dir_duty_cycle = XML->sys.L2[ithCache].dir_duty_cycle; + } + } + else if (cacheL==L1Directory) + { + cachep.name = "First Level Directory"; + cachep.dir_ty = (enum Dir_type) XML->sys.L1Directory[ithCache].Directory_type; + cachep.clockRate = XML->sys.L1Directory[ithCache].clockrate; + cachep.clockRate *= 1e6; + cachep.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6); + interface_ip.data_arr_ram_cell_tech_type = XML->sys.L1Directory[ithCache].device_type;//long channel device LSTP + interface_ip.data_arr_peri_global_tech_type = XML->sys.L1Directory[ithCache].device_type; + interface_ip.tag_arr_ram_cell_tech_type = XML->sys.L1Directory[ithCache].device_type; + interface_ip.tag_arr_peri_global_tech_type = XML->sys.L1Directory[ithCache].device_type; + cachep.capacity = XML->sys.L1Directory[ithCache].Dir_config[0]; + cachep.blockW = XML->sys.L1Directory[ithCache].Dir_config[1]; + cachep.assoc = XML->sys.L1Directory[ithCache].Dir_config[2]; + cachep.nbanks = XML->sys.L1Directory[ithCache].Dir_config[3]; + cachep.throughput = XML->sys.L1Directory[ithCache].Dir_config[4]/cachep.clockRate; + cachep.latency = XML->sys.L1Directory[ithCache].Dir_config[5]/cachep.clockRate; + cachep.missb_size = XML->sys.L1Directory[ithCache].buffer_sizes[0]; + cachep.fu_size = XML->sys.L1Directory[ithCache].buffer_sizes[1]; + cachep.prefetchb_size= XML->sys.L1Directory[ithCache].buffer_sizes[2]; + cachep.wbb_size = XML->sys.L1Directory[ithCache].buffer_sizes[3]; + cachep.duty_cycle = XML->sys.L1Directory[ithCache].duty_cycle; + } + else if (cacheL==L2Directory) + { + cachep.name = "Second Level Directory"; + cachep.dir_ty = (enum Dir_type) XML->sys.L2Directory[ithCache].Directory_type; + cachep.clockRate = XML->sys.L2Directory[ithCache].clockrate; + cachep.clockRate *= 1e6; + cachep.executionTime = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6); + interface_ip.data_arr_ram_cell_tech_type = XML->sys.L2Directory[ithCache].device_type;//long channel device LSTP + interface_ip.data_arr_peri_global_tech_type = XML->sys.L2Directory[ithCache].device_type; + interface_ip.tag_arr_ram_cell_tech_type = XML->sys.L2Directory[ithCache].device_type; + interface_ip.tag_arr_peri_global_tech_type = XML->sys.L2Directory[ithCache].device_type; + cachep.capacity = XML->sys.L2Directory[ithCache].Dir_config[0]; + cachep.blockW = XML->sys.L2Directory[ithCache].Dir_config[1]; + cachep.assoc = XML->sys.L2Directory[ithCache].Dir_config[2]; + cachep.nbanks = XML->sys.L2Directory[ithCache].Dir_config[3]; + cachep.throughput = XML->sys.L2Directory[ithCache].Dir_config[4]/cachep.clockRate; + cachep.latency = XML->sys.L2Directory[ithCache].Dir_config[5]/cachep.clockRate; + cachep.missb_size = XML->sys.L2Directory[ithCache].buffer_sizes[0]; + cachep.fu_size = XML->sys.L2Directory[ithCache].buffer_sizes[1]; + cachep.prefetchb_size= XML->sys.L2Directory[ithCache].buffer_sizes[2]; + cachep.wbb_size = XML->sys.L2Directory[ithCache].buffer_sizes[3]; + cachep.duty_cycle = XML->sys.L2Directory[ithCache].duty_cycle; + } + //cachep.cache_duty_cycle=cachep.dir_duty_cycle = 0.35; +} + diff --git a/ext/mcpat/sharedcache.h b/ext/mcpat/sharedcache.h new file mode 100644 index 000000000..923408482 --- /dev/null +++ b/ext/mcpat/sharedcache.h @@ -0,0 +1,89 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#ifndef SHAREDCACHE_H_ +#define SHAREDCACHE_H_ +#include <vector> + +#include "XML_Parse.h" +#include "area.h" +#include "array.h" +#include "basic_components.h" +#include "logic.h" +#include "parameter.h" + +class SharedCache :public Component{ + public: + ParseXML * XML; + int ithCache; + InputParameter interface_ip; + enum cache_level cacheL; + DataCache unicache;//Shared cache + CacheDynParam cachep; + statsDef homenode_tdp_stats; + statsDef homenode_rtp_stats; + statsDef homenode_stats_t; + double dir_overhead; + // cache_processor llCache,directory, directory1, inv_dir; + + //pipeline pipeLogicCache, pipeLogicDirectory; + //clock_network clockNetwork; + double scktRatio, executionTime; + // Component L2Tot, cc, cc1, ccTot; + + SharedCache(ParseXML *XML_interface, int ithCache_, InputParameter* interface_ip_,enum cache_level cacheL_ =L2); + void set_cache_param(); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,bool is_tdp=true); + ~SharedCache(){}; +}; + +class CCdir :public Component{ + public: + ParseXML * XML; + int ithCache; + InputParameter interface_ip; + DataCache dc;//Shared cache + ArrayST * shadow_dir; +// cache_processor llCache,directory, directory1, inv_dir; + + //pipeline pipeLogicCache, pipeLogicDirectory; + //clock_network clockNetwork; + double scktRatio, clockRate, executionTime; + Component L2Tot, cc, cc1, ccTot; + + CCdir(ParseXML *XML_interface, int ithCache_, InputParameter* interface_ip_); + void computeEnergy(bool is_tdp=true); + void displayEnergy(uint32_t indent = 0,bool is_tdp=true); + ~CCdir(); +}; + +#endif /* SHAREDCACHE_H_ */ diff --git a/ext/mcpat/technology_xeon_core.cc b/ext/mcpat/technology_xeon_core.cc new file mode 100644 index 000000000..4e60edc1b --- /dev/null +++ b/ext/mcpat/technology_xeon_core.cc @@ -0,0 +1,2772 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + +#include "basic_circuit.h" + +#include "parameter.h" + +double wire_resistance(double resistivity, double wire_width, double wire_thickness, + double barrier_thickness, double dishing_thickness, double alpha_scatter) +{ + double resistance; + resistance = alpha_scatter * resistivity /((wire_thickness - barrier_thickness - dishing_thickness)*(wire_width - 2 * barrier_thickness)); + return(resistance); +} + +double wire_capacitance(double wire_width, double wire_thickness, double wire_spacing, + double ild_thickness, double miller_value, double horiz_dielectric_constant, + double vert_dielectric_constant, double fringe_cap) +{ + double vertical_cap, sidewall_cap, total_cap; + vertical_cap = 2 * PERMITTIVITY_FREE_SPACE * vert_dielectric_constant * wire_width / ild_thickness; + sidewall_cap = 2 * PERMITTIVITY_FREE_SPACE * miller_value * horiz_dielectric_constant * wire_thickness / wire_spacing; + total_cap = vertical_cap + sidewall_cap + fringe_cap; + return(total_cap); +} + + +void init_tech_params(double technology, bool is_tag) +{ + int iter, tech, tech_lo, tech_hi; + double curr_alpha, curr_vpp; + double wire_width, wire_thickness, wire_spacing, + fringe_cap, pmos_to_nmos_sizing_r; +// double aspect_ratio,ild_thickness, miller_value = 1.5, horiz_dielectric_constant, vert_dielectric_constant; + double barrier_thickness, dishing_thickness, alpha_scatter; + double curr_vdd_dram_cell, curr_v_th_dram_access_transistor, curr_I_on_dram_cell, curr_c_dram_cell; + + uint32_t ram_cell_tech_type = (is_tag) ? g_ip->tag_arr_ram_cell_tech_type : g_ip->data_arr_ram_cell_tech_type; + uint32_t peri_global_tech_type = (is_tag) ? g_ip->tag_arr_peri_global_tech_type : g_ip->data_arr_peri_global_tech_type; + + technology = technology * 1000.0; // in the unit of nm + + // initialize parameters + g_tp.reset(); + double gmp_to_gmn_multiplier_periph_global = 0; + + double curr_Wmemcella_dram, curr_Wmemcellpmos_dram, curr_Wmemcellnmos_dram, + curr_area_cell_dram, curr_asp_ratio_cell_dram, curr_Wmemcella_sram, + curr_Wmemcellpmos_sram, curr_Wmemcellnmos_sram, curr_area_cell_sram, + curr_asp_ratio_cell_sram, curr_I_off_dram_cell_worst_case_length_temp; + double curr_Wmemcella_cam, curr_Wmemcellpmos_cam, curr_Wmemcellnmos_cam, curr_area_cell_cam,//Sheng: CAM data + curr_asp_ratio_cell_cam; + double SENSE_AMP_D, SENSE_AMP_P; // J + double area_cell_dram = 0; + double asp_ratio_cell_dram = 0; + double area_cell_sram = 0; + double asp_ratio_cell_sram = 0; + double area_cell_cam = 0; + double asp_ratio_cell_cam = 0; + double mobility_eff_periph_global = 0; + double Vdsat_periph_global = 0; + double nmos_effective_resistance_multiplier; + double width_dram_access_transistor; + + double curr_logic_scaling_co_eff = 0;//This is based on the reported numbers of Intel Merom 65nm, Penryn45nm and IBM cell 90/65/45 date + double curr_core_tx_density = 0;//this is density per um^2; 90, ...22nm based on Intel Penryn + double curr_chip_layout_overhead = 0; + double curr_macro_layout_overhead = 0; + double curr_sckt_co_eff = 0; + + if (technology < 91 && technology > 89) + { + tech_lo = 90; + tech_hi = 90; + } + else if (technology < 66 && technology > 64) + { + tech_lo = 65; + tech_hi = 65; + } + else if (technology < 46 && technology > 44) + { + tech_lo = 45; + tech_hi = 45; + } + else if (technology < 33 && technology > 31) + { + tech_lo = 32; + tech_hi = 32; + } + else if (technology < 23 && technology > 21) + { + tech_lo = 22; + tech_hi = 22; + if (ram_cell_tech_type == 3) + { + cout<<"current version does not support eDRAM technologies at 22nm"<<endl; + exit(0); + } + } +// else if (technology < 17 && technology > 15) +// { +// tech_lo = 16; +// tech_hi = 16; +// } + else if (technology < 90 && technology > 65) + { + tech_lo = 90; + tech_hi = 65; + } + else if (technology < 65 && technology > 45) + { + tech_lo = 65; + tech_hi = 45; + } + else if (technology < 45 && technology > 32) + { + tech_lo = 45; + tech_hi = 32; + } + else if (technology < 32 && technology > 22) + { + tech_lo = 32; + tech_hi = 22; + } +// else if (technology < 22 && technology > 16) +// { +// tech_lo = 22; +// tech_hi = 16; +// } + else + { + cout<<"Invalid technology nodes"<<endl; + exit(0); + } + + double vdd[NUMBER_TECH_FLAVORS]; + double Lphy[NUMBER_TECH_FLAVORS]; + double Lelec[NUMBER_TECH_FLAVORS]; + double t_ox[NUMBER_TECH_FLAVORS]; + double v_th[NUMBER_TECH_FLAVORS]; + double c_ox[NUMBER_TECH_FLAVORS]; + double mobility_eff[NUMBER_TECH_FLAVORS]; + double Vdsat[NUMBER_TECH_FLAVORS]; + double c_g_ideal[NUMBER_TECH_FLAVORS]; + double c_fringe[NUMBER_TECH_FLAVORS]; + double c_junc[NUMBER_TECH_FLAVORS]; + double I_on_n[NUMBER_TECH_FLAVORS]; + double I_on_p[NUMBER_TECH_FLAVORS]; + double Rnchannelon[NUMBER_TECH_FLAVORS]; + double Rpchannelon[NUMBER_TECH_FLAVORS]; + double n_to_p_eff_curr_drv_ratio[NUMBER_TECH_FLAVORS]; + double I_off_n[NUMBER_TECH_FLAVORS][101]; + double I_g_on_n[NUMBER_TECH_FLAVORS][101]; + //double I_off_p[NUMBER_TECH_FLAVORS][101]; + double gmp_to_gmn_multiplier[NUMBER_TECH_FLAVORS]; + //double curr_sckt_co_eff[NUMBER_TECH_FLAVORS]; + double long_channel_leakage_reduction[NUMBER_TECH_FLAVORS]; + + for (iter = 0; iter <= 1; ++iter) + { + // linear interpolation + if (iter == 0) + { + tech = tech_lo; + if (tech_lo == tech_hi) + { + curr_alpha = 1; + } + else + { + curr_alpha = (technology - tech_hi)/(tech_lo - tech_hi); + } + } + else + { + tech = tech_hi; + if (tech_lo == tech_hi) + { + break; + } + else + { + curr_alpha = (tech_lo - technology)/(tech_lo - tech_hi); + } + } + + if (tech == 90) + { + SENSE_AMP_D = .28e-9; // s + SENSE_AMP_P = 14.7e-15; // J + //90nm technology-node. Corresponds to year 2004 in ITRS + //ITRS HP device type + vdd[0] = 1.2; + Lphy[0] = 0.037;//Lphy is the physical gate-length. micron + Lelec[0] = 0.0266;//Lelec is the electrical gate-length. micron + t_ox[0] = 1.2e-3;//micron + v_th[0] = 0.23707;//V + c_ox[0] = 1.79e-14;//F/micron2 + mobility_eff[0] = 342.16 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs + Vdsat[0] = 0.128; //V + c_g_ideal[0] = 6.64e-16;//F/micron + c_fringe[0] = 0.08e-15;//F/micron + c_junc[0] = 1e-15;//F/micron2 + I_on_n[0] = 1076.9e-6;//A/micron + I_on_p[0] = 712.6e-6;//A/micron + //Note that nmos_effective_resistance_multiplier, n_to_p_eff_curr_drv_ratio and gmp_to_gmn_multiplier values are calculated offline + nmos_effective_resistance_multiplier = 1.54; + n_to_p_eff_curr_drv_ratio[0] = 2.45; + gmp_to_gmn_multiplier[0] = 1.22; + Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron + Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron + long_channel_leakage_reduction[0] = 1; + I_off_n[0][0] = 3.24e-8;//A/micron + I_off_n[0][10] = 4.01e-8; + I_off_n[0][20] = 4.90e-8; + I_off_n[0][30] = 5.92e-8; + I_off_n[0][40] = 7.08e-8; + I_off_n[0][50] = 8.38e-8; + I_off_n[0][60] = 9.82e-8; + I_off_n[0][70] = 1.14e-7; + I_off_n[0][80] = 1.29e-7; + I_off_n[0][90] = 1.43e-7; + I_off_n[0][100] = 1.54e-7; + + I_g_on_n[0][0] = 1.65e-8;//A/micron + I_g_on_n[0][10] = 1.65e-8; + I_g_on_n[0][20] = 1.65e-8; + I_g_on_n[0][30] = 1.65e-8; + I_g_on_n[0][40] = 1.65e-8; + I_g_on_n[0][50] = 1.65e-8; + I_g_on_n[0][60] = 1.65e-8; + I_g_on_n[0][70] = 1.65e-8; + I_g_on_n[0][80] = 1.65e-8; + I_g_on_n[0][90] = 1.65e-8; + I_g_on_n[0][100] = 1.65e-8; + + //ITRS LSTP device type + vdd[1] = 1.3; + Lphy[1] = 0.075; + Lelec[1] = 0.0486; + t_ox[1] = 2.2e-3; + v_th[1] = 0.48203; + c_ox[1] = 1.22e-14; + mobility_eff[1] = 356.76 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[1] = 0.373; + c_g_ideal[1] = 9.15e-16; + c_fringe[1] = 0.08e-15; + c_junc[1] = 1e-15; + I_on_n[1] = 503.6e-6; + I_on_p[1] = 235.1e-6; + nmos_effective_resistance_multiplier = 1.92; + n_to_p_eff_curr_drv_ratio[1] = 2.44; + gmp_to_gmn_multiplier[1] =0.88; + Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1]; + Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1]; + long_channel_leakage_reduction[1] = 1; + I_off_n[1][0] = 2.81e-12; + I_off_n[1][10] = 4.76e-12; + I_off_n[1][20] = 7.82e-12; + I_off_n[1][30] = 1.25e-11; + I_off_n[1][40] = 1.94e-11; + I_off_n[1][50] = 2.94e-11; + I_off_n[1][60] = 4.36e-11; + I_off_n[1][70] = 6.32e-11; + I_off_n[1][80] = 8.95e-11; + I_off_n[1][90] = 1.25e-10; + I_off_n[1][100] = 1.7e-10; + + I_g_on_n[1][0] = 3.87e-11;//A/micron + I_g_on_n[1][10] = 3.87e-11; + I_g_on_n[1][20] = 3.87e-11; + I_g_on_n[1][30] = 3.87e-11; + I_g_on_n[1][40] = 3.87e-11; + I_g_on_n[1][50] = 3.87e-11; + I_g_on_n[1][60] = 3.87e-11; + I_g_on_n[1][70] = 3.87e-11; + I_g_on_n[1][80] = 3.87e-11; + I_g_on_n[1][90] = 3.87e-11; + I_g_on_n[1][100] = 3.87e-11; + + //ITRS LOP device type + vdd[2] = 0.9; + Lphy[2] = 0.053; + Lelec[2] = 0.0354; + t_ox[2] = 1.5e-3; + v_th[2] = 0.30764; + c_ox[2] = 1.59e-14; + mobility_eff[2] = 460.39 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[2] = 0.113; + c_g_ideal[2] = 8.45e-16; + c_fringe[2] = 0.08e-15; + c_junc[2] = 1e-15; + I_on_n[2] = 386.6e-6; + I_on_p[2] = 209.7e-6; + nmos_effective_resistance_multiplier = 1.77; + n_to_p_eff_curr_drv_ratio[2] = 2.54; + gmp_to_gmn_multiplier[2] = 0.98; + Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2]; + Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2]; + long_channel_leakage_reduction[2] = 1; + I_off_n[2][0] = 2.14e-9; + I_off_n[2][10] = 2.9e-9; + I_off_n[2][20] = 3.87e-9; + I_off_n[2][30] = 5.07e-9; + I_off_n[2][40] = 6.54e-9; + I_off_n[2][50] = 8.27e-8; + I_off_n[2][60] = 1.02e-7; + I_off_n[2][70] = 1.20e-7; + I_off_n[2][80] = 1.36e-8; + I_off_n[2][90] = 1.52e-8; + I_off_n[2][100] = 1.73e-8; + + I_g_on_n[2][0] = 4.31e-8;//A/micron + I_g_on_n[2][10] = 4.31e-8; + I_g_on_n[2][20] = 4.31e-8; + I_g_on_n[2][30] = 4.31e-8; + I_g_on_n[2][40] = 4.31e-8; + I_g_on_n[2][50] = 4.31e-8; + I_g_on_n[2][60] = 4.31e-8; + I_g_on_n[2][70] = 4.31e-8; + I_g_on_n[2][80] = 4.31e-8; + I_g_on_n[2][90] = 4.31e-8; + I_g_on_n[2][100] = 4.31e-8; + + if (ram_cell_tech_type == lp_dram) + { + //LP-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.2; + Lphy[3] = 0.12; + Lelec[3] = 0.0756; + curr_v_th_dram_access_transistor = 0.4545; + width_dram_access_transistor = 0.14; + curr_I_on_dram_cell = 45e-6; + curr_I_off_dram_cell_worst_case_length_temp = 21.1e-12; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 0.168; + curr_asp_ratio_cell_dram = 1.46; + curr_c_dram_cell = 20e-15; + + //LP-DRAM wordline transistor parameters + curr_vpp = 1.6; + t_ox[3] = 2.2e-3; + v_th[3] = 0.4545; + c_ox[3] = 1.22e-14; + mobility_eff[3] = 323.95 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.3; + c_g_ideal[3] = 1.47e-15; + c_fringe[3] = 0.08e-15; + c_junc[3] = 1e-15; + I_on_n[3] = 321.6e-6; + I_on_p[3] = 203.3e-6; + nmos_effective_resistance_multiplier = 1.65; + n_to_p_eff_curr_drv_ratio[3] = 1.95; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 1.42e-11; + I_off_n[3][10] = 2.25e-11; + I_off_n[3][20] = 3.46e-11; + I_off_n[3][30] = 5.18e-11; + I_off_n[3][40] = 7.58e-11; + I_off_n[3][50] = 1.08e-10; + I_off_n[3][60] = 1.51e-10; + I_off_n[3][70] = 2.02e-10; + I_off_n[3][80] = 2.57e-10; + I_off_n[3][90] = 3.14e-10; + I_off_n[3][100] = 3.85e-10; + } + else if (ram_cell_tech_type == comm_dram) + { + //COMM-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.6; + Lphy[3] = 0.09; + Lelec[3] = 0.0576; + curr_v_th_dram_access_transistor = 1; + width_dram_access_transistor = 0.09; + curr_I_on_dram_cell = 20e-6; + curr_I_off_dram_cell_worst_case_length_temp = 1e-15; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 6*0.09*0.09; + curr_asp_ratio_cell_dram = 1.5; + curr_c_dram_cell = 30e-15; + + //COMM-DRAM wordline transistor parameters + curr_vpp = 3.7; + t_ox[3] = 5.5e-3; + v_th[3] = 1.0; + c_ox[3] = 5.65e-15; + mobility_eff[3] = 302.2 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.32; + c_g_ideal[3] = 5.08e-16; + c_fringe[3] = 0.08e-15; + c_junc[3] = 1e-15; + I_on_n[3] = 1094.3e-6; + I_on_p[3] = I_on_n[3] / 2; + nmos_effective_resistance_multiplier = 1.62; + n_to_p_eff_curr_drv_ratio[3] = 2.05; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 5.80e-15; + I_off_n[3][10] = 1.21e-14; + I_off_n[3][20] = 2.42e-14; + I_off_n[3][30] = 4.65e-14; + I_off_n[3][40] = 8.60e-14; + I_off_n[3][50] = 1.54e-13; + I_off_n[3][60] = 2.66e-13; + I_off_n[3][70] = 4.45e-13; + I_off_n[3][80] = 7.17e-13; + I_off_n[3][90] = 1.11e-12; + I_off_n[3][100] = 1.67e-12; + } + + //SRAM cell properties + curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um; + curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_sram = 1.46; + //CAM cell properties //TODO: data need to be revisited + curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um; + curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;//360 + curr_asp_ratio_cell_cam = 2.92;//2.5 + //Empirical undifferetiated core/FU coefficient + curr_logic_scaling_co_eff = 1; + curr_core_tx_density = 1.25*0.7*0.7; + curr_sckt_co_eff = 1.1539; + curr_chip_layout_overhead = 1.2;//die measurement results based on Niagara 1 and 2 + curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb + + + } + + if (tech == 65) + { //65nm technology-node. Corresponds to year 2007 in ITRS + //ITRS HP device type +// SENSE_AMP_D = .2e-9; // s +// SENSE_AMP_P = 5.7e-15; // J +// vdd[0] = 1.1; +// Lphy[0] = 0.025; +// Lelec[0] = 0.019; +// t_ox[0] = 1.1e-3; +// v_th[0] = .19491; +// c_ox[0] = 1.88e-14; +// mobility_eff[0] = 436.24 * (1e-2 * 1e6 * 1e-2 * 1e6); +// Vdsat[0] = 7.71e-2; +// c_g_ideal[0] = 4.69e-16; +// c_fringe[0] = 0.077e-15; +// c_junc[0] = 1e-15; +// I_on_n[0] = 1197.2e-6; +// I_on_p[0] = 870.8e-6; +// nmos_effective_resistance_multiplier = 1.50; +// n_to_p_eff_curr_drv_ratio[0] = 2.41; +// gmp_to_gmn_multiplier[0] = 1.38; +// Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0]; +// Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0]; +// long_channel_leakage_reduction[0] = 1/3.74; +// //Using MASTAR, @380K, increase Lgate until Ion reduces to 90% or Lgate increase by 10%, whichever comes first +// //Ioff(Lgate normal)/Ioff(Lgate long)= 3.74. +// I_off_n[0][0] = 1.96e-7; +// I_off_n[0][10] = 2.29e-7; +// I_off_n[0][20] = 2.66e-7; +// I_off_n[0][30] = 3.05e-7; +// I_off_n[0][40] = 3.49e-7; +// I_off_n[0][50] = 3.95e-7; +// I_off_n[0][60] = 4.45e-7; +// I_off_n[0][70] = 4.97e-7; +// I_off_n[0][80] = 5.48e-7; +// I_off_n[0][90] = 5.94e-7; +// I_off_n[0][100] = 6.3e-7; +// I_g_on_n[0][0] = 4.09e-8;//A/micron +// I_g_on_n[0][10] = 4.09e-8; +// I_g_on_n[0][20] = 4.09e-8; +// I_g_on_n[0][30] = 4.09e-8; +// I_g_on_n[0][40] = 4.09e-8; +// I_g_on_n[0][50] = 4.09e-8; +// I_g_on_n[0][60] = 4.09e-8; +// I_g_on_n[0][70] = 4.09e-8; +// I_g_on_n[0][80] = 4.09e-8; +// I_g_on_n[0][90] = 4.09e-8; +// I_g_on_n[0][100] = 4.09e-8; + + SENSE_AMP_D = .2e-9; // s + SENSE_AMP_P = 5.7e-15; // J + vdd[0] = 1.25; + Lphy[0] = 0.025; + Lelec[0] = 0.019; + t_ox[0] = 1.1e-3; + v_th[0] = .12491; + c_ox[0] = 1.88e-14; + mobility_eff[0] = 409.31 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[0] = 9.08e-2; + c_g_ideal[0] = 4.72e-16; + c_fringe[0] = 0.08e-15; + c_junc[0] = 1e-15; + I_on_n[0] = 1486.4e-6; + I_on_p[0] = 1131.5e-6; + nmos_effective_resistance_multiplier = 1.57; + n_to_p_eff_curr_drv_ratio[0] = 2; + gmp_to_gmn_multiplier[0] = 1.38; + Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0]; + Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0]; + long_channel_leakage_reduction[0] = 1.0/4.97; + //Using MASTAR, @380K, increase Lgate until Ion reduces to 90% or Lgate increase by 10%, whichever comes first + //Ioff(Lgate normal)/Ioff(Lgate long)= 4.97@Vdd=1.25; (3.74@Vdd=1.1), however, Intel paper suggest the reduction factor is 3. + I_off_n[0][0] = 8.62e-7; + I_off_n[0][10] = 9.08e-7; + I_off_n[0][20] = 9.55e-7; + I_off_n[0][30] = 1.00e-6; + I_off_n[0][40] = 1.05e-6; + I_off_n[0][50] = 1.09e-6; + I_off_n[0][60] = 1.14e-6; + I_off_n[0][70] = 1.18e-6; + I_off_n[0][80] = 1.23e-6; + I_off_n[0][90] = 1.27e-6; + I_off_n[0][100] = 1.31e-6; + + + I_g_on_n[0][0] = 7.02e-8;//A/micron + I_g_on_n[0][10] = 7.02e-8; + I_g_on_n[0][20] = 7.02e-8; + I_g_on_n[0][30] = 7.02e-8; + I_g_on_n[0][40] = 7.02e-8; + I_g_on_n[0][50] = 7.02e-8; + I_g_on_n[0][60] = 7.02e-8; + I_g_on_n[0][70] = 7.02e-8; + I_g_on_n[0][80] = 7.02e-8; + I_g_on_n[0][90] = 7.02e-8; + I_g_on_n[0][100] = 7.02e-8; + + //ITRS LSTP device type + vdd[1] = 1.2; + Lphy[1] = 0.045; + Lelec[1] = 0.0298; + t_ox[1] = 1.9e-3; + v_th[1] = 0.52354; + c_ox[1] = 1.36e-14; + mobility_eff[1] = 341.21 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[1] = 0.128; + c_g_ideal[1] = 6.14e-16; + c_fringe[1] = 0.08e-15; + c_junc[1] = 1e-15; + I_on_n[1] = 519.2e-6; + I_on_p[1] = 266e-6; + nmos_effective_resistance_multiplier = 1.96; + n_to_p_eff_curr_drv_ratio[1] = 2.23; + gmp_to_gmn_multiplier[1] = 0.99; + Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1]; + Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1]; + long_channel_leakage_reduction[1] = 1/2.82; + I_off_n[1][0] = 9.12e-12; + I_off_n[1][10] = 1.49e-11; + I_off_n[1][20] = 2.36e-11; + I_off_n[1][30] = 3.64e-11; + I_off_n[1][40] = 5.48e-11; + I_off_n[1][50] = 8.05e-11; + I_off_n[1][60] = 1.15e-10; + I_off_n[1][70] = 1.59e-10; + I_off_n[1][80] = 2.1e-10; + I_off_n[1][90] = 2.62e-10; + I_off_n[1][100] = 3.21e-10; + + I_g_on_n[1][0] = 1.09e-10;//A/micron + I_g_on_n[1][10] = 1.09e-10; + I_g_on_n[1][20] = 1.09e-10; + I_g_on_n[1][30] = 1.09e-10; + I_g_on_n[1][40] = 1.09e-10; + I_g_on_n[1][50] = 1.09e-10; + I_g_on_n[1][60] = 1.09e-10; + I_g_on_n[1][70] = 1.09e-10; + I_g_on_n[1][80] = 1.09e-10; + I_g_on_n[1][90] = 1.09e-10; + I_g_on_n[1][100] = 1.09e-10; + + //ITRS LOP device type + vdd[2] = 0.8; + Lphy[2] = 0.032; + Lelec[2] = 0.0216; + t_ox[2] = 1.2e-3; + v_th[2] = 0.28512; + c_ox[2] = 1.87e-14; + mobility_eff[2] = 495.19 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[2] = 0.292; + c_g_ideal[2] = 6e-16; + c_fringe[2] = 0.08e-15; + c_junc[2] = 1e-15; + I_on_n[2] = 573.1e-6; + I_on_p[2] = 340.6e-6; + nmos_effective_resistance_multiplier = 1.82; + n_to_p_eff_curr_drv_ratio[2] = 2.28; + gmp_to_gmn_multiplier[2] = 1.11; + Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2]; + Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2]; + long_channel_leakage_reduction[2] = 1/2.05; + I_off_n[2][0] = 4.9e-9; + I_off_n[2][10] = 6.49e-9; + I_off_n[2][20] = 8.45e-9; + I_off_n[2][30] = 1.08e-8; + I_off_n[2][40] = 1.37e-8; + I_off_n[2][50] = 1.71e-8; + I_off_n[2][60] = 2.09e-8; + I_off_n[2][70] = 2.48e-8; + I_off_n[2][80] = 2.84e-8; + I_off_n[2][90] = 3.13e-8; + I_off_n[2][100] = 3.42e-8; + + I_g_on_n[2][0] = 9.61e-9;//A/micron + I_g_on_n[2][10] = 9.61e-9; + I_g_on_n[2][20] = 9.61e-9; + I_g_on_n[2][30] = 9.61e-9; + I_g_on_n[2][40] = 9.61e-9; + I_g_on_n[2][50] = 9.61e-9; + I_g_on_n[2][60] = 9.61e-9; + I_g_on_n[2][70] = 9.61e-9; + I_g_on_n[2][80] = 9.61e-9; + I_g_on_n[2][90] = 9.61e-9; + I_g_on_n[2][100] = 9.61e-9; + + if (ram_cell_tech_type == lp_dram) + { + //LP-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.2; + Lphy[3] = 0.12; + Lelec[3] = 0.0756; + curr_v_th_dram_access_transistor = 0.43806; + width_dram_access_transistor = 0.09; + curr_I_on_dram_cell = 36e-6; + curr_I_off_dram_cell_worst_case_length_temp = 19.6e-12; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 0.11; + curr_asp_ratio_cell_dram = 1.46; + curr_c_dram_cell = 20e-15; + + //LP-DRAM wordline transistor parameters + curr_vpp = 1.6; + t_ox[3] = 2.2e-3; + v_th[3] = 0.43806; + c_ox[3] = 1.22e-14; + mobility_eff[3] = 328.32 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.43806; + c_g_ideal[3] = 1.46e-15; + c_fringe[3] = 0.08e-15; + c_junc[3] = 1e-15 ; + I_on_n[3] = 399.8e-6; + I_on_p[3] = 243.4e-6; + nmos_effective_resistance_multiplier = 1.65; + n_to_p_eff_curr_drv_ratio[3] = 2.05; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 2.23e-11; + I_off_n[3][10] = 3.46e-11; + I_off_n[3][20] = 5.24e-11; + I_off_n[3][30] = 7.75e-11; + I_off_n[3][40] = 1.12e-10; + I_off_n[3][50] = 1.58e-10; + I_off_n[3][60] = 2.18e-10; + I_off_n[3][70] = 2.88e-10; + I_off_n[3][80] = 3.63e-10; + I_off_n[3][90] = 4.41e-10; + I_off_n[3][100] = 5.36e-10; + } + else if (ram_cell_tech_type == comm_dram) + { + //COMM-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.3; + Lphy[3] = 0.065; + Lelec[3] = 0.0426; + curr_v_th_dram_access_transistor = 1; + width_dram_access_transistor = 0.065; + curr_I_on_dram_cell = 20e-6; + curr_I_off_dram_cell_worst_case_length_temp = 1e-15; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 6*0.065*0.065; + curr_asp_ratio_cell_dram = 1.5; + curr_c_dram_cell = 30e-15; + + //COMM-DRAM wordline transistor parameters + curr_vpp = 3.3; + t_ox[3] = 5e-3; + v_th[3] = 1.0; + c_ox[3] = 6.16e-15; + mobility_eff[3] = 303.44 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.385; + c_g_ideal[3] = 4e-16; + c_fringe[3] = 0.08e-15; + c_junc[3] = 1e-15 ; + I_on_n[3] = 1031e-6; + I_on_p[3] = I_on_n[3] / 2; + nmos_effective_resistance_multiplier = 1.69; + n_to_p_eff_curr_drv_ratio[3] = 2.39; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 1.80e-14; + I_off_n[3][10] = 3.64e-14; + I_off_n[3][20] = 7.03e-14; + I_off_n[3][30] = 1.31e-13; + I_off_n[3][40] = 2.35e-13; + I_off_n[3][50] = 4.09e-13; + I_off_n[3][60] = 6.89e-13; + I_off_n[3][70] = 1.13e-12; + I_off_n[3][80] = 1.78e-12; + I_off_n[3][90] = 2.71e-12; + I_off_n[3][100] = 3.99e-12; + } + + //SRAM cell properties + curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um; + curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_sram = 1.46; + //CAM cell properties //TODO: data need to be revisited + curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um; + curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_cam = 2.92; + //Empirical undifferetiated core/FU coefficient + curr_logic_scaling_co_eff = 0.7; + curr_core_tx_density = 1.25*0.7; + curr_sckt_co_eff = 1.1359; + curr_chip_layout_overhead = 1.2;//die measurement results based on Niagara 1 and 2 + curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb + } + + if (tech == 45) + { //45nm technology-node. Corresponds to year 2010 in ITRS + //ITRS HP device type + SENSE_AMP_D = .04e-9; // s + SENSE_AMP_P = 2.7e-15; // J + vdd[0] = 1.0; + Lphy[0] = 0.018; + Lelec[0] = 0.01345; + t_ox[0] = 0.65e-3; + v_th[0] = .18035; + c_ox[0] = 3.77e-14; + mobility_eff[0] = 266.68 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[0] = 9.38E-2; + c_g_ideal[0] = 6.78e-16; + c_fringe[0] = 0.05e-15; + c_junc[0] = 1e-15; + I_on_n[0] = 2046.6e-6; + //There are certain problems with the ITRS PMOS numbers in MASTAR for 45nm. So we are using 65nm values of + //n_to_p_eff_curr_drv_ratio and gmp_to_gmn_multiplier for 45nm + I_on_p[0] = I_on_n[0] / 2;//This value is fixed arbitrarily but I_on_p is not being used in CACTI + nmos_effective_resistance_multiplier = 1.51; + n_to_p_eff_curr_drv_ratio[0] = 2.41; + gmp_to_gmn_multiplier[0] = 1.38; + Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0]; + Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0]; + long_channel_leakage_reduction[0] = 1/3.546;//Using MASTAR, @380K, increase Lgate until Ion reduces to 90%, Ioff(Lgate normal)/Ioff(Lgate long)= 3.74 + I_off_n[0][0] = 2.8e-7; + I_off_n[0][10] = 3.28e-7; + I_off_n[0][20] = 3.81e-7; + I_off_n[0][30] = 4.39e-7; + I_off_n[0][40] = 5.02e-7; + I_off_n[0][50] = 5.69e-7; + I_off_n[0][60] = 6.42e-7; + I_off_n[0][70] = 7.2e-7; + I_off_n[0][80] = 8.03e-7; + I_off_n[0][90] = 8.91e-7; + I_off_n[0][100] = 9.84e-7; + + I_g_on_n[0][0] = 3.59e-8;//A/micron + I_g_on_n[0][10] = 3.59e-8; + I_g_on_n[0][20] = 3.59e-8; + I_g_on_n[0][30] = 3.59e-8; + I_g_on_n[0][40] = 3.59e-8; + I_g_on_n[0][50] = 3.59e-8; + I_g_on_n[0][60] = 3.59e-8; + I_g_on_n[0][70] = 3.59e-8; + I_g_on_n[0][80] = 3.59e-8; + I_g_on_n[0][90] = 3.59e-8; + I_g_on_n[0][100] = 3.59e-8; + + //ITRS LSTP device type + vdd[1] = 1.1; + Lphy[1] = 0.028; + Lelec[1] = 0.0212; + t_ox[1] = 1.4e-3; + v_th[1] = 0.50245; + c_ox[1] = 2.01e-14; + mobility_eff[1] = 363.96 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[1] = 9.12e-2; + c_g_ideal[1] = 5.18e-16; + c_fringe[1] = 0.08e-15; + c_junc[1] = 1e-15; + I_on_n[1] = 666.2e-6; + I_on_p[1] = I_on_n[1] / 2; + nmos_effective_resistance_multiplier = 1.99; + n_to_p_eff_curr_drv_ratio[1] = 2.23; + gmp_to_gmn_multiplier[1] = 0.99; + Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1]; + Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1]; + long_channel_leakage_reduction[1] = 1/2.08; + I_off_n[1][0] = 1.01e-11; + I_off_n[1][10] = 1.65e-11; + I_off_n[1][20] = 2.62e-11; + I_off_n[1][30] = 4.06e-11; + I_off_n[1][40] = 6.12e-11; + I_off_n[1][50] = 9.02e-11; + I_off_n[1][60] = 1.3e-10; + I_off_n[1][70] = 1.83e-10; + I_off_n[1][80] = 2.51e-10; + I_off_n[1][90] = 3.29e-10; + I_off_n[1][100] = 4.1e-10; + + I_g_on_n[1][0] = 9.47e-12;//A/micron + I_g_on_n[1][10] = 9.47e-12; + I_g_on_n[1][20] = 9.47e-12; + I_g_on_n[1][30] = 9.47e-12; + I_g_on_n[1][40] = 9.47e-12; + I_g_on_n[1][50] = 9.47e-12; + I_g_on_n[1][60] = 9.47e-12; + I_g_on_n[1][70] = 9.47e-12; + I_g_on_n[1][80] = 9.47e-12; + I_g_on_n[1][90] = 9.47e-12; + I_g_on_n[1][100] = 9.47e-12; + + //ITRS LOP device type + vdd[2] = 0.7; + Lphy[2] = 0.022; + Lelec[2] = 0.016; + t_ox[2] = 0.9e-3; + v_th[2] = 0.22599; + c_ox[2] = 2.82e-14;//F/micron2 + mobility_eff[2] = 508.9 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[2] = 5.71e-2; + c_g_ideal[2] = 6.2e-16; + c_fringe[2] = 0.073e-15; + c_junc[2] = 1e-15; + I_on_n[2] = 748.9e-6; + I_on_p[2] = I_on_n[2] / 2; + nmos_effective_resistance_multiplier = 1.76; + n_to_p_eff_curr_drv_ratio[2] = 2.28; + gmp_to_gmn_multiplier[2] = 1.11; + Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2]; + Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2]; + long_channel_leakage_reduction[2] = 1/1.92; + I_off_n[2][0] = 4.03e-9; + I_off_n[2][10] = 5.02e-9; + I_off_n[2][20] = 6.18e-9; + I_off_n[2][30] = 7.51e-9; + I_off_n[2][40] = 9.04e-9; + I_off_n[2][50] = 1.08e-8; + I_off_n[2][60] = 1.27e-8; + I_off_n[2][70] = 1.47e-8; + I_off_n[2][80] = 1.66e-8; + I_off_n[2][90] = 1.84e-8; + I_off_n[2][100] = 2.03e-8; + + I_g_on_n[2][0] = 3.24e-8;//A/micron + I_g_on_n[2][10] = 4.01e-8; + I_g_on_n[2][20] = 4.90e-8; + I_g_on_n[2][30] = 5.92e-8; + I_g_on_n[2][40] = 7.08e-8; + I_g_on_n[2][50] = 8.38e-8; + I_g_on_n[2][60] = 9.82e-8; + I_g_on_n[2][70] = 1.14e-7; + I_g_on_n[2][80] = 1.29e-7; + I_g_on_n[2][90] = 1.43e-7; + I_g_on_n[2][100] = 1.54e-7; + + if (ram_cell_tech_type == lp_dram) + { + //LP-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.1; + Lphy[3] = 0.078; + Lelec[3] = 0.0504;// Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors. + curr_v_th_dram_access_transistor = 0.44559; + width_dram_access_transistor = 0.079; + curr_I_on_dram_cell = 36e-6;//A + curr_I_off_dram_cell_worst_case_length_temp = 19.5e-12; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = width_dram_access_transistor * Lphy[3] * 10.0; + curr_asp_ratio_cell_dram = 1.46; + curr_c_dram_cell = 20e-15; + + //LP-DRAM wordline transistor parameters + curr_vpp = 1.5; + t_ox[3] = 2.1e-3; + v_th[3] = 0.44559; + c_ox[3] = 1.41e-14; + mobility_eff[3] = 426.30 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.181; + c_g_ideal[3] = 1.10e-15; + c_fringe[3] = 0.08e-15; + c_junc[3] = 1e-15; + I_on_n[3] = 456e-6; + I_on_p[3] = I_on_n[3] / 2; + nmos_effective_resistance_multiplier = 1.65; + n_to_p_eff_curr_drv_ratio[3] = 2.05; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 2.54e-11; + I_off_n[3][10] = 3.94e-11; + I_off_n[3][20] = 5.95e-11; + I_off_n[3][30] = 8.79e-11; + I_off_n[3][40] = 1.27e-10; + I_off_n[3][50] = 1.79e-10; + I_off_n[3][60] = 2.47e-10; + I_off_n[3][70] = 3.31e-10; + I_off_n[3][80] = 4.26e-10; + I_off_n[3][90] = 5.27e-10; + I_off_n[3][100] = 6.46e-10; + } + else if (ram_cell_tech_type == comm_dram) + { + //COMM-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.1; + Lphy[3] = 0.045; + Lelec[3] = 0.0298; + curr_v_th_dram_access_transistor = 1; + width_dram_access_transistor = 0.045; + curr_I_on_dram_cell = 20e-6;//A + curr_I_off_dram_cell_worst_case_length_temp = 1e-15; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 6*0.045*0.045; + curr_asp_ratio_cell_dram = 1.5; + curr_c_dram_cell = 30e-15; + + //COMM-DRAM wordline transistor parameters + curr_vpp = 2.7; + t_ox[3] = 4e-3; + v_th[3] = 1.0; + c_ox[3] = 7.98e-15; + mobility_eff[3] = 368.58 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.147; + c_g_ideal[3] = 3.59e-16; + c_fringe[3] = 0.08e-15; + c_junc[3] = 1e-15; + I_on_n[3] = 999.4e-6; + I_on_p[3] = I_on_n[3] / 2; + nmos_effective_resistance_multiplier = 1.69; + n_to_p_eff_curr_drv_ratio[3] = 1.95; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 1.31e-14; + I_off_n[3][10] = 2.68e-14; + I_off_n[3][20] = 5.25e-14; + I_off_n[3][30] = 9.88e-14; + I_off_n[3][40] = 1.79e-13; + I_off_n[3][50] = 3.15e-13; + I_off_n[3][60] = 5.36e-13; + I_off_n[3][70] = 8.86e-13; + I_off_n[3][80] = 1.42e-12; + I_off_n[3][90] = 2.20e-12; + I_off_n[3][100] = 3.29e-12; + } + + + //SRAM cell properties + curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um; + curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_sram = 1.46; + //CAM cell properties //TODO: data need to be revisited + curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um; + curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_cam = 2.92; + //Empirical undifferetiated core/FU coefficient + curr_logic_scaling_co_eff = 0.7*0.7; + curr_core_tx_density = 1.25; + curr_sckt_co_eff = 1.1387; + curr_chip_layout_overhead = 1.2;//die measurement results based on Niagara 1 and 2 + curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb + } + + if (tech == 32) + { + SENSE_AMP_D = .03e-9; // s + SENSE_AMP_P = 2.16e-15; // J + //For 2013, MPU/ASIC stagger-contacted M1 half-pitch is 32 nm (so this is 32 nm + //technology i.e. FEATURESIZE = 0.032). Using the SOI process numbers for + //HP and LSTP. + vdd[0] = 0.9; + Lphy[0] = 0.013; + Lelec[0] = 0.01013; + t_ox[0] = 0.5e-3; + v_th[0] = 0.21835; + c_ox[0] = 4.11e-14; + mobility_eff[0] = 361.84 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[0] = 5.09E-2; + c_g_ideal[0] = 5.34e-16; + c_fringe[0] = 0.04e-15; + c_junc[0] = 1e-15; + I_on_n[0] = 2211.7e-6; + I_on_p[0] = I_on_n[0] / 2; + nmos_effective_resistance_multiplier = 1.49; + n_to_p_eff_curr_drv_ratio[0] = 2.41; + gmp_to_gmn_multiplier[0] = 1.38; + Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron + Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron + long_channel_leakage_reduction[0] = 1/3.706; + //Using MASTAR, @300K (380K does not work in MASTAR), increase Lgate until Ion reduces to 95% or Lgate increase by 5% (DG device can only increase by 5%), + //whichever comes first + I_off_n[0][0] = 1.52e-7; + I_off_n[0][10] = 1.55e-7; + I_off_n[0][20] = 1.59e-7; + I_off_n[0][30] = 1.68e-7; + I_off_n[0][40] = 1.90e-7; + I_off_n[0][50] = 2.69e-7; + I_off_n[0][60] = 5.32e-7; + I_off_n[0][70] = 1.02e-6; + I_off_n[0][80] = 1.62e-6; + I_off_n[0][90] = 2.73e-6; + I_off_n[0][100] = 6.1e-6; + + I_g_on_n[0][0] = 6.55e-8;//A/micron + I_g_on_n[0][10] = 6.55e-8; + I_g_on_n[0][20] = 6.55e-8; + I_g_on_n[0][30] = 6.55e-8; + I_g_on_n[0][40] = 6.55e-8; + I_g_on_n[0][50] = 6.55e-8; + I_g_on_n[0][60] = 6.55e-8; + I_g_on_n[0][70] = 6.55e-8; + I_g_on_n[0][80] = 6.55e-8; + I_g_on_n[0][90] = 6.55e-8; + I_g_on_n[0][100] = 6.55e-8; + +// 32 DG +// I_g_on_n[0][0] = 2.71e-9;//A/micron +// I_g_on_n[0][10] = 2.71e-9; +// I_g_on_n[0][20] = 2.71e-9; +// I_g_on_n[0][30] = 2.71e-9; +// I_g_on_n[0][40] = 2.71e-9; +// I_g_on_n[0][50] = 2.71e-9; +// I_g_on_n[0][60] = 2.71e-9; +// I_g_on_n[0][70] = 2.71e-9; +// I_g_on_n[0][80] = 2.71e-9; +// I_g_on_n[0][90] = 2.71e-9; +// I_g_on_n[0][100] = 2.71e-9; + + //LSTP device type + vdd[1] = 1; + Lphy[1] = 0.020; + Lelec[1] = 0.0173; + t_ox[1] = 1.2e-3; + v_th[1] = 0.513; + c_ox[1] = 2.29e-14; + mobility_eff[1] = 347.46 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[1] = 8.64e-2; + c_g_ideal[1] = 4.58e-16; + c_fringe[1] = 0.053e-15; + c_junc[1] = 1e-15; + I_on_n[1] = 683.6e-6; + I_on_p[1] = I_on_n[1] / 2; + nmos_effective_resistance_multiplier = 1.99; + n_to_p_eff_curr_drv_ratio[1] = 2.23; + gmp_to_gmn_multiplier[1] = 0.99; + Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1]; + Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1]; + long_channel_leakage_reduction[1] = 1/1.93; + I_off_n[1][0] = 2.06e-11; + I_off_n[1][10] = 3.30e-11; + I_off_n[1][20] = 5.15e-11; + I_off_n[1][30] = 7.83e-11; + I_off_n[1][40] = 1.16e-10; + I_off_n[1][50] = 1.69e-10; + I_off_n[1][60] = 2.40e-10; + I_off_n[1][70] = 3.34e-10; + I_off_n[1][80] = 4.54e-10; + I_off_n[1][90] = 5.96e-10; + I_off_n[1][100] = 7.44e-10; + + I_g_on_n[1][0] = 3.73e-11;//A/micron + I_g_on_n[1][10] = 3.73e-11; + I_g_on_n[1][20] = 3.73e-11; + I_g_on_n[1][30] = 3.73e-11; + I_g_on_n[1][40] = 3.73e-11; + I_g_on_n[1][50] = 3.73e-11; + I_g_on_n[1][60] = 3.73e-11; + I_g_on_n[1][70] = 3.73e-11; + I_g_on_n[1][80] = 3.73e-11; + I_g_on_n[1][90] = 3.73e-11; + I_g_on_n[1][100] = 3.73e-11; + + + //LOP device type + vdd[2] = 0.6; + Lphy[2] = 0.016; + Lelec[2] = 0.01232; + t_ox[2] = 0.9e-3; + v_th[2] = 0.24227; + c_ox[2] = 2.84e-14; + mobility_eff[2] = 513.52 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[2] = 4.64e-2; + c_g_ideal[2] = 4.54e-16; + c_fringe[2] = 0.057e-15; + c_junc[2] = 1e-15; + I_on_n[2] = 827.8e-6; + I_on_p[2] = I_on_n[2] / 2; + nmos_effective_resistance_multiplier = 1.73; + n_to_p_eff_curr_drv_ratio[2] = 2.28; + gmp_to_gmn_multiplier[2] = 1.11; + Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2]; + Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2]; + long_channel_leakage_reduction[2] = 1/1.89; + I_off_n[2][0] = 5.94e-8; + I_off_n[2][10] = 7.23e-8; + I_off_n[2][20] = 8.7e-8; + I_off_n[2][30] = 1.04e-7; + I_off_n[2][40] = 1.22e-7; + I_off_n[2][50] = 1.43e-7; + I_off_n[2][60] = 1.65e-7; + I_off_n[2][70] = 1.90e-7; + I_off_n[2][80] = 2.15e-7; + I_off_n[2][90] = 2.39e-7; + I_off_n[2][100] = 2.63e-7; + + I_g_on_n[2][0] = 2.93e-9;//A/micron + I_g_on_n[2][10] = 2.93e-9; + I_g_on_n[2][20] = 2.93e-9; + I_g_on_n[2][30] = 2.93e-9; + I_g_on_n[2][40] = 2.93e-9; + I_g_on_n[2][50] = 2.93e-9; + I_g_on_n[2][60] = 2.93e-9; + I_g_on_n[2][70] = 2.93e-9; + I_g_on_n[2][80] = 2.93e-9; + I_g_on_n[2][90] = 2.93e-9; + I_g_on_n[2][100] = 2.93e-9; + + if (ram_cell_tech_type == lp_dram) + { + //LP-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.0; + Lphy[3] = 0.056; + Lelec[3] = 0.0419;//Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors. + curr_v_th_dram_access_transistor = 0.44129; + width_dram_access_transistor = 0.056; + curr_I_on_dram_cell = 36e-6; + curr_I_off_dram_cell_worst_case_length_temp = 18.9e-12; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = width_dram_access_transistor * Lphy[3] * 10.0; + curr_asp_ratio_cell_dram = 1.46; + curr_c_dram_cell = 20e-15; + + //LP-DRAM wordline transistor parameters + curr_vpp = 1.5; + t_ox[3] = 2e-3; + v_th[3] = 0.44467; + c_ox[3] = 1.48e-14; + mobility_eff[3] = 408.12 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.174; + c_g_ideal[3] = 7.45e-16; + c_fringe[3] = 0.053e-15; + c_junc[3] = 1e-15; + I_on_n[3] = 1055.4e-6; + I_on_p[3] = I_on_n[3] / 2; + nmos_effective_resistance_multiplier = 1.65; + n_to_p_eff_curr_drv_ratio[3] = 2.05; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 3.57e-11; + I_off_n[3][10] = 5.51e-11; + I_off_n[3][20] = 8.27e-11; + I_off_n[3][30] = 1.21e-10; + I_off_n[3][40] = 1.74e-10; + I_off_n[3][50] = 2.45e-10; + I_off_n[3][60] = 3.38e-10; + I_off_n[3][70] = 4.53e-10; + I_off_n[3][80] = 5.87e-10; + I_off_n[3][90] = 7.29e-10; + I_off_n[3][100] = 8.87e-10; + } + else if (ram_cell_tech_type == comm_dram) + { + //COMM-DRAM cell access transistor technology parameters + curr_vdd_dram_cell = 1.0; + Lphy[3] = 0.032; + Lelec[3] = 0.0205;//Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors. + curr_v_th_dram_access_transistor = 1; + width_dram_access_transistor = 0.032; + curr_I_on_dram_cell = 20e-6; + curr_I_off_dram_cell_worst_case_length_temp = 1e-15; + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 6*0.032*0.032; + curr_asp_ratio_cell_dram = 1.5; + curr_c_dram_cell = 30e-15; + + //COMM-DRAM wordline transistor parameters + curr_vpp = 2.6; + t_ox[3] = 4e-3; + v_th[3] = 1.0; + c_ox[3] = 7.99e-15; + mobility_eff[3] = 380.76 * (1e-2 * 1e6 * 1e-2 * 1e6); + Vdsat[3] = 0.129; + c_g_ideal[3] = 2.56e-16; + c_fringe[3] = 0.053e-15; + c_junc[3] = 1e-15; + I_on_n[3] = 1024.5e-6; + I_on_p[3] = I_on_n[3] / 2; + nmos_effective_resistance_multiplier = 1.69; + n_to_p_eff_curr_drv_ratio[3] = 1.95; + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3]; + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 3.63e-14; + I_off_n[3][10] = 7.18e-14; + I_off_n[3][20] = 1.36e-13; + I_off_n[3][30] = 2.49e-13; + I_off_n[3][40] = 4.41e-13; + I_off_n[3][50] = 7.55e-13; + I_off_n[3][60] = 1.26e-12; + I_off_n[3][70] = 2.03e-12; + I_off_n[3][80] = 3.19e-12; + I_off_n[3][90] = 4.87e-12; + I_off_n[3][100] = 7.16e-12; + } + + //SRAM cell properties + curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um; + curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_sram = 1.46; + //CAM cell properties //TODO: data need to be revisited + curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um; + curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_cam = 2.92; + //Empirical undifferetiated core/FU coefficient + curr_logic_scaling_co_eff = 0.7*0.7*0.7; + curr_core_tx_density = 1.25/0.7; + curr_sckt_co_eff = 1.1111; + curr_chip_layout_overhead = 1.2;//die measurement results based on Niagara 1 and 2 + curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb + } + + if(tech == 22){ + //For 2016, MPU/ASIC stagger-contacted M1 half-pitch is 22 nm (so this is 22 nm + //technology i.e. FEATURESIZE = 0.022). Using the DG process numbers for HP. + //22 nm HP + vdd[0] = 0.8; + Lphy[0] = 0.009;//Lphy is the physical gate-length. + Lelec[0] = 0.00468;//Lelec is the electrical gate-length. + t_ox[0] = 0.55e-3;//micron + v_th[0] = 0.1395;//V + c_ox[0] = 3.63e-14;//F/micron2 + mobility_eff[0] = 426.07 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs + Vdsat[0] = 2.33e-2; //V/micron + c_g_ideal[0] = 3.27e-16;//F/micron + c_fringe[0] = 0.06e-15;//F/micron + c_junc[0] = 0;//F/micron2 + I_on_n[0] = 2626.4e-6;//A/micron + I_on_p[0] = I_on_n[0] / 2;//A/micron //This value for I_on_p is not really used. + nmos_effective_resistance_multiplier = 1.45; + n_to_p_eff_curr_drv_ratio[0] = 2; //Wpmos/Wnmos = 2 in 2007 MASTAR. Look in + //"Dynamic" tab of Device workspace. + gmp_to_gmn_multiplier[0] = 1.38; //Just using the 32nm SOI value. + Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron + Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron + long_channel_leakage_reduction[0] = 1/3.274; + I_off_n[0][0] = 1.52e-7/1.5*1.2;//From 22nm, leakage current are directly from ITRS report rather than MASTAR, since MASTAR has serious bugs there. + I_off_n[0][10] = 1.55e-7/1.5*1.2; + I_off_n[0][20] = 1.59e-7/1.5*1.2; + I_off_n[0][30] = 1.68e-7/1.5*1.2; + I_off_n[0][40] = 1.90e-7/1.5*1.2; + I_off_n[0][50] = 2.69e-7/1.5*1.2; + I_off_n[0][60] = 5.32e-7/1.5*1.2; + I_off_n[0][70] = 1.02e-6/1.5*1.2; + I_off_n[0][80] = 1.62e-6/1.5*1.2; + I_off_n[0][90] = 2.73e-6/1.5*1.2; + I_off_n[0][100] = 6.1e-6/1.5*1.2; + //for 22nm DG HP + I_g_on_n[0][0] = 1.81e-9;//A/micron + I_g_on_n[0][10] = 1.81e-9; + I_g_on_n[0][20] = 1.81e-9; + I_g_on_n[0][30] = 1.81e-9; + I_g_on_n[0][40] = 1.81e-9; + I_g_on_n[0][50] = 1.81e-9; + I_g_on_n[0][60] = 1.81e-9; + I_g_on_n[0][70] = 1.81e-9; + I_g_on_n[0][80] = 1.81e-9; + I_g_on_n[0][90] = 1.81e-9; + I_g_on_n[0][100] = 1.81e-9; + + //22 nm LSTP DG + vdd[1] = 0.8; + Lphy[1] = 0.014; + Lelec[1] = 0.008;//Lelec is the electrical gate-length. + t_ox[1] = 1.1e-3;//micron + v_th[1] = 0.40126;//V + c_ox[1] = 2.30e-14;//F/micron2 + mobility_eff[1] = 738.09 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs + Vdsat[1] = 6.64e-2; //V/micron + c_g_ideal[1] = 3.22e-16;//F/micron + c_fringe[1] = 0.08e-15; + c_junc[1] = 0;//F/micron2 + I_on_n[1] = 727.6e-6;//A/micron + I_on_p[1] = I_on_n[1] / 2; + nmos_effective_resistance_multiplier = 1.99; + n_to_p_eff_curr_drv_ratio[1] = 2; + gmp_to_gmn_multiplier[1] = 0.99; + Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];//ohm-micron + Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];//ohm-micron + long_channel_leakage_reduction[1] = 1/1.89; + I_off_n[1][0] = 2.43e-11; + I_off_n[1][10] = 4.85e-11; + I_off_n[1][20] = 9.68e-11; + I_off_n[1][30] = 1.94e-10; + I_off_n[1][40] = 3.87e-10; + I_off_n[1][50] = 7.73e-10; + I_off_n[1][60] = 3.55e-10; + I_off_n[1][70] = 3.09e-9; + I_off_n[1][80] = 6.19e-9; + I_off_n[1][90] = 1.24e-8; + I_off_n[1][100]= 2.48e-8; + + I_g_on_n[1][0] = 4.51e-10;//A/micron + I_g_on_n[1][10] = 4.51e-10; + I_g_on_n[1][20] = 4.51e-10; + I_g_on_n[1][30] = 4.51e-10; + I_g_on_n[1][40] = 4.51e-10; + I_g_on_n[1][50] = 4.51e-10; + I_g_on_n[1][60] = 4.51e-10; + I_g_on_n[1][70] = 4.51e-10; + I_g_on_n[1][80] = 4.51e-10; + I_g_on_n[1][90] = 4.51e-10; + I_g_on_n[1][100] = 4.51e-10; + + //22 nm LOP + vdd[2] = 0.6; + Lphy[2] = 0.011; + Lelec[2] = 0.00604;//Lelec is the electrical gate-length. + t_ox[2] = 0.8e-3;//micron + v_th[2] = 0.2315;//V + c_ox[2] = 2.87e-14;//F/micron2 + mobility_eff[2] = 698.37 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs + Vdsat[2] = 1.81e-2; //V/micron + c_g_ideal[2] = 3.16e-16;//F/micron + c_fringe[2] = 0.08e-15; + c_junc[2] = 0;//F/micron2 This is Cj0 not Cjunc in MASTAR results->Dynamic Tab + I_on_n[2] = 916.1e-6;//A/micron + I_on_p[2] = I_on_n[2] / 2; + nmos_effective_resistance_multiplier = 1.73; + n_to_p_eff_curr_drv_ratio[2] = 2; + gmp_to_gmn_multiplier[2] = 1.11; + Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];//ohm-micron + Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];//ohm-micron + long_channel_leakage_reduction[2] = 1/2.38; + + I_off_n[2][0] = 1.31e-8; + I_off_n[2][10] = 2.60e-8; + I_off_n[2][20] = 5.14e-8; + I_off_n[2][30] = 1.02e-7; + I_off_n[2][40] = 2.02e-7; + I_off_n[2][50] = 3.99e-7; + I_off_n[2][60] = 7.91e-7; + I_off_n[2][70] = 1.09e-6; + I_off_n[2][80] = 2.09e-6; + I_off_n[2][90] = 4.04e-6; + I_off_n[2][100]= 4.48e-6; + + I_g_on_n[2][0] = 2.74e-9;//A/micron + I_g_on_n[2][10] = 2.74e-9; + I_g_on_n[2][20] = 2.74e-9; + I_g_on_n[2][30] = 2.74e-9; + I_g_on_n[2][40] = 2.74e-9; + I_g_on_n[2][50] = 2.74e-9; + I_g_on_n[2][60] = 2.74e-9; + I_g_on_n[2][70] = 2.74e-9; + I_g_on_n[2][80] = 2.74e-9; + I_g_on_n[2][90] = 2.74e-9; + I_g_on_n[2][100] = 2.74e-9; + + + + if (ram_cell_tech_type == 3) + {} + else if (ram_cell_tech_type == 4) + { + //22 nm commodity DRAM cell access transistor technology parameters. + //parameters + curr_vdd_dram_cell = 0.9;//0.45;//This value has reduced greatly in 2007 ITRS for all technology nodes. In + //2005 ITRS, the value was about twice the value in 2007 ITRS + Lphy[3] = 0.022;//micron + Lelec[3] = 0.0181;//micron. + curr_v_th_dram_access_transistor = 1;//V + width_dram_access_transistor = 0.022;//micron + curr_I_on_dram_cell = 20e-6; //This is a typical value that I have always + //kept constant. In reality this could perhaps be lower + curr_I_off_dram_cell_worst_case_length_temp = 1e-15;//A + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 6*0.022*0.022;//micron2. + curr_asp_ratio_cell_dram = 0.667; + curr_c_dram_cell = 30e-15;//This is a typical value that I have alwaus + //kept constant. + + //22 nm commodity DRAM wordline transistor parameters obtained using MASTAR. + curr_vpp = 2.3;//vpp. V + t_ox[3] = 3.5e-3;//micron + v_th[3] = 1.0;//V + c_ox[3] = 9.06e-15;//F/micron2 + mobility_eff[3] = 367.29 * (1e-2 * 1e6 * 1e-2 * 1e6);//micron2 / Vs + Vdsat[3] = 0.0972; //V/micron + c_g_ideal[3] = 1.99e-16;//F/micron + c_fringe[3] = 0.053e-15;//F/micron + c_junc[3] = 1e-15;//F/micron2 + I_on_n[3] = 910.5e-6;//A/micron + I_on_p[3] = I_on_n[3] / 2;//This value for I_on_p is not really used. + nmos_effective_resistance_multiplier = 1.69;//Using the value from 32nm. + // + n_to_p_eff_curr_drv_ratio[3] = 1.95;//Using the value from 32nm + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];//ohm-micron + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];//ohm-micron + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 1.1e-13; //A/micron + I_off_n[3][10] = 2.11e-13; + I_off_n[3][20] = 3.88e-13; + I_off_n[3][30] = 6.9e-13; + I_off_n[3][40] = 1.19e-12; + I_off_n[3][50] = 1.98e-12; + I_off_n[3][60] = 3.22e-12; + I_off_n[3][70] = 5.09e-12; + I_off_n[3][80] = 7.85e-12; + I_off_n[3][90] = 1.18e-11; + I_off_n[3][100] = 1.72e-11; + + } + else + { + //some error handler + } + + //SRAM cell properties + curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um; + curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_sram = 1.46; + //CAM cell properties //TODO: data need to be revisited + curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um; + curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_cam = 2.92; + //Empirical undifferetiated core/FU coefficient + curr_logic_scaling_co_eff = 0.7*0.7*0.7*0.7; + curr_core_tx_density = 1.25/0.7/0.7; + curr_sckt_co_eff = 1.1296; + curr_chip_layout_overhead = 1.2;//die measurement results based on Niagara 1 and 2 + curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb + } + + if(tech == 16){ + //For 2019, MPU/ASIC stagger-contacted M1 half-pitch is 16 nm (so this is 16 nm + //technology i.e. FEATURESIZE = 0.016). Using the DG process numbers for HP. + //16 nm HP + vdd[0] = 0.7; + Lphy[0] = 0.006;//Lphy is the physical gate-length. + Lelec[0] = 0.00315;//Lelec is the electrical gate-length. + t_ox[0] = 0.5e-3;//micron + v_th[0] = 0.1489;//V + c_ox[0] = 3.83e-14;//F/micron2 Cox_elec in MASTAR + mobility_eff[0] = 476.15 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs + Vdsat[0] = 1.42e-2; //V/micron calculated in spreadsheet + c_g_ideal[0] = 2.30e-16;//F/micron + c_fringe[0] = 0.06e-15;//F/micron MASTAR inputdynamic/3 + c_junc[0] = 0;//F/micron2 MASTAR result dynamic + I_on_n[0] = 2768.4e-6;//A/micron + I_on_p[0] = I_on_n[0] / 2;//A/micron //This value for I_on_p is not really used. + nmos_effective_resistance_multiplier = 1.48;//nmos_effective_resistance_multiplier is the ratio of Ieff to Idsat where Ieff is the effective NMOS current and Idsat is the saturation current. + n_to_p_eff_curr_drv_ratio[0] = 2; //Wpmos/Wnmos = 2 in 2007 MASTAR. Look in + //"Dynamic" tab of Device workspace. + gmp_to_gmn_multiplier[0] = 1.38; //Just using the 32nm SOI value. + Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron + Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron + long_channel_leakage_reduction[0] = 1/2.655; + I_off_n[0][0] = 1.52e-7/1.5*1.2*1.07; + I_off_n[0][10] = 1.55e-7/1.5*1.2*1.07; + I_off_n[0][20] = 1.59e-7/1.5*1.2*1.07; + I_off_n[0][30] = 1.68e-7/1.5*1.2*1.07; + I_off_n[0][40] = 1.90e-7/1.5*1.2*1.07; + I_off_n[0][50] = 2.69e-7/1.5*1.2*1.07; + I_off_n[0][60] = 5.32e-7/1.5*1.2*1.07; + I_off_n[0][70] = 1.02e-6/1.5*1.2*1.07; + I_off_n[0][80] = 1.62e-6/1.5*1.2*1.07; + I_off_n[0][90] = 2.73e-6/1.5*1.2*1.07; + I_off_n[0][100] = 6.1e-6/1.5*1.2*1.07; + //for 16nm DG HP + I_g_on_n[0][0] = 1.07e-9;//A/micron + I_g_on_n[0][10] = 1.07e-9; + I_g_on_n[0][20] = 1.07e-9; + I_g_on_n[0][30] = 1.07e-9; + I_g_on_n[0][40] = 1.07e-9; + I_g_on_n[0][50] = 1.07e-9; + I_g_on_n[0][60] = 1.07e-9; + I_g_on_n[0][70] = 1.07e-9; + I_g_on_n[0][80] = 1.07e-9; + I_g_on_n[0][90] = 1.07e-9; + I_g_on_n[0][100] = 1.07e-9; + +// //16 nm LSTP DG +// vdd[1] = 0.8; +// Lphy[1] = 0.014; +// Lelec[1] = 0.008;//Lelec is the electrical gate-length. +// t_ox[1] = 1.1e-3;//micron +// v_th[1] = 0.40126;//V +// c_ox[1] = 2.30e-14;//F/micron2 +// mobility_eff[1] = 738.09 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs +// Vdsat[1] = 6.64e-2; //V/micron +// c_g_ideal[1] = 3.22e-16;//F/micron +// c_fringe[1] = 0.008e-15; +// c_junc[1] = 0;//F/micron2 +// I_on_n[1] = 727.6e-6;//A/micron +// I_on_p[1] = I_on_n[1] / 2; +// nmos_effective_resistance_multiplier = 1.99; +// n_to_p_eff_curr_drv_ratio[1] = 2; +// gmp_to_gmn_multiplier[1] = 0.99; +// Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];//ohm-micron +// Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];//ohm-micron +// I_off_n[1][0] = 2.43e-11; +// I_off_n[1][10] = 4.85e-11; +// I_off_n[1][20] = 9.68e-11; +// I_off_n[1][30] = 1.94e-10; +// I_off_n[1][40] = 3.87e-10; +// I_off_n[1][50] = 7.73e-10; +// I_off_n[1][60] = 3.55e-10; +// I_off_n[1][70] = 3.09e-9; +// I_off_n[1][80] = 6.19e-9; +// I_off_n[1][90] = 1.24e-8; +// I_off_n[1][100]= 2.48e-8; +// +// // for 22nm LSTP HP +// I_g_on_n[1][0] = 4.51e-10;//A/micron +// I_g_on_n[1][10] = 4.51e-10; +// I_g_on_n[1][20] = 4.51e-10; +// I_g_on_n[1][30] = 4.51e-10; +// I_g_on_n[1][40] = 4.51e-10; +// I_g_on_n[1][50] = 4.51e-10; +// I_g_on_n[1][60] = 4.51e-10; +// I_g_on_n[1][70] = 4.51e-10; +// I_g_on_n[1][80] = 4.51e-10; +// I_g_on_n[1][90] = 4.51e-10; +// I_g_on_n[1][100] = 4.51e-10; + + + if (ram_cell_tech_type == 3) + {} + else if (ram_cell_tech_type == 4) + { + //22 nm commodity DRAM cell access transistor technology parameters. + //parameters + curr_vdd_dram_cell = 0.9;//0.45;//This value has reduced greatly in 2007 ITRS for all technology nodes. In + //2005 ITRS, the value was about twice the value in 2007 ITRS + Lphy[3] = 0.022;//micron + Lelec[3] = 0.0181;//micron. + curr_v_th_dram_access_transistor = 1;//V + width_dram_access_transistor = 0.022;//micron + curr_I_on_dram_cell = 20e-6; //This is a typical value that I have always + //kept constant. In reality this could perhaps be lower + curr_I_off_dram_cell_worst_case_length_temp = 1e-15;//A + curr_Wmemcella_dram = width_dram_access_transistor; + curr_Wmemcellpmos_dram = 0; + curr_Wmemcellnmos_dram = 0; + curr_area_cell_dram = 6*0.022*0.022;//micron2. + curr_asp_ratio_cell_dram = 0.667; + curr_c_dram_cell = 30e-15;//This is a typical value that I have alwaus + //kept constant. + + //22 nm commodity DRAM wordline transistor parameters obtained using MASTAR. + curr_vpp = 2.3;//vpp. V + t_ox[3] = 3.5e-3;//micron + v_th[3] = 1.0;//V + c_ox[3] = 9.06e-15;//F/micron2 + mobility_eff[3] = 367.29 * (1e-2 * 1e6 * 1e-2 * 1e6);//micron2 / Vs + Vdsat[3] = 0.0972; //V/micron + c_g_ideal[3] = 1.99e-16;//F/micron + c_fringe[3] = 0.053e-15;//F/micron + c_junc[3] = 1e-15;//F/micron2 + I_on_n[3] = 910.5e-6;//A/micron + I_on_p[3] = I_on_n[3] / 2;//This value for I_on_p is not really used. + nmos_effective_resistance_multiplier = 1.69;//Using the value from 32nm. + // + n_to_p_eff_curr_drv_ratio[3] = 1.95;//Using the value from 32nm + gmp_to_gmn_multiplier[3] = 0.90; + Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];//ohm-micron + Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];//ohm-micron + long_channel_leakage_reduction[3] = 1; + I_off_n[3][0] = 1.1e-13; //A/micron + I_off_n[3][10] = 2.11e-13; + I_off_n[3][20] = 3.88e-13; + I_off_n[3][30] = 6.9e-13; + I_off_n[3][40] = 1.19e-12; + I_off_n[3][50] = 1.98e-12; + I_off_n[3][60] = 3.22e-12; + I_off_n[3][70] = 5.09e-12; + I_off_n[3][80] = 7.85e-12; + I_off_n[3][90] = 1.18e-11; + I_off_n[3][100] = 1.72e-11; + + } + else + { + //some error handler + } + + //SRAM cell properties + curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um; + curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_sram = 1.46; + //CAM cell properties //TODO: data need to be revisited + curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um; + curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um; + curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um; + curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um; + curr_asp_ratio_cell_cam = 2.92; + //Empirical undifferetiated core/FU coefficient + curr_logic_scaling_co_eff = 0.7*0.7*0.7*0.7*0.7; + curr_core_tx_density = 1.25/0.7/0.7/0.7; + curr_sckt_co_eff = 1.1296; + curr_chip_layout_overhead = 1.2;//die measurement results based on Niagara 1 and 2 + curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb + } + + + g_tp.peri_global.Vdd += curr_alpha * vdd[peri_global_tech_type]; + g_tp.peri_global.t_ox += curr_alpha * t_ox[peri_global_tech_type]; + g_tp.peri_global.Vth += curr_alpha * v_th[peri_global_tech_type]; + g_tp.peri_global.C_ox += curr_alpha * c_ox[peri_global_tech_type]; + g_tp.peri_global.C_g_ideal += curr_alpha * c_g_ideal[peri_global_tech_type]; + g_tp.peri_global.C_fringe += curr_alpha * c_fringe[peri_global_tech_type]; + g_tp.peri_global.C_junc += curr_alpha * c_junc[peri_global_tech_type]; + g_tp.peri_global.C_junc_sidewall = 0.25e-15; // F/micron + g_tp.peri_global.l_phy += curr_alpha * Lphy[peri_global_tech_type]; + g_tp.peri_global.l_elec += curr_alpha * Lelec[peri_global_tech_type]; + g_tp.peri_global.I_on_n += curr_alpha * I_on_n[peri_global_tech_type]; + g_tp.peri_global.R_nch_on += curr_alpha * Rnchannelon[peri_global_tech_type]; + g_tp.peri_global.R_pch_on += curr_alpha * Rpchannelon[peri_global_tech_type]; + g_tp.peri_global.n_to_p_eff_curr_drv_ratio + += curr_alpha * n_to_p_eff_curr_drv_ratio[peri_global_tech_type]; + g_tp.peri_global.long_channel_leakage_reduction + += curr_alpha * long_channel_leakage_reduction[peri_global_tech_type]; + g_tp.peri_global.I_off_n += curr_alpha * I_off_n[peri_global_tech_type][g_ip->temp - 300]; + g_tp.peri_global.I_off_p += curr_alpha * I_off_n[peri_global_tech_type][g_ip->temp - 300]; + g_tp.peri_global.I_g_on_n += curr_alpha * I_g_on_n[peri_global_tech_type][g_ip->temp - 300]; + g_tp.peri_global.I_g_on_p += curr_alpha * I_g_on_n[peri_global_tech_type][g_ip->temp - 300]; + gmp_to_gmn_multiplier_periph_global += curr_alpha * gmp_to_gmn_multiplier[peri_global_tech_type]; + + g_tp.sram_cell.Vdd += curr_alpha * vdd[ram_cell_tech_type]; + g_tp.sram_cell.l_phy += curr_alpha * Lphy[ram_cell_tech_type]; + g_tp.sram_cell.l_elec += curr_alpha * Lelec[ram_cell_tech_type]; + g_tp.sram_cell.t_ox += curr_alpha * t_ox[ram_cell_tech_type]; + g_tp.sram_cell.Vth += curr_alpha * v_th[ram_cell_tech_type]; + g_tp.sram_cell.C_g_ideal += curr_alpha * c_g_ideal[ram_cell_tech_type]; + g_tp.sram_cell.C_fringe += curr_alpha * c_fringe[ram_cell_tech_type]; + g_tp.sram_cell.C_junc += curr_alpha * c_junc[ram_cell_tech_type]; + g_tp.sram_cell.C_junc_sidewall = 0.25e-15; // F/micron + g_tp.sram_cell.I_on_n += curr_alpha * I_on_n[ram_cell_tech_type]; + g_tp.sram_cell.R_nch_on += curr_alpha * Rnchannelon[ram_cell_tech_type]; + g_tp.sram_cell.R_pch_on += curr_alpha * Rpchannelon[ram_cell_tech_type]; + g_tp.sram_cell.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[ram_cell_tech_type]; + g_tp.sram_cell.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[ram_cell_tech_type]; + g_tp.sram_cell.I_off_n += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300]; + g_tp.sram_cell.I_off_p += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300]; + g_tp.sram_cell.I_g_on_n += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300]; + g_tp.sram_cell.I_g_on_p += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300]; + + g_tp.dram_cell_Vdd += curr_alpha * curr_vdd_dram_cell; + g_tp.dram_acc.Vth += curr_alpha * curr_v_th_dram_access_transistor; + g_tp.dram_acc.l_phy += curr_alpha * Lphy[dram_cell_tech_flavor]; + g_tp.dram_acc.l_elec += curr_alpha * Lelec[dram_cell_tech_flavor]; + g_tp.dram_acc.C_g_ideal += curr_alpha * c_g_ideal[dram_cell_tech_flavor]; + g_tp.dram_acc.C_fringe += curr_alpha * c_fringe[dram_cell_tech_flavor]; + g_tp.dram_acc.C_junc += curr_alpha * c_junc[dram_cell_tech_flavor]; + g_tp.dram_acc.C_junc_sidewall = 0.25e-15; // F/micron + g_tp.dram_cell_I_on += curr_alpha * curr_I_on_dram_cell; + g_tp.dram_cell_I_off_worst_case_len_temp += curr_alpha * curr_I_off_dram_cell_worst_case_length_temp; + g_tp.dram_acc.I_on_n += curr_alpha * I_on_n[dram_cell_tech_flavor]; + g_tp.dram_cell_C += curr_alpha * curr_c_dram_cell; + g_tp.vpp += curr_alpha * curr_vpp; + g_tp.dram_wl.l_phy += curr_alpha * Lphy[dram_cell_tech_flavor]; + g_tp.dram_wl.l_elec += curr_alpha * Lelec[dram_cell_tech_flavor]; + g_tp.dram_wl.C_g_ideal += curr_alpha * c_g_ideal[dram_cell_tech_flavor]; + g_tp.dram_wl.C_fringe += curr_alpha * c_fringe[dram_cell_tech_flavor]; + g_tp.dram_wl.C_junc += curr_alpha * c_junc[dram_cell_tech_flavor]; + g_tp.dram_wl.C_junc_sidewall = 0.25e-15; // F/micron + g_tp.dram_wl.I_on_n += curr_alpha * I_on_n[dram_cell_tech_flavor]; + g_tp.dram_wl.R_nch_on += curr_alpha * Rnchannelon[dram_cell_tech_flavor]; + g_tp.dram_wl.R_pch_on += curr_alpha * Rpchannelon[dram_cell_tech_flavor]; + g_tp.dram_wl.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[dram_cell_tech_flavor]; + g_tp.dram_wl.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[dram_cell_tech_flavor]; + g_tp.dram_wl.I_off_n += curr_alpha * I_off_n[dram_cell_tech_flavor][g_ip->temp - 300]; + g_tp.dram_wl.I_off_p += curr_alpha * I_off_n[dram_cell_tech_flavor][g_ip->temp - 300]; + + g_tp.cam_cell.Vdd += curr_alpha * vdd[ram_cell_tech_type]; + g_tp.cam_cell.l_phy += curr_alpha * Lphy[ram_cell_tech_type]; + g_tp.cam_cell.l_elec += curr_alpha * Lelec[ram_cell_tech_type]; + g_tp.cam_cell.t_ox += curr_alpha * t_ox[ram_cell_tech_type]; + g_tp.cam_cell.Vth += curr_alpha * v_th[ram_cell_tech_type]; + g_tp.cam_cell.C_g_ideal += curr_alpha * c_g_ideal[ram_cell_tech_type]; + g_tp.cam_cell.C_fringe += curr_alpha * c_fringe[ram_cell_tech_type]; + g_tp.cam_cell.C_junc += curr_alpha * c_junc[ram_cell_tech_type]; + g_tp.cam_cell.C_junc_sidewall = 0.25e-15; // F/micron + g_tp.cam_cell.I_on_n += curr_alpha * I_on_n[ram_cell_tech_type]; + g_tp.cam_cell.R_nch_on += curr_alpha * Rnchannelon[ram_cell_tech_type]; + g_tp.cam_cell.R_pch_on += curr_alpha * Rpchannelon[ram_cell_tech_type]; + g_tp.cam_cell.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[ram_cell_tech_type]; + g_tp.cam_cell.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[ram_cell_tech_type]; + g_tp.cam_cell.I_off_n += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300]; + g_tp.cam_cell.I_off_p += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300]; + g_tp.cam_cell.I_g_on_n += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300]; + g_tp.cam_cell.I_g_on_p += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300]; + + g_tp.dram.cell_a_w += curr_alpha * curr_Wmemcella_dram; + g_tp.dram.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_dram; + g_tp.dram.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_dram; + area_cell_dram += curr_alpha * curr_area_cell_dram; + asp_ratio_cell_dram += curr_alpha * curr_asp_ratio_cell_dram; + + g_tp.sram.cell_a_w += curr_alpha * curr_Wmemcella_sram; + g_tp.sram.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_sram; + g_tp.sram.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_sram; + area_cell_sram += curr_alpha * curr_area_cell_sram; + asp_ratio_cell_sram += curr_alpha * curr_asp_ratio_cell_sram; + + g_tp.cam.cell_a_w += curr_alpha * curr_Wmemcella_cam;//sheng + g_tp.cam.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_cam; + g_tp.cam.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_cam; + area_cell_cam += curr_alpha * curr_area_cell_cam; + asp_ratio_cell_cam += curr_alpha * curr_asp_ratio_cell_cam; + + //Sense amplifier latch Gm calculation + mobility_eff_periph_global += curr_alpha * mobility_eff[peri_global_tech_type]; + Vdsat_periph_global += curr_alpha * Vdsat[peri_global_tech_type]; + + //Empirical undifferetiated core/FU coefficient + g_tp.scaling_factor.logic_scaling_co_eff += curr_alpha * curr_logic_scaling_co_eff; + g_tp.scaling_factor.core_tx_density += curr_alpha * curr_core_tx_density; + g_tp.chip_layout_overhead += curr_alpha * curr_chip_layout_overhead; + g_tp.macro_layout_overhead += curr_alpha * curr_macro_layout_overhead; + g_tp.sckt_co_eff += curr_alpha * curr_sckt_co_eff; + } + + + //Currently we are not modeling the resistance/capacitance of poly anywhere. + //Continuous function (or date have been processed) does not need linear interpolation + g_tp.w_comp_inv_p1 = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process + g_tp.w_comp_inv_n1 = 7.5 * g_ip->F_sz_um;//this was 6 micron for the 0.8 micron process + g_tp.w_comp_inv_p2 = 25 * g_ip->F_sz_um;//this was 20 micron for the 0.8 micron process + g_tp.w_comp_inv_n2 = 15 * g_ip->F_sz_um;//this was 12 micron for the 0.8 micron process + g_tp.w_comp_inv_p3 = 50 * g_ip->F_sz_um;//this was 40 micron for the 0.8 micron process + g_tp.w_comp_inv_n3 = 30 * g_ip->F_sz_um;//this was 24 micron for the 0.8 micron process + g_tp.w_eval_inv_p = 100 * g_ip->F_sz_um;//this was 80 micron for the 0.8 micron process + g_tp.w_eval_inv_n = 50 * g_ip->F_sz_um;//this was 40 micron for the 0.8 micron process + g_tp.w_comp_n = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process + g_tp.w_comp_p = 37.5 * g_ip->F_sz_um;//this was 30 micron for the 0.8 micron process + + g_tp.MIN_GAP_BET_P_AND_N_DIFFS = 5 * g_ip->F_sz_um; + g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS = 1.5 * g_ip->F_sz_um; + g_tp.HPOWERRAIL = 2 * g_ip->F_sz_um; + g_tp.cell_h_def = 50 * g_ip->F_sz_um; + g_tp.w_poly_contact = g_ip->F_sz_um; + g_tp.spacing_poly_to_contact = g_ip->F_sz_um; + g_tp.spacing_poly_to_poly = 1.5 * g_ip->F_sz_um; + g_tp.ram_wl_stitching_overhead_ = 7.5 * g_ip->F_sz_um; + + g_tp.min_w_nmos_ = 3 * g_ip->F_sz_um / 2; + g_tp.max_w_nmos_ = 100 * g_ip->F_sz_um; + g_tp.w_iso = 12.5*g_ip->F_sz_um;//was 10 micron for the 0.8 micron process + g_tp.w_sense_n = 3.75*g_ip->F_sz_um; // sense amplifier N-trans; was 3 micron for the 0.8 micron process + g_tp.w_sense_p = 7.5*g_ip->F_sz_um; // sense amplifier P-trans; was 6 micron for the 0.8 micron process + g_tp.w_sense_en = 5*g_ip->F_sz_um; // Sense enable transistor of the sense amplifier; was 4 micron for the 0.8 micron process + g_tp.w_nmos_b_mux = 6 * g_tp.min_w_nmos_; + g_tp.w_nmos_sa_mux = 6 * g_tp.min_w_nmos_; + + if (ram_cell_tech_type == comm_dram) + { + g_tp.max_w_nmos_dec = 8 * g_ip->F_sz_um; + g_tp.h_dec = 8; // in the unit of memory cell height + } + else + { + g_tp.max_w_nmos_dec = g_tp.max_w_nmos_; + g_tp.h_dec = 4; // in the unit of memory cell height + } + + g_tp.peri_global.C_overlap = 0.2 * g_tp.peri_global.C_g_ideal; + g_tp.sram_cell.C_overlap = 0.2 * g_tp.sram_cell.C_g_ideal; + g_tp.cam_cell.C_overlap = 0.2 * g_tp.cam_cell.C_g_ideal; + + g_tp.dram_acc.C_overlap = 0.2 * g_tp.dram_acc.C_g_ideal; + g_tp.dram_acc.R_nch_on = g_tp.dram_cell_Vdd / g_tp.dram_acc.I_on_n; + //g_tp.dram_acc.R_pch_on = g_tp.dram_cell_Vdd / g_tp.dram_acc.I_on_p; + + g_tp.dram_wl.C_overlap = 0.2 * g_tp.dram_wl.C_g_ideal; + + double gmn_sense_amp_latch = (mobility_eff_periph_global / 2) * g_tp.peri_global.C_ox * (g_tp.w_sense_n / g_tp.peri_global.l_elec) * Vdsat_periph_global; + double gmp_sense_amp_latch = gmp_to_gmn_multiplier_periph_global * gmn_sense_amp_latch; + g_tp.gm_sense_amp_latch = gmn_sense_amp_latch + gmp_sense_amp_latch; + + g_tp.dram.b_w = sqrt(area_cell_dram / (asp_ratio_cell_dram)); + g_tp.dram.b_h = asp_ratio_cell_dram * g_tp.dram.b_w; + g_tp.sram.b_w = sqrt(area_cell_sram / (asp_ratio_cell_sram)); + g_tp.sram.b_h = asp_ratio_cell_sram * g_tp.sram.b_w; + g_tp.cam.b_w = sqrt(area_cell_cam / (asp_ratio_cell_cam));//Sheng + g_tp.cam.b_h = asp_ratio_cell_cam * g_tp.cam.b_w; + + g_tp.dram.Vbitpre = g_tp.dram_cell_Vdd; + g_tp.sram.Vbitpre = vdd[ram_cell_tech_type]; + g_tp.cam.Vbitpre = vdd[ram_cell_tech_type];//Sheng + pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); + g_tp.w_pmos_bl_precharge = 6 * pmos_to_nmos_sizing_r * g_tp.min_w_nmos_; + g_tp.w_pmos_bl_eq = pmos_to_nmos_sizing_r * g_tp.min_w_nmos_; + + + double wire_pitch [NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES], + wire_r_per_micron[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES], + wire_c_per_micron[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES], + horiz_dielectric_constant[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES], + vert_dielectric_constant[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES], + aspect_ratio[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES], + miller_value[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES], + ild_thickness[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES]; + + for (iter=0; iter<=1; ++iter) + { + // linear interpolation + if (iter == 0) + { + tech = tech_lo; + if (tech_lo == tech_hi) + { + curr_alpha = 1; + } + else + { + curr_alpha = (technology - tech_hi)/(tech_lo - tech_hi); + } + } + else + { + tech = tech_hi; + if (tech_lo == tech_hi) + { + break; + } + else + { + curr_alpha = (tech_lo - technology)/(tech_lo - tech_hi); + } + } + + if (tech == 90) + { + //Aggressive projections + wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//micron + aspect_ratio[0][0] = 2.4; + wire_width = wire_pitch[0][0] / 2; //micron + wire_thickness = aspect_ratio[0][0] * wire_width;//micron + wire_spacing = wire_pitch[0][0] - wire_width;//micron + barrier_thickness = 0.01;//micron + dishing_thickness = 0;//micron + alpha_scatter = 1; + wire_r_per_micron[0][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);//ohm/micron + ild_thickness[0][0] = 0.48;//micron + miller_value[0][0] = 1.5; + horiz_dielectric_constant[0][0] = 2.709; + vert_dielectric_constant[0][0] = 3.9; + fringe_cap = 0.115e-15; //F/micron + wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], + vert_dielectric_constant[0][0], + fringe_cap);//F/micron. + + wire_pitch[0][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[0][1] / 2; + aspect_ratio[0][1] = 2.4; + wire_thickness = aspect_ratio[0][1] * wire_width; + wire_spacing = wire_pitch[0][1] - wire_width; + wire_r_per_micron[0][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][1] = 0.48;//micron + miller_value[0][1] = 1.5; + horiz_dielectric_constant[0][1] = 2.709; + vert_dielectric_constant[0][1] = 3.9; + wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], + vert_dielectric_constant[0][1], + fringe_cap); + + wire_pitch[0][2] = 8 * g_ip->F_sz_um; + aspect_ratio[0][2] = 2.7; + wire_width = wire_pitch[0][2] / 2; + wire_thickness = aspect_ratio[0][2] * wire_width; + wire_spacing = wire_pitch[0][2] - wire_width; + wire_r_per_micron[0][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][2] = 0.96; + miller_value[0][2] = 1.5; + horiz_dielectric_constant[0][2] = 2.709; + vert_dielectric_constant[0][2] = 3.9; + wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2], + fringe_cap); + + //Conservative projections + wire_pitch[1][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[1][0] = 2.0; + wire_width = wire_pitch[1][0] / 2; + wire_thickness = aspect_ratio[1][0] * wire_width; + wire_spacing = wire_pitch[1][0] - wire_width; + barrier_thickness = 0.008; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][0] = 0.48; + miller_value[1][0] = 1.5; + horiz_dielectric_constant[1][0] = 3.038; + vert_dielectric_constant[1][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], + vert_dielectric_constant[1][0], + fringe_cap); + + wire_pitch[1][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[1][1] / 2; + aspect_ratio[1][1] = 2.0; + wire_thickness = aspect_ratio[1][1] * wire_width; + wire_spacing = wire_pitch[1][1] - wire_width; + wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][1] = 0.48; + miller_value[1][1] = 1.5; + horiz_dielectric_constant[1][1] = 3.038; + vert_dielectric_constant[1][1] = 3.9; + wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], + vert_dielectric_constant[1][1], + fringe_cap); + + wire_pitch[1][2] = 8 * g_ip->F_sz_um; + aspect_ratio[1][2] = 2.2; + wire_width = wire_pitch[1][2] / 2; + wire_thickness = aspect_ratio[1][2] * wire_width; + wire_spacing = wire_pitch[1][2] - wire_width; + dishing_thickness = 0.1 * wire_thickness; + wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][2] = 1.1; + miller_value[1][2] = 1.5; + horiz_dielectric_constant[1][2] = 3.038; + vert_dielectric_constant[1][2] = 3.9; + wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][2] , miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2], + fringe_cap); + //Nominal projections for commodity DRAM wordline/bitline + wire_pitch[1][3] = 2 * 0.09; + wire_c_per_micron[1][3] = 60e-15 / (256 * 2 * 0.09); + wire_r_per_micron[1][3] = 12 / 0.09; + } + else if (tech == 65) + { + //Aggressive projections + wire_pitch[0][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[0][0] = 2.7; + wire_width = wire_pitch[0][0] / 2; + wire_thickness = aspect_ratio[0][0] * wire_width; + wire_spacing = wire_pitch[0][0] - wire_width; + barrier_thickness = 0; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][0] = 0.405; + miller_value[0][0] = 1.5; + horiz_dielectric_constant[0][0] = 2.303; + vert_dielectric_constant[0][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][0] , miller_value[0][0] , horiz_dielectric_constant[0][0] , vert_dielectric_constant[0][0] , + fringe_cap); + + wire_pitch[0][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[0][1] / 2; + aspect_ratio[0][1] = 2.7; + wire_thickness = aspect_ratio[0][1] * wire_width; + wire_spacing = wire_pitch[0][1] - wire_width; + wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][1] = 0.405; + miller_value[0][1] = 1.5; + horiz_dielectric_constant[0][1] = 2.303; + vert_dielectric_constant[0][1] = 3.9; + wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], + vert_dielectric_constant[0][1], + fringe_cap); + + wire_pitch[0][2] = 8 * g_ip->F_sz_um; + aspect_ratio[0][2] = 2.8; + wire_width = wire_pitch[0][2] / 2; + wire_thickness = aspect_ratio[0][2] * wire_width; + wire_spacing = wire_pitch[0][2] - wire_width; + wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][2] = 0.81; + miller_value[0][2] = 1.5; + horiz_dielectric_constant[0][2] = 2.303; + vert_dielectric_constant[0][2] = 3.9; + wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2], + fringe_cap); + + //Conservative projections + wire_pitch[1][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[1][0] = 2.0; + wire_width = wire_pitch[1][0] / 2; + wire_thickness = aspect_ratio[1][0] * wire_width; + wire_spacing = wire_pitch[1][0] - wire_width; + barrier_thickness = 0.006; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][0] = 0.405; + miller_value[1][0] = 1.5; + horiz_dielectric_constant[1][0] = 2.734; + vert_dielectric_constant[1][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0], + fringe_cap); + + wire_pitch[1][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[1][1] / 2; + aspect_ratio[1][1] = 2.0; + wire_thickness = aspect_ratio[1][1] * wire_width; + wire_spacing = wire_pitch[1][1] - wire_width; + wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][1] = 0.405; + miller_value[1][1] = 1.5; + horiz_dielectric_constant[1][1] = 2.734; + vert_dielectric_constant[1][1] = 3.9; + wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1], + fringe_cap); + + wire_pitch[1][2] = 8 * g_ip->F_sz_um; + aspect_ratio[1][2] = 2.2; + wire_width = wire_pitch[1][2] / 2; + wire_thickness = aspect_ratio[1][2] * wire_width; + wire_spacing = wire_pitch[1][2] - wire_width; + dishing_thickness = 0.1 * wire_thickness; + wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][2] = 0.77; + miller_value[1][2] = 1.5; + horiz_dielectric_constant[1][2] = 2.734; + vert_dielectric_constant[1][2] = 3.9; + wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2], + fringe_cap); + //Nominal projections for commodity DRAM wordline/bitline + wire_pitch[1][3] = 2 * 0.065; + wire_c_per_micron[1][3] = 52.5e-15 / (256 * 2 * 0.065); + wire_r_per_micron[1][3] = 12 / 0.065; + } + else if (tech == 45) + { + //Aggressive projections. + wire_pitch[0][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[0][0] = 3.0; + wire_width = wire_pitch[0][0] / 2; + wire_thickness = aspect_ratio[0][0] * wire_width; + wire_spacing = wire_pitch[0][0] - wire_width; + barrier_thickness = 0; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][0] = 0.315; + miller_value[0][0] = 1.5; + horiz_dielectric_constant[0][0] = 1.958; + vert_dielectric_constant[0][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][0] , miller_value[0][0] , horiz_dielectric_constant[0][0] , vert_dielectric_constant[0][0] , + fringe_cap); + + wire_pitch[0][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[0][1] / 2; + aspect_ratio[0][1] = 3.0; + wire_thickness = aspect_ratio[0][1] * wire_width; + wire_spacing = wire_pitch[0][1] - wire_width; + wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][1] = 0.315; + miller_value[0][1] = 1.5; + horiz_dielectric_constant[0][1] = 1.958; + vert_dielectric_constant[0][1] = 3.9; + wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1], + fringe_cap); + + wire_pitch[0][2] = 8 * g_ip->F_sz_um; + aspect_ratio[0][2] = 3.0; + wire_width = wire_pitch[0][2] / 2; + wire_thickness = aspect_ratio[0][2] * wire_width; + wire_spacing = wire_pitch[0][2] - wire_width; + wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][2] = 0.63; + miller_value[0][2] = 1.5; + horiz_dielectric_constant[0][2] = 1.958; + vert_dielectric_constant[0][2] = 3.9; + wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2], + fringe_cap); + + //Conservative projections + wire_pitch[1][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[1][0] = 2.0; + wire_width = wire_pitch[1][0] / 2; + wire_thickness = aspect_ratio[1][0] * wire_width; + wire_spacing = wire_pitch[1][0] - wire_width; + barrier_thickness = 0.004; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][0] = 0.315; + miller_value[1][0] = 1.5; + horiz_dielectric_constant[1][0] = 2.46; + vert_dielectric_constant[1][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0], + fringe_cap); + + wire_pitch[1][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[1][1] / 2; + aspect_ratio[1][1] = 2.0; + wire_thickness = aspect_ratio[1][1] * wire_width; + wire_spacing = wire_pitch[1][1] - wire_width; + wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][1] = 0.315; + miller_value[1][1] = 1.5; + horiz_dielectric_constant[1][1] = 2.46; + vert_dielectric_constant[1][1] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1], + fringe_cap); + + wire_pitch[1][2] = 8 * g_ip->F_sz_um; + aspect_ratio[1][2] = 2.2; + wire_width = wire_pitch[1][2] / 2; + wire_thickness = aspect_ratio[1][2] * wire_width; + wire_spacing = wire_pitch[1][2] - wire_width; + dishing_thickness = 0.1 * wire_thickness; + wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][2] = 0.55; + miller_value[1][2] = 1.5; + horiz_dielectric_constant[1][2] = 2.46; + vert_dielectric_constant[1][2] = 3.9; + wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2], + fringe_cap); + //Nominal projections for commodity DRAM wordline/bitline + wire_pitch[1][3] = 2 * 0.045; + wire_c_per_micron[1][3] = 37.5e-15 / (256 * 2 * 0.045); + wire_r_per_micron[1][3] = 12 / 0.045; + } + else if (tech == 32) + { + //Aggressive projections. + wire_pitch[0][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[0][0] = 3.0; + wire_width = wire_pitch[0][0] / 2; + wire_thickness = aspect_ratio[0][0] * wire_width; + wire_spacing = wire_pitch[0][0] - wire_width; + barrier_thickness = 0; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][0] = 0.21; + miller_value[0][0] = 1.5; + horiz_dielectric_constant[0][0] = 1.664; + vert_dielectric_constant[0][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0], + fringe_cap); + + wire_pitch[0][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[0][1] / 2; + aspect_ratio[0][1] = 3.0; + wire_thickness = aspect_ratio[0][1] * wire_width; + wire_spacing = wire_pitch[0][1] - wire_width; + wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][1] = 0.21; + miller_value[0][1] = 1.5; + horiz_dielectric_constant[0][1] = 1.664; + vert_dielectric_constant[0][1] = 3.9; + wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1], + fringe_cap); + + wire_pitch[0][2] = 8 * g_ip->F_sz_um; + aspect_ratio[0][2] = 3.0; + wire_width = wire_pitch[0][2] / 2; + wire_thickness = aspect_ratio[0][2] * wire_width; + wire_spacing = wire_pitch[0][2] - wire_width; + wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][2] = 0.42; + miller_value[0][2] = 1.5; + horiz_dielectric_constant[0][2] = 1.664; + vert_dielectric_constant[0][2] = 3.9; + wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2], + fringe_cap); + + //Conservative projections + wire_pitch[1][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[1][0] = 2.0; + wire_width = wire_pitch[1][0] / 2; + wire_thickness = aspect_ratio[1][0] * wire_width; + wire_spacing = wire_pitch[1][0] - wire_width; + barrier_thickness = 0.003; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][0] = 0.21; + miller_value[1][0] = 1.5; + horiz_dielectric_constant[1][0] = 2.214; + vert_dielectric_constant[1][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0], + fringe_cap); + + wire_pitch[1][1] = 4 * g_ip->F_sz_um; + aspect_ratio[1][1] = 2.0; + wire_width = wire_pitch[1][1] / 2; + wire_thickness = aspect_ratio[1][1] * wire_width; + wire_spacing = wire_pitch[1][1] - wire_width; + wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][1] = 0.21; + miller_value[1][1] = 1.5; + horiz_dielectric_constant[1][1] = 2.214; + vert_dielectric_constant[1][1] = 3.9; + wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1], + fringe_cap); + + wire_pitch[1][2] = 8 * g_ip->F_sz_um; + aspect_ratio[1][2] = 2.2; + wire_width = wire_pitch[1][2] / 2; + wire_thickness = aspect_ratio[1][2] * wire_width; + wire_spacing = wire_pitch[1][2] - wire_width; + dishing_thickness = 0.1 * wire_thickness; + wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][2] = 0.385; + miller_value[1][2] = 1.5; + horiz_dielectric_constant[1][2] = 2.214; + vert_dielectric_constant[1][2] = 3.9; + wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2], + fringe_cap); + //Nominal projections for commodity DRAM wordline/bitline + wire_pitch[1][3] = 2 * 0.032;//micron + wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.032);//F/micron + wire_r_per_micron[1][3] = 12 / 0.032;//ohm/micron + } + else if (tech == 22) + { + //Aggressive projections. + wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//local + aspect_ratio[0][0] = 3.0; + wire_width = wire_pitch[0][0] / 2; + wire_thickness = aspect_ratio[0][0] * wire_width; + wire_spacing = wire_pitch[0][0] - wire_width; + barrier_thickness = 0; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][0] = 0.15; + miller_value[0][0] = 1.5; + horiz_dielectric_constant[0][0] = 1.414; + vert_dielectric_constant[0][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0], + fringe_cap); + + wire_pitch[0][1] = 4 * g_ip->F_sz_um;//semi-global + wire_width = wire_pitch[0][1] / 2; + aspect_ratio[0][1] = 3.0; + wire_thickness = aspect_ratio[0][1] * wire_width; + wire_spacing = wire_pitch[0][1] - wire_width; + wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][1] = 0.15; + miller_value[0][1] = 1.5; + horiz_dielectric_constant[0][1] = 1.414; + vert_dielectric_constant[0][1] = 3.9; + wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1], + fringe_cap); + + wire_pitch[0][2] = 8 * g_ip->F_sz_um;//global + aspect_ratio[0][2] = 3.0; + wire_width = wire_pitch[0][2] / 2; + wire_thickness = aspect_ratio[0][2] * wire_width; + wire_spacing = wire_pitch[0][2] - wire_width; + wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][2] = 0.3; + miller_value[0][2] = 1.5; + horiz_dielectric_constant[0][2] = 1.414; + vert_dielectric_constant[0][2] = 3.9; + wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2], + fringe_cap); + +// //************************* +// wire_pitch[0][4] = 16 * g_ip.F_sz_um;//global +// aspect_ratio = 3.0; +// wire_width = wire_pitch[0][4] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[0][4] - wire_width; +// wire_r_per_micron[0][4] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.3; +// wire_c_per_micron[0][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[0][5] = 24 * g_ip.F_sz_um;//global +// aspect_ratio = 3.0; +// wire_width = wire_pitch[0][5] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[0][5] - wire_width; +// wire_r_per_micron[0][5] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.3; +// wire_c_per_micron[0][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[0][6] = 32 * g_ip.F_sz_um;//global +// aspect_ratio = 3.0; +// wire_width = wire_pitch[0][6] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[0][6] - wire_width; +// wire_r_per_micron[0][6] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.3; +// wire_c_per_micron[0][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); + //************************* + + //Conservative projections + wire_pitch[1][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[1][0] = 2.0; + wire_width = wire_pitch[1][0] / 2; + wire_thickness = aspect_ratio[1][0] * wire_width; + wire_spacing = wire_pitch[1][0] - wire_width; + barrier_thickness = 0.003; + dishing_thickness = 0; + alpha_scatter = 1.05; + wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][0] = 0.15; + miller_value[1][0] = 1.5; + horiz_dielectric_constant[1][0] = 2.104; + vert_dielectric_constant[1][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0], + fringe_cap); + + wire_pitch[1][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[1][1] / 2; + aspect_ratio[1][1] = 2.0; + wire_thickness = aspect_ratio[1][1] * wire_width; + wire_spacing = wire_pitch[1][1] - wire_width; + wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][1] = 0.15; + miller_value[1][1] = 1.5; + horiz_dielectric_constant[1][1] = 2.104; + vert_dielectric_constant[1][1] = 3.9; + wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1], + fringe_cap); + + wire_pitch[1][2] = 8 * g_ip->F_sz_um; + aspect_ratio[1][2] = 2.2; + wire_width = wire_pitch[1][2] / 2; + wire_thickness = aspect_ratio[1][2] * wire_width; + wire_spacing = wire_pitch[1][2] - wire_width; + dishing_thickness = 0.1 * wire_thickness; + wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][2] = 0.275; + miller_value[1][2] = 1.5; + horiz_dielectric_constant[1][2] = 2.104; + vert_dielectric_constant[1][2] = 3.9; + wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2], + fringe_cap); + //Nominal projections for commodity DRAM wordline/bitline + wire_pitch[1][3] = 2 * 0.022;//micron + wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.022);//F/micron + wire_r_per_micron[1][3] = 12 / 0.022;//ohm/micron + + //****************** +// wire_pitch[1][4] = 16 * g_ip.F_sz_um; +// aspect_ratio = 2.2; +// wire_width = wire_pitch[1][4] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[1][4] - wire_width; +// dishing_thickness = 0.1 * wire_thickness; +// wire_r_per_micron[1][4] = wire_resistance(CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.275; +// wire_c_per_micron[1][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[1][5] = 24 * g_ip.F_sz_um; +// aspect_ratio = 2.2; +// wire_width = wire_pitch[1][5] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[1][5] - wire_width; +// dishing_thickness = 0.1 * wire_thickness; +// wire_r_per_micron[1][5] = wire_resistance(CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.275; +// wire_c_per_micron[1][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[1][6] = 32 * g_ip.F_sz_um; +// aspect_ratio = 2.2; +// wire_width = wire_pitch[1][6] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[1][6] - wire_width; +// dishing_thickness = 0.1 * wire_thickness; +// wire_r_per_micron[1][6] = wire_resistance(CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.275; +// wire_c_per_micron[1][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); + } + + else if (tech == 16) + { + //Aggressive projections. + wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//local + aspect_ratio[0][0] = 3.0; + wire_width = wire_pitch[0][0] / 2; + wire_thickness = aspect_ratio[0][0] * wire_width; + wire_spacing = wire_pitch[0][0] - wire_width; + barrier_thickness = 0; + dishing_thickness = 0; + alpha_scatter = 1; + wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][0] = 0.108; + miller_value[0][0] = 1.5; + horiz_dielectric_constant[0][0] = 1.202; + vert_dielectric_constant[0][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0], + fringe_cap); + + wire_pitch[0][1] = 4 * g_ip->F_sz_um;//semi-global + aspect_ratio[0][1] = 3.0; + wire_width = wire_pitch[0][1] / 2; + wire_thickness = aspect_ratio[0][1] * wire_width; + wire_spacing = wire_pitch[0][1] - wire_width; + wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][1] = 0.108; + miller_value[0][1] = 1.5; + horiz_dielectric_constant[0][1] = 1.202; + vert_dielectric_constant[0][1] = 3.9; + wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1], + fringe_cap); + + wire_pitch[0][2] = 8 * g_ip->F_sz_um;//global + aspect_ratio[0][2] = 3.0; + wire_width = wire_pitch[0][2] / 2; + wire_thickness = aspect_ratio[0][2] * wire_width; + wire_spacing = wire_pitch[0][2] - wire_width; + wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[0][2] = 0.216; + miller_value[0][2] = 1.5; + horiz_dielectric_constant[0][2] = 1.202; + vert_dielectric_constant[0][2] = 3.9; + wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2], + fringe_cap); + +// //************************* +// wire_pitch[0][4] = 16 * g_ip.F_sz_um;//global +// aspect_ratio = 3.0; +// wire_width = wire_pitch[0][4] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[0][4] - wire_width; +// wire_r_per_micron[0][4] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.3; +// wire_c_per_micron[0][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[0][5] = 24 * g_ip.F_sz_um;//global +// aspect_ratio = 3.0; +// wire_width = wire_pitch[0][5] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[0][5] - wire_width; +// wire_r_per_micron[0][5] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.3; +// wire_c_per_micron[0][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[0][6] = 32 * g_ip.F_sz_um;//global +// aspect_ratio = 3.0; +// wire_width = wire_pitch[0][6] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[0][6] - wire_width; +// wire_r_per_micron[0][6] = wire_resistance(BULK_CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.3; +// wire_c_per_micron[0][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); + //************************* + + //Conservative projections + wire_pitch[1][0] = 2.5 * g_ip->F_sz_um; + aspect_ratio[1][0] = 2.0; + wire_width = wire_pitch[1][0] / 2; + wire_thickness = aspect_ratio[1][0] * wire_width; + wire_spacing = wire_pitch[1][0] - wire_width; + barrier_thickness = 0.002; + dishing_thickness = 0; + alpha_scatter = 1.05; + wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][0] = 0.108; + miller_value[1][0] = 1.5; + horiz_dielectric_constant[1][0] = 1.998; + vert_dielectric_constant[1][0] = 3.9; + fringe_cap = 0.115e-15; + wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0], + fringe_cap); + + wire_pitch[1][1] = 4 * g_ip->F_sz_um; + wire_width = wire_pitch[1][1] / 2; + aspect_ratio[1][1] = 2.0; + wire_thickness = aspect_ratio[1][1] * wire_width; + wire_spacing = wire_pitch[1][1] - wire_width; + wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][1] = 0.108; + miller_value[1][1] = 1.5; + horiz_dielectric_constant[1][1] = 1.998; + vert_dielectric_constant[1][1] = 3.9; + wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1], + fringe_cap); + + wire_pitch[1][2] = 8 * g_ip->F_sz_um; + aspect_ratio[1][2] = 2.2; + wire_width = wire_pitch[1][2] / 2; + wire_thickness = aspect_ratio[1][2] * wire_width; + wire_spacing = wire_pitch[1][2] - wire_width; + dishing_thickness = 0.1 * wire_thickness; + wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width, + wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); + ild_thickness[1][2] = 0.198; + miller_value[1][2] = 1.5; + horiz_dielectric_constant[1][2] = 1.998; + vert_dielectric_constant[1][2] = 3.9; + wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing, + ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2], + fringe_cap); + //Nominal projections for commodity DRAM wordline/bitline + wire_pitch[1][3] = 2 * 0.016;//micron + wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.016);//F/micron + wire_r_per_micron[1][3] = 12 / 0.016;//ohm/micron + + //****************** +// wire_pitch[1][4] = 16 * g_ip.F_sz_um; +// aspect_ratio = 2.2; +// wire_width = wire_pitch[1][4] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[1][4] - wire_width; +// dishing_thickness = 0.1 * wire_thickness; +// wire_r_per_micron[1][4] = wire_resistance(CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.275; +// wire_c_per_micron[1][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[1][5] = 24 * g_ip.F_sz_um; +// aspect_ratio = 2.2; +// wire_width = wire_pitch[1][5] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[1][5] - wire_width; +// dishing_thickness = 0.1 * wire_thickness; +// wire_r_per_micron[1][5] = wire_resistance(CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.275; +// wire_c_per_micron[1][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); +// +// wire_pitch[1][6] = 32 * g_ip.F_sz_um; +// aspect_ratio = 2.2; +// wire_width = wire_pitch[1][6] / 2; +// wire_thickness = aspect_ratio * wire_width; +// wire_spacing = wire_pitch[1][6] - wire_width; +// dishing_thickness = 0.1 * wire_thickness; +// wire_r_per_micron[1][6] = wire_resistance(CU_RESISTIVITY, wire_width, +// wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter); +// ild_thickness = 0.275; +// wire_c_per_micron[1][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing, +// ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant, +// fringe_cap); + } + g_tp.wire_local.pitch += curr_alpha * wire_pitch[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + g_tp.wire_local.R_per_um += curr_alpha * wire_r_per_micron[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + g_tp.wire_local.C_per_um += curr_alpha * wire_c_per_micron[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + g_tp.wire_local.aspect_ratio += curr_alpha * aspect_ratio[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + g_tp.wire_local.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + g_tp.wire_local.miller_value += curr_alpha * miller_value[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + g_tp.wire_local.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + g_tp.wire_local.vert_dielectric_constant += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0]; + + g_tp.wire_inside_mat.pitch += curr_alpha * wire_pitch[g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + g_tp.wire_inside_mat.R_per_um += curr_alpha* wire_r_per_micron[g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + g_tp.wire_inside_mat.C_per_um += curr_alpha* wire_c_per_micron[g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + g_tp.wire_inside_mat.aspect_ratio += curr_alpha * aspect_ratio[g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + g_tp.wire_inside_mat.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + g_tp.wire_inside_mat.miller_value += curr_alpha * miller_value[g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + g_tp.wire_inside_mat.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + g_tp.wire_inside_mat.vert_dielectric_constant += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][g_ip->wire_is_mat_type]; + + g_tp.wire_outside_mat.pitch += curr_alpha * wire_pitch[g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + g_tp.wire_outside_mat.R_per_um += curr_alpha*wire_r_per_micron[g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + g_tp.wire_outside_mat.C_per_um += curr_alpha*wire_c_per_micron[g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + g_tp.wire_outside_mat.aspect_ratio += curr_alpha * aspect_ratio[g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + g_tp.wire_outside_mat.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + g_tp.wire_outside_mat.miller_value += curr_alpha * miller_value[g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + g_tp.wire_outside_mat.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + g_tp.wire_outside_mat.vert_dielectric_constant += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][g_ip->wire_os_mat_type]; + + g_tp.unit_len_wire_del = g_tp.wire_inside_mat.R_per_um * g_tp.wire_inside_mat.C_per_um / 2; + + g_tp.sense_delay += curr_alpha *SENSE_AMP_D; + g_tp.sense_dy_power += curr_alpha *SENSE_AMP_P; +// g_tp.horiz_dielectric_constant += horiz_dielectric_constant; +// g_tp.vert_dielectric_constant += vert_dielectric_constant; +// g_tp.aspect_ratio += aspect_ratio; +// g_tp.miller_value += miller_value; +// g_tp.ild_thickness += ild_thickness; + + } + g_tp.fringe_cap = fringe_cap; + + double rd = tr_R_on(g_tp.min_w_nmos_, NCH, 1); + double p_to_n_sizing_r = pmos_to_nmos_sz_ratio(); + double c_load = gate_C(g_tp.min_w_nmos_ * (1 + p_to_n_sizing_r), 0.0); + double tf = rd * c_load; + g_tp.kinv = horowitz(0, tf, 0.5, 0.5, RISE); + double KLOAD = 1; + c_load = KLOAD * (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(g_tp.min_w_nmos_ * p_to_n_sizing_r, PCH, 1, 1, g_tp.cell_h_def) + + gate_C(g_tp.min_w_nmos_ * 4 * (1 + p_to_n_sizing_r), 0.0)); + tf = rd * c_load; + g_tp.FO4 = horowitz(0, tf, 0.5, 0.5, RISE); +} + diff --git a/ext/mcpat/version.h b/ext/mcpat/version.h new file mode 100644 index 000000000..76d8c7508 --- /dev/null +++ b/ext/mcpat/version.h @@ -0,0 +1,40 @@ +/***************************************************************************** + * McPAT + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + +#ifndef VERSION_H_ +#define VERSION_H_ + +#define VER_MAJOR 0 /* beta release */ +#define VER_MINOR 8 + +#define VER_UPDATE "Aug, 2010" + +#endif /* VERSION_H_ */ diff --git a/ext/mcpat/xmlParser.cc b/ext/mcpat/xmlParser.cc new file mode 100644 index 000000000..5ac45edae --- /dev/null +++ b/ext/mcpat/xmlParser.cc @@ -0,0 +1,2891 @@ +/** + **************************************************************************** + * <P> XML.c - implementation file for basic XML parser written in ANSI C++ + * for portability. It works by using recursion and a node tree for breaking + * down the elements of an XML document. </P> + * + * @version V2.41 + * @author Frank Vanden Berghen + * + * NOTE: + * + * If you add "#define STRICT_PARSING", on the first line of this file + * the parser will see the following XML-stream: + * <a><b>some text</b><b>other text </a> + * as an error. Otherwise, this tring will be equivalent to: + * <a><b>some text</b><b>other text</b></a> + * + * NOTE: + * + * If you add "#define APPROXIMATE_PARSING" on the first line of this file + * the parser will see the following XML-stream: + * <data name="n1"> + * <data name="n2"> + * <data name="n3" /> + * as equivalent to the following XML-stream: + * <data name="n1" /> + * <data name="n2" /> + * <data name="n3" /> + * This can be useful for badly-formed XML-streams but prevent the use + * of the following XML-stream (problem is: tags at contiguous levels + * have the same names): + * <data name="n1"> + * <data name="n2"> + * <data name="n3" /> + * </data> + * </data> + * + * NOTE: + * + * If you add "#define _XMLPARSER_NO_MESSAGEBOX_" on the first line of this file + * the "openFileHelper" function will always display error messages inside the + * console instead of inside a message-box-window. Message-box-windows are + * available on windows 9x/NT/2000/XP/Vista only. + * + * The following license terms for the "XMLParser library from Business-Insight" apply to projects + * that are in some way related to + * the "mcpat project", including applications + * using "mcpat project" and tools developed + * for enhancing "mcpat project". All other projects + * (not related to "mcpat project") have to use the "XMLParser library from Business-Insight" + * code under the Aladdin Free Public License (AFPL) + * See the file "AFPL-license.txt" for more informations about the AFPL license. + * (see http://www.artifex.com/downloads/doc/Public.htm for detailed AFPL terms) + * + * Redistribution and use of the "XMLParser library from Business-Insight" in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Frank Vanden Berghen nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY Business-Insight ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Business-Insight BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Copyright (c) 2002, Business-Insight + * <a href="http://www.Business-Insight.com">Business-Insight</a> + * All rights reserved. + * + **************************************************************************** + */ +#ifndef _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_DEPRECATE +#endif +#include "xmlParser.h" +#ifdef _XMLWINDOWS +//#ifdef _DEBUG +//#define _CRTDBG_MAP_ALLOC +//#include <crtdbg.h> +//#endif +#define WIN32_LEAN_AND_MEAN +#include <Windows.h> // to have IsTextUnicode, MultiByteToWideChar, WideCharToMultiByte to handle unicode files + // to have "MessageBoxA" to display error messages for openFilHelper +#endif + +#include <memory.h> + +#include <cassert> +#include <cstdio> +#include <cstdlib> +#include <cstring> + +XMLCSTR XMLNode::getVersion() { return _CXML("v2.39"); } +void freeXMLString(XMLSTR t){if(t)free(t);} + +static XMLNode::XMLCharEncoding characterEncoding=XMLNode::char_encoding_UTF8; +static char guessWideCharChars=1, dropWhiteSpace=1, removeCommentsInMiddleOfText=1; + +inline int mmin( const int t1, const int t2 ) { return t1 < t2 ? t1 : t2; } + +// You can modify the initialization of the variable "XMLClearTags" below +// to change the clearTags that are currently recognized by the library. +// The number on the second columns is the length of the string inside the +// first column. The "<!DOCTYPE" declaration must be the second in the list. +// The "<!--" declaration must be the third in the list. +typedef struct { XMLCSTR lpszOpen; int openTagLen; XMLCSTR lpszClose;} ALLXMLClearTag; +static ALLXMLClearTag XMLClearTags[] = +{ + { _CXML("<![CDATA["),9, _CXML("]]>") }, + { _CXML("<!DOCTYPE"),9, _CXML(">") }, + { _CXML("<!--") ,4, _CXML("-->") }, + { _CXML("<PRE>") ,5, _CXML("</PRE>") }, +// { _CXML("<Script>") ,8, _CXML("</Script>")}, + { NULL ,0, NULL } +}; + +// You can modify the initialization of the variable "XMLEntities" below +// to change the character entities that are currently recognized by the library. +// The number on the second columns is the length of the string inside the +// first column. Additionally, the syntaxes " " and " " are recognized. +typedef struct { XMLCSTR s; int l; XMLCHAR c;} XMLCharacterEntity; +static XMLCharacterEntity XMLEntities[] = +{ + { _CXML("&" ), 5, _CXML('&' )}, + { _CXML("<" ), 4, _CXML('<' )}, + { _CXML(">" ), 4, _CXML('>' )}, + { _CXML("""), 6, _CXML('\"')}, + { _CXML("'"), 6, _CXML('\'')}, + { NULL , 0, '\0' } +}; + +// When rendering the XMLNode to a string (using the "createXMLString" function), +// you can ask for a beautiful formatting. This formatting is using the +// following indentation character: +#define INDENTCHAR _CXML('\t') + +// The following function parses the XML errors into a user friendly string. +// You can edit this to change the output language of the library to something else. +XMLCSTR XMLNode::getError(XMLError xerror) +{ + switch (xerror) + { + case eXMLErrorNone: return _CXML("No error"); + case eXMLErrorMissingEndTag: return _CXML("Warning: Unmatched end tag"); + case eXMLErrorNoXMLTagFound: return _CXML("Warning: No XML tag found"); + case eXMLErrorEmpty: return _CXML("Error: No XML data"); + case eXMLErrorMissingTagName: return _CXML("Error: Missing start tag name"); + case eXMLErrorMissingEndTagName: return _CXML("Error: Missing end tag name"); + case eXMLErrorUnmatchedEndTag: return _CXML("Error: Unmatched end tag"); + case eXMLErrorUnmatchedEndClearTag: return _CXML("Error: Unmatched clear tag end"); + case eXMLErrorUnexpectedToken: return _CXML("Error: Unexpected token found"); + case eXMLErrorNoElements: return _CXML("Error: No elements found"); + case eXMLErrorFileNotFound: return _CXML("Error: File not found"); + case eXMLErrorFirstTagNotFound: return _CXML("Error: First Tag not found"); + case eXMLErrorUnknownCharacterEntity:return _CXML("Error: Unknown character entity"); + case eXMLErrorCharacterCodeAbove255: return _CXML("Error: Character code above 255 is forbidden in MultiByte char mode."); + case eXMLErrorCharConversionError: return _CXML("Error: unable to convert between WideChar and MultiByte chars"); + case eXMLErrorCannotOpenWriteFile: return _CXML("Error: unable to open file for writing"); + case eXMLErrorCannotWriteFile: return _CXML("Error: cannot write into file"); + + case eXMLErrorBase64DataSizeIsNotMultipleOf4: return _CXML("Warning: Base64-string length is not a multiple of 4"); + case eXMLErrorBase64DecodeTruncatedData: return _CXML("Warning: Base64-string is truncated"); + case eXMLErrorBase64DecodeIllegalCharacter: return _CXML("Error: Base64-string contains an illegal character"); + case eXMLErrorBase64DecodeBufferTooSmall: return _CXML("Error: Base64 decode output buffer is too small"); + }; + return _CXML("Unknown"); +} + +///////////////////////////////////////////////////////////////////////// +// Here start the abstraction layer to be OS-independent // +///////////////////////////////////////////////////////////////////////// + +// Here is an abstraction layer to access some common string manipulation functions. +// The abstraction layer is currently working for gcc, Microsoft Visual Studio 6.0, +// Microsoft Visual Studio .NET, CC (sun compiler) and Borland C++. +// If you plan to "port" the library to a new system/compiler, all you have to do is +// to edit the following lines. +#ifdef XML_NO_WIDE_CHAR +char myIsTextWideChar(const void *b, int len) { return FALSE; } +#else + #if defined (UNDER_CE) || !defined(_XMLWINDOWS) + char myIsTextWideChar(const void *b, int len) // inspired by the Wine API: RtlIsTextUnicode + { +#ifdef sun + // for SPARC processors: wchar_t* buffers must always be alligned, otherwise it's a char* buffer. + if ((((unsigned long)b)%sizeof(wchar_t))!=0) return FALSE; +#endif + const wchar_t *s=(const wchar_t*)b; + + // buffer too small: + if (len<(int)sizeof(wchar_t)) return FALSE; + + // odd length test + if (len&1) return FALSE; + + /* only checks the first 256 characters */ + len=mmin(256,len/sizeof(wchar_t)); + + // Check for the special byte order: + if (*((unsigned short*)s) == 0xFFFE) return TRUE; // IS_TEXT_UNICODE_REVERSE_SIGNATURE; + if (*((unsigned short*)s) == 0xFEFF) return TRUE; // IS_TEXT_UNICODE_SIGNATURE + + // checks for ASCII characters in the UNICODE stream + int i,stats=0; + for (i=0; i<len; i++) if (s[i]<=(unsigned short)255) stats++; + if (stats>len/2) return TRUE; + + // Check for UNICODE NULL chars + for (i=0; i<len; i++) if (!s[i]) return TRUE; + + return FALSE; + } + #else + char myIsTextWideChar(const void *b,int l) { return (char)IsTextUnicode((CONST LPVOID)b,l,NULL); }; + #endif +#endif + +#ifdef _XMLWINDOWS +// for Microsoft Visual Studio 6.0 and Microsoft Visual Studio .NET and Borland C++ Builder 6.0 + #ifdef _XMLWIDECHAR + wchar_t *myMultiByteToWideChar(const char *s, XMLNode::XMLCharEncoding ce) + { + int i; + if (ce==XMLNode::char_encoding_UTF8) i=(int)MultiByteToWideChar(CP_UTF8,0 ,s,-1,NULL,0); + else i=(int)MultiByteToWideChar(CP_ACP ,MB_PRECOMPOSED,s,-1,NULL,0); + if (i<0) return NULL; + wchar_t *d=(wchar_t *)malloc((i+1)*sizeof(XMLCHAR)); + if (ce==XMLNode::char_encoding_UTF8) i=(int)MultiByteToWideChar(CP_UTF8,0 ,s,-1,d,i); + else i=(int)MultiByteToWideChar(CP_ACP ,MB_PRECOMPOSED,s,-1,d,i); + d[i]=0; + return d; + } + static inline FILE *xfopen(XMLCSTR filename,XMLCSTR mode) { return _wfopen(filename,mode); } + static inline int xstrlen(XMLCSTR c) { return (int)wcslen(c); } + static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) { return _wcsnicmp(c1,c2,l);} + static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) { return wcsncmp(c1,c2,l);} + static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return _wcsicmp(c1,c2); } + static inline XMLSTR xstrstr(XMLCSTR c1, XMLCSTR c2) { return (XMLSTR)wcsstr(c1,c2); } + static inline XMLSTR xstrcpy(XMLSTR c1, XMLCSTR c2) { return (XMLSTR)wcscpy(c1,c2); } + #else + char *myWideCharToMultiByte(const wchar_t *s) + { + UINT codePage=CP_ACP; if (characterEncoding==XMLNode::char_encoding_UTF8) codePage=CP_UTF8; + int i=(int)WideCharToMultiByte(codePage, // code page + 0, // performance and mapping flags + s, // wide-character string + -1, // number of chars in string + NULL, // buffer for new string + 0, // size of buffer + NULL, // default for unmappable chars + NULL // set when default char used + ); + if (i<0) return NULL; + char *d=(char*)malloc(i+1); + WideCharToMultiByte(codePage, // code page + 0, // performance and mapping flags + s, // wide-character string + -1, // number of chars in string + d, // buffer for new string + i, // size of buffer + NULL, // default for unmappable chars + NULL // set when default char used + ); + d[i]=0; + return d; + } + static inline FILE *xfopen(XMLCSTR filename,XMLCSTR mode) { return fopen(filename,mode); } + static inline int xstrlen(XMLCSTR c) { return (int)strlen(c); } + #ifdef __BORLANDC__ + static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) { return strnicmp(c1,c2,l);} + static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return stricmp(c1,c2); } + #else + static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) { return _strnicmp(c1,c2,l);} + static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return _stricmp(c1,c2); } + #endif + static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) { return strncmp(c1,c2,l);} + static inline XMLSTR xstrstr(XMLCSTR c1, XMLCSTR c2) { return (XMLSTR)strstr(c1,c2); } + static inline XMLSTR xstrcpy(XMLSTR c1, XMLCSTR c2) { return (XMLSTR)strcpy(c1,c2); } + #endif +#else +// for gcc and CC + #ifdef XML_NO_WIDE_CHAR + char *myWideCharToMultiByte(const wchar_t *s) { return NULL; } + #else + char *myWideCharToMultiByte(const wchar_t *s) + { + const wchar_t *ss=s; + int i=(int)wcsrtombs(NULL,&ss,0,NULL); + if (i<0) return NULL; + char *d=(char *)malloc(i+1); + wcsrtombs(d,&s,i,NULL); + d[i]=0; + return d; + } + #endif + #ifdef _XMLWIDECHAR + wchar_t *myMultiByteToWideChar(const char *s, XMLNode::XMLCharEncoding ce) + { + const char *ss=s; + int i=(int)mbsrtowcs(NULL,&ss,0,NULL); + if (i<0) return NULL; + wchar_t *d=(wchar_t *)malloc((i+1)*sizeof(wchar_t)); + mbsrtowcs(d,&s,i,NULL); + d[i]=0; + return d; + } + int xstrlen(XMLCSTR c) { return wcslen(c); } + #ifdef sun + // for CC + #include <widec.h> + static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) { return wsncasecmp(c1,c2,l);} + static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) { return wsncmp(c1,c2,l);} + static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return wscasecmp(c1,c2); } + #else + // for gcc + static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) { return wcsncasecmp(c1,c2,l);} + static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) { return wcsncmp(c1,c2,l);} + static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return wcscasecmp(c1,c2); } + #endif + static inline XMLSTR xstrstr(XMLCSTR c1, XMLCSTR c2) { return (XMLSTR)wcsstr(c1,c2); } + static inline XMLSTR xstrcpy(XMLSTR c1, XMLCSTR c2) { return (XMLSTR)wcscpy(c1,c2); } + static inline FILE *xfopen(XMLCSTR filename,XMLCSTR mode) + { + char *filenameAscii=myWideCharToMultiByte(filename); + FILE *f; + if (mode[0]==_CXML('r')) f=fopen(filenameAscii,"rb"); + else f=fopen(filenameAscii,"wb"); + free(filenameAscii); + return f; + } + #else + static inline FILE *xfopen(XMLCSTR filename,XMLCSTR mode) { return fopen(filename,mode); } + static inline int xstrlen(XMLCSTR c) { return strlen(c); } + static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) { return strncasecmp(c1,c2,l);} + static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) { return strncmp(c1,c2,l);} + static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return strcasecmp(c1,c2); } + static inline XMLSTR xstrstr(XMLCSTR c1, XMLCSTR c2) { return (XMLSTR)strstr(c1,c2); } + static inline XMLSTR xstrcpy(XMLSTR c1, XMLCSTR c2) { return (XMLSTR)strcpy(c1,c2); } + #endif + static inline int _strnicmp(const char *c1,const char *c2, int l) { return strncasecmp(c1,c2,l);} +#endif + + +/////////////////////////////////////////////////////////////////////////////// +// the "xmltoc,xmltob,xmltoi,xmltol,xmltof,xmltoa" functions // +/////////////////////////////////////////////////////////////////////////////// +// These 6 functions are not used inside the XMLparser. +// There are only here as "convenience" functions for the user. +// If you don't need them, you can delete them without any trouble. +#ifdef _XMLWIDECHAR + #ifdef _XMLWINDOWS + // for Microsoft Visual Studio 6.0 and Microsoft Visual Studio .NET and Borland C++ Builder 6.0 + char xmltob(XMLCSTR t,int v){ if (t&&(*t)) return (char)_wtoi(t); return v; } + int xmltoi(XMLCSTR t,int v){ if (t&&(*t)) return _wtoi(t); return v; } + long xmltol(XMLCSTR t,long v){ if (t&&(*t)) return _wtol(t); return v; } + double xmltof(XMLCSTR t,double v){ if (t&&(*t)) wscanf(t, "%f", &v); /*v=_wtof(t);*/ return v; } + #else + #ifdef sun + // for CC + #include <widec.h> + char xmltob(XMLCSTR t,int v){ if (t) return (char)wstol(t,NULL,10); return v; } + int xmltoi(XMLCSTR t,int v){ if (t) return (int)wstol(t,NULL,10); return v; } + long xmltol(XMLCSTR t,long v){ if (t) return wstol(t,NULL,10); return v; } + #else + // for gcc + char xmltob(XMLCSTR t,int v){ if (t) return (char)wcstol(t,NULL,10); return v; } + int xmltoi(XMLCSTR t,int v){ if (t) return (int)wcstol(t,NULL,10); return v; } + long xmltol(XMLCSTR t,long v){ if (t) return wcstol(t,NULL,10); return v; } + #endif + double xmltof(XMLCSTR t,double v){ if (t&&(*t)) wscanf(t, "%f", &v); /*v=_wtof(t);*/ return v; } + #endif +#else + char xmltob(XMLCSTR t,char v){ if (t&&(*t)) return (char)atoi(t); return v; } + int xmltoi(XMLCSTR t,int v){ if (t&&(*t)) return atoi(t); return v; } + long xmltol(XMLCSTR t,long v){ if (t&&(*t)) return atol(t); return v; } + double xmltof(XMLCSTR t,double v){ if (t&&(*t)) return atof(t); return v; } +#endif +XMLCSTR xmltoa(XMLCSTR t,XMLCSTR v){ if (t) return t; return v; } +XMLCHAR xmltoc(XMLCSTR t,XMLCHAR v){ if (t&&(*t)) return *t; return v; } + +///////////////////////////////////////////////////////////////////////// +// the "openFileHelper" function // +///////////////////////////////////////////////////////////////////////// + +// Since each application has its own way to report and deal with errors, you should modify & rewrite +// the following "openFileHelper" function to get an "error reporting mechanism" tailored to your needs. +XMLNode XMLNode::openFileHelper(XMLCSTR filename, XMLCSTR tag) +{ + // guess the value of the global parameter "characterEncoding" + // (the guess is based on the first 200 bytes of the file). + FILE *f=xfopen(filename,_CXML("rb")); + if (f) + { + char bb[205]; + int l=(int)fread(bb,1,200,f); + setGlobalOptions(guessCharEncoding(bb,l),guessWideCharChars,dropWhiteSpace,removeCommentsInMiddleOfText); + fclose(f); + } + + // parse the file + XMLResults pResults; + XMLNode xnode=XMLNode::parseFile(filename,tag,&pResults); + + // display error message (if any) + if (pResults.error != eXMLErrorNone) + { + // create message + char message[2000],*s1=(char*)"",*s3=(char*)""; XMLCSTR s2=_CXML(""); + if (pResults.error==eXMLErrorFirstTagNotFound) { s1=(char*)"First Tag should be '"; s2=tag; s3=(char*)"'.\n"; } + sprintf(message, +#ifdef _XMLWIDECHAR + "XML Parsing error inside file '%S'.\n%S\nAt line %i, column %i.\n%s%S%s" +#else + "XML Parsing error inside file '%s'.\n%s\nAt line %i, column %i.\n%s%s%s" +#endif + ,filename,XMLNode::getError(pResults.error),pResults.nLine,pResults.nColumn,s1,s2,s3); + + // display message +#if defined(_XMLWINDOWS) && !defined(UNDER_CE) && !defined(_XMLPARSER_NO_MESSAGEBOX_) + MessageBoxA(NULL,message,"XML Parsing error",MB_OK|MB_ICONERROR|MB_TOPMOST); +#else + printf("%s",message); +#endif + exit(255); + } + return xnode; +} + +///////////////////////////////////////////////////////////////////////// +// Here start the core implementation of the XMLParser library // +///////////////////////////////////////////////////////////////////////// + +// You should normally not change anything below this point. + +#ifndef _XMLWIDECHAR +// If "characterEncoding=ascii" then we assume that all characters have the same length of 1 byte. +// If "characterEncoding=UTF8" then the characters have different lengths (from 1 byte to 4 bytes). +// If "characterEncoding=ShiftJIS" then the characters have different lengths (from 1 byte to 2 bytes). +// This table is used as lookup-table to know the length of a character (in byte) based on the +// content of the first byte of the character. +// (note: if you modify this, you must always have XML_utf8ByteTable[0]=0 ). +static const char XML_utf8ByteTable[256] = +{ + // 0 1 2 3 4 5 6 7 8 9 a b c d e f + 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x00 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x10 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x20 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x30 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x40 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x50 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x60 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x70 End of ASCII range + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x80 0x80 to 0xc1 invalid + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x90 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0xa0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0xb0 + 1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xc0 0xc2 to 0xdf 2 byte + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xd0 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,// 0xe0 0xe0 to 0xef 3 byte + 4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid +}; +static const char XML_legacyByteTable[256] = +{ + 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +}; +static const char XML_sjisByteTable[256] = +{ + // 0 1 2 3 4 5 6 7 8 9 a b c d e f + 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x00 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x10 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x20 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x30 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x40 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x50 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x60 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x70 + 1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0x80 0x81 to 0x9F 2 bytes + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0x90 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0xa0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0xb0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0xc0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0xd0 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xe0 0xe0 to 0xef 2 bytes + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 // 0xf0 +}; +static const char XML_gb2312ByteTable[256] = +{ +// 0 1 2 3 4 5 6 7 8 9 a b c d e f + 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x00 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x10 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x20 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x30 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x40 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x50 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x60 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x70 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x80 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x90 + 1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xa0 0xa1 to 0xf7 2 bytes + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xb0 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xc0 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xd0 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xe0 + 2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1 // 0xf0 +}; +static const char XML_gbk_big5_ByteTable[256] = +{ + // 0 1 2 3 4 5 6 7 8 9 a b c d e f + 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x00 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x10 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x20 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x30 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x40 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x50 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x60 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,// 0x70 + 1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0x80 0x81 to 0xfe 2 bytes + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0x90 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xa0 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xb0 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xc0 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xd0 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,// 0xe0 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1 // 0xf0 +}; +static const char *XML_ByteTable=(const char *)XML_utf8ByteTable; // the default is "characterEncoding=XMLNode::encoding_UTF8" +#endif + + +XMLNode XMLNode::emptyXMLNode; +XMLClear XMLNode::emptyXMLClear={ NULL, NULL, NULL}; +XMLAttribute XMLNode::emptyXMLAttribute={ NULL, NULL}; + +// Enumeration used to decipher what type a token is +typedef enum XMLTokenTypeTag +{ + eTokenText = 0, + eTokenQuotedText, + eTokenTagStart, /* "<" */ + eTokenTagEnd, /* "</" */ + eTokenCloseTag, /* ">" */ + eTokenEquals, /* "=" */ + eTokenDeclaration, /* "<?" */ + eTokenShortHandClose, /* "/>" */ + eTokenClear, + eTokenError +} XMLTokenType; + +// Main structure used for parsing XML +typedef struct XML +{ + XMLCSTR lpXML; + XMLCSTR lpszText; + int nIndex,nIndexMissigEndTag; + enum XMLError error; + XMLCSTR lpEndTag; + int cbEndTag; + XMLCSTR lpNewElement; + int cbNewElement; + int nFirst; +} XML; + +typedef struct +{ + ALLXMLClearTag *pClr; + XMLCSTR pStr; +} NextToken; + +// Enumeration used when parsing attributes +typedef enum Attrib +{ + eAttribName = 0, + eAttribEquals, + eAttribValue +} Attrib; + +// Enumeration used when parsing elements to dictate whether we are currently +// inside a tag +typedef enum Status +{ + eInsideTag = 0, + eOutsideTag +} Status; + +XMLError XMLNode::writeToFile(XMLCSTR filename, const char *encoding, char nFormat) const +{ + if (!d) return eXMLErrorNone; + FILE *f=xfopen(filename,_CXML("wb")); + if (!f) return eXMLErrorCannotOpenWriteFile; +#ifdef _XMLWIDECHAR + unsigned char h[2]={ 0xFF, 0xFE }; + if (!fwrite(h,2,1,f)) return eXMLErrorCannotWriteFile; + if ((!isDeclaration())&&((d->lpszName)||(!getChildNode().isDeclaration()))) + { + if (!fwrite(L"<?xml version=\"1.0\" encoding=\"utf-16\"?>\n",sizeof(wchar_t)*40,1,f)) + return eXMLErrorCannotWriteFile; + } +#else + if ((!isDeclaration())&&((d->lpszName)||(!getChildNode().isDeclaration()))) + { + if (characterEncoding==char_encoding_UTF8) + { + // header so that windows recognize the file as UTF-8: + unsigned char h[3]={0xEF,0xBB,0xBF}; if (!fwrite(h,3,1,f)) return eXMLErrorCannotWriteFile; + encoding="utf-8"; + } else if (characterEncoding==char_encoding_ShiftJIS) encoding="SHIFT-JIS"; + + if (!encoding) encoding="ISO-8859-1"; + if (fprintf(f,"<?xml version=\"1.0\" encoding=\"%s\"?>\n",encoding)<0) return eXMLErrorCannotWriteFile; + } else + { + if (characterEncoding==char_encoding_UTF8) + { + unsigned char h[3]={0xEF,0xBB,0xBF}; if (!fwrite(h,3,1,f)) return eXMLErrorCannotWriteFile; + } + } +#endif + int i; + XMLSTR t=createXMLString(nFormat,&i); + if (!fwrite(t,sizeof(XMLCHAR)*i,1,f)) return eXMLErrorCannotWriteFile; + if (fclose(f)!=0) return eXMLErrorCannotWriteFile; + free(t); + return eXMLErrorNone; +} + +// Duplicate a given string. +XMLSTR stringDup(XMLCSTR lpszData, int cbData) +{ + if (lpszData==NULL) return NULL; + + XMLSTR lpszNew; + if (cbData==-1) cbData=(int)xstrlen(lpszData); + lpszNew = (XMLSTR)malloc((cbData+1) * sizeof(XMLCHAR)); + if (lpszNew) + { + memcpy(lpszNew, lpszData, (cbData) * sizeof(XMLCHAR)); + lpszNew[cbData] = (XMLCHAR)NULL; + } + return lpszNew; +} + +XMLSTR ToXMLStringTool::toXMLUnSafe(XMLSTR dest,XMLCSTR source) +{ + XMLSTR dd=dest; + XMLCHAR ch; + XMLCharacterEntity *entity; + while ((ch=*source)) + { + entity=XMLEntities; + do + { + if (ch==entity->c) {xstrcpy(dest,entity->s); dest+=entity->l; source++; goto out_of_loop1; } + entity++; + } while(entity->s); +#ifdef _XMLWIDECHAR + *(dest++)=*(source++); +#else + switch(XML_ByteTable[(unsigned char)ch]) + { + case 4: *(dest++)=*(source++); + case 3: *(dest++)=*(source++); + case 2: *(dest++)=*(source++); + case 1: *(dest++)=*(source++); + } +#endif +out_of_loop1: + ; + } + *dest=0; + return dd; +} + +// private (used while rendering): +int ToXMLStringTool::lengthXMLString(XMLCSTR source) +{ + int r=0; + XMLCharacterEntity *entity; + XMLCHAR ch; + while ((ch=*source)) + { + entity=XMLEntities; + do + { + if (ch==entity->c) { r+=entity->l; source++; goto out_of_loop1; } + entity++; + } while(entity->s); +#ifdef _XMLWIDECHAR + r++; source++; +#else + ch=XML_ByteTable[(unsigned char)ch]; r+=ch; source+=ch; +#endif +out_of_loop1: + ; + } + return r; +} + +ToXMLStringTool::~ToXMLStringTool(){ freeBuffer(); } +void ToXMLStringTool::freeBuffer(){ if (buf) free(buf); buf=NULL; buflen=0; } +XMLSTR ToXMLStringTool::toXML(XMLCSTR source) +{ + int l=lengthXMLString(source)+1; + if (l>buflen) { buflen=l; buf=(XMLSTR)realloc(buf,l*sizeof(XMLCHAR)); } + return toXMLUnSafe(buf,source); +} + +// private: +XMLSTR fromXMLString(XMLCSTR s, int lo, XML *pXML) +{ + // This function is the opposite of the function "toXMLString". It decodes the escape + // sequences &, ", ', <, > and replace them by the characters + // &,",',<,>. This function is used internally by the XML Parser. All the calls to + // the XML library will always gives you back "decoded" strings. + // + // in: string (s) and length (lo) of string + // out: new allocated string converted from xml + if (!s) return NULL; + + int ll=0,j; + XMLSTR d; + XMLCSTR ss=s; + XMLCharacterEntity *entity; + while ((lo>0)&&(*s)) + { + if (*s==_CXML('&')) + { + if ((lo>2)&&(s[1]==_CXML('#'))) + { + s+=2; lo-=2; + if ((*s==_CXML('X'))||(*s==_CXML('x'))) { s++; lo--; } + while ((*s)&&(*s!=_CXML(';'))&&((lo--)>0)) s++; + if (*s!=_CXML(';')) + { + pXML->error=eXMLErrorUnknownCharacterEntity; + return NULL; + } + s++; lo--; + } else + { + entity=XMLEntities; + do + { + if ((lo>=entity->l)&&(xstrnicmp(s,entity->s,entity->l)==0)) { s+=entity->l; lo-=entity->l; break; } + entity++; + } while(entity->s); + if (!entity->s) + { + pXML->error=eXMLErrorUnknownCharacterEntity; + return NULL; + } + } + } else + { +#ifdef _XMLWIDECHAR + s++; lo--; +#else + j=XML_ByteTable[(unsigned char)*s]; s+=j; lo-=j; ll+=j-1; +#endif + } + ll++; + } + + d=(XMLSTR)malloc((ll+1)*sizeof(XMLCHAR)); + s=d; + while (ll-->0) + { + if (*ss==_CXML('&')) + { + if (ss[1]==_CXML('#')) + { + ss+=2; j=0; + if ((*ss==_CXML('X'))||(*ss==_CXML('x'))) + { + ss++; + while (*ss!=_CXML(';')) + { + if ((*ss>=_CXML('0'))&&(*ss<=_CXML('9'))) j=(j<<4)+*ss-_CXML('0'); + else if ((*ss>=_CXML('A'))&&(*ss<=_CXML('F'))) j=(j<<4)+*ss-_CXML('A')+10; + else if ((*ss>=_CXML('a'))&&(*ss<=_CXML('f'))) j=(j<<4)+*ss-_CXML('a')+10; + else { free((void*)s); pXML->error=eXMLErrorUnknownCharacterEntity;return NULL;} + ss++; + } + } else + { + while (*ss!=_CXML(';')) + { + if ((*ss>=_CXML('0'))&&(*ss<=_CXML('9'))) j=(j*10)+*ss-_CXML('0'); + else { free((void*)s); pXML->error=eXMLErrorUnknownCharacterEntity;return NULL;} + ss++; + } + } +#ifndef _XMLWIDECHAR + if (j>255) { free((void*)s); pXML->error=eXMLErrorCharacterCodeAbove255;return NULL;} +#endif + (*d++)=(XMLCHAR)j; ss++; + } else + { + entity=XMLEntities; + do + { + if (xstrnicmp(ss,entity->s,entity->l)==0) { *(d++)=entity->c; ss+=entity->l; break; } + entity++; + } while(entity->s); + } + } else + { +#ifdef _XMLWIDECHAR + *(d++)=*(ss++); +#else + switch(XML_ByteTable[(unsigned char)*ss]) + { + case 4: *(d++)=*(ss++); ll--; + case 3: *(d++)=*(ss++); ll--; + case 2: *(d++)=*(ss++); ll--; + case 1: *(d++)=*(ss++); + } +#endif + } + } + *d=0; + return (XMLSTR)s; +} + +#define XML_isSPACECHAR(ch) ((ch==_CXML('\n'))||(ch==_CXML(' '))||(ch== _CXML('\t'))||(ch==_CXML('\r'))) + +// private: +char myTagCompare(XMLCSTR cclose, XMLCSTR copen) +// !!!! WARNING strange convention&: +// return 0 if equals +// return 1 if different +{ + if (!cclose) return 1; + int l=(int)xstrlen(cclose); + if (xstrnicmp(cclose, copen, l)!=0) return 1; + const XMLCHAR c=copen[l]; + if (XML_isSPACECHAR(c)|| + (c==_CXML('/' ))|| + (c==_CXML('<' ))|| + (c==_CXML('>' ))|| + (c==_CXML('=' ))) return 0; + return 1; +} + +// Obtain the next character from the string. +static inline XMLCHAR getNextChar(XML *pXML) +{ + XMLCHAR ch = pXML->lpXML[pXML->nIndex]; +#ifdef _XMLWIDECHAR + if (ch!=0) pXML->nIndex++; +#else + pXML->nIndex+=XML_ByteTable[(unsigned char)ch]; +#endif + return ch; +} + +// Find the next token in a string. +// pcbToken contains the number of characters that have been read. +static NextToken GetNextToken(XML *pXML, int *pcbToken, enum XMLTokenTypeTag *pType) +{ + NextToken result; + XMLCHAR ch; + XMLCHAR chTemp; + int indexStart,nFoundMatch,nIsText=FALSE; + result.pClr=NULL; // prevent warning + + // Find next non-white space character + do { indexStart=pXML->nIndex; ch=getNextChar(pXML); } while XML_isSPACECHAR(ch); + + if (ch) + { + // Cache the current string pointer + result.pStr = &pXML->lpXML[indexStart]; + + // First check whether the token is in the clear tag list (meaning it + // does not need formatting). + ALLXMLClearTag *ctag=XMLClearTags; + do + { + if (xstrncmp(ctag->lpszOpen, result.pStr, ctag->openTagLen)==0) + { + result.pClr=ctag; + pXML->nIndex+=ctag->openTagLen-1; + *pType=eTokenClear; + return result; + } + ctag++; + } while(ctag->lpszOpen); + + // If we didn't find a clear tag then check for standard tokens + switch(ch) + { + // Check for quotes + case _CXML('\''): + case _CXML('\"'): + // Type of token + *pType = eTokenQuotedText; + chTemp = ch; + + // Set the size + nFoundMatch = FALSE; + + // Search through the string to find a matching quote + while((ch = getNextChar(pXML))) + { + if (ch==chTemp) { nFoundMatch = TRUE; break; } + if (ch==_CXML('<')) break; + } + + // If we failed to find a matching quote + if (nFoundMatch == FALSE) + { + pXML->nIndex=indexStart+1; + nIsText=TRUE; + break; + } + +// 4.02.2002 +// if (FindNonWhiteSpace(pXML)) pXML->nIndex--; + + break; + + // Equals (used with attribute values) + case _CXML('='): + *pType = eTokenEquals; + break; + + // Close tag + case _CXML('>'): + *pType = eTokenCloseTag; + break; + + // Check for tag start and tag end + case _CXML('<'): + + // Peek at the next character to see if we have an end tag '</', + // or an xml declaration '<?' + chTemp = pXML->lpXML[pXML->nIndex]; + + // If we have a tag end... + if (chTemp == _CXML('/')) + { + // Set the type and ensure we point at the next character + getNextChar(pXML); + *pType = eTokenTagEnd; + } + + // If we have an XML declaration tag + else if (chTemp == _CXML('?')) + { + + // Set the type and ensure we point at the next character + getNextChar(pXML); + *pType = eTokenDeclaration; + } + + // Otherwise we must have a start tag + else + { + *pType = eTokenTagStart; + } + break; + + // Check to see if we have a short hand type end tag ('/>'). + case _CXML('/'): + + // Peek at the next character to see if we have a short end tag '/>' + chTemp = pXML->lpXML[pXML->nIndex]; + + // If we have a short hand end tag... + if (chTemp == _CXML('>')) + { + // Set the type and ensure we point at the next character + getNextChar(pXML); + *pType = eTokenShortHandClose; + break; + } + + // If we haven't found a short hand closing tag then drop into the + // text process + + // Other characters + default: + nIsText = TRUE; + } + + // If this is a TEXT node + if (nIsText) + { + // Indicate we are dealing with text + *pType = eTokenText; + while((ch = getNextChar(pXML))) + { + if XML_isSPACECHAR(ch) + { + indexStart++; break; + + } else if (ch==_CXML('/')) + { + // If we find a slash then this maybe text or a short hand end tag + // Peek at the next character to see it we have short hand end tag + ch=pXML->lpXML[pXML->nIndex]; + // If we found a short hand end tag then we need to exit the loop + if (ch==_CXML('>')) { pXML->nIndex--; break; } + + } else if ((ch==_CXML('<'))||(ch==_CXML('>'))||(ch==_CXML('='))) + { + pXML->nIndex--; break; + } + } + } + *pcbToken = pXML->nIndex-indexStart; + } else + { + // If we failed to obtain a valid character + *pcbToken = 0; + *pType = eTokenError; + result.pStr=NULL; + } + + return result; +} + +XMLCSTR XMLNode::updateName_WOSD(XMLSTR lpszName) +{ + if (!d) { free(lpszName); return NULL; } + if (d->lpszName&&(lpszName!=d->lpszName)) free((void*)d->lpszName); + d->lpszName=lpszName; + return lpszName; +} + +// private: +XMLNode::XMLNode(struct XMLNodeDataTag *p){ d=p; (p->ref_count)++; } +XMLNode::XMLNode(XMLNodeData *pParent, XMLSTR lpszName, char isDeclaration) +{ + d=(XMLNodeData*)malloc(sizeof(XMLNodeData)); + d->ref_count=1; + + d->lpszName=NULL; + d->nChild= 0; + d->nText = 0; + d->nClear = 0; + d->nAttribute = 0; + + d->isDeclaration = isDeclaration; + + d->pParent = pParent; + d->pChild= NULL; + d->pText= NULL; + d->pClear= NULL; + d->pAttribute= NULL; + d->pOrder= NULL; + + updateName_WOSD(lpszName); +} + +XMLNode XMLNode::createXMLTopNode_WOSD(XMLSTR lpszName, char isDeclaration) { return XMLNode(NULL,lpszName,isDeclaration); } +XMLNode XMLNode::createXMLTopNode(XMLCSTR lpszName, char isDeclaration) { return XMLNode(NULL,stringDup(lpszName),isDeclaration); } + +#define MEMORYINCREASE 50 + +static inline void myFree(void *p) { if (p) free(p); } +static inline void *myRealloc(void *p, int newsize, int memInc, int sizeofElem) +{ + if (p==NULL) { if (memInc) return malloc(memInc*sizeofElem); return malloc(sizeofElem); } + if ((memInc==0)||((newsize%memInc)==0)) p=realloc(p,(newsize+memInc)*sizeofElem); +// if (!p) +// { +// printf("XMLParser Error: Not enough memory! Aborting...\n"); exit(220); +// } + return p; +} + +// private: +XMLElementPosition XMLNode::findPosition(XMLNodeData *d, int index, XMLElementType xxtype) +{ + if (index<0) return -1; + int i=0,j=(int)((index<<2)+xxtype),*o=d->pOrder; while (o[i]!=j) i++; return i; +} + +// private: +// update "order" information when deleting a content of a XMLNode +int XMLNode::removeOrderElement(XMLNodeData *d, XMLElementType t, int index) +{ + int n=d->nChild+d->nText+d->nClear, *o=d->pOrder,i=findPosition(d,index,t); + memmove(o+i, o+i+1, (n-i)*sizeof(int)); + for (;i<n;i++) + if ((o[i]&3)==(int)t) o[i]-=4; + // We should normally do: + // d->pOrder=(int)realloc(d->pOrder,n*sizeof(int)); + // but we skip reallocation because it's too time consuming. + // Anyway, at the end, it will be free'd completely at once. + return i; +} + +void *XMLNode::addToOrder(int memoryIncrease,int *_pos, int nc, void *p, int size, XMLElementType xtype) +{ + // in: *_pos is the position inside d->pOrder ("-1" means "EndOf") + // out: *_pos is the index inside p + p=myRealloc(p,(nc+1),memoryIncrease,size); + int n=d->nChild+d->nText+d->nClear; + d->pOrder=(int*)myRealloc(d->pOrder,n+1,memoryIncrease*3,sizeof(int)); + int pos=*_pos,*o=d->pOrder; + + if ((pos<0)||(pos>=n)) { *_pos=nc; o[n]=(int)((nc<<2)+xtype); return p; } + + int i=pos; + memmove(o+i+1, o+i, (n-i)*sizeof(int)); + + while ((pos<n)&&((o[pos]&3)!=(int)xtype)) pos++; + if (pos==n) { *_pos=nc; o[n]=(int)((nc<<2)+xtype); return p; } + + o[i]=o[pos]; + for (i=pos+1;i<=n;i++) if ((o[i]&3)==(int)xtype) o[i]+=4; + + *_pos=pos=o[pos]>>2; + memmove(((char*)p)+(pos+1)*size,((char*)p)+pos*size,(nc-pos)*size); + + return p; +} + +// Add a child node to the given element. +XMLNode XMLNode::addChild_priv(int memoryIncrease, XMLSTR lpszName, char isDeclaration, int pos) +{ + if (!lpszName) return emptyXMLNode; + d->pChild=(XMLNode*)addToOrder(memoryIncrease,&pos,d->nChild,d->pChild,sizeof(XMLNode),eNodeChild); + d->pChild[pos].d=NULL; + d->pChild[pos]=XMLNode(d,lpszName,isDeclaration); + d->nChild++; + return d->pChild[pos]; +} + +// Add an attribute to an element. +XMLAttribute *XMLNode::addAttribute_priv(int memoryIncrease,XMLSTR lpszName, XMLSTR lpszValuev) +{ + if (!lpszName) return &emptyXMLAttribute; + if (!d) { myFree(lpszName); myFree(lpszValuev); return &emptyXMLAttribute; } + int nc=d->nAttribute; + d->pAttribute=(XMLAttribute*)myRealloc(d->pAttribute,(nc+1),memoryIncrease,sizeof(XMLAttribute)); + XMLAttribute *pAttr=d->pAttribute+nc; + pAttr->lpszName = lpszName; + pAttr->lpszValue = lpszValuev; + d->nAttribute++; + return pAttr; +} + +// Add text to the element. +XMLCSTR XMLNode::addText_priv(int memoryIncrease, XMLSTR lpszValue, int pos) +{ + if (!lpszValue) return NULL; + if (!d) { myFree(lpszValue); return NULL; } + d->pText=(XMLCSTR*)addToOrder(memoryIncrease,&pos,d->nText,d->pText,sizeof(XMLSTR),eNodeText); + d->pText[pos]=lpszValue; + d->nText++; + return lpszValue; +} + +// Add clear (unformatted) text to the element. +XMLClear *XMLNode::addClear_priv(int memoryIncrease, XMLSTR lpszValue, XMLCSTR lpszOpen, XMLCSTR lpszClose, int pos) +{ + if (!lpszValue) return &emptyXMLClear; + if (!d) { myFree(lpszValue); return &emptyXMLClear; } + d->pClear=(XMLClear *)addToOrder(memoryIncrease,&pos,d->nClear,d->pClear,sizeof(XMLClear),eNodeClear); + XMLClear *pNewClear=d->pClear+pos; + pNewClear->lpszValue = lpszValue; + if (!lpszOpen) lpszOpen=XMLClearTags->lpszOpen; + if (!lpszClose) lpszClose=XMLClearTags->lpszClose; + pNewClear->lpszOpenTag = lpszOpen; + pNewClear->lpszCloseTag = lpszClose; + d->nClear++; + return pNewClear; +} + +// private: +// Parse a clear (unformatted) type node. +char XMLNode::parseClearTag(void *px, void *_pClear) +{ + XML *pXML=(XML *)px; + ALLXMLClearTag pClear=*((ALLXMLClearTag*)_pClear); + int cbTemp=0; + XMLCSTR lpszTemp=NULL; + XMLCSTR lpXML=&pXML->lpXML[pXML->nIndex]; + static XMLCSTR docTypeEnd=_CXML("]>"); + + // Find the closing tag + // Seems the <!DOCTYPE need a better treatment so lets handle it + if (pClear.lpszOpen==XMLClearTags[1].lpszOpen) + { + XMLCSTR pCh=lpXML; + while (*pCh) + { + if (*pCh==_CXML('<')) { pClear.lpszClose=docTypeEnd; lpszTemp=xstrstr(lpXML,docTypeEnd); break; } + else if (*pCh==_CXML('>')) { lpszTemp=pCh; break; } +#ifdef _XMLWIDECHAR + pCh++; +#else + pCh+=XML_ByteTable[(unsigned char)(*pCh)]; +#endif + } + } else lpszTemp=xstrstr(lpXML, pClear.lpszClose); + + if (lpszTemp) + { + // Cache the size and increment the index + cbTemp = (int)(lpszTemp - lpXML); + + pXML->nIndex += cbTemp+(int)xstrlen(pClear.lpszClose); + + // Add the clear node to the current element + addClear_priv(MEMORYINCREASE,stringDup(lpXML,cbTemp), pClear.lpszOpen, pClear.lpszClose,-1); + return 0; + } + + // If we failed to find the end tag + pXML->error = eXMLErrorUnmatchedEndClearTag; + return 1; +} + +void XMLNode::exactMemory(XMLNodeData *d) +{ + if (d->pOrder) d->pOrder=(int*)realloc(d->pOrder,(d->nChild+d->nText+d->nClear)*sizeof(int)); + if (d->pChild) d->pChild=(XMLNode*)realloc(d->pChild,d->nChild*sizeof(XMLNode)); + if (d->pAttribute) d->pAttribute=(XMLAttribute*)realloc(d->pAttribute,d->nAttribute*sizeof(XMLAttribute)); + if (d->pText) d->pText=(XMLCSTR*)realloc(d->pText,d->nText*sizeof(XMLSTR)); + if (d->pClear) d->pClear=(XMLClear *)realloc(d->pClear,d->nClear*sizeof(XMLClear)); +} + +char XMLNode::maybeAddTxT(void *pa, XMLCSTR tokenPStr) +{ + XML *pXML=(XML *)pa; + XMLCSTR lpszText=pXML->lpszText; + if (!lpszText) return 0; + if (dropWhiteSpace) while (XML_isSPACECHAR(*lpszText)&&(lpszText!=tokenPStr)) lpszText++; + int cbText = (int)(tokenPStr - lpszText); + if (!cbText) { pXML->lpszText=NULL; return 0; } + if (dropWhiteSpace) { cbText--; while ((cbText)&&XML_isSPACECHAR(lpszText[cbText])) cbText--; cbText++; } + if (!cbText) { pXML->lpszText=NULL; return 0; } + XMLSTR lpt=fromXMLString(lpszText,cbText,pXML); + if (!lpt) return 1; + pXML->lpszText=NULL; + if (removeCommentsInMiddleOfText && d->nText && d->nClear) + { + // if the previous insertion was a comment (<!-- -->) AND + // if the previous previous insertion was a text then, delete the comment and append the text + int n=d->nChild+d->nText+d->nClear-1,*o=d->pOrder; + if (((o[n]&3)==eNodeClear)&&((o[n-1]&3)==eNodeText)) + { + int i=o[n]>>2; + if (d->pClear[i].lpszOpenTag==XMLClearTags[2].lpszOpen) + { + deleteClear(i); + i=o[n-1]>>2; + n=xstrlen(d->pText[i]); + int n2=xstrlen(lpt)+1; + d->pText[i]=(XMLSTR)realloc((void*)d->pText[i],(n+n2)*sizeof(XMLCHAR)); + if (!d->pText[i]) return 1; + memcpy((void*)(d->pText[i]+n),lpt,n2*sizeof(XMLCHAR)); + free(lpt); + return 0; + } + } + } + addText_priv(MEMORYINCREASE,lpt,-1); + return 0; +} +// private: +// Recursively parse an XML element. +int XMLNode::ParseXMLElement(void *pa) +{ + XML *pXML=(XML *)pa; + int cbToken; + enum XMLTokenTypeTag xtype; + NextToken token; + XMLCSTR lpszTemp=NULL; + int cbTemp=0; + char nDeclaration; + XMLNode pNew; + enum Status status; // inside or outside a tag + enum Attrib attrib = eAttribName; + + assert(pXML); + + // If this is the first call to the function + if (pXML->nFirst) + { + // Assume we are outside of a tag definition + pXML->nFirst = FALSE; + status = eOutsideTag; + } else + { + // If this is not the first call then we should only be called when inside a tag. + status = eInsideTag; + } + + // Iterate through the tokens in the document + for(;;) + { + // Obtain the next token + token = GetNextToken(pXML, &cbToken, &xtype); + + if (xtype != eTokenError) + { + // Check the current status + switch(status) + { + + // If we are outside of a tag definition + case eOutsideTag: + + // Check what type of token we obtained + switch(xtype) + { + // If we have found text or quoted text + case eTokenText: + case eTokenCloseTag: /* '>' */ + case eTokenShortHandClose: /* '/>' */ + case eTokenQuotedText: + case eTokenEquals: + break; + + // If we found a start tag '<' and declarations '<?' + case eTokenTagStart: + case eTokenDeclaration: + + // Cache whether this new element is a declaration or not + nDeclaration = (xtype == eTokenDeclaration); + + // If we have node text then add this to the element + if (maybeAddTxT(pXML,token.pStr)) return FALSE; + + // Find the name of the tag + token = GetNextToken(pXML, &cbToken, &xtype); + + // Return an error if we couldn't obtain the next token or + // it wasnt text + if (xtype != eTokenText) + { + pXML->error = eXMLErrorMissingTagName; + return FALSE; + } + + // If we found a new element which is the same as this + // element then we need to pass this back to the caller.. + +#ifdef APPROXIMATE_PARSING + if (d->lpszName && + myTagCompare(d->lpszName, token.pStr) == 0) + { + // Indicate to the caller that it needs to create a + // new element. + pXML->lpNewElement = token.pStr; + pXML->cbNewElement = cbToken; + return TRUE; + } else +#endif + { + // If the name of the new element differs from the name of + // the current element we need to add the new element to + // the current one and recurse + pNew = addChild_priv(MEMORYINCREASE,stringDup(token.pStr,cbToken), nDeclaration,-1); + + while (!pNew.isEmpty()) + { + // Callself to process the new node. If we return + // FALSE this means we dont have any more + // processing to do... + + if (!pNew.ParseXMLElement(pXML)) return FALSE; + else + { + // If the call to recurse this function + // evented in a end tag specified in XML then + // we need to unwind the calls to this + // function until we find the appropriate node + // (the element name and end tag name must + // match) + if (pXML->cbEndTag) + { + // If we are back at the root node then we + // have an unmatched end tag + if (!d->lpszName) + { + pXML->error=eXMLErrorUnmatchedEndTag; + return FALSE; + } + + // If the end tag matches the name of this + // element then we only need to unwind + // once more... + + if (myTagCompare(d->lpszName, pXML->lpEndTag)==0) + { + pXML->cbEndTag = 0; + } + + return TRUE; + } else + if (pXML->cbNewElement) + { + // If the call indicated a new element is to + // be created on THIS element. + + // If the name of this element matches the + // name of the element we need to create + // then we need to return to the caller + // and let it process the element. + + if (myTagCompare(d->lpszName, pXML->lpNewElement)==0) + { + return TRUE; + } + + // Add the new element and recurse + pNew = addChild_priv(MEMORYINCREASE,stringDup(pXML->lpNewElement,pXML->cbNewElement),0,-1); + pXML->cbNewElement = 0; + } + else + { + // If we didn't have a new element to create + pNew = emptyXMLNode; + + } + } + } + } + break; + + // If we found an end tag + case eTokenTagEnd: + + // If we have node text then add this to the element + if (maybeAddTxT(pXML,token.pStr)) return FALSE; + + // Find the name of the end tag + token = GetNextToken(pXML, &cbTemp, &xtype); + + // The end tag should be text + if (xtype != eTokenText) + { + pXML->error = eXMLErrorMissingEndTagName; + return FALSE; + } + lpszTemp = token.pStr; + + // After the end tag we should find a closing tag + token = GetNextToken(pXML, &cbToken, &xtype); + if (xtype != eTokenCloseTag) + { + pXML->error = eXMLErrorMissingEndTagName; + return FALSE; + } + pXML->lpszText=pXML->lpXML+pXML->nIndex; + + // We need to return to the previous caller. If the name + // of the tag cannot be found we need to keep returning to + // caller until we find a match + if (myTagCompare(d->lpszName, lpszTemp) != 0) +#ifdef STRICT_PARSING + { + pXML->error=eXMLErrorUnmatchedEndTag; + pXML->nIndexMissigEndTag=pXML->nIndex; + return FALSE; + } +#else + { + pXML->error=eXMLErrorMissingEndTag; + pXML->nIndexMissigEndTag=pXML->nIndex; + pXML->lpEndTag = lpszTemp; + pXML->cbEndTag = cbTemp; + } +#endif + + // Return to the caller + exactMemory(d); + return TRUE; + + // If we found a clear (unformatted) token + case eTokenClear: + // If we have node text then add this to the element + if (maybeAddTxT(pXML,token.pStr)) return FALSE; + if (parseClearTag(pXML, token.pClr)) return FALSE; + pXML->lpszText=pXML->lpXML+pXML->nIndex; + break; + + default: + break; + } + break; + + // If we are inside a tag definition we need to search for attributes + case eInsideTag: + + // Check what part of the attribute (name, equals, value) we + // are looking for. + switch(attrib) + { + // If we are looking for a new attribute + case eAttribName: + + // Check what the current token type is + switch(xtype) + { + // If the current type is text... + // Eg. 'attribute' + case eTokenText: + // Cache the token then indicate that we are next to + // look for the equals + lpszTemp = token.pStr; + cbTemp = cbToken; + attrib = eAttribEquals; + break; + + // If we found a closing tag... + // Eg. '>' + case eTokenCloseTag: + // We are now outside the tag + status = eOutsideTag; + pXML->lpszText=pXML->lpXML+pXML->nIndex; + break; + + // If we found a short hand '/>' closing tag then we can + // return to the caller + case eTokenShortHandClose: + exactMemory(d); + pXML->lpszText=pXML->lpXML+pXML->nIndex; + return TRUE; + + // Errors... + case eTokenQuotedText: /* '"SomeText"' */ + case eTokenTagStart: /* '<' */ + case eTokenTagEnd: /* '</' */ + case eTokenEquals: /* '=' */ + case eTokenDeclaration: /* '<?' */ + case eTokenClear: + pXML->error = eXMLErrorUnexpectedToken; + return FALSE; + default: break; + } + break; + + // If we are looking for an equals + case eAttribEquals: + // Check what the current token type is + switch(xtype) + { + // If the current type is text... + // Eg. 'Attribute AnotherAttribute' + case eTokenText: + // Add the unvalued attribute to the list + addAttribute_priv(MEMORYINCREASE,stringDup(lpszTemp,cbTemp), NULL); + // Cache the token then indicate. We are next to + // look for the equals attribute + lpszTemp = token.pStr; + cbTemp = cbToken; + break; + + // If we found a closing tag 'Attribute >' or a short hand + // closing tag 'Attribute />' + case eTokenShortHandClose: + case eTokenCloseTag: + // If we are a declaration element '<?' then we need + // to remove extra closing '?' if it exists + pXML->lpszText=pXML->lpXML+pXML->nIndex; + + if (d->isDeclaration && + (lpszTemp[cbTemp-1]) == _CXML('?')) + { + cbTemp--; + if (d->pParent && d->pParent->pParent) xtype = eTokenShortHandClose; + } + + if (cbTemp) + { + // Add the unvalued attribute to the list + addAttribute_priv(MEMORYINCREASE,stringDup(lpszTemp,cbTemp), NULL); + } + + // If this is the end of the tag then return to the caller + if (xtype == eTokenShortHandClose) + { + exactMemory(d); + return TRUE; + } + + // We are now outside the tag + status = eOutsideTag; + break; + + // If we found the equals token... + // Eg. 'Attribute =' + case eTokenEquals: + // Indicate that we next need to search for the value + // for the attribute + attrib = eAttribValue; + break; + + // Errors... + case eTokenQuotedText: /* 'Attribute "InvalidAttr"'*/ + case eTokenTagStart: /* 'Attribute <' */ + case eTokenTagEnd: /* 'Attribute </' */ + case eTokenDeclaration: /* 'Attribute <?' */ + case eTokenClear: + pXML->error = eXMLErrorUnexpectedToken; + return FALSE; + default: break; + } + break; + + // If we are looking for an attribute value + case eAttribValue: + // Check what the current token type is + switch(xtype) + { + // If the current type is text or quoted text... + // Eg. 'Attribute = "Value"' or 'Attribute = Value' or + // 'Attribute = 'Value''. + case eTokenText: + case eTokenQuotedText: + // If we are a declaration element '<?' then we need + // to remove extra closing '?' if it exists + if (d->isDeclaration && + (token.pStr[cbToken-1]) == _CXML('?')) + { + cbToken--; + } + + if (cbTemp) + { + // Add the valued attribute to the list + if (xtype==eTokenQuotedText) { token.pStr++; cbToken-=2; } + XMLSTR attrVal=(XMLSTR)token.pStr; + if (attrVal) + { + attrVal=fromXMLString(attrVal,cbToken,pXML); + if (!attrVal) return FALSE; + } + addAttribute_priv(MEMORYINCREASE,stringDup(lpszTemp,cbTemp),attrVal); + } + + // Indicate we are searching for a new attribute + attrib = eAttribName; + break; + + // Errors... + case eTokenTagStart: /* 'Attr = <' */ + case eTokenTagEnd: /* 'Attr = </' */ + case eTokenCloseTag: /* 'Attr = >' */ + case eTokenShortHandClose: /* "Attr = />" */ + case eTokenEquals: /* 'Attr = =' */ + case eTokenDeclaration: /* 'Attr = <?' */ + case eTokenClear: + pXML->error = eXMLErrorUnexpectedToken; + return FALSE; + break; + default: break; + } + } + } + } + // If we failed to obtain the next token + else + { + if ((!d->isDeclaration)&&(d->pParent)) + { +#ifdef STRICT_PARSING + pXML->error=eXMLErrorUnmatchedEndTag; +#else + pXML->error=eXMLErrorMissingEndTag; +#endif + pXML->nIndexMissigEndTag=pXML->nIndex; + } + maybeAddTxT(pXML,pXML->lpXML+pXML->nIndex); + return FALSE; + } + } +} + +// Count the number of lines and columns in an XML string. +static void CountLinesAndColumns(XMLCSTR lpXML, int nUpto, XMLResults *pResults) +{ + XMLCHAR ch; + assert(lpXML); + assert(pResults); + + struct XML xml={ lpXML,lpXML, 0, 0, eXMLErrorNone, NULL, 0, NULL, 0, TRUE }; + + pResults->nLine = 1; + pResults->nColumn = 1; + while (xml.nIndex<nUpto) + { + ch = getNextChar(&xml); + if (ch != _CXML('\n')) pResults->nColumn++; + else + { + pResults->nLine++; + pResults->nColumn=1; + } + } +} + +// Parse XML and return the root element. +XMLNode XMLNode::parseString(XMLCSTR lpszXML, XMLCSTR tag, XMLResults *pResults) +{ + if (!lpszXML) + { + if (pResults) + { + pResults->error=eXMLErrorNoElements; + pResults->nLine=0; + pResults->nColumn=0; + } + return emptyXMLNode; + } + + XMLNode xnode(NULL,NULL,FALSE); + struct XML xml={ lpszXML, lpszXML, 0, 0, eXMLErrorNone, NULL, 0, NULL, 0, TRUE }; + + // Create header element + xnode.ParseXMLElement(&xml); + enum XMLError error = xml.error; + if (!xnode.nChildNode()) error=eXMLErrorNoXMLTagFound; + if ((xnode.nChildNode()==1)&&(xnode.nElement()==1)) xnode=xnode.getChildNode(); // skip the empty node + + // If no error occurred + if ((error==eXMLErrorNone)||(error==eXMLErrorMissingEndTag)||(error==eXMLErrorNoXMLTagFound)) + { + XMLCSTR name=xnode.getName(); + if (tag&&(*tag)&&((!name)||(xstricmp(name,tag)))) + { + xnode=xnode.getChildNode(tag); + if (xnode.isEmpty()) + { + if (pResults) + { + pResults->error=eXMLErrorFirstTagNotFound; + pResults->nLine=0; + pResults->nColumn=0; + } + return emptyXMLNode; + } + } + } else + { + // Cleanup: this will destroy all the nodes + xnode = emptyXMLNode; + } + + + // If we have been given somewhere to place results + if (pResults) + { + pResults->error = error; + + // If we have an error + if (error!=eXMLErrorNone) + { + if (error==eXMLErrorMissingEndTag) xml.nIndex=xml.nIndexMissigEndTag; + // Find which line and column it starts on. + CountLinesAndColumns(xml.lpXML, xml.nIndex, pResults); + } + } + return xnode; +} + +XMLNode XMLNode::parseFile(XMLCSTR filename, XMLCSTR tag, XMLResults *pResults) +{ + if (pResults) { pResults->nLine=0; pResults->nColumn=0; } + FILE *f=xfopen(filename,_CXML("rb")); + if (f==NULL) { if (pResults) pResults->error=eXMLErrorFileNotFound; return emptyXMLNode; } + fseek(f,0,SEEK_END); + int l=ftell(f),headerSz=0; + if (!l) { if (pResults) pResults->error=eXMLErrorEmpty; fclose(f); return emptyXMLNode; } + fseek(f,0,SEEK_SET); + unsigned char *buf=(unsigned char*)malloc(l+4); + l=fread(buf,1,l,f); + fclose(f); + buf[l]=0;buf[l+1]=0;buf[l+2]=0;buf[l+3]=0; +#ifdef _XMLWIDECHAR + if (guessWideCharChars) + { + if (!myIsTextWideChar(buf,l)) + { + XMLNode::XMLCharEncoding ce=XMLNode::char_encoding_legacy; + if ((buf[0]==0xef)&&(buf[1]==0xbb)&&(buf[2]==0xbf)) { headerSz=3; ce=XMLNode::char_encoding_UTF8; } + XMLSTR b2=myMultiByteToWideChar((const char*)(buf+headerSz),ce); + free(buf); buf=(unsigned char*)b2; headerSz=0; + } else + { + if ((buf[0]==0xef)&&(buf[1]==0xff)) headerSz=2; + if ((buf[0]==0xff)&&(buf[1]==0xfe)) headerSz=2; + } + } +#else + if (guessWideCharChars) + { + if (myIsTextWideChar(buf,l)) + { + if ((buf[0]==0xef)&&(buf[1]==0xff)) headerSz=2; + if ((buf[0]==0xff)&&(buf[1]==0xfe)) headerSz=2; + char *b2=myWideCharToMultiByte((const wchar_t*)(buf+headerSz)); + free(buf); buf=(unsigned char*)b2; headerSz=0; + } else + { + if ((buf[0]==0xef)&&(buf[1]==0xbb)&&(buf[2]==0xbf)) headerSz=3; + } + } +#endif + + if (!buf) { if (pResults) pResults->error=eXMLErrorCharConversionError; return emptyXMLNode; } + XMLNode x=parseString((XMLSTR)(buf+headerSz),tag,pResults); + free(buf); + return x; +} + +static inline void charmemset(XMLSTR dest,XMLCHAR c,int l) { while (l--) *(dest++)=c; } +// private: +// Creates an user friendly XML string from a given element with +// appropriate white space and carriage returns. +// +// This recurses through all subnodes then adds contents of the nodes to the +// string. +int XMLNode::CreateXMLStringR(XMLNodeData *pEntry, XMLSTR lpszMarker, int nFormat) +{ + int nResult = 0; + int cb=nFormat<0?0:nFormat; + int cbElement; + int nChildFormat=-1; + int nElementI=pEntry->nChild+pEntry->nText+pEntry->nClear; + int i,j; + if ((nFormat>=0)&&(nElementI==1)&&(pEntry->nText==1)&&(!pEntry->isDeclaration)) nFormat=-2; + + assert(pEntry); + +#define LENSTR(lpsz) (lpsz ? xstrlen(lpsz) : 0) + + // If the element has no name then assume this is the head node. + cbElement = (int)LENSTR(pEntry->lpszName); + + if (cbElement) + { + // "<elementname " + if (lpszMarker) + { + if (cb) charmemset(lpszMarker, INDENTCHAR, cb); + nResult = cb; + lpszMarker[nResult++]=_CXML('<'); + if (pEntry->isDeclaration) lpszMarker[nResult++]=_CXML('?'); + xstrcpy(&lpszMarker[nResult], pEntry->lpszName); + nResult+=cbElement; + lpszMarker[nResult++]=_CXML(' '); + + } else + { + nResult+=cbElement+2+cb; + if (pEntry->isDeclaration) nResult++; + } + + // Enumerate attributes and add them to the string + XMLAttribute *pAttr=pEntry->pAttribute; + for (i=0; i<pEntry->nAttribute; i++) + { + // "Attrib + cb = (int)LENSTR(pAttr->lpszName); + if (cb) + { + if (lpszMarker) xstrcpy(&lpszMarker[nResult], pAttr->lpszName); + nResult += cb; + // "Attrib=Value " + if (pAttr->lpszValue) + { + cb=(int)ToXMLStringTool::lengthXMLString(pAttr->lpszValue); + if (lpszMarker) + { + lpszMarker[nResult]=_CXML('='); + lpszMarker[nResult+1]=_CXML('"'); + if (cb) ToXMLStringTool::toXMLUnSafe(&lpszMarker[nResult+2],pAttr->lpszValue); + lpszMarker[nResult+cb+2]=_CXML('"'); + } + nResult+=cb+3; + } + if (lpszMarker) lpszMarker[nResult] = _CXML(' '); + nResult++; + } + pAttr++; + } + + if (pEntry->isDeclaration) + { + if (lpszMarker) + { + lpszMarker[nResult-1]=_CXML('?'); + lpszMarker[nResult]=_CXML('>'); + } + nResult++; + if (nFormat!=-1) + { + if (lpszMarker) lpszMarker[nResult]=_CXML('\n'); + nResult++; + } + } else + // If there are child nodes we need to terminate the start tag + if (nElementI) + { + if (lpszMarker) lpszMarker[nResult-1]=_CXML('>'); + if (nFormat>=0) + { + if (lpszMarker) lpszMarker[nResult]=_CXML('\n'); + nResult++; + } + } else nResult--; + } + + // Calculate the child format for when we recurse. This is used to + // determine the number of spaces used for prefixes. + if (nFormat!=-1) + { + if (cbElement&&(!pEntry->isDeclaration)) nChildFormat=nFormat+1; + else nChildFormat=nFormat; + } + + // Enumerate through remaining children + for (i=0; i<nElementI; i++) + { + j=pEntry->pOrder[i]; + switch((XMLElementType)(j&3)) + { + // Text nodes + case eNodeText: + { + // "Text" + XMLCSTR pChild=pEntry->pText[j>>2]; + cb = (int)ToXMLStringTool::lengthXMLString(pChild); + if (cb) + { + if (nFormat>=0) + { + if (lpszMarker) + { + charmemset(&lpszMarker[nResult],INDENTCHAR,nFormat+1); + ToXMLStringTool::toXMLUnSafe(&lpszMarker[nResult+nFormat+1],pChild); + lpszMarker[nResult+nFormat+1+cb]=_CXML('\n'); + } + nResult+=cb+nFormat+2; + } else + { + if (lpszMarker) ToXMLStringTool::toXMLUnSafe(&lpszMarker[nResult], pChild); + nResult += cb; + } + } + break; + } + + // Clear type nodes + case eNodeClear: + { + XMLClear *pChild=pEntry->pClear+(j>>2); + // "OpenTag" + cb = (int)LENSTR(pChild->lpszOpenTag); + if (cb) + { + if (nFormat!=-1) + { + if (lpszMarker) + { + charmemset(&lpszMarker[nResult], INDENTCHAR, nFormat+1); + xstrcpy(&lpszMarker[nResult+nFormat+1], pChild->lpszOpenTag); + } + nResult+=cb+nFormat+1; + } + else + { + if (lpszMarker)xstrcpy(&lpszMarker[nResult], pChild->lpszOpenTag); + nResult += cb; + } + } + + // "OpenTag Value" + cb = (int)LENSTR(pChild->lpszValue); + if (cb) + { + if (lpszMarker) xstrcpy(&lpszMarker[nResult], pChild->lpszValue); + nResult += cb; + } + + // "OpenTag Value CloseTag" + cb = (int)LENSTR(pChild->lpszCloseTag); + if (cb) + { + if (lpszMarker) xstrcpy(&lpszMarker[nResult], pChild->lpszCloseTag); + nResult += cb; + } + + if (nFormat!=-1) + { + if (lpszMarker) lpszMarker[nResult] = _CXML('\n'); + nResult++; + } + break; + } + + // Element nodes + case eNodeChild: + { + // Recursively add child nodes + nResult += CreateXMLStringR(pEntry->pChild[j>>2].d, lpszMarker ? lpszMarker + nResult : 0, nChildFormat); + break; + } + default: break; + } + } + + if ((cbElement)&&(!pEntry->isDeclaration)) + { + // If we have child entries we need to use long XML notation for + // closing the element - "<elementname>blah blah blah</elementname>" + if (nElementI) + { + // "</elementname>\0" + if (lpszMarker) + { + if (nFormat >=0) + { + charmemset(&lpszMarker[nResult], INDENTCHAR,nFormat); + nResult+=nFormat; + } + + lpszMarker[nResult]=_CXML('<'); lpszMarker[nResult+1]=_CXML('/'); + nResult += 2; + xstrcpy(&lpszMarker[nResult], pEntry->lpszName); + nResult += cbElement; + + lpszMarker[nResult]=_CXML('>'); + if (nFormat == -1) nResult++; + else + { + lpszMarker[nResult+1]=_CXML('\n'); + nResult+=2; + } + } else + { + if (nFormat>=0) nResult+=cbElement+4+nFormat; + else if (nFormat==-1) nResult+=cbElement+3; + else nResult+=cbElement+4; + } + } else + { + // If there are no children we can use shorthand XML notation - + // "<elementname/>" + // "/>\0" + if (lpszMarker) + { + lpszMarker[nResult]=_CXML('/'); lpszMarker[nResult+1]=_CXML('>'); + if (nFormat != -1) lpszMarker[nResult+2]=_CXML('\n'); + } + nResult += nFormat == -1 ? 2 : 3; + } + } + + return nResult; +} + +#undef LENSTR + +// Create an XML string +// @param int nFormat - 0 if no formatting is required +// otherwise nonzero for formatted text +// with carriage returns and indentation. +// @param int *pnSize - [out] pointer to the size of the +// returned string not including the +// NULL terminator. +// @return XMLSTR - Allocated XML string, you must free +// this with free(). +XMLSTR XMLNode::createXMLString(int nFormat, int *pnSize) const +{ + if (!d) { if (pnSize) *pnSize=0; return NULL; } + + XMLSTR lpszResult = NULL; + int cbStr; + + // Recursively Calculate the size of the XML string + if (!dropWhiteSpace) nFormat=0; + nFormat = nFormat ? 0 : -1; + cbStr = CreateXMLStringR(d, 0, nFormat); + // Alllocate memory for the XML string + the NULL terminator and + // create the recursively XML string. + lpszResult=(XMLSTR)malloc((cbStr+1)*sizeof(XMLCHAR)); + CreateXMLStringR(d, lpszResult, nFormat); + lpszResult[cbStr]=_CXML('\0'); + if (pnSize) *pnSize = cbStr; + return lpszResult; +} + +int XMLNode::detachFromParent(XMLNodeData *d) +{ + XMLNode *pa=d->pParent->pChild; + int i=0; + while (((void*)(pa[i].d))!=((void*)d)) i++; + d->pParent->nChild--; + if (d->pParent->nChild) memmove(pa+i,pa+i+1,(d->pParent->nChild-i)*sizeof(XMLNode)); + else { free(pa); d->pParent->pChild=NULL; } + return removeOrderElement(d->pParent,eNodeChild,i); +} + +XMLNode::~XMLNode() +{ + if (!d) return; + d->ref_count--; + emptyTheNode(0); +} +void XMLNode::deleteNodeContent() +{ + if (!d) return; + if (d->pParent) { detachFromParent(d); d->pParent=NULL; d->ref_count--; } + emptyTheNode(1); +} +void XMLNode::emptyTheNode(char force) +{ + XMLNodeData *dd=d; // warning: must stay this way! + if ((dd->ref_count==0)||force) + { + if (d->pParent) detachFromParent(d); + int i; + XMLNode *pc; + for(i=0; i<dd->nChild; i++) + { + pc=dd->pChild+i; + pc->d->pParent=NULL; + pc->d->ref_count--; + pc->emptyTheNode(force); + } + myFree(dd->pChild); + for(i=0; i<dd->nText; i++) free((void*)dd->pText[i]); + myFree(dd->pText); + for(i=0; i<dd->nClear; i++) free((void*)dd->pClear[i].lpszValue); + myFree(dd->pClear); + for(i=0; i<dd->nAttribute; i++) + { + free((void*)dd->pAttribute[i].lpszName); + if (dd->pAttribute[i].lpszValue) free((void*)dd->pAttribute[i].lpszValue); + } + myFree(dd->pAttribute); + myFree(dd->pOrder); + myFree((void*)dd->lpszName); + dd->nChild=0; dd->nText=0; dd->nClear=0; dd->nAttribute=0; + dd->pChild=NULL; dd->pText=NULL; dd->pClear=NULL; dd->pAttribute=NULL; + dd->pOrder=NULL; dd->lpszName=NULL; dd->pParent=NULL; + } + if (dd->ref_count==0) + { + free(dd); + d=NULL; + } +} + +XMLNode& XMLNode::operator=( const XMLNode& A ) +{ + // shallow copy + if (this != &A) + { + if (d) { d->ref_count--; emptyTheNode(0); } + d=A.d; + if (d) (d->ref_count) ++ ; + } + return *this; +} + +XMLNode::XMLNode(const XMLNode &A) +{ + // shallow copy + d=A.d; + if (d) (d->ref_count)++ ; +} + +XMLNode XMLNode::deepCopy() const +{ + if (!d) return XMLNode::emptyXMLNode; + XMLNode x(NULL,stringDup(d->lpszName),d->isDeclaration); + XMLNodeData *p=x.d; + int n=d->nAttribute; + if (n) + { + p->nAttribute=n; p->pAttribute=(XMLAttribute*)malloc(n*sizeof(XMLAttribute)); + while (n--) + { + p->pAttribute[n].lpszName=stringDup(d->pAttribute[n].lpszName); + p->pAttribute[n].lpszValue=stringDup(d->pAttribute[n].lpszValue); + } + } + if (d->pOrder) + { + n=(d->nChild+d->nText+d->nClear)*sizeof(int); p->pOrder=(int*)malloc(n); memcpy(p->pOrder,d->pOrder,n); + } + n=d->nText; + if (n) + { + p->nText=n; p->pText=(XMLCSTR*)malloc(n*sizeof(XMLCSTR)); + while(n--) p->pText[n]=stringDup(d->pText[n]); + } + n=d->nClear; + if (n) + { + p->nClear=n; p->pClear=(XMLClear*)malloc(n*sizeof(XMLClear)); + while (n--) + { + p->pClear[n].lpszCloseTag=d->pClear[n].lpszCloseTag; + p->pClear[n].lpszOpenTag=d->pClear[n].lpszOpenTag; + p->pClear[n].lpszValue=stringDup(d->pClear[n].lpszValue); + } + } + n=d->nChild; + if (n) + { + p->nChild=n; p->pChild=(XMLNode*)malloc(n*sizeof(XMLNode)); + while (n--) + { + p->pChild[n].d=NULL; + p->pChild[n]=d->pChild[n].deepCopy(); + p->pChild[n].d->pParent=p; + } + } + return x; +} + +XMLNode XMLNode::addChild(XMLNode childNode, int pos) +{ + XMLNodeData *dc=childNode.d; + if ((!dc)||(!d)) return childNode; + if (!dc->lpszName) + { + // this is a root node: todo: correct fix + int j=pos; + while (dc->nChild) + { + addChild(dc->pChild[0],j); + if (pos>=0) j++; + } + return childNode; + } + if (dc->pParent) { if ((detachFromParent(dc)<=pos)&&(dc->pParent==d)) pos--; } else dc->ref_count++; + dc->pParent=d; +// int nc=d->nChild; +// d->pChild=(XMLNode*)myRealloc(d->pChild,(nc+1),memoryIncrease,sizeof(XMLNode)); + d->pChild=(XMLNode*)addToOrder(0,&pos,d->nChild,d->pChild,sizeof(XMLNode),eNodeChild); + d->pChild[pos].d=dc; + d->nChild++; + return childNode; +} + +void XMLNode::deleteAttribute(int i) +{ + if ((!d)||(i<0)||(i>=d->nAttribute)) return; + d->nAttribute--; + XMLAttribute *p=d->pAttribute+i; + free((void*)p->lpszName); + if (p->lpszValue) free((void*)p->lpszValue); + if (d->nAttribute) memmove(p,p+1,(d->nAttribute-i)*sizeof(XMLAttribute)); else { free(p); d->pAttribute=NULL; } +} + +void XMLNode::deleteAttribute(XMLAttribute *a){ if (a) deleteAttribute(a->lpszName); } +void XMLNode::deleteAttribute(XMLCSTR lpszName) +{ + int j=0; + getAttribute(lpszName,&j); + if (j) deleteAttribute(j-1); +} + +XMLAttribute *XMLNode::updateAttribute_WOSD(XMLSTR lpszNewValue, XMLSTR lpszNewName,int i) +{ + if (!d) { if (lpszNewValue) free(lpszNewValue); if (lpszNewName) free(lpszNewName); return NULL; } + if (i>=d->nAttribute) + { + if (lpszNewName) return addAttribute_WOSD(lpszNewName,lpszNewValue); + return NULL; + } + XMLAttribute *p=d->pAttribute+i; + if (p->lpszValue&&p->lpszValue!=lpszNewValue) free((void*)p->lpszValue); + p->lpszValue=lpszNewValue; + if (lpszNewName&&p->lpszName!=lpszNewName) { free((void*)p->lpszName); p->lpszName=lpszNewName; }; + return p; +} + +XMLAttribute *XMLNode::updateAttribute_WOSD(XMLAttribute *newAttribute, XMLAttribute *oldAttribute) +{ + if (oldAttribute) return updateAttribute_WOSD((XMLSTR)newAttribute->lpszValue,(XMLSTR)newAttribute->lpszName,oldAttribute->lpszName); + return addAttribute_WOSD((XMLSTR)newAttribute->lpszName,(XMLSTR)newAttribute->lpszValue); +} + +XMLAttribute *XMLNode::updateAttribute_WOSD(XMLSTR lpszNewValue, XMLSTR lpszNewName,XMLCSTR lpszOldName) +{ + int j=0; + getAttribute(lpszOldName,&j); + if (j) return updateAttribute_WOSD(lpszNewValue,lpszNewName,j-1); + else + { + if (lpszNewName) return addAttribute_WOSD(lpszNewName,lpszNewValue); + else return addAttribute_WOSD(stringDup(lpszOldName),lpszNewValue); + } +} + +int XMLNode::indexText(XMLCSTR lpszValue) const +{ + if (!d) return -1; + int i,l=d->nText; + if (!lpszValue) { if (l) return 0; return -1; } + XMLCSTR *p=d->pText; + for (i=0; i<l; i++) if (lpszValue==p[i]) return i; + return -1; +} + +void XMLNode::deleteText(int i) +{ + if ((!d)||(i<0)||(i>=d->nText)) return; + d->nText--; + XMLCSTR *p=d->pText+i; + free((void*)*p); + if (d->nText) memmove(p,p+1,(d->nText-i)*sizeof(XMLCSTR)); else { free(p); d->pText=NULL; } + removeOrderElement(d,eNodeText,i); +} + +void XMLNode::deleteText(XMLCSTR lpszValue) { deleteText(indexText(lpszValue)); } + +XMLCSTR XMLNode::updateText_WOSD(XMLSTR lpszNewValue, int i) +{ + if (!d) { if (lpszNewValue) free(lpszNewValue); return NULL; } + if (i>=d->nText) return addText_WOSD(lpszNewValue); + XMLCSTR *p=d->pText+i; + if (*p!=lpszNewValue) { free((void*)*p); *p=lpszNewValue; } + return lpszNewValue; +} + +XMLCSTR XMLNode::updateText_WOSD(XMLSTR lpszNewValue, XMLCSTR lpszOldValue) +{ + if (!d) { if (lpszNewValue) free(lpszNewValue); return NULL; } + int i=indexText(lpszOldValue); + if (i>=0) return updateText_WOSD(lpszNewValue,i); + return addText_WOSD(lpszNewValue); +} + +void XMLNode::deleteClear(int i) +{ + if ((!d)||(i<0)||(i>=d->nClear)) return; + d->nClear--; + XMLClear *p=d->pClear+i; + free((void*)p->lpszValue); + if (d->nClear) memmove(p,p+1,(d->nClear-i)*sizeof(XMLClear)); else { free(p); d->pClear=NULL; } + removeOrderElement(d,eNodeClear,i); +} + +int XMLNode::indexClear(XMLCSTR lpszValue) const +{ + if (!d) return -1; + int i,l=d->nClear; + if (!lpszValue) { if (l) return 0; return -1; } + XMLClear *p=d->pClear; + for (i=0; i<l; i++) if (lpszValue==p[i].lpszValue) return i; + return -1; +} + +void XMLNode::deleteClear(XMLCSTR lpszValue) { deleteClear(indexClear(lpszValue)); } +void XMLNode::deleteClear(XMLClear *a) { if (a) deleteClear(a->lpszValue); } + +XMLClear *XMLNode::updateClear_WOSD(XMLSTR lpszNewContent, int i) +{ + if (!d) { if (lpszNewContent) free(lpszNewContent); return NULL; } + if (i>=d->nClear) return addClear_WOSD(lpszNewContent); + XMLClear *p=d->pClear+i; + if (lpszNewContent!=p->lpszValue) { free((void*)p->lpszValue); p->lpszValue=lpszNewContent; } + return p; +} + +XMLClear *XMLNode::updateClear_WOSD(XMLSTR lpszNewContent, XMLCSTR lpszOldValue) +{ + if (!d) { if (lpszNewContent) free(lpszNewContent); return NULL; } + int i=indexClear(lpszOldValue); + if (i>=0) return updateClear_WOSD(lpszNewContent,i); + return addClear_WOSD(lpszNewContent); +} + +XMLClear *XMLNode::updateClear_WOSD(XMLClear *newP,XMLClear *oldP) +{ + if (oldP) return updateClear_WOSD((XMLSTR)newP->lpszValue,(XMLSTR)oldP->lpszValue); + return NULL; +} + +int XMLNode::nChildNode(XMLCSTR name) const +{ + if (!d) return 0; + int i,j=0,n=d->nChild; + XMLNode *pc=d->pChild; + for (i=0; i<n; i++) + { + if (xstricmp(pc->d->lpszName, name)==0) j++; + pc++; + } + return j; +} + +XMLNode XMLNode::getChildNode(XMLCSTR name, int *j) const +{ + if (!d) return emptyXMLNode; + int i=0,n=d->nChild; + if (j) i=*j; + XMLNode *pc=d->pChild+i; + for (; i<n; i++) + { + if (!xstricmp(pc->d->lpszName, name)) + { + if (j) *j=i+1; + return *pc; + } + pc++; + } + return emptyXMLNode; +} + +XMLNode XMLNode::getChildNode(XMLCSTR name, int j) const +{ + if (!d) return emptyXMLNode; + if (j>=0) + { + int i=0; + while (j-->0) getChildNode(name,&i); + return getChildNode(name,&i); + } + int i=d->nChild; + while (i--) if (!xstricmp(name,d->pChild[i].d->lpszName)) break; + if (i<0) return emptyXMLNode; + return getChildNode(i); +} + +XMLNode XMLNode::getChildNodeByPath(XMLCSTR _path, char createMissing, XMLCHAR sep) +{ + XMLSTR path=stringDup(_path); + XMLNode x=getChildNodeByPathNonConst(path,createMissing,sep); + if (path) free(path); + return x; +} + +XMLNode XMLNode::getChildNodeByPathNonConst(XMLSTR path, char createIfMissing, XMLCHAR sep) +{ + if ((!path)||(!(*path))) return *this; + XMLNode xn,xbase=*this; + XMLCHAR *tend1,sepString[2]; sepString[0]=sep; sepString[1]=0; + tend1=xstrstr(path,sepString); + while(tend1) + { + *tend1=0; + xn=xbase.getChildNode(path); + if (xn.isEmpty()) + { + if (createIfMissing) xn=xbase.addChild(path); + else { *tend1=sep; return XMLNode::emptyXMLNode; } + } + *tend1=sep; + xbase=xn; + path=tend1+1; + tend1=xstrstr(path,sepString); + } + xn=xbase.getChildNode(path); + if (xn.isEmpty()&&createIfMissing) xn=xbase.addChild(path); + return xn; +} + +XMLElementPosition XMLNode::positionOfText (int i) const { if (i>=d->nText ) i=d->nText-1; return findPosition(d,i,eNodeText ); } +XMLElementPosition XMLNode::positionOfClear (int i) const { if (i>=d->nClear) i=d->nClear-1; return findPosition(d,i,eNodeClear); } +XMLElementPosition XMLNode::positionOfChildNode(int i) const { if (i>=d->nChild) i=d->nChild-1; return findPosition(d,i,eNodeChild); } +XMLElementPosition XMLNode::positionOfText (XMLCSTR lpszValue) const { return positionOfText (indexText (lpszValue)); } +XMLElementPosition XMLNode::positionOfClear(XMLCSTR lpszValue) const { return positionOfClear(indexClear(lpszValue)); } +XMLElementPosition XMLNode::positionOfClear(XMLClear *a) const { if (a) return positionOfClear(a->lpszValue); return positionOfClear(); } +XMLElementPosition XMLNode::positionOfChildNode(XMLNode x) const +{ + if ((!d)||(!x.d)) return -1; + XMLNodeData *dd=x.d; + XMLNode *pc=d->pChild; + int i=d->nChild; + while (i--) if (pc[i].d==dd) return findPosition(d,i,eNodeChild); + return -1; +} +XMLElementPosition XMLNode::positionOfChildNode(XMLCSTR name, int count) const +{ + if (!name) return positionOfChildNode(count); + int j=0; + do { getChildNode(name,&j); if (j<0) return -1; } while (count--); + return findPosition(d,j-1,eNodeChild); +} + +XMLNode XMLNode::getChildNodeWithAttribute(XMLCSTR name,XMLCSTR attributeName,XMLCSTR attributeValue, int *k) const +{ + int i=0,j; + if (k) i=*k; + XMLNode x; + XMLCSTR t; + do + { + x=getChildNode(name,&i); + if (!x.isEmpty()) + { + if (attributeValue) + { + j=0; + do + { + t=x.getAttribute(attributeName,&j); + if (t&&(xstricmp(attributeValue,t)==0)) { if (k) *k=i; return x; } + } while (t); + } else + { + if (x.isAttributeSet(attributeName)) { if (k) *k=i; return x; } + } + } + } while (!x.isEmpty()); + return emptyXMLNode; +} + +// Find an attribute on an node. +XMLCSTR XMLNode::getAttribute(XMLCSTR lpszAttrib, int *j) const +{ + if (!d) return NULL; + int i=0,n=d->nAttribute; + if (j) i=*j; + XMLAttribute *pAttr=d->pAttribute+i; + for (; i<n; i++) + { + if (xstricmp(pAttr->lpszName, lpszAttrib)==0) + { + if (j) *j=i+1; + return pAttr->lpszValue; + } + pAttr++; + } + return NULL; +} + +char XMLNode::isAttributeSet(XMLCSTR lpszAttrib) const +{ + if (!d) return FALSE; + int i,n=d->nAttribute; + XMLAttribute *pAttr=d->pAttribute; + for (i=0; i<n; i++) + { + if (xstricmp(pAttr->lpszName, lpszAttrib)==0) + { + return TRUE; + } + pAttr++; + } + return FALSE; +} + +XMLCSTR XMLNode::getAttribute(XMLCSTR name, int j) const +{ + if (!d) return NULL; + int i=0; + while (j-->0) getAttribute(name,&i); + return getAttribute(name,&i); +} + +XMLNodeContents XMLNode::enumContents(int i) const +{ + XMLNodeContents c; + if (!d) { c.etype=eNodeNULL; return c; } + if (i<d->nAttribute) + { + c.etype=eNodeAttribute; + c.attrib=d->pAttribute[i]; + return c; + } + i-=d->nAttribute; + c.etype=(XMLElementType)(d->pOrder[i]&3); + i=(d->pOrder[i])>>2; + switch (c.etype) + { + case eNodeChild: c.child = d->pChild[i]; break; + case eNodeText: c.text = d->pText[i]; break; + case eNodeClear: c.clear = d->pClear[i]; break; + default: break; + } + return c; +} + +XMLCSTR XMLNode::getName() const { if (!d) return NULL; return d->lpszName; } +int XMLNode::nText() const { if (!d) return 0; return d->nText; } +int XMLNode::nChildNode() const { if (!d) return 0; return d->nChild; } +int XMLNode::nAttribute() const { if (!d) return 0; return d->nAttribute; } +int XMLNode::nClear() const { if (!d) return 0; return d->nClear; } +int XMLNode::nElement() const { if (!d) return 0; return d->nAttribute+d->nChild+d->nText+d->nClear; } +XMLClear XMLNode::getClear (int i) const { if ((!d)||(i>=d->nClear )) return emptyXMLClear; return d->pClear[i]; } +XMLAttribute XMLNode::getAttribute (int i) const { if ((!d)||(i>=d->nAttribute)) return emptyXMLAttribute; return d->pAttribute[i]; } +XMLCSTR XMLNode::getAttributeName (int i) const { if ((!d)||(i>=d->nAttribute)) return NULL; return d->pAttribute[i].lpszName; } +XMLCSTR XMLNode::getAttributeValue(int i) const { if ((!d)||(i>=d->nAttribute)) return NULL; return d->pAttribute[i].lpszValue; } +XMLCSTR XMLNode::getText (int i) const { if ((!d)||(i>=d->nText )) return NULL; return d->pText[i]; } +XMLNode XMLNode::getChildNode (int i) const { if ((!d)||(i>=d->nChild )) return emptyXMLNode; return d->pChild[i]; } +XMLNode XMLNode::getParentNode ( ) const { if ((!d)||(!d->pParent )) return emptyXMLNode; return XMLNode(d->pParent); } +char XMLNode::isDeclaration ( ) const { if (!d) return 0; return d->isDeclaration; } +char XMLNode::isEmpty ( ) const { return (d==NULL); } +XMLNode XMLNode::emptyNode ( ) { return XMLNode::emptyXMLNode; } + +XMLNode XMLNode::addChild(XMLCSTR lpszName, char isDeclaration, XMLElementPosition pos) + { return addChild_priv(0,stringDup(lpszName),isDeclaration,pos); } +XMLNode XMLNode::addChild_WOSD(XMLSTR lpszName, char isDeclaration, XMLElementPosition pos) + { return addChild_priv(0,lpszName,isDeclaration,pos); } +XMLAttribute *XMLNode::addAttribute(XMLCSTR lpszName, XMLCSTR lpszValue) + { return addAttribute_priv(0,stringDup(lpszName),stringDup(lpszValue)); } +XMLAttribute *XMLNode::addAttribute_WOSD(XMLSTR lpszName, XMLSTR lpszValuev) + { return addAttribute_priv(0,lpszName,lpszValuev); } +XMLCSTR XMLNode::addText(XMLCSTR lpszValue, XMLElementPosition pos) + { return addText_priv(0,stringDup(lpszValue),pos); } +XMLCSTR XMLNode::addText_WOSD(XMLSTR lpszValue, XMLElementPosition pos) + { return addText_priv(0,lpszValue,pos); } +XMLClear *XMLNode::addClear(XMLCSTR lpszValue, XMLCSTR lpszOpen, XMLCSTR lpszClose, XMLElementPosition pos) + { return addClear_priv(0,stringDup(lpszValue),lpszOpen,lpszClose,pos); } +XMLClear *XMLNode::addClear_WOSD(XMLSTR lpszValue, XMLCSTR lpszOpen, XMLCSTR lpszClose, XMLElementPosition pos) + { return addClear_priv(0,lpszValue,lpszOpen,lpszClose,pos); } +XMLCSTR XMLNode::updateName(XMLCSTR lpszName) + { return updateName_WOSD(stringDup(lpszName)); } +XMLAttribute *XMLNode::updateAttribute(XMLAttribute *newAttribute, XMLAttribute *oldAttribute) + { return updateAttribute_WOSD(stringDup(newAttribute->lpszValue),stringDup(newAttribute->lpszName),oldAttribute->lpszName); } +XMLAttribute *XMLNode::updateAttribute(XMLCSTR lpszNewValue, XMLCSTR lpszNewName,int i) + { return updateAttribute_WOSD(stringDup(lpszNewValue),stringDup(lpszNewName),i); } +XMLAttribute *XMLNode::updateAttribute(XMLCSTR lpszNewValue, XMLCSTR lpszNewName,XMLCSTR lpszOldName) + { return updateAttribute_WOSD(stringDup(lpszNewValue),stringDup(lpszNewName),lpszOldName); } +XMLCSTR XMLNode::updateText(XMLCSTR lpszNewValue, int i) + { return updateText_WOSD(stringDup(lpszNewValue),i); } +XMLCSTR XMLNode::updateText(XMLCSTR lpszNewValue, XMLCSTR lpszOldValue) + { return updateText_WOSD(stringDup(lpszNewValue),lpszOldValue); } +XMLClear *XMLNode::updateClear(XMLCSTR lpszNewContent, int i) + { return updateClear_WOSD(stringDup(lpszNewContent),i); } +XMLClear *XMLNode::updateClear(XMLCSTR lpszNewValue, XMLCSTR lpszOldValue) + { return updateClear_WOSD(stringDup(lpszNewValue),lpszOldValue); } +XMLClear *XMLNode::updateClear(XMLClear *newP,XMLClear *oldP) + { return updateClear_WOSD(stringDup(newP->lpszValue),oldP->lpszValue); } + +char XMLNode::setGlobalOptions(XMLCharEncoding _characterEncoding, char _guessWideCharChars, + char _dropWhiteSpace, char _removeCommentsInMiddleOfText) +{ + guessWideCharChars=_guessWideCharChars; dropWhiteSpace=_dropWhiteSpace; removeCommentsInMiddleOfText=_removeCommentsInMiddleOfText; +#ifdef _XMLWIDECHAR + if (_characterEncoding) characterEncoding=_characterEncoding; +#else + switch(_characterEncoding) + { + case char_encoding_UTF8: characterEncoding=_characterEncoding; XML_ByteTable=XML_utf8ByteTable; break; + case char_encoding_legacy: characterEncoding=_characterEncoding; XML_ByteTable=XML_legacyByteTable; break; + case char_encoding_ShiftJIS: characterEncoding=_characterEncoding; XML_ByteTable=XML_sjisByteTable; break; + case char_encoding_GB2312: characterEncoding=_characterEncoding; XML_ByteTable=XML_gb2312ByteTable; break; + case char_encoding_Big5: + case char_encoding_GBK: characterEncoding=_characterEncoding; XML_ByteTable=XML_gbk_big5_ByteTable; break; + default: return 1; + } +#endif + return 0; +} + +XMLNode::XMLCharEncoding XMLNode::guessCharEncoding(void *buf,int l, char useXMLEncodingAttribute) +{ +#ifdef _XMLWIDECHAR + return (XMLCharEncoding)0; +#else + if (l<25) return (XMLCharEncoding)0; + if (guessWideCharChars&&(myIsTextWideChar(buf,l))) return (XMLCharEncoding)0; + unsigned char *b=(unsigned char*)buf; + if ((b[0]==0xef)&&(b[1]==0xbb)&&(b[2]==0xbf)) return char_encoding_UTF8; + + // Match utf-8 model ? + XMLCharEncoding bestGuess=char_encoding_UTF8; + int i=0; + while (i<l) + switch (XML_utf8ByteTable[b[i]]) + { + case 4: i++; if ((i<l)&&(b[i]& 0xC0)!=0x80) { bestGuess=char_encoding_legacy; i=l; } // 10bbbbbb ? + case 3: i++; if ((i<l)&&(b[i]& 0xC0)!=0x80) { bestGuess=char_encoding_legacy; i=l; } // 10bbbbbb ? + case 2: i++; if ((i<l)&&(b[i]& 0xC0)!=0x80) { bestGuess=char_encoding_legacy; i=l; } // 10bbbbbb ? + case 1: i++; break; + case 0: i=l; + } + if (!useXMLEncodingAttribute) return bestGuess; + // if encoding is specified and different from utf-8 than it's non-utf8 + // otherwise it's utf-8 + char bb[201]; + l=mmin(l,200); + memcpy(bb,buf,l); // copy buf into bb to be able to do "bb[l]=0" + bb[l]=0; + b=(unsigned char*)strstr(bb,"encoding"); + if (!b) return bestGuess; + b+=8; while XML_isSPACECHAR(*b) b++; if (*b!='=') return bestGuess; + b++; while XML_isSPACECHAR(*b) b++; if ((*b!='\'')&&(*b!='"')) return bestGuess; + b++; while XML_isSPACECHAR(*b) b++; + + if ((xstrnicmp((char*)b,"utf-8",5)==0)|| + (xstrnicmp((char*)b,"utf8",4)==0)) + { + if (bestGuess==char_encoding_legacy) return char_encoding_error; + return char_encoding_UTF8; + } + + if ((xstrnicmp((char*)b,"shiftjis",8)==0)|| + (xstrnicmp((char*)b,"shift-jis",9)==0)|| + (xstrnicmp((char*)b,"sjis",4)==0)) return char_encoding_ShiftJIS; + + if (xstrnicmp((char*)b,"GB2312",6)==0) return char_encoding_GB2312; + if (xstrnicmp((char*)b,"Big5",4)==0) return char_encoding_Big5; + if (xstrnicmp((char*)b,"GBK",3)==0) return char_encoding_GBK; + + return char_encoding_legacy; +#endif +} +#undef XML_isSPACECHAR + +////////////////////////////////////////////////////////// +// Here starts the base64 conversion functions. // +////////////////////////////////////////////////////////// + +static const char base64Fillchar = _CXML('='); // used to mark partial words at the end + +// this lookup table defines the base64 encoding +XMLCSTR base64EncodeTable=_CXML("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"); + +// Decode Table gives the index of any valid base64 character in the Base64 table] +// 96: '=' - 97: space char - 98: illegal char - 99: end of string +const unsigned char base64DecodeTable[] = { + 99,98,98,98,98,98,98,98,98,97, 97,98,98,97,98,98,98,98,98,98, 98,98,98,98,98,98,98,98,98,98, //00 -29 + 98,98,97,98,98,98,98,98,98,98, 98,98,98,62,98,98,98,63,52,53, 54,55,56,57,58,59,60,61,98,98, //30 -59 + 98,96,98,98,98, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 15,16,17,18,19,20,21,22,23,24, //60 -89 + 25,98,98,98,98,98,98,26,27,28, 29,30,31,32,33,34,35,36,37,38, 39,40,41,42,43,44,45,46,47,48, //90 -119 + 49,50,51,98,98,98,98,98,98,98, 98,98,98,98,98,98,98,98,98,98, 98,98,98,98,98,98,98,98,98,98, //120 -149 + 98,98,98,98,98,98,98,98,98,98, 98,98,98,98,98,98,98,98,98,98, 98,98,98,98,98,98,98,98,98,98, //150 -179 + 98,98,98,98,98,98,98,98,98,98, 98,98,98,98,98,98,98,98,98,98, 98,98,98,98,98,98,98,98,98,98, //180 -209 + 98,98,98,98,98,98,98,98,98,98, 98,98,98,98,98,98,98,98,98,98, 98,98,98,98,98,98,98,98,98,98, //210 -239 + 98,98,98,98,98,98,98,98,98,98, 98,98,98,98,98,98 //240 -255 +}; + +XMLParserBase64Tool::~XMLParserBase64Tool(){ freeBuffer(); } + +void XMLParserBase64Tool::freeBuffer(){ if (buf) free(buf); buf=NULL; buflen=0; } + +int XMLParserBase64Tool::encodeLength(int inlen, char formatted) +{ + unsigned int i=((inlen-1)/3*4+4+1); + if (formatted) i+=inlen/54; + return i; +} + +XMLSTR XMLParserBase64Tool::encode(unsigned char *inbuf, unsigned int inlen, char formatted) +{ + int i=encodeLength(inlen,formatted),k=17,eLen=inlen/3,j; + alloc(i*sizeof(XMLCHAR)); + XMLSTR curr=(XMLSTR)buf; + for(i=0;i<eLen;i++) + { + // Copy next three bytes into lower 24 bits of int, paying attention to sign. + j=(inbuf[0]<<16)|(inbuf[1]<<8)|inbuf[2]; inbuf+=3; + // Encode the int into four chars + *(curr++)=base64EncodeTable[ j>>18 ]; + *(curr++)=base64EncodeTable[(j>>12)&0x3f]; + *(curr++)=base64EncodeTable[(j>> 6)&0x3f]; + *(curr++)=base64EncodeTable[(j )&0x3f]; + if (formatted) { if (!k) { *(curr++)=_CXML('\n'); k=18; } k--; } + } + eLen=inlen-eLen*3; // 0 - 2. + if (eLen==1) + { + *(curr++)=base64EncodeTable[ inbuf[0]>>2 ]; + *(curr++)=base64EncodeTable[(inbuf[0]<<4)&0x3F]; + *(curr++)=base64Fillchar; + *(curr++)=base64Fillchar; + } else if (eLen==2) + { + j=(inbuf[0]<<8)|inbuf[1]; + *(curr++)=base64EncodeTable[ j>>10 ]; + *(curr++)=base64EncodeTable[(j>> 4)&0x3f]; + *(curr++)=base64EncodeTable[(j<< 2)&0x3f]; + *(curr++)=base64Fillchar; + } + *(curr++)=0; + return (XMLSTR)buf; +} + +unsigned int XMLParserBase64Tool::decodeSize(XMLCSTR data,XMLError *xe) +{ + if (xe) *xe=eXMLErrorNone; + int size=0; + unsigned char c; + //skip any extra characters (e.g. newlines or spaces) + while (*data) + { +#ifdef _XMLWIDECHAR + if (*data>255) { if (xe) *xe=eXMLErrorBase64DecodeIllegalCharacter; return 0; } +#endif + c=base64DecodeTable[(unsigned char)(*data)]; + if (c<97) size++; + else if (c==98) { if (xe) *xe=eXMLErrorBase64DecodeIllegalCharacter; return 0; } + data++; + } + if (xe&&(size%4!=0)) *xe=eXMLErrorBase64DataSizeIsNotMultipleOf4; + if (size==0) return 0; + do { data--; size--; } while(*data==base64Fillchar); size++; + return (unsigned int)((size*3)/4); +} + +unsigned char XMLParserBase64Tool::decode(XMLCSTR data, unsigned char *buf, int len, XMLError *xe) +{ + if (xe) *xe=eXMLErrorNone; + int i=0,p=0; + unsigned char d,c; + for(;;) + { + +#ifdef _XMLWIDECHAR +#define BASE64DECODE_READ_NEXT_CHAR(c) \ + do { \ + if (data[i]>255){ c=98; break; } \ + c=base64DecodeTable[(unsigned char)data[i++]]; \ + }while (c==97); \ + if(c==98){ if(xe)*xe=eXMLErrorBase64DecodeIllegalCharacter; return 0; } +#else +#define BASE64DECODE_READ_NEXT_CHAR(c) \ + do { c=base64DecodeTable[(unsigned char)data[i++]]; }while (c==97); \ + if(c==98){ if(xe)*xe=eXMLErrorBase64DecodeIllegalCharacter; return 0; } +#endif + + BASE64DECODE_READ_NEXT_CHAR(c) + if (c==99) { return 2; } + if (c==96) + { + if (p==(int)len) return 2; + if (xe) *xe=eXMLErrorBase64DecodeTruncatedData; + return 1; + } + + BASE64DECODE_READ_NEXT_CHAR(d) + if ((d==99)||(d==96)) { if (xe) *xe=eXMLErrorBase64DecodeTruncatedData; return 1; } + if (p==(int)len) { if (xe) *xe=eXMLErrorBase64DecodeBufferTooSmall; return 0; } + buf[p++]=(unsigned char)((c<<2)|((d>>4)&0x3)); + + BASE64DECODE_READ_NEXT_CHAR(c) + if (c==99) { if (xe) *xe=eXMLErrorBase64DecodeTruncatedData; return 1; } + if (p==(int)len) + { + if (c==96) return 2; + if (xe) *xe=eXMLErrorBase64DecodeBufferTooSmall; + return 0; + } + if (c==96) { if (xe) *xe=eXMLErrorBase64DecodeTruncatedData; return 1; } + buf[p++]=(unsigned char)(((d<<4)&0xf0)|((c>>2)&0xf)); + + BASE64DECODE_READ_NEXT_CHAR(d) + if (d==99 ) { if (xe) *xe=eXMLErrorBase64DecodeTruncatedData; return 1; } + if (p==(int)len) + { + if (d==96) return 2; + if (xe) *xe=eXMLErrorBase64DecodeBufferTooSmall; + return 0; + } + if (d==96) { if (xe) *xe=eXMLErrorBase64DecodeTruncatedData; return 1; } + buf[p++]=(unsigned char)(((c<<6)&0xc0)|d); + } +} +#undef BASE64DECODE_READ_NEXT_CHAR + +void XMLParserBase64Tool::alloc(int newsize) +{ + if ((!buf)&&(newsize)) { buf=malloc(newsize); buflen=newsize; return; } + if (newsize>buflen) { buf=realloc(buf,newsize); buflen=newsize; } +} + +unsigned char *XMLParserBase64Tool::decode(XMLCSTR data, int *outlen, XMLError *xe) +{ + if (xe) *xe=eXMLErrorNone; + unsigned int len=decodeSize(data,xe); + if (outlen) *outlen=len; + if (!len) return NULL; + alloc(len+1); + if(!decode(data,(unsigned char*)buf,len,xe)){ return NULL; } + return (unsigned char*)buf; +} + diff --git a/ext/mcpat/xmlParser.h b/ext/mcpat/xmlParser.h new file mode 100644 index 000000000..e29136cb9 --- /dev/null +++ b/ext/mcpat/xmlParser.h @@ -0,0 +1,764 @@ +/****************************************************************************/ +/*! \mainpage XMLParser library + * \section intro_sec Introduction + * + * This is a basic XML parser written in ANSI C++ for portability. + * It works by using recursion and a node tree for breaking + * down the elements of an XML document. + * + * @version V2.41 + * @author Frank Vanden Berghen + * + * The following license terms for the "XMLParser library from Business-Insight" apply to projects + * that are in some way related to + * the "mcpat project", including applications + * using "mcpat project" and tools developed + * for enhancing "mcpat project". All other projects + * (not related to "mcpat project") have to use the "XMLParser library from Business-Insight" + * code under the Aladdin Free Public License (AFPL) + * See the file "AFPL-license.txt" for more informations about the AFPL license. + * (see http://www.artifex.com/downloads/doc/Public.htm for detailed AFPL terms) + * + * Redistribution and use of the "XMLParser library from Business-Insight" in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Frank Vanden Berghen nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY Business-Insight ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Business-Insight BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Copyright (c) 2002, Business-Insight + * <a href="http://www.Business-Insight.com">Business-Insight</a> + * All rights reserved. + * + * \section tutorial First Tutorial + * You can follow a simple <a href="../../xmlParser.html">Tutorial</a> to know the basics... + * + * \section usage General usage: How to include the XMLParser library inside your project. + * + * The library is composed of two files: <a href="../../xmlParser.cpp">xmlParser.cpp</a> and + * <a href="../../xmlParser.h">xmlParser.h</a>. These are the ONLY 2 files that you need when + * using the library inside your own projects. + * + * All the functions of the library are documented inside the comments of the file + * <a href="../../xmlParser.h">xmlParser.h</a>. These comments can be transformed in + * full-fledged HTML documentation using the DOXYGEN software: simply type: "doxygen doxy.cfg" + * + * By default, the XMLParser library uses (char*) for string representation.To use the (wchar_t*) + * version of the library, you need to define the "_UNICODE" preprocessor definition variable + * (this is usually done inside your project definition file) (This is done automatically for you + * when using Visual Studio). + * + * \section example Advanced Tutorial and Many Examples of usage. + * + * Some very small introductory examples are described inside the Tutorial file + * <a href="../../xmlParser.html">xmlParser.html</a> + * + * Some additional small examples are also inside the file <a href="../../xmlTest.cpp">xmlTest.cpp</a> + * (for the "char*" version of the library) and inside the file + * <a href="../../xmlTestUnicode.cpp">xmlTestUnicode.cpp</a> (for the "wchar_t*" + * version of the library). If you have a question, please review these additionnal examples + * before sending an e-mail to the author. + * + * To build the examples: + * - linux/unix: type "make" + * - solaris: type "make -f makefile.solaris" + * - windows: Visual Studio: double-click on xmlParser.dsw + * (under Visual Studio .NET, the .dsp and .dsw files will be automatically converted to .vcproj and .sln files) + * + * In order to build the examples you need some additional files: + * - linux/unix: makefile + * - solaris: makefile.solaris + * - windows: Visual Studio: *.dsp, xmlParser.dsw and also xmlParser.lib and xmlParser.dll + * + * \section debugging Debugging with the XMLParser library + * + * \subsection debugwin Debugging under WINDOWS + * + * Inside Visual C++, the "debug versions" of the memory allocation functions are + * very slow: Do not forget to compile in "release mode" to get maximum speed. + * When I had to debug a software that was using the XMLParser Library, it was usually + * a nightmare because the library was sooOOOoooo slow in debug mode (because of the + * slow memory allocations in Debug mode). To solve this + * problem, during all the debugging session, I am now using a very fast DLL version of the + * XMLParser Library (the DLL is compiled in release mode). Using the DLL version of + * the XMLParser Library allows me to have lightening XML parsing speed even in debug! + * Other than that, the DLL version is useless: In the release version of my tool, + * I always use the normal, ".cpp"-based, XMLParser Library (I simply include the + * <a href="../../xmlParser.cpp">xmlParser.cpp</a> and + * <a href="../../xmlParser.h">xmlParser.h</a> files into the project). + * + * The file <a href="../../XMLNodeAutoexp.txt">XMLNodeAutoexp.txt</a> contains some + * "tweaks" that improve substancially the display of the content of the XMLNode objects + * inside the Visual Studio Debugger. Believe me, once you have seen inside the debugger + * the "smooth" display of the XMLNode objects, you cannot live without it anymore! + * + * \subsection debuglinux Debugging under LINUX/UNIX + * + * The speed of the debug version of the XMLParser library is tolerable so no extra + * work.has been done. + * + ****************************************************************************/ + +#ifndef __INCLUDE_XML_NODE__ +#define __INCLUDE_XML_NODE__ + +#include <stdlib.h> + +#ifdef _UNICODE +// If you comment the next "define" line then the library will never "switch to" _UNICODE (wchar_t*) mode (16/32 bits per characters). +// This is useful when you get error messages like: +// 'XMLNode::openFileHelper' : cannot convert parameter 2 from 'const char [5]' to 'const wchar_t *' +// The _XMLWIDECHAR preprocessor variable force the XMLParser library into either utf16/32-mode (the proprocessor variable +// must be defined) or utf8-mode(the pre-processor variable must be undefined). +#define _XMLWIDECHAR +#endif + +#if defined(WIN32) || defined(UNDER_CE) || defined(_WIN32) || defined(WIN64) || defined(__BORLANDC__) +// comment the next line if you are under windows and the compiler is not Microsoft Visual Studio (6.0 or .NET) or Borland +#define _XMLWINDOWS +#endif + +#ifdef XMLDLLENTRY +#undef XMLDLLENTRY +#endif +#ifdef _USE_XMLPARSER_DLL +#ifdef _DLL_EXPORTS_ +#define XMLDLLENTRY __declspec(dllexport) +#else +#define XMLDLLENTRY __declspec(dllimport) +#endif +#else +#define XMLDLLENTRY +#endif + +// uncomment the next line if you want no support for wchar_t* (no need for the <wchar.h> or <tchar.h> libraries anymore to compile) +//#define XML_NO_WIDE_CHAR + +#ifdef XML_NO_WIDE_CHAR +#undef _XMLWINDOWS +#undef _XMLWIDECHAR +#endif + +#ifdef _XMLWINDOWS +#include <tchar.h> +#else +#define XMLDLLENTRY +#ifndef XML_NO_WIDE_CHAR +#include <wchar.h> // to have 'wcsrtombs' for ANSI version + // to have 'mbsrtowcs' for WIDECHAR version +#endif +#endif + +// Some common types for char set portable code +#ifdef _XMLWIDECHAR + #define _CXML(c) L ## c + #define XMLCSTR const wchar_t * + #define XMLSTR wchar_t * + #define XMLCHAR wchar_t +#else + #define _CXML(c) c + #define XMLCSTR const char * + #define XMLSTR char * + #define XMLCHAR char +#endif +#ifndef FALSE + #define FALSE 0 +#endif /* FALSE */ +#ifndef TRUE + #define TRUE 1 +#endif /* TRUE */ + + +/// Enumeration for XML parse errors. +typedef enum XMLError +{ + eXMLErrorNone = 0, + eXMLErrorMissingEndTag, + eXMLErrorNoXMLTagFound, + eXMLErrorEmpty, + eXMLErrorMissingTagName, + eXMLErrorMissingEndTagName, + eXMLErrorUnmatchedEndTag, + eXMLErrorUnmatchedEndClearTag, + eXMLErrorUnexpectedToken, + eXMLErrorNoElements, + eXMLErrorFileNotFound, + eXMLErrorFirstTagNotFound, + eXMLErrorUnknownCharacterEntity, + eXMLErrorCharacterCodeAbove255, + eXMLErrorCharConversionError, + eXMLErrorCannotOpenWriteFile, + eXMLErrorCannotWriteFile, + + eXMLErrorBase64DataSizeIsNotMultipleOf4, + eXMLErrorBase64DecodeIllegalCharacter, + eXMLErrorBase64DecodeTruncatedData, + eXMLErrorBase64DecodeBufferTooSmall +} XMLError; + + +/// Enumeration used to manage type of data. Use in conjunction with structure XMLNodeContents +typedef enum XMLElementType +{ + eNodeChild=0, + eNodeAttribute=1, + eNodeText=2, + eNodeClear=3, + eNodeNULL=4 +} XMLElementType; + +/// Structure used to obtain error details if the parse fails. +typedef struct XMLResults +{ + enum XMLError error; + int nLine,nColumn; +} XMLResults; + +/// Structure for XML clear (unformatted) node (usually comments) +typedef struct XMLClear { + XMLCSTR lpszValue; XMLCSTR lpszOpenTag; XMLCSTR lpszCloseTag; +} XMLClear; + +/// Structure for XML attribute. +typedef struct XMLAttribute { + XMLCSTR lpszName; XMLCSTR lpszValue; +} XMLAttribute; + +/// XMLElementPosition are not interchangeable with simple indexes +typedef int XMLElementPosition; + +struct XMLNodeContents; + +/** @defgroup XMLParserGeneral The XML parser */ + +/// Main Class representing a XML node +/** + * All operations are performed using this class. + * \note The constructors of the XMLNode class are protected, so use instead one of these four methods to get your first instance of XMLNode: + * <ul> + * <li> XMLNode::parseString </li> + * <li> XMLNode::parseFile </li> + * <li> XMLNode::openFileHelper </li> + * <li> XMLNode::createXMLTopNode (or XMLNode::createXMLTopNode_WOSD)</li> + * </ul> */ +typedef struct XMLDLLENTRY XMLNode +{ + private: + + struct XMLNodeDataTag; + + /// Constructors are protected, so use instead one of: XMLNode::parseString, XMLNode::parseFile, XMLNode::openFileHelper, XMLNode::createXMLTopNode + XMLNode(struct XMLNodeDataTag *pParent, XMLSTR lpszName, char isDeclaration); + /// Constructors are protected, so use instead one of: XMLNode::parseString, XMLNode::parseFile, XMLNode::openFileHelper, XMLNode::createXMLTopNode + XMLNode(struct XMLNodeDataTag *p); + + public: + static XMLCSTR getVersion();///< Return the XMLParser library version number + + /** @defgroup conversions Parsing XML files/strings to an XMLNode structure and Rendering XMLNode's to files/string. + * @ingroup XMLParserGeneral + * @{ */ + + /// Parse an XML string and return the root of a XMLNode tree representing the string. + static XMLNode parseString (XMLCSTR lpXMLString, XMLCSTR tag=NULL, XMLResults *pResults=NULL); + /**< The "parseString" function parse an XML string and return the root of a XMLNode tree. The "opposite" of this function is + * the function "createXMLString" that re-creates an XML string from an XMLNode tree. If the XML document is corrupted, the + * "parseString" method will initialize the "pResults" variable with some information that can be used to trace the error. + * If you still want to parse the file, you can use the APPROXIMATE_PARSING option as explained inside the note at the + * beginning of the "xmlParser.cpp" file. + * + * @param lpXMLString the XML string to parse + * @param tag the name of the first tag inside the XML file. If the tag parameter is omitted, this function returns a node that represents the head of the xml document including the declaration term (<? ... ?>). + * @param pResults a pointer to a XMLResults variable that will contain some information that can be used to trace the XML parsing error. You can have a user-friendly explanation of the parsing error with the "getError" function. + */ + + /// Parse an XML file and return the root of a XMLNode tree representing the file. + static XMLNode parseFile (XMLCSTR filename, XMLCSTR tag=NULL, XMLResults *pResults=NULL); + /**< The "parseFile" function parse an XML file and return the root of a XMLNode tree. The "opposite" of this function is + * the function "writeToFile" that re-creates an XML file from an XMLNode tree. If the XML document is corrupted, the + * "parseFile" method will initialize the "pResults" variable with some information that can be used to trace the error. + * If you still want to parse the file, you can use the APPROXIMATE_PARSING option as explained inside the note at the + * beginning of the "xmlParser.cpp" file. + * + * @param filename the path to the XML file to parse + * @param tag the name of the first tag inside the XML file. If the tag parameter is omitted, this function returns a node that represents the head of the xml document including the declaration term (<? ... ?>). + * @param pResults a pointer to a XMLResults variable that will contain some information that can be used to trace the XML parsing error. You can have a user-friendly explanation of the parsing error with the "getError" function. + */ + + /// Parse an XML file and return the root of a XMLNode tree representing the file. A very crude error checking is made. An attempt to guess the Char Encoding used in the file is made. + static XMLNode openFileHelper(XMLCSTR filename, XMLCSTR tag=NULL); + /**< The "openFileHelper" function reports to the screen all the warnings and errors that occurred during parsing of the XML file. + * This function also tries to guess char Encoding (UTF-8, ASCII or SHIT-JIS) based on the first 200 bytes of the file. Since each + * application has its own way to report and deal with errors, you should rather use the "parseFile" function to parse XML files + * and program yourself thereafter an "error reporting" tailored for your needs (instead of using the very crude "error reporting" + * mechanism included inside the "openFileHelper" function). + * + * If the XML document is corrupted, the "openFileHelper" method will: + * - display an error message on the console (or inside a messageBox for windows). + * - stop execution (exit). + * + * I strongly suggest that you write your own "openFileHelper" method tailored to your needs. If you still want to parse + * the file, you can use the APPROXIMATE_PARSING option as explained inside the note at the beginning of the "xmlParser.cpp" file. + * + * @param filename the path of the XML file to parse. + * @param tag the name of the first tag inside the XML file. If the tag parameter is omitted, this function returns a node that represents the head of the xml document including the declaration term (<? ... ?>). + */ + + static XMLCSTR getError(XMLError error); ///< this gives you a user-friendly explanation of the parsing error + + /// Create an XML string starting from the current XMLNode. + XMLSTR createXMLString(int nFormat=1, int *pnSize=NULL) const; + /**< The returned string should be free'd using the "freeXMLString" function. + * + * If nFormat==0, no formatting is required otherwise this returns an user friendly XML string from a given element + * with appropriate white spaces and carriage returns. if pnSize is given it returns the size in character of the string. */ + + /// Save the content of an xmlNode inside a file + XMLError writeToFile(XMLCSTR filename, + const char *encoding=NULL, + char nFormat=1) const; + /**< If nFormat==0, no formatting is required otherwise this returns an user friendly XML string from a given element with appropriate white spaces and carriage returns. + * If the global parameter "characterEncoding==encoding_UTF8", then the "encoding" parameter is ignored and always set to "utf-8". + * If the global parameter "characterEncoding==encoding_ShiftJIS", then the "encoding" parameter is ignored and always set to "SHIFT-JIS". + * If "_XMLWIDECHAR=1", then the "encoding" parameter is ignored and always set to "utf-16". + * If no "encoding" parameter is given the "ISO-8859-1" encoding is used. */ + /** @} */ + + /** @defgroup navigate Navigate the XMLNode structure + * @ingroup XMLParserGeneral + * @{ */ + XMLCSTR getName() const; ///< name of the node + XMLCSTR getText(int i=0) const; ///< return ith text field + int nText() const; ///< nbr of text field + XMLNode getParentNode() const; ///< return the parent node + XMLNode getChildNode(int i=0) const; ///< return ith child node + XMLNode getChildNode(XMLCSTR name, int i) const; ///< return ith child node with specific name (return an empty node if failing). If i==-1, this returns the last XMLNode with the given name. + XMLNode getChildNode(XMLCSTR name, int *i=NULL) const; ///< return next child node with specific name (return an empty node if failing) + XMLNode getChildNodeWithAttribute(XMLCSTR tagName, + XMLCSTR attributeName, + XMLCSTR attributeValue=NULL, + int *i=NULL) const; ///< return child node with specific name/attribute (return an empty node if failing) + XMLNode getChildNodeByPath(XMLCSTR path, char createNodeIfMissing=0, XMLCHAR sep='/'); + ///< return the first child node with specific path + XMLNode getChildNodeByPathNonConst(XMLSTR path, char createNodeIfMissing=0, XMLCHAR sep='/'); + ///< return the first child node with specific path. + + int nChildNode(XMLCSTR name) const; ///< return the number of child node with specific name + int nChildNode() const; ///< nbr of child node + XMLAttribute getAttribute(int i=0) const; ///< return ith attribute + XMLCSTR getAttributeName(int i=0) const; ///< return ith attribute name + XMLCSTR getAttributeValue(int i=0) const; ///< return ith attribute value + char isAttributeSet(XMLCSTR name) const; ///< test if an attribute with a specific name is given + XMLCSTR getAttribute(XMLCSTR name, int i) const; ///< return ith attribute content with specific name (return a NULL if failing) + XMLCSTR getAttribute(XMLCSTR name, int *i=NULL) const; ///< return next attribute content with specific name (return a NULL if failing) + int nAttribute() const; ///< nbr of attribute + XMLClear getClear(int i=0) const; ///< return ith clear field (comments) + int nClear() const; ///< nbr of clear field + XMLNodeContents enumContents(XMLElementPosition i) const; ///< enumerate all the different contents (attribute,child,text, clear) of the current XMLNode. The order is reflecting the order of the original file/string. NOTE: 0 <= i < nElement(); + int nElement() const; ///< nbr of different contents for current node + char isEmpty() const; ///< is this node Empty? + char isDeclaration() const; ///< is this node a declaration <? .... ?> + XMLNode deepCopy() const; ///< deep copy (duplicate/clone) a XMLNode + static XMLNode emptyNode(); ///< return XMLNode::emptyXMLNode; + /** @} */ + + ~XMLNode(); + XMLNode(const XMLNode &A); ///< to allow shallow/fast copy: + XMLNode& operator=( const XMLNode& A ); ///< to allow shallow/fast copy: + + XMLNode(): d(NULL){}; + static XMLNode emptyXMLNode; + static XMLClear emptyXMLClear; + static XMLAttribute emptyXMLAttribute; + + /** @defgroup xmlModify Create or Update the XMLNode structure + * @ingroup XMLParserGeneral + * The functions in this group allows you to create from scratch (or update) a XMLNode structure. Start by creating your top + * node with the "createXMLTopNode" function and then add new nodes with the "addChild" function. The parameter 'pos' gives + * the position where the childNode, the text or the XMLClearTag will be inserted. The default value (pos=-1) inserts at the + * end. The value (pos=0) insert at the beginning (Insertion at the beginning is slower than at the end). <br> + * + * REMARK: 0 <= pos < nChild()+nText()+nClear() <br> + */ + + /** @defgroup creation Creating from scratch a XMLNode structure + * @ingroup xmlModify + * @{ */ + static XMLNode createXMLTopNode(XMLCSTR lpszName, char isDeclaration=FALSE); ///< Create the top node of an XMLNode structure + XMLNode addChild(XMLCSTR lpszName, char isDeclaration=FALSE, XMLElementPosition pos=-1); ///< Add a new child node + XMLNode addChild(XMLNode nodeToAdd, XMLElementPosition pos=-1); ///< If the "nodeToAdd" has some parents, it will be detached from it's parents before being attached to the current XMLNode + XMLAttribute *addAttribute(XMLCSTR lpszName, XMLCSTR lpszValuev); ///< Add a new attribute + XMLCSTR addText(XMLCSTR lpszValue, XMLElementPosition pos=-1); ///< Add a new text content + XMLClear *addClear(XMLCSTR lpszValue, XMLCSTR lpszOpen=NULL, XMLCSTR lpszClose=NULL, XMLElementPosition pos=-1); + /**< Add a new clear tag + * @param lpszOpen default value "<![CDATA[" + * @param lpszClose default value "]]>" + */ + /** @} */ + + /** @defgroup xmlUpdate Updating Nodes + * @ingroup xmlModify + * Some update functions: + * @{ + */ + XMLCSTR updateName(XMLCSTR lpszName); ///< change node's name + XMLAttribute *updateAttribute(XMLAttribute *newAttribute, XMLAttribute *oldAttribute); ///< if the attribute to update is missing, a new one will be added + XMLAttribute *updateAttribute(XMLCSTR lpszNewValue, XMLCSTR lpszNewName=NULL,int i=0); ///< if the attribute to update is missing, a new one will be added + XMLAttribute *updateAttribute(XMLCSTR lpszNewValue, XMLCSTR lpszNewName,XMLCSTR lpszOldName);///< set lpszNewName=NULL if you don't want to change the name of the attribute if the attribute to update is missing, a new one will be added + XMLCSTR updateText(XMLCSTR lpszNewValue, int i=0); ///< if the text to update is missing, a new one will be added + XMLCSTR updateText(XMLCSTR lpszNewValue, XMLCSTR lpszOldValue); ///< if the text to update is missing, a new one will be added + XMLClear *updateClear(XMLCSTR lpszNewContent, int i=0); ///< if the clearTag to update is missing, a new one will be added + XMLClear *updateClear(XMLClear *newP,XMLClear *oldP); ///< if the clearTag to update is missing, a new one will be added + XMLClear *updateClear(XMLCSTR lpszNewValue, XMLCSTR lpszOldValue); ///< if the clearTag to update is missing, a new one will be added + /** @} */ + + /** @defgroup xmlDelete Deleting Nodes or Attributes + * @ingroup xmlModify + * Some deletion functions: + * @{ + */ + /// The "deleteNodeContent" function forces the deletion of the content of this XMLNode and the subtree. + void deleteNodeContent(); + /**< \note The XMLNode instances that are referring to the part of the subtree that has been deleted CANNOT be used anymore!!. Unexpected results will occur if you continue using them. */ + void deleteAttribute(int i=0); ///< Delete the ith attribute of the current XMLNode + void deleteAttribute(XMLCSTR lpszName); ///< Delete the attribute with the given name (the "strcmp" function is used to find the right attribute) + void deleteAttribute(XMLAttribute *anAttribute); ///< Delete the attribute with the name "anAttribute->lpszName" (the "strcmp" function is used to find the right attribute) + void deleteText(int i=0); ///< Delete the Ith text content of the current XMLNode + void deleteText(XMLCSTR lpszValue); ///< Delete the text content "lpszValue" inside the current XMLNode (direct "pointer-to-pointer" comparison is used to find the right text) + void deleteClear(int i=0); ///< Delete the Ith clear tag inside the current XMLNode + void deleteClear(XMLCSTR lpszValue); ///< Delete the clear tag "lpszValue" inside the current XMLNode (direct "pointer-to-pointer" comparison is used to find the clear tag) + void deleteClear(XMLClear *p); ///< Delete the clear tag "p" inside the current XMLNode (direct "pointer-to-pointer" comparison on the lpszName of the clear tag is used to find the clear tag) + /** @} */ + + /** @defgroup xmlWOSD ???_WOSD functions. + * @ingroup xmlModify + * The strings given as parameters for the "add" and "update" methods that have a name with + * the postfix "_WOSD" (that means "WithOut String Duplication")(for example "addText_WOSD") + * will be free'd by the XMLNode class. For example, it means that this is incorrect: + * \code + * xNode.addText_WOSD("foo"); + * xNode.updateAttribute_WOSD("#newcolor" ,NULL,"color"); + * \endcode + * In opposition, this is correct: + * \code + * xNode.addText("foo"); + * xNode.addText_WOSD(stringDup("foo")); + * xNode.updateAttribute("#newcolor" ,NULL,"color"); + * xNode.updateAttribute_WOSD(stringDup("#newcolor"),NULL,"color"); + * \endcode + * Typically, you will never do: + * \code + * char *b=(char*)malloc(...); + * xNode.addText(b); + * free(b); + * \endcode + * ... but rather: + * \code + * char *b=(char*)malloc(...); + * xNode.addText_WOSD(b); + * \endcode + * ('free(b)' is performed by the XMLNode class) + * @{ */ + static XMLNode createXMLTopNode_WOSD(XMLSTR lpszName, char isDeclaration=FALSE); ///< Create the top node of an XMLNode structure + XMLNode addChild_WOSD(XMLSTR lpszName, char isDeclaration=FALSE, XMLElementPosition pos=-1); ///< Add a new child node + XMLAttribute *addAttribute_WOSD(XMLSTR lpszName, XMLSTR lpszValue); ///< Add a new attribute + XMLCSTR addText_WOSD(XMLSTR lpszValue, XMLElementPosition pos=-1); ///< Add a new text content + XMLClear *addClear_WOSD(XMLSTR lpszValue, XMLCSTR lpszOpen=NULL, XMLCSTR lpszClose=NULL, XMLElementPosition pos=-1); ///< Add a new clear Tag + + XMLCSTR updateName_WOSD(XMLSTR lpszName); ///< change node's name + XMLAttribute *updateAttribute_WOSD(XMLAttribute *newAttribute, XMLAttribute *oldAttribute); ///< if the attribute to update is missing, a new one will be added + XMLAttribute *updateAttribute_WOSD(XMLSTR lpszNewValue, XMLSTR lpszNewName=NULL,int i=0); ///< if the attribute to update is missing, a new one will be added + XMLAttribute *updateAttribute_WOSD(XMLSTR lpszNewValue, XMLSTR lpszNewName,XMLCSTR lpszOldName); ///< set lpszNewName=NULL if you don't want to change the name of the attribute if the attribute to update is missing, a new one will be added + XMLCSTR updateText_WOSD(XMLSTR lpszNewValue, int i=0); ///< if the text to update is missing, a new one will be added + XMLCSTR updateText_WOSD(XMLSTR lpszNewValue, XMLCSTR lpszOldValue); ///< if the text to update is missing, a new one will be added + XMLClear *updateClear_WOSD(XMLSTR lpszNewContent, int i=0); ///< if the clearTag to update is missing, a new one will be added + XMLClear *updateClear_WOSD(XMLClear *newP,XMLClear *oldP); ///< if the clearTag to update is missing, a new one will be added + XMLClear *updateClear_WOSD(XMLSTR lpszNewValue, XMLCSTR lpszOldValue); ///< if the clearTag to update is missing, a new one will be added + /** @} */ + + /** @defgroup xmlPosition Position helper functions (use in conjunction with the update&add functions + * @ingroup xmlModify + * These are some useful functions when you want to insert a childNode, a text or a XMLClearTag in the + * middle (at a specified position) of a XMLNode tree already constructed. The value returned by these + * methods is to be used as last parameter (parameter 'pos') of addChild, addText or addClear. + * @{ */ + XMLElementPosition positionOfText(int i=0) const; + XMLElementPosition positionOfText(XMLCSTR lpszValue) const; + XMLElementPosition positionOfClear(int i=0) const; + XMLElementPosition positionOfClear(XMLCSTR lpszValue) const; + XMLElementPosition positionOfClear(XMLClear *a) const; + XMLElementPosition positionOfChildNode(int i=0) const; + XMLElementPosition positionOfChildNode(XMLNode x) const; + XMLElementPosition positionOfChildNode(XMLCSTR name, int i=0) const; ///< return the position of the ith childNode with the specified name if (name==NULL) return the position of the ith childNode + /** @} */ + + /// Enumeration for XML character encoding. + typedef enum XMLCharEncoding + { + char_encoding_error=0, + char_encoding_UTF8=1, + char_encoding_legacy=2, + char_encoding_ShiftJIS=3, + char_encoding_GB2312=4, + char_encoding_Big5=5, + char_encoding_GBK=6 // this is actually the same as Big5 + } XMLCharEncoding; + + /** \addtogroup conversions + * @{ */ + + /// Sets the global options for the conversions + static char setGlobalOptions(XMLCharEncoding characterEncoding=XMLNode::char_encoding_UTF8, char guessWideCharChars=1, + char dropWhiteSpace=1, char removeCommentsInMiddleOfText=1); + /**< The "setGlobalOptions" function allows you to change four global parameters that affect string & file + * parsing. First of all, you most-probably will never have to change these 3 global parameters. + * + * @param guessWideCharChars If "guessWideCharChars"=1 and if this library is compiled in WideChar mode, then the + * XMLNode::parseFile and XMLNode::openFileHelper functions will test if the file contains ASCII + * characters. If this is the case, then the file will be loaded and converted in memory to + * WideChar before being parsed. If 0, no conversion will be performed. + * + * @param guessWideCharChars If "guessWideCharChars"=1 and if this library is compiled in ASCII/UTF8/char* mode, then the + * XMLNode::parseFile and XMLNode::openFileHelper functions will test if the file contains WideChar + * characters. If this is the case, then the file will be loaded and converted in memory to + * ASCII/UTF8/char* before being parsed. If 0, no conversion will be performed. + * + * @param characterEncoding This parameter is only meaningful when compiling in char* mode (multibyte character mode). + * In wchar_t* (wide char mode), this parameter is ignored. This parameter should be one of the + * three currently recognized encodings: XMLNode::encoding_UTF8, XMLNode::encoding_ascii, + * XMLNode::encoding_ShiftJIS. + * + * @param dropWhiteSpace In most situations, text fields containing only white spaces (and carriage returns) + * are useless. Even more, these "empty" text fields are annoying because they increase the + * complexity of the user's code for parsing. So, 99% of the time, it's better to drop + * the "empty" text fields. However The XML specification indicates that no white spaces + * should be lost when parsing the file. So to be perfectly XML-compliant, you should set + * dropWhiteSpace=0. A note of caution: if you set "dropWhiteSpace=0", the parser will be + * slower and your code will be more complex. + * + * @param removeCommentsInMiddleOfText To explain this parameter, let's consider this code: + * \code + * XMLNode x=XMLNode::parseString("<a>foo<!-- hello -->bar<!DOCTYPE world >chu</a>","a"); + * \endcode + * If removeCommentsInMiddleOfText=0, then we will have: + * \code + * x.getText(0) -> "foo" + * x.getText(1) -> "bar" + * x.getText(2) -> "chu" + * x.getClear(0) --> "<!-- hello -->" + * x.getClear(1) --> "<!DOCTYPE world >" + * \endcode + * If removeCommentsInMiddleOfText=1, then we will have: + * \code + * x.getText(0) -> "foobar" + * x.getText(1) -> "chu" + * x.getClear(0) --> "<!DOCTYPE world >" + * \endcode + * + * \return "0" when there are no errors. If you try to set an unrecognized encoding then the return value will be "1" to signal an error. + * + * \note Sometime, it's useful to set "guessWideCharChars=0" to disable any conversion + * because the test to detect the file-type (ASCII/UTF8/char* or WideChar) may fail (rarely). */ + + /// Guess the character encoding of the string (ascii, utf8 or shift-JIS) + static XMLCharEncoding guessCharEncoding(void *buffer, int bufLen, char useXMLEncodingAttribute=1); + /**< The "guessCharEncoding" function try to guess the character encoding. You most-probably will never + * have to use this function. It then returns the appropriate value of the global parameter + * "characterEncoding" described in the XMLNode::setGlobalOptions. The guess is based on the content of a buffer of length + * "bufLen" bytes that contains the first bytes (minimum 25 bytes; 200 bytes is a good value) of the + * file to be parsed. The XMLNode::openFileHelper function is using this function to automatically compute + * the value of the "characterEncoding" global parameter. There are several heuristics used to do the + * guess. One of the heuristic is based on the "encoding" attribute. The original XML specifications + * forbids to use this attribute to do the guess but you can still use it if you set + * "useXMLEncodingAttribute" to 1 (this is the default behavior and the behavior of most parsers). + * If an inconsistency in the encoding is detected, then the return value is "0". */ + /** @} */ + + private: + // these are functions and structures used internally by the XMLNode class (don't bother about them): + + typedef struct XMLNodeDataTag // to allow shallow copy and "intelligent/smart" pointers (automatic delete): + { + XMLCSTR lpszName; // Element name (=NULL if root) + int nChild, // Number of child nodes + nText, // Number of text fields + nClear, // Number of Clear fields (comments) + nAttribute; // Number of attributes + char isDeclaration; // Whether node is an XML declaration - '<?xml ?>' + struct XMLNodeDataTag *pParent; // Pointer to parent element (=NULL if root) + XMLNode *pChild; // Array of child nodes + XMLCSTR *pText; // Array of text fields + XMLClear *pClear; // Array of clear fields + XMLAttribute *pAttribute; // Array of attributes + int *pOrder; // order of the child_nodes,text_fields,clear_fields + int ref_count; // for garbage collection (smart pointers) + } XMLNodeData; + XMLNodeData *d; + + char parseClearTag(void *px, void *pa); + char maybeAddTxT(void *pa, XMLCSTR tokenPStr); + int ParseXMLElement(void *pXML); + void *addToOrder(int memInc, int *_pos, int nc, void *p, int size, XMLElementType xtype); + int indexText(XMLCSTR lpszValue) const; + int indexClear(XMLCSTR lpszValue) const; + XMLNode addChild_priv(int,XMLSTR,char,int); + XMLAttribute *addAttribute_priv(int,XMLSTR,XMLSTR); + XMLCSTR addText_priv(int,XMLSTR,int); + XMLClear *addClear_priv(int,XMLSTR,XMLCSTR,XMLCSTR,int); + void emptyTheNode(char force); + static inline XMLElementPosition findPosition(XMLNodeData *d, int index, XMLElementType xtype); + static int CreateXMLStringR(XMLNodeData *pEntry, XMLSTR lpszMarker, int nFormat); + static int removeOrderElement(XMLNodeData *d, XMLElementType t, int index); + static void exactMemory(XMLNodeData *d); + static int detachFromParent(XMLNodeData *d); +} XMLNode; + +/// This structure is given by the function XMLNode::enumContents. +typedef struct XMLNodeContents +{ + /// This dictates what's the content of the XMLNodeContent + enum XMLElementType etype; + /**< should be an union to access the appropriate data. Compiler does not allow union of object with constructor... too bad. */ + XMLNode child; + XMLAttribute attrib; + XMLCSTR text; + XMLClear clear; + +} XMLNodeContents; + +/** @defgroup StringAlloc String Allocation/Free functions + * @ingroup xmlModify + * @{ */ +/// Duplicate (copy in a new allocated buffer) the source string. +XMLDLLENTRY XMLSTR stringDup(XMLCSTR source, int cbData=-1); +/**< This is + * a very handy function when used with all the "XMLNode::*_WOSD" functions (\link xmlWOSD \endlink). + * @param cbData If !=0 then cbData is the number of chars to duplicate. New strings allocated with + * this function should be free'd using the "freeXMLString" function. */ + +/// to free the string allocated inside the "stringDup" function or the "createXMLString" function. +XMLDLLENTRY void freeXMLString(XMLSTR t); // {free(t);} +/** @} */ + +/** @defgroup atoX ato? like functions + * @ingroup XMLParserGeneral + * The "xmlto?" functions are equivalents to the atoi, atol, atof functions. + * The only difference is: If the variable "xmlString" is NULL, than the return value + * is "defautValue". These 6 functions are only here as "convenience" functions for the + * user (they are not used inside the XMLparser). If you don't need them, you can + * delete them without any trouble. + * + * @{ */ +XMLDLLENTRY char xmltob(XMLCSTR xmlString,char defautValue=0); +XMLDLLENTRY int xmltoi(XMLCSTR xmlString,int defautValue=0); +XMLDLLENTRY long xmltol(XMLCSTR xmlString,long defautValue=0); +XMLDLLENTRY double xmltof(XMLCSTR xmlString,double defautValue=.0); +XMLDLLENTRY XMLCSTR xmltoa(XMLCSTR xmlString,XMLCSTR defautValue=_CXML("")); +XMLDLLENTRY XMLCHAR xmltoc(XMLCSTR xmlString,XMLCHAR defautValue=_CXML('\0')); +/** @} */ + +/** @defgroup ToXMLStringTool Helper class to create XML files using "printf", "fprintf", "cout",... functions. + * @ingroup XMLParserGeneral + * @{ */ +/// Helper class to create XML files using "printf", "fprintf", "cout",... functions. +/** The ToXMLStringTool class helps you creating XML files using "printf", "fprintf", "cout",... functions. + * The "ToXMLStringTool" class is processing strings so that all the characters + * &,",',<,> are replaced by their XML equivalent: + * \verbatim &, ", ', <, > \endverbatim + * Using the "ToXMLStringTool class" and the "fprintf function" is THE most efficient + * way to produce VERY large XML documents VERY fast. + * \note If you are creating from scratch an XML file using the provided XMLNode class + * you must not use the "ToXMLStringTool" class (because the "XMLNode" class does the + * processing job for you during rendering).*/ +typedef struct XMLDLLENTRY ToXMLStringTool +{ +public: + ToXMLStringTool(): buf(NULL),buflen(0){} + ~ToXMLStringTool(); + void freeBuffer();///<call this function when you have finished using this object to release memory used by the internal buffer. + + XMLSTR toXML(XMLCSTR source);///< returns a pointer to an internal buffer that contains a XML-encoded string based on the "source" parameter. + + /** The "toXMLUnSafe" function is deprecated because there is a possibility of + * "destination-buffer-overflow". It converts the string + * "source" to the string "dest". */ + static XMLSTR toXMLUnSafe(XMLSTR dest,XMLCSTR source); ///< deprecated: use "toXML" instead + static int lengthXMLString(XMLCSTR source); ///< deprecated: use "toXML" instead + +private: + XMLSTR buf; + int buflen; +} ToXMLStringTool; +/** @} */ + +/** @defgroup XMLParserBase64Tool Helper class to include binary data inside XML strings using "Base64 encoding". + * @ingroup XMLParserGeneral + * @{ */ +/// Helper class to include binary data inside XML strings using "Base64 encoding". +/** The "XMLParserBase64Tool" class allows you to include any binary data (images, sounds,...) + * into an XML document using "Base64 encoding". This class is completely + * separated from the rest of the xmlParser library and can be removed without any problem. + * To include some binary data into an XML file, you must convert the binary data into + * standard text (using "encode"). To retrieve the original binary data from the + * b64-encoded text included inside the XML file, use "decode". Alternatively, these + * functions can also be used to "encrypt/decrypt" some critical data contained inside + * the XML (it's not a strong encryption at all, but sometimes it can be useful). */ +typedef struct XMLDLLENTRY XMLParserBase64Tool +{ +public: + XMLParserBase64Tool(): buf(NULL),buflen(0){} + ~XMLParserBase64Tool(); + void freeBuffer();///< Call this function when you have finished using this object to release memory used by the internal buffer. + + /** + * @param formatted If "formatted"=true, some space will be reserved for a carriage-return every 72 chars. */ + static int encodeLength(int inBufLen, char formatted=0); ///< return the length of the base64 string that encodes a data buffer of size inBufLen bytes. + + /** + * The "base64Encode" function returns a string containing the base64 encoding of "inByteLen" bytes + * from "inByteBuf". If "formatted" parameter is true, then there will be a carriage-return every 72 chars. + * The string will be free'd when the XMLParserBase64Tool object is deleted. + * All returned strings are sharing the same memory space. */ + XMLSTR encode(unsigned char *inByteBuf, unsigned int inByteLen, char formatted=0); ///< returns a pointer to an internal buffer containing the base64 string containing the binary data encoded from "inByteBuf" + + /// returns the number of bytes which will be decoded from "inString". + static unsigned int decodeSize(XMLCSTR inString, XMLError *xe=NULL); + + /** + * The "decode" function returns a pointer to a buffer containing the binary data decoded from "inString" + * The output buffer will be free'd when the XMLParserBase64Tool object is deleted. + * All output buffer are sharing the same memory space. + * @param inString If "instring" is malformed, NULL will be returned */ + unsigned char* decode(XMLCSTR inString, int *outByteLen=NULL, XMLError *xe=NULL); ///< returns a pointer to an internal buffer containing the binary data decoded from "inString" + + /** + * decodes data from "inString" to "outByteBuf". You need to provide the size (in byte) of "outByteBuf" + * in "inMaxByteOutBuflen". If "outByteBuf" is not large enough or if data is malformed, then "FALSE" + * will be returned; otherwise "TRUE". */ + static unsigned char decode(XMLCSTR inString, unsigned char *outByteBuf, int inMaxByteOutBuflen, XMLError *xe=NULL); ///< deprecated. + +private: + void *buf; + int buflen; + void alloc(int newsize); +}XMLParserBase64Tool; +/** @} */ + +#undef XMLDLLENTRY + +#endif |