1 files changed, 736 insertions, 0 deletions
diff --git a/ext/mcpat/memoryctrl.cc b/ext/mcpat/memoryctrl.cc
new file mode 100644
index 000000000..ae3bc75ec
--- /dev/null
+++ b/ext/mcpat/memoryctrl.cc
@@ -0,0 +1,736 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <string>
+
+#include "XML_Parse.h"
+#include "basic_circuit.h"
+#include "basic_components.h"
+#include "const.h"
+#include "io.h"
+#include "logic.h"
+#include "memoryctrl.h"
+#include "parameter.h"
+
+/* overview of MC models:
+ * McPAT memory controllers are modeled according to large number of industrial data points.
+ * The Basic memory controller architecture is base on the Synopsis designs
+ * (DesignWare DDR2/DDR3-Lite memory controllers and DDR2/DDR3-Lite protocol controllers)
+ * as in Cadence ChipEstimator Tool
+ *
+ * An MC has 3 parts as shown in this design. McPAT models both high performance MC
+ * based on Niagara processor designs and curving and low power MC based on data points in
+ * Cadence ChipEstimator Tool.
+ *
+ * The frontend is modeled analytically, the backend is modeled empirically according to
+ * DDR2/DDR3-Lite protocol controllers in Cadence ChipEstimator Tool
+ * The PHY is modeled based on
+ * "A 100mW 9.6Gb/s Transceiver in 90nm CMOS for next-generation memory interfaces ," ISSCC 2006,
+ * and A 14mW 6.25Gb/s Transceiver in 90nm CMOS for Serial Chip-to-Chip Communication," ISSCC 2007
+ *
+ * In Cadence ChipEstimator Tool there are two types of memory controllers: the full memory controllers
+ * that includes the frontend as the DesignWare DDR2/DDR3-Lite memory controllers and the backend only
+ * memory controllers as the DDR2/DDR3-Lite protocol controllers (except DesignWare DDR2/DDR3-Lite memory
+ * controllers, all memory controller IP in Cadence ChipEstimator Tool are backend memory controllers such as
+ * DDRC 1600A and DDRC 800A). Thus,to some extend the area and power difference between DesignWare
+ * DDR2/DDR3-Lite memory controllers and DDR2/DDR3-Lite protocol controllers can be an estimation to the
+ * frontend power and area, which is very close the analitically modeled results of the frontend for Niagara2@65nm
+ *
+ */
+
+MCBackend::MCBackend(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_)
+:l_ip(*interface_ip_),
+ mc_type(mc_type_),
+ mcp(mcp_)
+{
+
+  local_result = init_interface(&l_ip);
+  compute();
+
+}
+
+
+void MCBackend::compute()
+{
+  //double max_row_addr_width = 20.0;//Current address 12~18bits
+  double C_MCB, mc_power, backend_dyn, backend_gates;//, refresh_period,refresh_freq;//Equivalent per bit Cap for backend,
+  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+  double NMOS_sizing, PMOS_sizing;
+
+  if (mc_type == MC)
+  {
+          if (mcp.type == 0)
+          {
+                  //area = (2.2927*log(peakDataTransferRate)-14.504)*memDataWidth/144.0*(l_ip.F_sz_um/0.09);
+                  area.set_area((2.7927*log(mcp.peakDataTransferRate*2)-19.862)/2.0*mcp.dataBusWidth/128.0*(l_ip.F_sz_um/0.09)*mcp.num_channels*1e6);//um^2
+                  //assuming the approximately same scaling factor as seen in processors.
+                  //C_MCB=0.2/1.3/1.3/266/64/0.09*g_ip.F_sz_um;//based on AMD Geode processor which has a very basic mc on chip.
+                  //C_MCB = 1.6/200/1e6/144/1.2/1.2*g_ip.F_sz_um/0.19;//Based on Niagara power numbers.The base power (W) is divided by device frequency and vdd and scale to target process.
+                  //mc_power = 0.0291*2;//29.1mW@200MHz @130nm From Power Analysis of SystemLevel OnChip Communication Architectures by Lahiri et
+                  mc_power = 4.32*0.1;//4.32W@1GhzMHz @65nm Cadence ChipEstimator 10% for backend
+                  C_MCB = mc_power/1e9/72/1.1/1.1*l_ip.F_sz_um/0.065;
+                  power_t.readOp.dynamic = C_MCB*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(mcp.dataBusWidth/*+mcp.addressBusWidth*/);//per access energy in memory controller
+                  power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
+                  power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
+
+          }
+          else
+          {   NMOS_sizing 	  = g_tp.min_w_nmos_;
+                  PMOS_sizing	  = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
+                  area.set_area(0.15*mcp.dataBusWidth/72.0*(l_ip.F_sz_um/0.065)* (l_ip.F_sz_um/0.065)*mcp.num_channels*1e6);//um^2
+                  backend_dyn = 0.9e-9/800e6*mcp.clockRate/12800*mcp.peakDataTransferRate*mcp.dataBusWidth/72.0*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(l_ip.F_sz_nm/65.0);//Average on DDR2/3 protocol controller and DDRC 1600/800A in Cadence ChipEstimate
+                  //Scaling to technology and DIMM feature. The base IP support DDR3-1600(PC3 12800)
+                  backend_gates = 50000*mcp.dataBusWidth/64.0;//5000 is from Cadence ChipEstimator
+
+                  power_t.readOp.dynamic = backend_dyn;
+                  power_t.readOp.leakage = (backend_gates)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
+                  power_t.readOp.gate_leakage = (backend_gates)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
+
+          }
+  }
+  else
+  {//skip old model
+          cout<<"Unknown memory controllers"<<endl;exit(0);
+          area.set_area(0.243*mcp.dataBusWidth/8);//area based on Cadence ChipEstimator for 8bit bus
+          //mc_power = 4.32*0.1;//4.32W@1GhzMHz @65nm Cadence ChipEstimator 10% for backend
+          C_MCB = mc_power/1e9/72/1.1/1.1*l_ip.F_sz_um/0.065;
+          power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
+          power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
+          power_t.readOp.dynamic *= 1.2;
+          power_t.readOp.leakage *= 1.2;
+          power_t.readOp.gate_leakage *= 1.2;
+          //flash controller has about 20% more backend power since BCH ECC in flash is complex and power hungry
+  }
+  double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
+  power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
+}
+
+void MCBackend::computeEnergy(bool is_tdp)
+{
+        //backend uses internal data buswidth
+        if (is_tdp)
+        {
+                //init stats for Peak
+                stats_t.readAc.access   = 0.5*mcp.num_channels;
+                stats_t.writeAc.access  = 0.5*mcp.num_channels;
+                tdp_stats = stats_t;
+        }
+        else
+        {
+                //init stats for runtime power (RTP)
+                stats_t.readAc.access   = mcp.reads;
+                stats_t.writeAc.access  = mcp.writes;
+                tdp_stats = stats_t;
+        }
+        if (is_tdp)
+    {
+                power = power_t;
+                power.readOp.dynamic	= (stats_t.readAc.access + stats_t.writeAc.access)*power_t.readOp.dynamic;
+
+    }
+    else
+    {
+        rt_power.readOp.dynamic	= (stats_t.readAc.access + stats_t.writeAc.access)*mcp.llcBlockSize*8.0/mcp.dataBusWidth*power_t.readOp.dynamic;
+        rt_power = rt_power + power_t*pppm_lkg;
+        rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime;
+        //Assume 10% of peak power is consumed by routine job including memory refreshing and scrubbing
+    }
+}
+
+
+MCPHY::MCPHY(InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_)
+:l_ip(*interface_ip_),
+ mc_type(mc_type_),
+ mcp(mcp_)
+{
+
+  local_result = init_interface(&l_ip);
+  compute();
+}
+
+void MCPHY::compute()
+{
+  //PHY uses internal data buswidth but the actuall off-chip datawidth is 64bits + ecc
+  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio() ;
+  /*
+   * according to "A 100mW 9.6Gb/s Transceiver in 90nm CMOS for next-generation memory interfaces ," ISSCC 2006;
+   * From Cadence ChipEstimator for normal I/O around 0.4~0.8 mW/Gb/s
+   */
+  double power_per_gb_per_s, phy_dyn,phy_gates, NMOS_sizing, PMOS_sizing;
+
+  if (mc_type == MC)
+  {
+          if (mcp.type == 0)
+          {
+                  power_per_gb_per_s = mcp.LVDS? 0.01:0.04;
+                  //Based on die photos from Niagara 1 and 2.
+                  //TODO merge this into undifferentiated core.PHY only achieves square root of the ideal scaling.
+                  //area = (6.4323*log(peakDataTransferRate)-34.76)*memDataWidth/128.0*(l_ip.F_sz_um/0.09);
+                  area.set_area((6.4323*log(mcp.peakDataTransferRate*2)-48.134)*mcp.dataBusWidth/128.0*(l_ip.F_sz_um/0.09)*mcp.num_channels*1e6/2);//TODO:/2
+                  //This is from curve fitting based on Niagara 1 and 2's PHY die photo.
+                  //This is power not energy, 10mw/Gb/s @90nm for each channel and scaling down
+                  //power.readOp.dynamic = 0.02*memAccesses*llcBlocksize*8;//change from Bytes to bits.
+                  power_t.readOp.dynamic = power_per_gb_per_s*sqrt(l_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
+                  power_t.readOp.leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
+                  power_t.readOp.gate_leakage = area.get_area()/2 *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
+
+          }
+          else
+          {
+                  NMOS_sizing 	  = g_tp.min_w_nmos_;
+                  PMOS_sizing	  = g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
+                  //Designware/synopsis 16bit DDR3 PHY is 1.3mm (WITH IOs) at 40nm for upto DDR3 2133 (PC3 17066)
+                  double non_IO_percentage = 0.2;
+                  area.set_area(1.3*non_IO_percentage/2133.0e6*mcp.clockRate/17066*mcp.peakDataTransferRate*mcp.dataBusWidth/16.0*(l_ip.F_sz_um/0.040)* (l_ip.F_sz_um/0.040)*mcp.num_channels*1e6);//um^2
+                  phy_gates = 200000*mcp.dataBusWidth/64.0;
+                  power_per_gb_per_s = 0.01;
+                  //This is power not energy, 10mw/Gb/s @90nm for each channel and scaling down
+                  power_t.readOp.dynamic = power_per_gb_per_s*(l_ip.F_sz_um/0.09)*g_tp.peri_global.Vdd/1.2*g_tp.peri_global.Vdd/1.2;
+                  power_t.readOp.leakage = (mcp.withPHY? phy_gates:0)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
+                  power_t.readOp.gate_leakage = (mcp.withPHY? phy_gates:0)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand)*g_tp.peri_global.Vdd;//unit W
+          }
+
+  }
+  else
+  {
+          area.set_area(0.4e6/2*mcp.dataBusWidth/8);//area based on Cadence ChipEstimator for 8bit bus
+  }
+
+//  double phy_factor = (int)ceil(mcp.dataBusWidth/72.0);//Previous phy power numbers are based on 72 bit DIMM interface
+//  power_t.readOp.dynamic *= phy_factor;
+//  power_t.readOp.leakage *= phy_factor;
+//  power_t.readOp.gate_leakage *= phy_factor;
+
+  double long_channel_device_reduction = longer_channel_device_reduction(Uncore_device);
+  power_t.readOp.longer_channel_leakage = power_t.readOp.leakage * long_channel_device_reduction;
+}
+
+
+void MCPHY::computeEnergy(bool is_tdp)
+{
+        if (is_tdp)
+        {
+                //init stats for Peak
+                stats_t.readAc.access   = 0.5*mcp.num_channels; //time share on buses
+                stats_t.writeAc.access  = 0.5*mcp.num_channels;
+                tdp_stats = stats_t;
+        }
+        else
+        {
+                //init stats for runtime power (RTP)
+                stats_t.readAc.access   = mcp.reads;
+                stats_t.writeAc.access  = mcp.writes;
+                tdp_stats = stats_t;
+        }
+
+        if (is_tdp)
+    {
+                double data_transfer_unit = (mc_type == MC)? 72:16;/*DIMM data width*/
+                power = power_t;
+                power.readOp.dynamic	= power.readOp.dynamic * (mcp.peakDataTransferRate*8*1e6/1e9/*change to Gbs*/)*mcp.dataBusWidth/data_transfer_unit*mcp.num_channels/mcp.clockRate;
+                // divide by clock rate is for match the final computation where *clock is used
+                //(stats_t.readAc.access*power_t.readOp.dynamic+
+//					stats_t.writeAc.access*power_t.readOp.dynamic);
+
+    }
+    else
+    {
+        rt_power = power_t;
+//    	rt_power.readOp.dynamic	= (stats_t.readAc.access*power_t.readOp.dynamic+
+//    						stats_t.writeAc.access*power_t.readOp.dynamic);
+
+        rt_power.readOp.dynamic=power_t.readOp.dynamic*(stats_t.readAc.access + stats_t.writeAc.access)*(mcp.llcBlockSize)*8/1e9/mcp.executionTime*(mcp.executionTime);
+        rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime;
+    }
+}
+
+MCFrontEnd::MCFrontEnd(ParseXML *XML_interface,InputParameter* interface_ip_, const MCParam & mcp_, enum MemoryCtrl_type mc_type_)
+:XML(XML_interface),
+ interface_ip(*interface_ip_),
+ mc_type(mc_type_),
+ mcp(mcp_),
+ MC_arb(0),
+ frontendBuffer(0),
+ readBuffer(0),
+ writeBuffer(0)
+{
+  /* All computations are for a single MC
+   *
+   */
+
+  int tag, data;
+  bool is_default =true;//indication for default setup
+
+  /* MC frontend engine channels share the same engines but logically partitioned
+   * For all hardware inside MC. different channels do not share resources.
+   * TODO: add docodeing/mux stage to steer memory requests to different channels.
+   */
+
+  //memory request reorder buffer
+  tag							   = mcp.addressBusWidth  + EXTRA_TAG_BITS + mcp.opcodeW;
+  data    					 	   = int(ceil((XML->sys.physical_address_width + mcp.opcodeW)/8.0));
+  interface_ip.cache_sz            = data*XML->sys.mc.req_window_size_per_channel;
+  interface_ip.line_sz             = data;
+  interface_ip.assoc               = 0;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.access_mode         = 0;
+  interface_ip.throughput          = 1.0/mcp.clockRate;
+  interface_ip.latency             = 1.0/mcp.clockRate;
+  interface_ip.is_cache			   = true;
+  interface_ip.pure_cam            = false;
+  interface_ip.pure_ram            = false;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports        = 0;
+  interface_ip.num_rd_ports        = XML->sys.mc.memory_channels_per_mc;
+  interface_ip.num_wr_ports        = interface_ip.num_rd_ports;
+  interface_ip.num_se_rd_ports     = 0;
+  interface_ip.num_search_ports     = XML->sys.mc.memory_channels_per_mc;
+  frontendBuffer = new ArrayST(&interface_ip, "MC ReorderBuffer", Uncore_device);
+  frontendBuffer->area.set_area(frontendBuffer->area.get_area()+ frontendBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
+  area.set_area(area.get_area()+ frontendBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
+
+  //selection and arbitration logic
+  MC_arb = new selection_logic(is_default, XML->sys.mc.req_window_size_per_channel,1,&interface_ip, Uncore_device);
+
+  //read buffers.
+  data    					 	   = (int)ceil(mcp.dataBusWidth/8.0);//Support key words first operation //8 means converting bit to Byte
+  interface_ip.cache_sz            = data*XML->sys.mc.IO_buffer_size_per_channel;//*llcBlockSize;
+  interface_ip.line_sz             = data;
+  interface_ip.assoc               = 1;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 1;
+  interface_ip.throughput          = 1.0/mcp.clockRate;
+  interface_ip.latency             = 1.0/mcp.clockRate;
+  interface_ip.is_cache			   = false;
+  interface_ip.pure_cam            = false;
+  interface_ip.pure_ram            = true;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports        = 0;//XML->sys.mc.memory_channels_per_mc*2>2?2:XML->sys.mc.memory_channels_per_mc*2;
+  interface_ip.num_rd_ports        = XML->sys.mc.memory_channels_per_mc;
+  interface_ip.num_wr_ports        = interface_ip.num_rd_ports;
+  interface_ip.num_se_rd_ports     = 0;
+  readBuffer = new ArrayST(&interface_ip, "MC ReadBuffer", Uncore_device);
+  readBuffer->area.set_area(readBuffer->area.get_area()+ readBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
+  area.set_area(area.get_area()+ readBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
+
+  //write buffer
+  data    					 	   = (int)ceil(mcp.dataBusWidth/8.0);//Support key words first operation //8 means converting bit to Byte
+  interface_ip.cache_sz            = data*XML->sys.mc.IO_buffer_size_per_channel;//*llcBlockSize;
+  interface_ip.line_sz             = data;
+  interface_ip.assoc               = 1;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 0;
+  interface_ip.throughput          = 1.0/mcp.clockRate;
+  interface_ip.latency             = 1.0/mcp.clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports        = 0;
+  interface_ip.num_rd_ports        = XML->sys.mc.memory_channels_per_mc;
+  interface_ip.num_wr_ports        = interface_ip.num_rd_ports;
+  interface_ip.num_se_rd_ports     = 0;
+  writeBuffer = new ArrayST(&interface_ip, "MC writeBuffer", Uncore_device);
+  writeBuffer->area.set_area(writeBuffer->area.get_area()+ writeBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
+  area.set_area(area.get_area()+ writeBuffer->local_result.area*XML->sys.mc.memory_channels_per_mc);
+}
+
+void MCFrontEnd::computeEnergy(bool is_tdp)
+{
+        if (is_tdp)
+            {
+                //init stats for Peak
+                frontendBuffer->stats_t.readAc.access  = frontendBuffer->l_ip.num_search_ports;
+                frontendBuffer->stats_t.writeAc.access = frontendBuffer->l_ip.num_wr_ports;
+                frontendBuffer->tdp_stats = frontendBuffer->stats_t;
+
+                readBuffer->stats_t.readAc.access  = readBuffer->l_ip.num_rd_ports*mcp.frontend_duty_cycle;
+                readBuffer->stats_t.writeAc.access = readBuffer->l_ip.num_wr_ports*mcp.frontend_duty_cycle;
+                readBuffer->tdp_stats = readBuffer->stats_t;
+
+                writeBuffer->stats_t.readAc.access  = writeBuffer->l_ip.num_rd_ports*mcp.frontend_duty_cycle;
+                writeBuffer->stats_t.writeAc.access = writeBuffer->l_ip.num_wr_ports*mcp.frontend_duty_cycle;
+                writeBuffer->tdp_stats = writeBuffer->stats_t;
+
+            }
+            else
+            {
+                //init stats for runtime power (RTP)
+                frontendBuffer->stats_t.readAc.access  = XML->sys.mc.memory_reads *mcp.llcBlockSize*8.0/mcp.dataBusWidth*mcp.dataBusWidth/72;
+                //For each channel, each memory word need to check the address data to achieve best scheduling results.
+                //and this need to be done on all physical DIMMs in each logical memory DIMM *mcp.dataBusWidth/72
+                frontendBuffer->stats_t.writeAc.access = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth*mcp.dataBusWidth/72;
+                frontendBuffer->rtp_stats = frontendBuffer->stats_t;
+
+                readBuffer->stats_t.readAc.access  = XML->sys.mc.memory_reads*mcp.llcBlockSize*8.0/mcp.dataBusWidth;//support key word first
+                readBuffer->stats_t.writeAc.access = XML->sys.mc.memory_reads*mcp.llcBlockSize*8.0/mcp.dataBusWidth;//support key word first
+                readBuffer->rtp_stats = readBuffer->stats_t;
+
+                writeBuffer->stats_t.readAc.access  = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth;
+                writeBuffer->stats_t.writeAc.access = XML->sys.mc.memory_writes*mcp.llcBlockSize*8.0/mcp.dataBusWidth;
+                writeBuffer->rtp_stats = writeBuffer->stats_t;
+            }
+
+        frontendBuffer->power_t.reset();
+        readBuffer->power_t.reset();
+        writeBuffer->power_t.reset();
+
+//	frontendBuffer->power_t.readOp.dynamic	+= (frontendBuffer->stats_t.readAc.access*
+//			(frontendBuffer->local_result.power.searchOp.dynamic+frontendBuffer->local_result.power.readOp.dynamic)+
+//    		frontendBuffer->stats_t.writeAc.access*frontendBuffer->local_result.power.writeOp.dynamic);
+
+                frontendBuffer->power_t.readOp.dynamic	+= (frontendBuffer->stats_t.readAc.access +
+                                  frontendBuffer->stats_t.writeAc.access)*frontendBuffer->local_result.power.searchOp.dynamic
+                                + frontendBuffer->stats_t.readAc.access * frontendBuffer->local_result.power.readOp.dynamic
+                                + frontendBuffer->stats_t.writeAc.access*frontendBuffer->local_result.power.writeOp.dynamic;
+
+        readBuffer->power_t.readOp.dynamic	+= (readBuffer->stats_t.readAc.access*
+                        readBuffer->local_result.power.readOp.dynamic+
+                readBuffer->stats_t.writeAc.access*readBuffer->local_result.power.writeOp.dynamic);
+        writeBuffer->power_t.readOp.dynamic	+= (writeBuffer->stats_t.readAc.access*
+                        writeBuffer->local_result.power.readOp.dynamic+
+                writeBuffer->stats_t.writeAc.access*writeBuffer->local_result.power.writeOp.dynamic);
+
+        if (is_tdp)
+    {
+        power = power + frontendBuffer->power_t + readBuffer->power_t + writeBuffer->power_t +
+                (frontendBuffer->local_result.power +
+                                readBuffer->local_result.power +
+                                writeBuffer->local_result.power)*pppm_lkg;
+
+    }
+    else
+    {
+        rt_power = rt_power + frontendBuffer->power_t + readBuffer->power_t + writeBuffer->power_t +
+                (frontendBuffer->local_result.power +
+                                readBuffer->local_result.power +
+                                writeBuffer->local_result.power)*pppm_lkg;
+        rt_power.readOp.dynamic = rt_power.readOp.dynamic + power.readOp.dynamic*0.1*mcp.clockRate*mcp.num_mcs*mcp.executionTime;
+    }
+}
+
+void MCFrontEnd::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+
+        if (is_tdp)
+        {
+                cout << indent_str << "Front End ROB:" << endl;
+                cout << indent_str_next << "Area = " << frontendBuffer->area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << frontendBuffer->power.readOp.dynamic*mcp.clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << frontendBuffer->power.readOp.leakage <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << frontendBuffer->power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << frontendBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
+
+                cout <<endl;
+                cout << indent_str<< "Read Buffer:" << endl;
+                cout << indent_str_next << "Area = " << readBuffer->area.get_area()*1e-6  << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << readBuffer->power.readOp.dynamic*mcp.clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << readBuffer->power.readOp.leakage  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << readBuffer->power.readOp.gate_leakage  << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << readBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
+                cout <<endl;
+                cout << indent_str << "Write Buffer:" << endl;
+                cout << indent_str_next << "Area = " << writeBuffer->area.get_area() *1e-6 << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << writeBuffer->power.readOp.dynamic*mcp.clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << writeBuffer->power.readOp.leakage  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << writeBuffer->power.readOp.gate_leakage  << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << writeBuffer->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
+                cout <<endl;
+        }
+        else
+        {
+                cout << indent_str << "Front End ROB:" << endl;
+                cout << indent_str_next << "Area = " << frontendBuffer->area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << frontendBuffer->rt_power.readOp.dynamic*mcp.clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << frontendBuffer->rt_power.readOp.leakage <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << frontendBuffer->rt_power.readOp.gate_leakage << " W" << endl;
+                cout <<endl;
+                cout << indent_str<< "Read Buffer:" << endl;
+                cout << indent_str_next << "Area = " << readBuffer->area.get_area()*1e-6  << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << readBuffer->rt_power.readOp.dynamic*mcp.clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << readBuffer->rt_power.readOp.leakage  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << readBuffer->rt_power.readOp.gate_leakage  << " W" << endl;
+                cout <<endl;
+                cout << indent_str << "Write Buffer:" << endl;
+                cout << indent_str_next << "Area = " << writeBuffer->area.get_area() *1e-6 << " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << writeBuffer->rt_power.readOp.dynamic*mcp.clockRate  << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << writeBuffer->rt_power.readOp.leakage  << " W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << writeBuffer->rt_power.readOp.gate_leakage  << " W" << endl;
+        }
+
+}
+
+
+MemoryController::MemoryController(ParseXML *XML_interface,InputParameter* interface_ip_, enum MemoryCtrl_type mc_type_)
+:XML(XML_interface),
+ interface_ip(*interface_ip_),
+ mc_type(mc_type_),
+ frontend(0),
+ transecEngine(0),
+ PHY(0),
+ pipeLogic(0)
+{
+  /* All computations are for a single MC
+   *
+   */
+  interface_ip.wire_is_mat_type = 2;
+  interface_ip.wire_os_mat_type = 2;
+  interface_ip.wt               =Global;
+  set_mc_param();
+  frontend = new MCFrontEnd(XML, &interface_ip, mcp, mc_type);
+  area.set_area(area.get_area()+ frontend->area.get_area());
+  transecEngine = new MCBackend(&interface_ip, mcp, mc_type);
+  area.set_area(area.get_area()+ transecEngine->area.get_area());
+  if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
+  {
+          PHY = new MCPHY(&interface_ip, mcp, mc_type);
+          area.set_area(area.get_area()+ PHY->area.get_area());
+  }
+  //+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc.
+//  transecEngine.initialize(&interface_ip);
+//  transecEngine.peakDataTransferRate = XML->sys.mem.peak_transfer_rate;
+//  transecEngine.memDataWidth = dataBusWidth;
+//  transecEngine.memRank = XML->sys.mem.number_ranks;
+//  //transecEngine.memAccesses=XML->sys.mc.memory_accesses;
+//  //transecEngine.llcBlocksize=llcBlockSize;
+//  transecEngine.compute();
+//  transecEngine.area.set_area(XML->sys.mc.memory_channels_per_mc*transecEngine.area.get_area()) ;
+//  area.set_area(area.get_area()+ transecEngine.area.get_area());
+//  ///cout<<"area="<<area<<endl;
+////
+//  //++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers
+//  PHY.initialize(&interface_ip);
+//  PHY.peakDataTransferRate = XML->sys.mem.peak_transfer_rate;
+//  PHY.memDataWidth = dataBusWidth;
+//  //PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power
+//  //PHY.llcBlocksize=llcBlockSize;
+//  PHY.compute();
+//  PHY.area.set_area(XML->sys.mc.memory_channels_per_mc*PHY.area.get_area()) ;
+//  area.set_area(area.get_area()+ PHY.area.get_area());
+  ///cout<<"area="<<area<<endl;
+//
+//  interface_ip.pipeline_stages = 5;//normal memory controller has five stages in the pipeline.
+//  interface_ip.per_stage_vector = addressBusWidth + XML->sys.core[0].opcode_width + dataBusWidth;
+//  pipeLogic = new pipeline(is_default, &interface_ip);
+//  //pipeLogic.init_pipeline(is_default, &interface_ip);
+//  pipeLogic->compute_pipeline();
+//  area.set_area(area.get_area()+ pipeLogic->area.get_area()*1e-6);
+//  area.set_area((area.get_area()+mc_area*1e-6)*1.1);//placement and routing overhead
+//
+//
+////  //clock
+////  clockNetwork.init_wire_external(is_default, &interface_ip);
+////  clockNetwork.clk_area           =area*1.1;//10% of placement overhead. rule of thumb
+////  clockNetwork.end_wiring_level   =5;//toplevel metal
+////  clockNetwork.start_wiring_level =5;//toplevel metal
+////  clockNetwork.num_regs           = pipeLogic.tot_stage_vector;
+////  clockNetwork.optimize_wire();
+
+
+}
+void MemoryController::computeEnergy(bool is_tdp)
+{
+
+        frontend->computeEnergy(is_tdp);
+        transecEngine->computeEnergy(is_tdp);
+        if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
+        {
+                PHY->computeEnergy(is_tdp);
+        }
+        if (is_tdp)
+        {
+                power = power + frontend->power + transecEngine->power;
+                if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
+                {
+                        power = power + PHY->power;
+                }
+        }
+        else
+        {
+                rt_power = rt_power + frontend->rt_power + transecEngine->rt_power;
+                if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
+                {
+                        rt_power = rt_power + PHY->rt_power;
+                }
+        }
+}
+
+void MemoryController::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
+{
+        string indent_str(indent, ' ');
+        string indent_str_next(indent+2, ' ');
+        bool long_channel = XML->sys.longer_channel_device;
+
+        if (is_tdp)
+        {
+                cout << "Memory Controller:" << endl;
+                cout << indent_str<< "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic*mcp.clockRate  << " W" << endl;
+                cout << indent_str<< "Subthreshold Leakage = "
+                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
+                //cout << indent_str<< "Subthreshold Leakage = " << power.readOp.longer_channel_leakage <<" W" << endl;
+                cout << indent_str<< "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
+                cout<<endl;
+                cout << indent_str << "Front End Engine:" << endl;
+                cout << indent_str_next << "Area = " << frontend->area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << frontend->power.readOp.dynamic*mcp.clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? frontend->power.readOp.longer_channel_leakage:frontend->power.readOp.leakage) <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << frontend->power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << frontend->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
+                cout <<endl;
+                if (plevel >2){
+                        frontend->displayEnergy(indent+4,is_tdp);
+                }
+                cout << indent_str << "Transaction Engine:" << endl;
+                cout << indent_str_next << "Area = " << transecEngine->area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << transecEngine->power.readOp.dynamic*mcp.clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? transecEngine->power.readOp.longer_channel_leakage:transecEngine->power.readOp.leakage) <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << transecEngine->power.readOp.gate_leakage << " W" << endl;
+                cout << indent_str_next << "Runtime Dynamic = " << transecEngine->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
+                cout <<endl;
+                if (mcp.type==0 || (mcp.type==1&&mcp.withPHY))
+                {
+                        cout << indent_str << "PHY:" << endl;
+                        cout << indent_str_next << "Area = " << PHY->area.get_area()*1e-6<< " mm^2" << endl;
+                        cout << indent_str_next << "Peak Dynamic = " << PHY->power.readOp.dynamic*mcp.clockRate << " W" << endl;
+                        cout << indent_str_next << "Subthreshold Leakage = "
+                        << (long_channel? PHY->power.readOp.longer_channel_leakage:PHY->power.readOp.leakage) <<" W" << endl;
+                        cout << indent_str_next << "Gate Leakage = " << PHY->power.readOp.gate_leakage << " W" << endl;
+                        cout << indent_str_next << "Runtime Dynamic = " << PHY->rt_power.readOp.dynamic/mcp.executionTime << " W" << endl;
+                        cout <<endl;
+                }
+        }
+        else
+        {
+                cout << "Memory Controller:" << endl;
+                cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
+                cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*mcp.clockRate << " W" << endl;
+                cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl;
+                cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
+                cout<<endl;
+        }
+
+}
+
+void MemoryController::set_mc_param()
+{
+
+        if (mc_type==MC)
+        {
+          mcp.clockRate       =XML->sys.mc.mc_clock*2;//DDR double pumped
+          mcp.clockRate       *= 1e6;
+          mcp.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+
+          mcp.llcBlockSize    =int(ceil(XML->sys.mc.llc_line_length/8.0))+XML->sys.mc.llc_line_length;//ecc overhead
+          mcp.dataBusWidth    =int(ceil(XML->sys.mc.databus_width/8.0)) + XML->sys.mc.databus_width;
+          mcp.addressBusWidth =int(ceil(XML->sys.mc.addressbus_width));//XML->sys.physical_address_width;
+          mcp.opcodeW         =16;
+          mcp.num_mcs         = XML->sys.mc.number_mcs;
+          mcp.num_channels    = XML->sys.mc.memory_channels_per_mc;
+          mcp.reads  = XML->sys.mc.memory_reads;
+          mcp.writes = XML->sys.mc.memory_writes;
+          //+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc.
+          mcp.peakDataTransferRate = XML->sys.mc.peak_transfer_rate;
+          mcp.memRank = XML->sys.mc.number_ranks;
+          //++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers
+          //PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power
+          //PHY.llcBlocksize=llcBlockSize;
+          mcp.frontend_duty_cycle = 0.5;//for max power, the actual off-chip links is bidirectional but time shared
+          mcp.LVDS = XML->sys.mc.LVDS;
+          mcp.type = XML->sys.mc.type;
+          mcp.withPHY = XML->sys.mc.withPHY;
+        }
+//	else if (mc_type==FLASHC)
+//	{
+//		mcp.clockRate       =XML->sys.flashc.mc_clock*2;//DDR double pumped
+//		mcp.clockRate       *= 1e6;
+//		mcp.executionTime   = XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+//
+//		mcp.llcBlockSize    =int(ceil(XML->sys.flashc.llc_line_length/8.0))+XML->sys.flashc.llc_line_length;//ecc overhead
+//		mcp.dataBusWidth    =int(ceil(XML->sys.flashc.databus_width/8.0)) + XML->sys.flashc.databus_width;
+//		mcp.addressBusWidth =int(ceil(XML->sys.flashc.addressbus_width));//XML->sys.physical_address_width;
+//		mcp.opcodeW         =16;
+//		mcp.num_mcs         = XML->sys.flashc.number_mcs;
+//		mcp.num_channels    = XML->sys.flashc.memory_channels_per_mc;
+//		mcp.reads  = XML->sys.flashc.memory_reads;
+//		mcp.writes = XML->sys.flashc.memory_writes;
+//		//+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers, Run the RTL code from OpenSparc.
+//		mcp.peakDataTransferRate = XML->sys.flashc.peak_transfer_rate;
+//		mcp.memRank = XML->sys.flashc.number_ranks;
+//		//++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers
+//		//PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power
+//		//PHY.llcBlocksize=llcBlockSize;
+//		mcp.frontend_duty_cycle = 0.5;//for max power, the actual off-chip links is bidirectional but time shared
+//		mcp.LVDS = XML->sys.flashc.LVDS;
+//		mcp.type = XML->sys.flashc.type;
+//	}
+        else
+        {
+                cout<<"Unknown memory controller type: neither DRAM controller nor Flash controller" <<endl;
+                exit(0);
+        }
+}
+
+MCFrontEnd ::~MCFrontEnd(){
+
+        if(MC_arb) 	               {delete MC_arb; MC_arb = 0;}
+        if(frontendBuffer) 	       {delete frontendBuffer; frontendBuffer = 0;}
+        if(readBuffer) 	           {delete readBuffer; readBuffer = 0;}
+        if(writeBuffer) 	       {delete writeBuffer; writeBuffer = 0;}
+}
+
+MemoryController ::~MemoryController(){
+
+        if(frontend) 	               {delete frontend; frontend = 0;}
+        if(transecEngine) 	           {delete transecEngine; transecEngine = 0;}
+        if(PHY) 	                   {delete PHY; PHY = 0;}
+        if(pipeLogic) 	               {delete pipeLogic; pipeLogic = 0;}
+}
+