1 files changed, 744 insertions, 800 deletions
diff --git a/ext/mcpat/logic.cc b/ext/mcpat/logic.cc
index 11519d863..43823e77b 100644
--- a/ext/mcpat/logic.cc
+++ b/ext/mcpat/logic.cc
@@ -2,6 +2,7 @@
  *                                McPAT
  *                      SOFTWARE LICENSE AGREEMENT
  *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *            Copyright (c) 2010-2013 Advanced Micro Devices, Inc.
  *                          All Rights Reserved
  *
  * Redistribution and use in source and binary forms, with or without
@@ -25,416 +26,500 @@
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ***************************************************************************/
 
+#include "common.h"
 #include "logic.h"
 
-
 //selection_logic
-selection_logic::selection_logic(
-    bool   _is_default,
-    int    win_entries_,
-    int    issue_width_,
-    const InputParameter *configure_interface,
-    enum Device_ty device_ty_,
-    enum Core_type core_ty_)
-    //const ParseXML *_XML_interface)
- :is_default(_is_default),
-  win_entries(win_entries_),
-  issue_width(issue_width_),
-  device_ty(device_ty_),
-  core_ty(core_ty_)
- {
-        //uca_org_t result2;
-        l_ip=*configure_interface;
-        local_result = init_interface(&l_ip);
-        //init_tech_params(l_ip.F_sz_um, false);
-        //win_entries=numIBEntries;//IQentries;
-                //issue_width=issueWidth;
-        selection_power();
-        double sckRation = g_tp.sckt_co_eff;
-        power.readOp.dynamic *= sckRation;
-        power.writeOp.dynamic *= sckRation;
-        power.searchOp.dynamic *= sckRation;
-
-        double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
-        power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
-         }
-
-void selection_logic::selection_power()
-{//based on cost effective superscalar processor TR pp27-31
-  double Ctotal, Cor, Cpencode;
-  int num_arbiter;
-  double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp;
-
-  //TODO: the 0.8um process data is used.
-  WSelORn    	=  	12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process
-  WSelORprequ   = 	50 * l_ip.F_sz_um;//this was 40 micron for the 0.8 micron process
-  WSelPn     	= 	12.5 * l_ip.F_sz_um;//this was 10mcron for the 0.8 micron process
-  WSelPp     	=  	18.75 * l_ip.F_sz_um;//this was 15 micron for the 0.8 micron process
-  WSelEnn    	=  	6.25 * l_ip.F_sz_um;//this was 5 micron for the 0.8 micron process
-  WSelEnp    	=  	12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process
-
-
-  Ctotal=0;
-  num_arbiter=1;
-  while(win_entries > 4)
-    {
-      win_entries = (int)ceil((double)win_entries / 4.0);
-      num_arbiter += win_entries;
-    }
-  //the 4-input OR logic to generate anyreq
-  Cor = 4 * drain_C_(WSelORn,NCH,1,1, g_tp.cell_h_def) + drain_C_(WSelORprequ,PCH,1,1, g_tp.cell_h_def);
-  power.readOp.gate_leakage = cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor)*g_tp.peri_global.Vdd;
-
-  //The total capacity of the 4-bit priority encoder
-  Cpencode = drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,1, 1, g_tp.cell_h_def) +
-    2*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,2, 1, g_tp.cell_h_def) +
-    3*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,3, 1, g_tp.cell_h_def) +
-    4*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,4, 1, g_tp.cell_h_def) +//precompute priority logic
-    2*4*gate_C(WSelEnn+WSelEnp,20.0)+
-    4*drain_C_(WSelEnn,NCH,1, 1, g_tp.cell_h_def) + 2*4*drain_C_(WSelEnp,PCH,1, 1, g_tp.cell_h_def)+//enable logic
-    (2*4+2*3+2*2+2)*gate_C(WSelPn+WSelPp,10.0);//requests signal
-
-  Ctotal += issue_width * num_arbiter*(Cor+Cpencode);
-
-  power.readOp.dynamic = Ctotal*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*2;//2 means the abitration signal need to travel round trip
-  power.readOp.leakage = issue_width * num_arbiter *
-      (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
-       + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p
-       + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p
-       + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
-       + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals
-                  )*g_tp.peri_global.Vdd;
-  power.readOp.gate_leakage = issue_width * num_arbiter *
-      (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
-       + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p
-       + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p
-       + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
-       + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals
-        )*g_tp.peri_global.Vdd;
+selection_logic::selection_logic(XMLNode* _xml_data, bool _is_default,
+                                 int _win_entries, int issue_width_,
+                                 const InputParameter *configure_interface,
+                                 string _name, double _accesses,
+                                 double clockRate_, enum Device_ty device_ty_,
+                                 enum Core_type core_ty_)
+    : McPATComponent(_xml_data), is_default(_is_default),
+      win_entries(_win_entries),
+      issue_width(issue_width_),
+      accesses(_accesses),
+      device_ty(device_ty_),
+      core_ty(core_ty_) {
+    clockRate = clockRate_;
+    name = _name;
+    l_ip = *configure_interface;
+    local_result = init_interface(&l_ip, name);
+}
+
+void selection_logic::computeArea() {
+    output_data.area = local_result.area;
 }
 
+void selection_logic::computeEnergy() {
+    //based on cost effective superscalar processor TR pp27-31
+    double Ctotal, Cor, Cpencode;
+    int num_arbiter;
+    double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp;
+
+    //the 0.8um process data is used.
+    //this was 10 micron for the 0.8 micron process
+    WSelORn	= 12.5 * l_ip.F_sz_um;
+    //this was 40 micron for the 0.8 micron process
+    WSelORprequ = 50 * l_ip.F_sz_um;
+    //this was 10mcron for the 0.8 micron process
+    WSelPn = 12.5 * l_ip.F_sz_um;
+    //this was 15 micron for the 0.8 micron process
+    WSelPp = 18.75 * l_ip.F_sz_um;
+    //this was 5 micron for the 0.8 micron process
+    WSelEnn	= 6.25 * l_ip.F_sz_um;
+    //this was 10 micron for the 0.8 micron process
+    WSelEnp	= 12.5 * l_ip.F_sz_um;
+
+    Ctotal = 0;
+    num_arbiter = 1;
+    while (win_entries > 4) {
+        win_entries = (int)ceil((double)win_entries / 4.0);
+        num_arbiter += win_entries;
+    }
+    //the 4-input OR logic to generate anyreq
+    Cor = 4 * drain_C_(WSelORn, NCH, 1, 1, g_tp.cell_h_def) +
+        drain_C_(WSelORprequ, PCH, 1, 1, g_tp.cell_h_def);
+    power.readOp.gate_leakage =
+        cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor) * g_tp.peri_global.Vdd;
+
+    //The total capacity of the 4-bit priority encoder
+    Cpencode = drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
+        drain_C_(WSelPp, PCH, 1, 1, g_tp.cell_h_def) +
+        2 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
+        drain_C_(WSelPp, PCH, 2, 1, g_tp.cell_h_def) +
+        3 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
+        drain_C_(WSelPp, PCH, 3, 1, g_tp.cell_h_def) +
+        4 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
+        drain_C_(WSelPp, PCH, 4, 1, g_tp.cell_h_def) +//precompute priority logic
+        2 * 4 * gate_C(WSelEnn + WSelEnp, 20.0) +
+        4 * drain_C_(WSelEnn, NCH, 1, 1, g_tp.cell_h_def) +
+        2 * 4 * drain_C_(WSelEnp, PCH, 1, 1, g_tp.cell_h_def) +//enable logic
+        (2 * 4 + 2 * 3 + 2 * 2 + 2) *
+        gate_C(WSelPn + WSelPp, 10.0);//requests signal
+
+    Ctotal += issue_width * num_arbiter * (Cor + Cpencode);
+
+    //2 means the abitration signal need to travel round trip
+    power.readOp.dynamic =
+        Ctotal * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 2;
+    power.readOp.leakage = issue_width * num_arbiter *
+        (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
+         + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p
+         + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p
+         + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
+         + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals
+            ) * g_tp.peri_global.Vdd;
+    power.readOp.gate_leakage = issue_width * num_arbiter *
+        (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p
+         + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p
+         + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p
+         + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic
+         + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals
+            ) * g_tp.peri_global.Vdd;
+    double sckRation = g_tp.sckt_co_eff;
+    power.readOp.dynamic *= sckRation;
+    power.writeOp.dynamic *= sckRation;
+    power.searchOp.dynamic *= sckRation;
+
+    double long_channel_device_reduction =
+        longer_channel_device_reduction(device_ty, core_ty);
+    power.readOp.longer_channel_leakage =
+        power.readOp.leakage * long_channel_device_reduction;
+
+    output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
+    output_data.subthreshold_leakage_power = power.readOp.leakage;
+    output_data.gate_leakage_power = power.readOp.gate_leakage;
+    output_data.runtime_dynamic_energy = power.readOp.dynamic * accesses;
+}
 
 dep_resource_conflict_check::dep_resource_conflict_check(
-        const InputParameter *configure_interface,
-        const CoreDynParam & dyn_p_,
-        int   compare_bits_,
-    bool   _is_default)
- :  l_ip(*configure_interface),
-    coredynp(dyn_p_),
-    compare_bits(compare_bits_),
-        is_default(_is_default)
-{
-        Wcompn    	=  	25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process
-        Wevalinvp   = 	25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process
-        Wevalinvn   = 	100 * l_ip.F_sz_um;//this was 80.0 mcron for the 0.8 micron process
-        Wcomppreequ =  	50 * l_ip.F_sz_um;//this was 40.0  micron for the 0.8 micron process
-        WNORn    	=  	6.75 * l_ip.F_sz_um;//this was 5.4 micron for the 0.8 micron process
-        WNORp    	=  	38.125 * l_ip.F_sz_um;//this was 30.5 micron for the 0.8 micron process
-
-        local_result = init_interface(&l_ip);
-
-        if (coredynp.core_ty==Inorder)
-                    compare_bits += 16 + 8 + 8;//TODO: opcode bits + log(shared resources) + REG TAG BITS-->opcode comparator
-        else
-                compare_bits += 16 + 8 + 8;
-
-                conflict_check_power();
-        double sckRation = g_tp.sckt_co_eff;
-        power.readOp.dynamic *= sckRation;
-        power.writeOp.dynamic *= sckRation;
-        power.searchOp.dynamic *= sckRation;
+    XMLNode* _xml_data, const string _name,
+    const InputParameter *configure_interface,
+    const CoreParameters & dyn_p_, int compare_bits_,
+    double clockRate_, bool _is_default)
+    : McPATComponent(_xml_data), l_ip(*configure_interface),
+      coredynp(dyn_p_), compare_bits(compare_bits_), is_default(_is_default) {
+
+    name = _name;
+    clockRate = clockRate_;
+    //this was 20.0 micron for the 0.8 micron process
+    Wcompn = 25 * l_ip.F_sz_um;
+    //this was 20.0 micron for the 0.8 micron process
+    Wevalinvp = 25 * l_ip.F_sz_um;
+    //this was 80.0 mcron for the 0.8 micron process
+    Wevalinvn = 100 * l_ip.F_sz_um;
+    //this was 40.0  micron for the 0.8 micron process
+    Wcomppreequ = 50 * l_ip.F_sz_um;
+    //this was 5.4 micron for the 0.8 micron process
+    WNORn =	6.75 * l_ip.F_sz_um;
+    //this was 30.5 micron for the 0.8 micron process
+    WNORp =	38.125 * l_ip.F_sz_um;
+
+    // To make CACTI happy.
+    l_ip.cache_sz = MIN_BUFFER_SIZE;
+    local_result = init_interface(&l_ip, name);
+
+    if (coredynp.core_ty == Inorder)
+        //TODO: opcode bits + log(shared resources) + REG TAG BITS -->
+        //opcode comparator
+        compare_bits += 16 + 8 + 8;
+    else
+        compare_bits += 16 + 8 + 8;
+
+    conflict_check_power();
+    double sckRation = g_tp.sckt_co_eff;
+    power.readOp.dynamic *= sckRation;
+    power.writeOp.dynamic *= sckRation;
+    power.searchOp.dynamic *= sckRation;
 
 }
 
-void dep_resource_conflict_check::conflict_check_power()
-{
-        double Ctotal;
-        int num_comparators;
-        num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision.
-        //When decode-width ==1, no dcl logic
+void dep_resource_conflict_check::conflict_check_power() {
+    double Ctotal;
+    int num_comparators;
+    //2(N*N-N) is used for source to dest comparison, (N*N-N) is used for
+    //dest to dest comparision.
+    num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) -
+                           coredynp.decodeW);
 
-        Ctotal = num_comparators * compare_cap();
-        //printf("%i,%s\n",XML_interface->sys.core[0].predictor.predictor_entries,XML_interface->sys.core[0].predictor.prediction_scheme);
+    Ctotal = num_comparators * compare_cap();
 
-        power.readOp.dynamic=Ctotal*/*CLOCKRATE*/g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/*AF*/;
-        power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn,  false);
+    power.readOp.dynamic = Ctotal * /*CLOCKRATE*/ g_tp.peri_global.Vdd *
+        g_tp.peri_global.Vdd /*AF*/;
+    power.readOp.leakage = num_comparators * compare_bits * 2 *
+        simplified_nmos_leakage(Wcompn,  false);
 
-        double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
-        power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
-        power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos);
+    double long_channel_device_reduction =
+        longer_channel_device_reduction(Core_device, coredynp.core_ty);
+    power.readOp.longer_channel_leakage	=
+        power.readOp.leakage * long_channel_device_reduction;
+    power.readOp.gate_leakage = num_comparators * compare_bits * 2 *
+        cmos_Ig_leakage(Wcompn, 0, 2, nmos);
 
 }
 
 /* estimate comparator power consumption (this comparator is similar
    to the tag-match structure in a CAM */
-double dep_resource_conflict_check::compare_cap()
-{
-  double c1, c2;
-
-  WNORp = WNORp * compare_bits/2.0;//resize the big NOR gate at the DCL according to fan in.
-  /* bottom part of comparator */
-  c2 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def))+
-  drain_C_(Wevalinvp,PCH,1,1, g_tp.cell_h_def) + drain_C_(Wevalinvn,NCH,1,1, g_tp.cell_h_def);
-
-  /* top part of comparator */
-  c1 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def)+
-                  drain_C_(Wcomppreequ,NCH,1,1, g_tp.cell_h_def)) +  gate_C(WNORn + WNORp,10.0) +
-                  drain_C_(WNORp,NCH,2,1, g_tp.cell_h_def) + compare_bits*drain_C_(WNORn,NCH,2,1, g_tp.cell_h_def);
-  return(c1 + c2);
+double dep_resource_conflict_check::compare_cap() {
+    double c1, c2;
+
+    //resize the big NOR gate at the DCL according to fan in.
+    WNORp = WNORp * compare_bits / 2.0;
+    /* bottom part of comparator */
+    c2 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) +
+                           drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def)) +
+        drain_C_(Wevalinvp, PCH, 1, 1, g_tp.cell_h_def) +
+        drain_C_(Wevalinvn, NCH, 1, 1, g_tp.cell_h_def);
+
+    /* top part of comparator */
+    c1 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) +
+                           drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def) +
+                           drain_C_(Wcomppreequ, NCH, 1, 1, g_tp.cell_h_def)) +
+        gate_C(WNORn + WNORp, 10.0) +
+        drain_C_(WNORp, NCH, 2, 1, g_tp.cell_h_def) + compare_bits *
+        drain_C_(WNORn, NCH, 2, 1, g_tp.cell_h_def);
+    return(c1 + c2);
 
 }
 
 void dep_resource_conflict_check::leakage_feedback(double temperature)
 {
   l_ip.temp = (unsigned int)round(temperature/10.0)*10;
-  uca_org_t init_result = init_interface(&l_ip); // init_result is dummy
+  uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy
 
   // This is part of conflict_check_power()
-  int num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision.
-  power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn,  false);
-
-  double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
-  power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
-  power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos);
+  // 2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest
+  // to dest comparison.
+  int num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) -
+                             coredynp.decodeW);
+  power.readOp.leakage = num_comparators * compare_bits * 2 *
+      simplified_nmos_leakage(Wcompn,  false);
+
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(Core_device, coredynp.core_ty);
+  power.readOp.longer_channel_leakage = power.readOp.leakage *
+      long_channel_device_reduction;
+  power.readOp.gate_leakage = num_comparators * compare_bits * 2 *
+      cmos_Ig_leakage(Wcompn, 0, 2, nmos);
 }
 
-//TODO: add inverter and transmission gate base DFF.
 
 DFFCell::DFFCell(
-                bool _is_dram,
-                double _WdecNANDn,
-                double _WdecNANDp,
-                double _cell_load,
-                const InputParameter *configure_interface)
-:is_dram(_is_dram),
-cell_load(_cell_load),
-WdecNANDn(_WdecNANDn),
-WdecNANDp(_WdecNANDp)
-{//this model is based on the NAND2 based DFF.
-                        l_ip=*configure_interface;
-//			area.set_area(730*l_ip.F_sz_um*l_ip.F_sz_um);
-                        area.set_area(5*compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp, g_tp.cell_h_def)
-                                + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn, g_tp.cell_h_def));
+    bool _is_dram,
+    double _WdecNANDn,
+    double _WdecNANDp,
+    double _cell_load,
+    const InputParameter *configure_interface)
+        : is_dram(_is_dram),
+        cell_load(_cell_load),
+        WdecNANDn(_WdecNANDn),
+        WdecNANDp(_WdecNANDp) { //this model is based on the NAND2 based DFF.
+    l_ip = *configure_interface;
+    area.set_area(5 * compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp,
+                                        g_tp.cell_h_def)
+                  + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn,
+                                      g_tp.cell_h_def));
 
 
 }
 
 
-double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out)
-{
-  double Ctotal = 0;
-  //printf("WdecNANDn = %E\n", WdecNANDn);
+double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out) {
+    double Ctotal = 0;
 
-  /* part 1: drain cap of NAND gate */
-  Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram);
+    /* part 1: drain cap of NAND gate */
+    Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram);
 
-  /* part 2: gate cap of NAND gates */
-  Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
+    /* part 2: gate cap of NAND gates */
+    Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
 
-  return Ctotal;
+    return Ctotal;
 }
 
 
-void DFFCell::compute_DFF_cell()
-{
-        double c1, c2, c3, c4, c5, c6;
-           /* node 5 and node 6 are identical to node 1 in capacitance */
-           c1 = c5 = c6 = fpfp_node_cap(2, 1);
-           c2 = fpfp_node_cap(2, 3);
-           c3 = fpfp_node_cap(3, 2);
-           c4 = fpfp_node_cap(2, 2);
-
-           //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2
-           clock_cap= 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
-           e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2*cell_load)*0.5*g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
-
-           /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */
-           e_keep_1.readOp.dynamic += c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
-           e_keep_0.readOp.dynamic += c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
-           e_clock.readOp.dynamic += clock_cap* g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
-
-           /* static power */
-           e_switch.readOp.leakage +=  (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF
-                                           + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd;
-           e_switch.readOp.gate_leakage += (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF
-                                           + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd;
-           //printf("leakage =%E\n",cmos_Ileak(1, is_dram) );
+void DFFCell::compute_DFF_cell() {
+    double c1, c2, c3, c4, c5, c6;
+    /* node 5 and node 6 are identical to node 1 in capacitance */
+    c1 = c5 = c6 = fpfp_node_cap(2, 1);
+    c2 = fpfp_node_cap(2, 3);
+    c3 = fpfp_node_cap(3, 2);
+    c4 = fpfp_node_cap(2, 2);
+
+    //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2
+    clock_cap = 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
+    e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2 * cell_load) *
+        0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
+
+    /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */
+    e_keep_1.readOp.dynamic +=
+        c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
+    e_keep_0.readOp.dynamic +=
+        c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ;
+    e_clock.readOp.dynamic +=
+        clock_cap * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;;
+
+    /* static power */
+    e_switch.readOp.leakage +=
+        (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand) *
+         5//5 NAND2 and 1 NAND3 in a DFF
+         + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand)) *
+        g_tp.peri_global.Vdd;
+    e_switch.readOp.gate_leakage +=
+        (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand) *
+         5//5 NAND2 and 1 NAND3 in a DFF
+         + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand)) *
+        g_tp.peri_global.Vdd;
 }
 
-Pipeline::Pipeline(
-                const InputParameter *configure_interface,
-                const CoreDynParam & dyn_p_,
-                enum Device_ty device_ty_,
-                bool _is_core_pipeline,
-                bool _is_default)
-: l_ip(*configure_interface),
-  coredynp(dyn_p_),
-  device_ty(device_ty_),
-  is_core_pipeline(_is_core_pipeline),
-  is_default(_is_default),
-  num_piperegs(0.0)
-
-  {
-        local_result = init_interface(&l_ip);
-        if (!coredynp.Embedded)
-                process_ind = true;
-        else
-                process_ind = false;
-        WNANDn = (process_ind)? 25 *   l_ip.F_sz_um : g_tp.min_w_nmos_ ;//this was  20 micron for the 0.8 micron process
-        WNANDp = (process_ind)? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_*pmos_to_nmos_sz_ratio();//this was  30 micron for the 0.8 micron process
-        load_per_pipeline_stage = 2*gate_C(WNANDn + WNANDp, 0, false);
-        compute();
+Pipeline::Pipeline(XMLNode* _xml_data,
+                   const InputParameter *configure_interface,
+                   const CoreParameters & dyn_p_,
+                   enum Device_ty device_ty_,
+                   bool _is_core_pipeline,
+                   bool _is_default)
+    : McPATComponent(_xml_data), l_ip(*configure_interface),
+      coredynp(dyn_p_), device_ty(device_ty_),
+      is_core_pipeline(_is_core_pipeline), is_default(_is_default),
+      num_piperegs(0.0) {
+    name = "Pipeline?";
+
+    local_result = init_interface(&l_ip, name);
+    if (!coredynp.Embedded) {
+        process_ind = true;
+    } else {
+        process_ind = false;
+    }
+    //this was  20 micron for the 0.8 micron process
+    WNANDn = (process_ind) ? 25 * l_ip.F_sz_um : g_tp.min_w_nmos_ ;
+    //this was  30 micron for the 0.8 micron process
+    WNANDp = (process_ind) ? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_ *
+        pmos_to_nmos_sz_ratio();
+    load_per_pipeline_stage = 2 * gate_C(WNANDn + WNANDp, 0, false);
+    compute();
 
 }
 
-void Pipeline::compute()
-{
-        compute_stage_vector();
-        DFFCell pipe_reg(false, WNANDn,WNANDp, load_per_pipeline_stage, &l_ip);
-        pipe_reg.compute_DFF_cell();
-
-        double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic;
-        //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider
-        //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power.
-        double pipe_reg_power = num_piperegs * (pipe_reg.e_switch.readOp.dynamic+pipe_reg.e_keep_0.readOp.dynamic+pipe_reg.e_keep_1.readOp.dynamic)/3+clock_power_pipereg;
-        double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage;
-        double pipe_reg_gate_leakage = num_piperegs * pipe_reg.e_switch.readOp.gate_leakage;
-        power.readOp.dynamic	+=pipe_reg_power;
-        power.readOp.leakage	+=pipe_reg_leakage;
-        power.readOp.gate_leakage	+=pipe_reg_gate_leakage;
-        area.set_area(num_piperegs * pipe_reg.area.get_area());
-
-        double long_channel_device_reduction = longer_channel_device_reduction(device_ty, coredynp.core_ty);
-        power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
-
-
-        double sckRation = g_tp.sckt_co_eff;
-        power.readOp.dynamic *= sckRation;
-        power.writeOp.dynamic *= sckRation;
-        power.searchOp.dynamic *= sckRation;
-        double macro_layout_overhead = g_tp.macro_layout_overhead;
+void Pipeline::compute() {
+    compute_stage_vector();
+    DFFCell pipe_reg(false, WNANDn, WNANDp, load_per_pipeline_stage, &l_ip);
+    pipe_reg.compute_DFF_cell();
+
+    double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic;
+    //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider
+    //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power.
+    double pipe_reg_power = num_piperegs *
+        (pipe_reg.e_switch.readOp.dynamic + pipe_reg.e_keep_0.readOp.dynamic +
+         pipe_reg.e_keep_1.readOp.dynamic) / 3 + clock_power_pipereg;
+    double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage;
+    double pipe_reg_gate_leakage = num_piperegs *
+        pipe_reg.e_switch.readOp.gate_leakage;
+    power.readOp.dynamic	+= pipe_reg_power;
+    power.readOp.leakage	+= pipe_reg_leakage;
+    power.readOp.gate_leakage	+= pipe_reg_gate_leakage;
+    area.set_area(num_piperegs * pipe_reg.area.get_area());
+
+    double long_channel_device_reduction =
+        longer_channel_device_reduction(device_ty, coredynp.core_ty);
+    power.readOp.longer_channel_leakage	= power.readOp.leakage *
+        long_channel_device_reduction;
+
+
+    double sckRation = g_tp.sckt_co_eff;
+    power.readOp.dynamic *= sckRation;
+    power.writeOp.dynamic *= sckRation;
+    power.searchOp.dynamic *= sckRation;
+    double macro_layout_overhead = g_tp.macro_layout_overhead;
         if (!coredynp.Embedded)
-                area.set_area(area.get_area()*macro_layout_overhead);
-}
-
-void Pipeline::compute_stage_vector()
-{
-        double num_stages, tot_stage_vector, per_stage_vector;
-        int opcode_length = coredynp.x86? coredynp.micro_opcode_length:coredynp.opcode_length;
-        //Hthread = thread_clock_gated? 1:num_thread;
+                area.set_area(area.get_area() * macro_layout_overhead);
 
-  if (!is_core_pipeline)
-  {
-        num_piperegs=l_ip.pipeline_stages*l_ip.per_stage_vector;//The number of pipeline stages are calculated based on the achievable throughput and required throughput
-  }
-  else
-  {
-        if (coredynp.core_ty==Inorder)
-        {
-                /* assume 6 pipe stages and try to estimate bits per pipe stage */
-                /* pipe stage 0/IF */
-                num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads;
-                /* pipe stage IF/ID */
-                num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;
-                /* pipe stage IF/ThreadSEL */
-                if (coredynp.multithreaded) num_piperegs += coredynp.num_hthreads*coredynp.perThreadState; //8 bit thread states
-                /* pipe stage ID/EXE */
-                num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width + pow(2.0,opcode_length)+ 2*coredynp.int_data_width)*coredynp.num_hthreads;
-                /* pipe stage EXE/MEM */
-                num_piperegs += coredynp.issueW*(3 * coredynp.arch_ireg_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/);
-                /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/
-                num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/);
-//		/* pipe stage 5/6 */
-//		num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/);
-//		/* pipe stage 6/7 */
-//		num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/);
-//		/* pipe stage 7/8 */
-//		num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/**2*powers (2,reg_length)*/);
-//		/* assume 50% extra in control signals (rule of thumb) */
-                num_stages=6;
+    output_data.area = area.get_area() / 1e6;
+    output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
+    output_data.subthreshold_leakage_power = power.readOp.leakage;
+    output_data.gate_leakage_power = power.readOp.gate_leakage;
+    output_data.runtime_dynamic_energy = power.readOp.dynamic * total_cycles;
+}
 
-        }
-        else
-        {
-                /* assume 12 stage pipe stages and try to estimate bits per pipe stage */
-                /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */
-
-                /* pipe stage 0/1F*/
-                num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads ;//PC and Next PC
-                /* pipe stage IF/ID */
-                num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is used to feed branch predictor in ID
-                /* pipe stage 1D/Renaming*/
-                num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is for branch exe in later stage.
-                /* pipe stage Renaming/wire_drive */
-                num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width);
-                /* pipe stage Renaming/IssueQ */
-                num_piperegs += coredynp.issueW*(coredynp.instruction_length  + coredynp.pc_width + 3*coredynp.phy_ireg_width)*coredynp.num_hthreads;//3*coredynp.phy_ireg_width means 2 sources and 1 dest
-                /* pipe stage IssueQ/Dispatch */
-                num_piperegs += coredynp.issueW*(coredynp.instruction_length + 3 * coredynp.phy_ireg_width);
-                /* pipe stage Dispatch/EXE */
-
-                num_piperegs += coredynp.issueW*(3 * coredynp.phy_ireg_width + coredynp.pc_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
-                /* 2^opcode_length means the total decoded signal for the opcode*/
-                num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
-                /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/
-                num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);
-                /* pipe stage EXE/MEM, data need to be read/write, address*/
-                num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.v_address_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);//memory Opcode still need to be passed
-                /* pipe stage MEM/WB; result data, writeback regs */
-                num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.phy_ireg_width /* powers (2,opcode_length) + (2,opcode_length)+2*powers (2,reg_length)*/);
-                /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/
-                num_piperegs += coredynp.commitW*(coredynp.int_data_width + coredynp.v_address_width + coredynp.phy_ireg_width/*+ powers (2,opcode_length)*2*powers (2,reg_length)*/)*coredynp.num_hthreads;
-//		if (multithreaded)
-//		{
-//
-//		}
-                num_stages=12;
+void Pipeline::compute_stage_vector() {
+    double num_stages, tot_stage_vector, per_stage_vector;
+    int opcode_length = coredynp.x86 ?
+        coredynp.micro_opcode_length : coredynp.opcode_width;
+
+    if (!is_core_pipeline) {
+        //The number of pipeline stages are calculated based on the achievable
+        //throughput and required throughput
+        num_piperegs = l_ip.pipeline_stages * l_ip.per_stage_vector;
+    } else {
+        if (coredynp.core_ty == Inorder) {
+            /* assume 6 pipe stages and try to estimate bits per pipe stage */
+            /* pipe stage 0/IF */
+            num_piperegs += coredynp.pc_width * 2 * coredynp.num_hthreads;
+            /* pipe stage IF/ID */
+            num_piperegs += coredynp.fetchW *
+                (coredynp.instruction_length + coredynp.pc_width) *
+                coredynp.num_hthreads;
+            /* pipe stage IF/ThreadSEL */
+            if (coredynp.multithreaded) {
+                num_piperegs += coredynp.num_hthreads *
+                    coredynp.perThreadState; //8 bit thread states
+            }
+            /* pipe stage ID/EXE */
+            num_piperegs += coredynp.decodeW *
+                (coredynp.instruction_length + coredynp.pc_width +
+                 pow(2.0, opcode_length) + 2 * coredynp.int_data_width) *
+                coredynp.num_hthreads;
+            /* pipe stage EXE/MEM */
+            num_piperegs += coredynp.issueW *
+                (3 * coredynp.arch_ireg_width + pow(2.0, opcode_length) + 8 *
+                 2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/);
+            /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/
+            num_piperegs += coredynp.issueW *
+                (2 * coredynp.int_data_width + pow(2.0, opcode_length) + 8 *
+                 2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/);
+            num_stages = 6;
+        } else {
+            /* assume 12 stage pipe stages and try to estimate bits per pipe stage */
+            /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */
+
+            /* pipe stage 0/1F*/
+            num_piperegs +=
+                coredynp.pc_width * 2 * coredynp.num_hthreads ;//PC and Next PC
+            /* pipe stage IF/ID */
+            num_piperegs += coredynp.fetchW *
+                (coredynp.instruction_length + coredynp.pc_width) *
+                coredynp.num_hthreads;//PC is used to feed branch predictor in ID
+            /* pipe stage 1D/Renaming*/
+            num_piperegs += coredynp.decodeW *
+                (coredynp.instruction_length + coredynp.pc_width) *
+                coredynp.num_hthreads;//PC is for branch exe in later stage.
+            /* pipe stage Renaming/wire_drive */
+            num_piperegs += coredynp.decodeW *
+                (coredynp.instruction_length + coredynp.pc_width);
+            /* pipe stage Renaming/IssueQ */
+            //3*coredynp.phy_ireg_width means 2 sources and 1 dest
+            num_piperegs += coredynp.issueW *
+                (coredynp.instruction_length  + coredynp.pc_width + 3 *
+                 coredynp.phy_ireg_width) * coredynp.num_hthreads;
+            /* pipe stage IssueQ/Dispatch */
+            num_piperegs += coredynp.issueW *
+                (coredynp.instruction_length + 3 * coredynp.phy_ireg_width);
+            /* pipe stage Dispatch/EXE */
+
+            num_piperegs += coredynp.issueW *
+                (3 * coredynp.phy_ireg_width + coredynp.pc_width +
+                 pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/);
+            /* 2^opcode_length means the total decoded signal for the opcode*/
+            num_piperegs += coredynp.issueW *
+                (2 * coredynp.int_data_width + pow(2.0, opcode_length)
+                 /*+2*powers (2,reg_length)*/);
+            /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/
+            num_piperegs += coredynp.issueW *
+                (2 * coredynp.int_data_width + pow(2.0, opcode_length)
+                 /*+2*powers (2,reg_length)*/);
+            /* pipe stage EXE/MEM, data need to be read/write, address*/
+            //memory Opcode still need to be passed
+            num_piperegs += coredynp.issueW *
+                (coredynp.int_data_width + coredynp.v_address_width +
+                 pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/);
+            /* pipe stage MEM/WB; result data, writeback regs */
+            num_piperegs += coredynp.issueW *
+                (coredynp.int_data_width + coredynp.phy_ireg_width
+                 /* powers (2,opcode_length) +
+                    (2,opcode_length)+2*powers (2,reg_length)*/);
+            /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/
+            num_piperegs += coredynp.commitW *
+                (coredynp.int_data_width + coredynp.v_address_width +
+                 coredynp.phy_ireg_width
+                 /*+ powers (2,opcode_length)*2*powers (2,reg_length)*/) *
+                coredynp.num_hthreads;
+            num_stages = 12;
 
         }
 
         /* assume 50% extra in control registers and interrupt registers (rule of thumb) */
         num_piperegs = num_piperegs * 1.5;
-        tot_stage_vector=num_piperegs;
-        per_stage_vector=tot_stage_vector/num_stages;
-
-        if (coredynp.core_ty==Inorder)
-        {
-                if (coredynp.pipeline_stages>6)
-                        num_piperegs= per_stage_vector*coredynp.pipeline_stages;
+        tot_stage_vector = num_piperegs;
+        per_stage_vector = tot_stage_vector / num_stages;
+
+        if (coredynp.core_ty == Inorder) {
+            if (coredynp.pipeline_stages > 6)
+                num_piperegs = per_stage_vector * coredynp.pipeline_stages;
+        } else { //OOO
+            if (coredynp.pipeline_stages > 12)
+                num_piperegs = per_stage_vector * coredynp.pipeline_stages;
         }
-        else//OOO
-        {
-                if (coredynp.pipeline_stages>12)
-                        num_piperegs= per_stage_vector*coredynp.pipeline_stages;
-        }
-  }
+    }
 
 }
 
-FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, enum FU_type fu_type_)
-:XML(XML_interface),
- ithCore(ithCore_),
- interface_ip(*interface_ip_),
- coredynp(dyn_p_),
- fu_type(fu_type_)
-{
-    double area_t;//, leakage, gate_leakage;
+FunctionalUnit::FunctionalUnit(XMLNode* _xml_data,
+                               InputParameter* interface_ip_,
+                               const CoreParameters & _core_params,
+                               const CoreStatistics & _core_stats,
+                               enum FU_type fu_type_)
+    : McPATComponent(_xml_data),
+      interface_ip(*interface_ip_), core_params(_core_params),
+      core_stats(_core_stats), fu_type(fu_type_) {
+    double area_t;
+    double leakage;
+    double gate_leakage;
     double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
-        clockRate = coredynp.clockRate;
-        executionTime = coredynp.executionTime;
-
-        //XML_interface=_XML_interface;
-        uca_org_t result2;
-        result2 = init_interface(&interface_ip);
-        if (XML->sys.Embedded)
-        {
-                if (fu_type == FPU)
-                {
-                        num_fu=coredynp.num_fpus;
+    clockRate = core_params.clockRate;
+
+    uca_org_t result2;
+    // Temp name for the following function call
+    name = "Functional Unit";
+
+    result2 = init_interface(&interface_ip, name);
+
+        if (core_params.Embedded) {
+            if (fu_type == FPU) {
+                num_fu=core_params.num_fpus;
                         //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
                         area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number
                         //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60%
@@ -449,10 +534,8 @@ FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParam
                         per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ)
                         //FPU power from Sandia's processor sizing tech report
                         FU_height=(18667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
-                }
-                else if (fu_type == ALU)
-                {
-                        num_fu=coredynp.num_alus;
+            } else if (fu_type == ALU) {
+                num_fu=core_params.num_alus;
                         area_t = 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
                         leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
                         gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
@@ -462,10 +545,8 @@ FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParam
                         per_access_energy = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
                         FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
 
-                }
-                else if (fu_type == MUL)
-                {
-                        num_fu=coredynp.num_muls;
+            } else if (fu_type == MUL) {
+                num_fu=core_params.num_muls;
                         area_t = 280*260*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
                         leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
                         gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
@@ -474,197 +555,117 @@ FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParam
                         base_energy = 0;
                         per_access_energy = 1.15*2/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
                         FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
-                }
-                else
-                {
+            } else {
                         cout<<"Unknown Functional Unit Type"<<endl;
                         exit(0);
                 }
                 per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy
+        } else {
+            if (fu_type == FPU) {
+                name = "Floating Point Unit(s)";
+                num_fu = core_params.num_fpus;
+                area_t = 8.47 * 1e6 * (g_ip->F_sz_nm * g_ip->F_sz_nm / 90.0 /
+                                       90.0);//this is um^2
+                if (g_ip->F_sz_nm > 90)
+                    area_t = 8.47 * 1e6 *
+                        g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
+            leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+            gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+            //W The base energy of ALU average numbers from Intel 4G and
+            //773Mhz (Wattch)
+            base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 3;
+            base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
+                            1.2);
+            per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ)
+            FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
+        } else if (fu_type == ALU) {
+            name = "Integer ALU(s)";
+            num_fu = core_params.num_alus;
+            //this is um^2 ALU + MUl
+            area_t = 280 * 260 * 2 * g_tp.scaling_factor.logic_scaling_co_eff;
+            leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+            gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
+            //W The base energy of ALU average numbers from Intel 4G and 773Mhz
+            //(Wattch)
+            base_energy = core_params.core_ty == Inorder ? 0 : 89e-3;
+            base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
+                            1.2);
+            per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
+            FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
+        } else if (fu_type == MUL) {
+            name = "Multiply/Divide Unit(s)";
+            num_fu = core_params.num_muls;
+            //this is um^2 ALU + MUl
+            area_t = 280 * 260 * 2 * 3 *
+                g_tp.scaling_factor.logic_scaling_co_eff;
+            leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
+            gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
+            //W The base energy of ALU average numbers from Intel 4G and 773Mhz
+            //(Wattch)
+            base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 2;
+            base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 /
+                            1.2);
+            per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
+            FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
+        } else {
+            cout << "Unknown Functional Unit Type" << endl;
+            exit(0);
         }
-        else
-        {
-                if (fu_type == FPU)
-                {
-                        num_fu=coredynp.num_fpus;
-                        //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
-                        area_t = 8.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2
-                        if (g_ip->F_sz_nm>90)
-                                area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2
-                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
-                        gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
-                        //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles.
-                        base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
-                        base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
-                        per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ)
-                        FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
-                }
-                else if (fu_type == ALU)
-                {
-                        num_fu=coredynp.num_alus;
-                        area_t = 280*260*2*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
-                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
-                        gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
-                        base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
-                        base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
-                        per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ)
-                        FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU
+    }
 
-                }
-                else if (fu_type == MUL)
-                {
-                        num_fu=coredynp.num_muls;
-                        area_t = 280*260*2*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl
-                        leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W
-                        gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;
-                        base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch)
-                        base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
-                        per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch
-                        FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data
-                }
-                else
-                {
-                        cout<<"Unknown Functional Unit Type"<<endl;
-                        exit(0);
-                }
-        }
-        //IEXEU, simple ALU and FPU
-        //  double C_ALU, C_EXEU, C_FPU; //Lum Equivalent capacitance of IEXEU and FPU. Based on Intel and Sun 90nm process fabracation.
-        //
-        //  C_ALU	  = 0.025e-9;//F
-        //  C_EXEU  = 0.05e-9; //F
-        //  C_FPU	  = 0.35e-9;//F
     area.set_area(area_t*num_fu);
-    leakage *= num_fu;
-    gate_leakage *=num_fu;
-        double macro_layout_overhead = g_tp.macro_layout_overhead;
-//	if (!XML->sys.Embedded)
-                area.set_area(area.get_area()*macro_layout_overhead);
-}
-
-void FunctionalUnit::computeEnergy(bool is_tdp)
-{
-        double pppm_t[4]    = {1,1,1,1};
-        double FU_duty_cycle;
-        if (is_tdp)
-        {
-
-
-                set_pppm(pppm_t, 2, 2, 2, 2);//2 means two source operands needs to be passed for each int instruction.
-                if (fu_type == FPU)
-                {
-                        stats_t.readAc.access = num_fu;
-                        tdp_stats = stats_t;
-                        FU_duty_cycle = coredynp.FPU_duty_cycle;
-                }
-                else if (fu_type == ALU)
-                {
-                        stats_t.readAc.access = 1*num_fu;
-                        tdp_stats = stats_t;
-                        FU_duty_cycle = coredynp.ALU_duty_cycle;
-                }
-                else if (fu_type == MUL)
-                {
-                        stats_t.readAc.access = num_fu;
-                        tdp_stats = stats_t;
-                        FU_duty_cycle = coredynp.MUL_duty_cycle;
-                }
-
-            //power.readOp.dynamic = base_energy/clockRate + energy*stats_t.readAc.access;
-            power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy/clockRate;
-                double sckRation = g_tp.sckt_co_eff;
-                power.readOp.dynamic *= sckRation*FU_duty_cycle;
-                power.writeOp.dynamic *= sckRation;
-                power.searchOp.dynamic *= sckRation;
-
-            power.readOp.leakage = leakage;
-            power.readOp.gate_leakage = gate_leakage;
-            double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
-            power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
-
-        }
-        else
-        {
-                if (fu_type == FPU)
-                {
-                        stats_t.readAc.access = XML->sys.core[ithCore].fpu_accesses;
-                        rtp_stats = stats_t;
-                }
-                else if (fu_type == ALU)
-                {
-                        stats_t.readAc.access = XML->sys.core[ithCore].ialu_accesses;
-                        rtp_stats = stats_t;
-                }
-                else if (fu_type == MUL)
-                {
-                        stats_t.readAc.access = XML->sys.core[ithCore].mul_accesses;
-                        rtp_stats = stats_t;
-                }
-
-            //rt_power.readOp.dynamic = base_energy*executionTime + energy*stats_t.readAc.access;
-            rt_power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy*executionTime;
-                double sckRation = g_tp.sckt_co_eff;
-                rt_power.readOp.dynamic *= sckRation;
-                rt_power.writeOp.dynamic *= sckRation;
-                rt_power.searchOp.dynamic *= sckRation;
-
-        }
-
-
+    power.readOp.leakage = leakage * num_fu;
+    power.readOp.gate_leakage = gate_leakage * num_fu;
+
+    double long_channel_device_reduction =
+        longer_channel_device_reduction(Core_device, core_params.core_ty);
+    power.readOp.longer_channel_leakage	=
+        power.readOp.leakage * long_channel_device_reduction;
+    double macro_layout_overhead = g_tp.macro_layout_overhead;
+    area.set_area(area.get_area()*macro_layout_overhead);
 }
 
-void FunctionalUnit::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
-{
-        string indent_str(indent, ' ');
-        string indent_str_next(indent+2, ' ');
-        bool long_channel = XML->sys.longer_channel_device;
-
-//	cout << indent_str_next << "Results Broadcast Bus Area = " << bypass->area.get_area() *1e-6 << " mm^2" << endl;
-        if (is_tdp)
-        {
-                if (fu_type == FPU)
-                {
-                        cout << indent_str << "Floating Point Units (FPUs) (Count: "<< coredynp.num_fpus <<" ):" << endl;
-                        cout << indent_str_next << "Area = " << area.get_area()*1e-6  << " mm^2" << endl;
-                        cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate  << " W" << endl;
-//			cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage  << " W" << endl;
-                        cout << indent_str_next<< "Subthreshold Leakage = "
-                                                << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
-                        cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage  << " W" << endl;
-                        cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
-                        cout <<endl;
-                }
-                else if (fu_type == ALU)
-                {
-                        cout << indent_str << "Integer ALUs (Count: "<< coredynp.num_alus <<" ):" << endl;
-                        cout << indent_str_next << "Area = " << area.get_area()*1e-6  << " mm^2" << endl;
-                        cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate  << " W" << endl;
-//			cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage  << " W" << endl;
-                        cout << indent_str_next<< "Subthreshold Leakage = "
-                                                << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
-                        cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage  << " W" << endl;
-                        cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
-                        cout <<endl;
-                }
-                else if (fu_type == MUL)
-                {
-                        cout << indent_str << "Complex ALUs (Mul/Div) (Count: "<< coredynp.num_muls <<" ):" << endl;
-                        cout << indent_str_next << "Area = " << area.get_area()*1e-6  << " mm^2" << endl;
-                        cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate  << " W" << endl;
-//			cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage  << " W" << endl;
-                        cout << indent_str_next<< "Subthreshold Leakage = "
-                                                << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
-                        cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage  << " W" << endl;
-                        cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
-                        cout <<endl;
-
-                }
+void FunctionalUnit::computeEnergy() {
+    double pppm_t[4]    = {1, 1, 1, 1};
+    double FU_duty_cycle;
+    double sckRation = g_tp.sckt_co_eff;
+
+    // TDP power calculation
+    //2 means two source operands needs to be passed for each int instruction.
+    set_pppm(pppm_t, 2, 2, 2, 2);
+    tdp_stats.readAc.access = num_fu;
+    if (fu_type == FPU) {
+        FU_duty_cycle = core_stats.FPU_duty_cycle;
+    } else if (fu_type == ALU) {
+        FU_duty_cycle = core_stats.ALU_duty_cycle;
+    } else if (fu_type == MUL) {
+        FU_duty_cycle = core_stats.MUL_duty_cycle;
+    }
 
-        }
-        else
-        {
-        }
+    power.readOp.dynamic =
+        per_access_energy * tdp_stats.readAc.access + base_energy / clockRate;
+    power.readOp.dynamic *= sckRation * FU_duty_cycle;
+
+    // Runtime power calculation
+    if (fu_type == FPU) {
+        rtp_stats.readAc.access = core_stats.fpu_accesses;
+    } else if (fu_type == ALU) {
+        rtp_stats.readAc.access = core_stats.ialu_accesses;
+    } else if (fu_type == MUL) {
+        rtp_stats.readAc.access = core_stats.mul_accesses;
+    }
 
+    rt_power.readOp.dynamic = per_access_energy * rtp_stats.readAc.access +
+        base_energy * execution_time;
+    rt_power.readOp.dynamic *= sckRation;
+
+    output_data.area = area.get_area() / 1e6;
+    output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
+    output_data.subthreshold_leakage_power =
+        (longer_channel_device) ? power.readOp.longer_channel_leakage :
+        power.readOp.leakage;
+    output_data.gate_leakage_power = power.readOp.gate_leakage;
+    output_data.runtime_dynamic_energy = rt_power.readOp.dynamic;
 }
 
 void FunctionalUnit::leakage_feedback(double temperature)
@@ -672,7 +673,8 @@ void FunctionalUnit::leakage_feedback(double temperature)
   // Update the temperature and initialize the global interfaces.
   interface_ip.temp = (unsigned int)round(temperature/10.0)*10;
 
-  uca_org_t init_result = init_interface(&interface_ip); // init_result is dummy
+  // init_result is dummy
+  uca_org_t init_result = init_interface(&interface_ip, name);
 
   // This is part of FunctionalUnit()
   double area_t, leakage, gate_leakage;
@@ -706,277 +708,220 @@ void FunctionalUnit::leakage_feedback(double temperature)
 
   power.readOp.leakage = leakage*num_fu;
   power.readOp.gate_leakage = gate_leakage*num_fu;
-  power.readOp.longer_channel_leakage = longer_channel_device_reduction(Core_device, coredynp.core_ty);
+  power.readOp.longer_channel_leakage =
+      longer_channel_device_reduction(Core_device, core_params.core_ty);
 }
 
-UndiffCore::UndiffCore(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_,  bool embedded_)
-:XML(XML_interface),
- ithCore(ithCore_),
- interface_ip(*interface_ip_),
- coredynp(dyn_p_),
- core_ty(coredynp.core_ty),
- embedded(XML->sys.Embedded),
- pipeline_stage(coredynp.pipeline_stages),
- num_hthreads(coredynp.num_hthreads),
- issue_width(coredynp.issueW),
- exist(exist_)
-// is_default(_is_default)
-{
-        if (!exist) return;
-        double undifferentiated_core=0;
-        double core_tx_density=0;
-        double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+UndiffCore::UndiffCore(XMLNode* _xml_data, InputParameter* interface_ip_,
+                       const CoreParameters & dyn_p_,
+                       bool exist_)
+        : McPATComponent(_xml_data),
+        interface_ip(*interface_ip_), coredynp(dyn_p_),
+        core_ty(coredynp.core_ty), embedded(coredynp.Embedded),
+        pipeline_stage(coredynp.pipeline_stages),
+        num_hthreads(coredynp.num_hthreads), issue_width(coredynp.issueW),
+        exist(exist_) {
+    if (!exist) return;
+
+    name = "Undifferentiated Core";
+    clockRate = coredynp.clockRate;
+
+    double undifferentiated_core = 0;
+    double core_tx_density = 0;
+    double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
         double undifferentiated_core_coe;
-        //XML_interface=_XML_interface;
-        uca_org_t result2;
-        result2 = init_interface(&interface_ip);
-
-        //Compute undifferentiated core area at 90nm.
-        if (embedded==false)
-        {
-                //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements
-                if (core_ty==OOO)
-                {
-                        //undifferentiated_core = (0.0764*pipeline_stage*pipeline_stage -2.3685*pipeline_stage + 10.405);//OOO
-                        undifferentiated_core = (3.57*log(pipeline_stage)-1.2643)>0?(3.57*log(pipeline_stage)-1.2643):0;
-                }
-                else if (core_ty==Inorder)
-                {
-                        //undifferentiated_core = (0.1238*pipeline_stage + 7.2572)*0.9;//inorder
-                        undifferentiated_core = (-2.19*log(pipeline_stage)+6.55)>0?(-2.19*log(pipeline_stage)+6.55):0;
-                }
-                else
-                {
-                        cout<<"invalid core type"<<endl;
-                        exit(0);
-                }
-                undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0716);
+    uca_org_t result2;
+    result2 = init_interface(&interface_ip, name);
+
+    //Compute undifferentiated core area at 90nm.
+    if (embedded == false) {
+        //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements
+        if (core_ty == OOO) {
+            undifferentiated_core = (3.57 * log(pipeline_stage) - 1.2643) > 0 ?
+                (3.57 * log(pipeline_stage) - 1.2643) : 0;
+        } else if (core_ty == Inorder) {
+            undifferentiated_core = (-2.19 * log(pipeline_stage) + 6.55) > 0 ?
+                (-2.19 * log(pipeline_stage) + 6.55) : 0;
+        } else {
+            cout << "invalid core type" << endl;
+            exit(0);
         }
-        else
-        {
-                //Based on the results in paper "parametrized processor models" Sandia Labs
-                if (XML->sys.opt_clockrate)
+        undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0716);
+    } else {
+        //Based on the results in paper "parametrized processor models" Sandia Labs
+                if (opt_for_clk)
                         undifferentiated_core_coe = 0.05;
                 else
                         undifferentiated_core_coe = 0;
-                undifferentiated_core = (0.4109* pipeline_stage - 0.776)*undifferentiated_core_coe;
-                undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0426);
-        }
-
-        undifferentiated_core 		    *= g_tp.scaling_factor.logic_scaling_co_eff*1e6;//change from mm^2 to um^2
-        core_tx_density                 = g_tp.scaling_factor.core_tx_density;
-        //undifferentiated_core 		    = 3*1e6;
-        //undifferentiated_core			*= g_tp.scaling_factor.logic_scaling_co_eff;//(g_ip->F_sz_um*g_ip->F_sz_um/0.09/0.09)*;
-        power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
-        power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;
-
-        double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
-        power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
-        area.set_area(undifferentiated_core);
-
-        scktRatio = g_tp.sckt_co_eff;
-        power.readOp.dynamic *= scktRatio;
-        power.writeOp.dynamic *= scktRatio;
-        power.searchOp.dynamic *= scktRatio;
-        macro_PR_overhead = g_tp.macro_layout_overhead;
-        area.set_area(area.get_area()*macro_PR_overhead);
-
-
-
-//		double vt=g_tp.peri_global.Vth;
-//		double velocity_index=1.1;
-//		double c_in=gate_C(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r , 0.0, false);
-//		double c_out= drain_C_(g_tp.min_w_nmos_, NCH, 2, 1, g_tp.cell_h_def, false) + drain_C_(g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, PCH, 1, 1, g_tp.cell_h_def, false) + c_in;
-//		double w_nmos=g_tp.min_w_nmos_;
-//		double w_pmos=g_tp.min_w_nmos_*pmos_to_nmos_sizing_r;
-//		double i_on_n=1.0;
-//		double i_on_p=1.0;
-//		double i_on_n_in=1.0;
-//		double i_on_p_in=1;
-//		double vdd=g_tp.peri_global.Vdd;
-
-//		power.readOp.sc=shortcircuit_simple(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd);
-//		power.readOp.dynamic=c_out*vdd*vdd/2;
-
-//		cout<<power.readOp.dynamic << "dynamic" <<endl;
-//		cout<<power.readOp.sc << "sc" << endl;
-
-//		power.readOp.sc=shortcircuit(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd);
-//		power.readOp.dynamic=c_out*vdd*vdd/2;
-//
-//		cout<<power.readOp.dynamic << "dynamic" <<endl;
-//		cout<<power.readOp.sc << "sc" << endl;
-
-
-
-}
-
-
-void UndiffCore::displayEnergy(uint32_t indent,int plevel,bool is_tdp)
-{
-        string indent_str(indent, ' ');
-        string indent_str_next(indent+2, ' ');
-        bool long_channel = XML->sys.longer_channel_device;
-
-        if (is_tdp)
-        {
-                cout << indent_str << "UndiffCore:" << endl;
-                cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
-                cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl;
-                //cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl;
-                cout << indent_str_next<< "Subthreshold Leakage = "
-                                        << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl;
-                cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
-                //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
-                cout <<endl;
-        }
-        else
-        {
-                cout << indent_str << "UndiffCore:" << endl;
-                cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl;
-                cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl;
-                cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl;
-                cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl;
-                //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl;
-                cout <<endl;
-        }
+                undifferentiated_core = (0.4109 * pipeline_stage - 0.776) *
+                    undifferentiated_core_coe;
+                undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0426);
+    }
 
+    undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff *
+        1e6;//change from mm^2 to um^2
+    core_tx_density                 = g_tp.scaling_factor.core_tx_density;
+    power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W
+    power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;
+
+    double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty);
+    power.readOp.longer_channel_leakage	=
+        power.readOp.leakage * long_channel_device_reduction;
+    area.set_area(undifferentiated_core);
+
+    scktRatio = g_tp.sckt_co_eff;
+    power.readOp.dynamic *= scktRatio;
+    power.writeOp.dynamic *= scktRatio;
+    power.searchOp.dynamic *= scktRatio;
+    macro_PR_overhead = g_tp.macro_layout_overhead;
+    area.set_area(area.get_area()*macro_PR_overhead);
+
+    output_data.area = area.get_area() / 1e6;
+    output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
+    output_data.subthreshold_leakage_power =
+        longer_channel_device ? power.readOp.longer_channel_leakage :
+        power.readOp.leakage;
+    output_data.gate_leakage_power = power.readOp.gate_leakage;
 }
 
-inst_decoder::inst_decoder(
-                bool   _is_default,
-                const InputParameter *configure_interface,
-                int opcode_length_,
-                int num_decoders_,
-                bool x86_,
-            enum Device_ty device_ty_,
-            enum Core_type core_ty_)
-:is_default(_is_default),
- opcode_length(opcode_length_),
- num_decoders(num_decoders_),
- x86(x86_),
- device_ty(device_ty_),
- core_ty(core_ty_)
- {
-                        /*
-                         * Instruction decoder is different from n to 2^n decoders
-                         * that are commonly used in row decoders in memory arrays.
-                         * The RISC instruction decoder is typically a very simple device.
-                         * We can decode an instruction by simply
-                         * separating the machine word into small parts using wire slices
-                         * The RISC instruction decoder can be approximate by the n to 2^n decoders,
-                         * although this approximation usually underestimate power since each decoded
-                         * instruction normally has more than 1 active signal.
-                         *
-                         * However, decoding a CISC instruction word is much more difficult
-                         * than the RISC case. A CISC decoder is typically set up as a state machine.
-                         * The machine reads the opcode field to determine
-                         * what type of instruction it is,
-                         * and where the other data values are.
-                         * The instruction word is read in piece by piece,
-                         * and decisions are made at each stage as to
-                         * how the remainder of the instruction word will be read.
-                         * (sequencer and ROM are usually needed)
-                         * An x86 decoder can be even more complex since
-                         * it involve  both decoding instructions into u-ops and
-                         * merge u-ops when doing micro-ops fusion.
-                         */
-                        bool is_dram=false;
-                        double pmos_to_nmos_sizing_r;
-                        double load_nmos_width, load_pmos_width;
-                        double C_driver_load, R_wire_load;
-                        Area cell;
-
-                        l_ip=*configure_interface;
-                        local_result = init_interface(&l_ip);
-                        cell.h =g_tp.cell_h_def;
-                        cell.w =g_tp.cell_h_def;
-
-                        num_decoder_segments = (int)ceil(opcode_length/18.0);
-                        if (opcode_length > 18)	opcode_length = 18;
-                        num_decoded_signals= (int)pow(2.0,opcode_length);
-                        pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
-                        load_nmos_width=g_tp.max_w_nmos_ /2;
-                        load_pmos_width= g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r;
-                        C_driver_load = 1024*gate_C(load_nmos_width + load_pmos_width, 0, is_dram); //TODO: this number 1024 needs to be revisited
-                        R_wire_load   = 3000*l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um;
-
-                        final_dec = new Decoder(
-                                        num_decoded_signals,
-                                        false,
-                                        C_driver_load,
-                                        R_wire_load,
-                                        false/*is_fa*/,
-                                        false/*is_dram*/,
-                                        false/*wl_tr*/, //to use peri device
-                                        cell);
-
-                        PredecBlk * predec_blk1 = new PredecBlk(
-                                        num_decoded_signals,
-                                        final_dec,
-                                        0,//Assuming predec and dec are back to back
-                                        0,
-                                        1,//Each Predec only drives one final dec
-                                        false/*is_dram*/,
-                                        true);
-                        PredecBlk * predec_blk2 = new PredecBlk(
-                                        num_decoded_signals,
-                                        final_dec,
-                                        0,//Assuming predec and dec are back to back
-                                        0,
-                                        1,//Each Predec only drives one final dec
-                                        false/*is_dram*/,
-                                        false);
-
-                        PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false);
-                        PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false);
-
-                        pre_dec            = new Predec(predec_blk_drv1, predec_blk_drv2);
-
-                        double area_decoder = final_dec->area.get_area() * num_decoded_signals * num_decoder_segments*num_decoders;
-                        //double w_decoder    = area_decoder / area.get_h();
-                        double area_pre_dec = (predec_blk_drv1->area.get_area() +
-                                        predec_blk_drv2->area.get_area() +
-                                        predec_blk1->area.get_area() +
-                                        predec_blk2->area.get_area())*
-                                        num_decoder_segments*num_decoders;
-                        area.set_area(area.get_area()+ area_decoder + area_pre_dec);
-                        double macro_layout_overhead   = g_tp.macro_layout_overhead;
-                        double chip_PR_overhead        = g_tp.chip_layout_overhead;
-                        area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead);
-
-                        inst_decoder_delay_power();
-
-                        double sckRation = g_tp.sckt_co_eff;
-                        power.readOp.dynamic *= sckRation;
-                        power.writeOp.dynamic *= sckRation;
-                        power.searchOp.dynamic *= sckRation;
-
-                        double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty);
-                        power.readOp.longer_channel_leakage	= power.readOp.leakage*long_channel_device_reduction;
-
+InstructionDecoder::InstructionDecoder(XMLNode* _xml_data, const string _name,
+                                       bool _is_default,
+                                       const InputParameter *configure_interface,
+                                       int opcode_length_, int num_decoders_,
+                                       bool x86_,
+                                       double clockRate_,
+                                       enum Device_ty device_ty_,
+                                       enum Core_type core_ty_)
+    : McPATComponent(_xml_data), is_default(_is_default),
+      opcode_length(opcode_length_), num_decoders(num_decoders_), x86(x86_),
+      device_ty(device_ty_), core_ty(core_ty_) {
+    /*
+     * Instruction decoder is different from n to 2^n decoders
+     * that are commonly used in row decoders in memory arrays.
+     * The RISC instruction decoder is typically a very simple device.
+     * We can decode an instruction by simply
+     * separating the machine word into small parts using wire slices
+     * The RISC instruction decoder can be approximate by the n to 2^n decoders,
+     * although this approximation usually underestimate power since each decoded
+     * instruction normally has more than 1 active signal.
+     *
+     * However, decoding a CISC instruction word is much more difficult
+     * than the RISC case. A CISC decoder is typically set up as a state machine.
+     * The machine reads the opcode field to determine
+     * what type of instruction it is,
+     * and where the other data values are.
+     * The instruction word is read in piece by piece,
+     * and decisions are made at each stage as to
+     * how the remainder of the instruction word will be read.
+     * (sequencer and ROM are usually needed)
+     * An x86 decoder can be even more complex since
+     * it involve  both decoding instructions into u-ops and
+     * merge u-ops when doing micro-ops fusion.
+     */
+    name = _name;
+    clockRate = clockRate_;
+    bool is_dram = false;
+    double pmos_to_nmos_sizing_r;
+    double load_nmos_width, load_pmos_width;
+    double C_driver_load, R_wire_load;
+    Area cell;
+
+    l_ip = *configure_interface;
+    local_result = init_interface(&l_ip, name);
+    cell.h = g_tp.cell_h_def;
+    cell.w = g_tp.cell_h_def;
+
+    num_decoder_segments = (int)ceil(opcode_length / 18.0);
+    if (opcode_length > 18)	opcode_length = 18;
+    num_decoded_signals = (int)pow(2.0, opcode_length);
+    pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+    load_nmos_width = g_tp.max_w_nmos_ / 2;
+    load_pmos_width = g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r;
+    C_driver_load = 1024 * gate_C(load_nmos_width + load_pmos_width, 0, is_dram);
+    R_wire_load   = 3000 * l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um;
+
+    final_dec = new Decoder(
+        num_decoded_signals,
+        false,
+        C_driver_load,
+        R_wire_load,
+        false/*is_fa*/,
+        false/*is_dram*/,
+        false/*wl_tr*/, //to use peri device
+        cell);
+
+    PredecBlk * predec_blk1 = new PredecBlk(
+        num_decoded_signals,
+        final_dec,
+        0,//Assuming predec and dec are back to back
+        0,
+        1,//Each Predec only drives one final dec
+        false/*is_dram*/,
+        true);
+    PredecBlk * predec_blk2 = new PredecBlk(
+        num_decoded_signals,
+        final_dec,
+        0,//Assuming predec and dec are back to back
+        0,
+        1,//Each Predec only drives one final dec
+        false/*is_dram*/,
+        false);
+
+    PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false);
+    PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false);
+
+    pre_dec            = new Predec(predec_blk_drv1, predec_blk_drv2);
+
+    double area_decoder = final_dec->area.get_area() * num_decoded_signals *
+        num_decoder_segments * num_decoders;
+    //double w_decoder    = area_decoder / area.get_h();
+    double area_pre_dec = (predec_blk_drv1->area.get_area() +
+                           predec_blk_drv2->area.get_area() +
+                           predec_blk1->area.get_area() +
+                           predec_blk2->area.get_area()) *
+                          num_decoder_segments * num_decoders;
+    area.set_area(area.get_area() + area_decoder + area_pre_dec);
+    double macro_layout_overhead   = g_tp.macro_layout_overhead;
+    double chip_PR_overhead        = g_tp.chip_layout_overhead;
+    area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead);
+
+    inst_decoder_delay_power();
+
+    double sckRation = g_tp.sckt_co_eff;
+    power.readOp.dynamic *= sckRation;
+    power.writeOp.dynamic *= sckRation;
+    power.searchOp.dynamic *= sckRation;
+
+    double long_channel_device_reduction =
+        longer_channel_device_reduction(device_ty, core_ty);
+    power.readOp.longer_channel_leakage	= power.readOp.leakage *
+        long_channel_device_reduction;
+
+    output_data.area = area.get_area() / 1e6;
+    output_data.peak_dynamic_power = power.readOp.dynamic * clockRate;
+    output_data.subthreshold_leakage_power = power.readOp.leakage;
+    output_data.gate_leakage_power = power.readOp.gate_leakage;
 }
 
-void inst_decoder::inst_decoder_delay_power()
-{
+void InstructionDecoder::inst_decoder_delay_power() {
 
-        double dec_outrisetime;
-        double inrisetime=0, outrisetime;
-        double pppm_t[4]    = {1,1,1,1};
-        double squencer_passes = x86?2:1;
+    double dec_outrisetime;
+    double inrisetime = 0, outrisetime;
+    double pppm_t[4]    = {1, 1, 1, 1};
+    double squencer_passes = x86 ? 2 : 1;
 
-        outrisetime = pre_dec->compute_delays(inrisetime);
-        dec_outrisetime = final_dec->compute_delays(outrisetime);
-        set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
-    power = power + pre_dec->power*pppm_t;
+    outrisetime = pre_dec->compute_delays(inrisetime);
+    dec_outrisetime = final_dec->compute_delays(outrisetime);
+    set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments);
+    power = power + pre_dec->power * pppm_t;
     set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals,
-                num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
-    power = power + final_dec->power*pppm_t;
+             num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments);
+    power = power + final_dec->power * pppm_t;
 }
-void inst_decoder::leakage_feedback(double temperature)
-{
+
+void InstructionDecoder::leakage_feedback(double temperature) {
   l_ip.temp = (unsigned int)round(temperature/10.0)*10;
-  uca_org_t init_result = init_interface(&l_ip); // init_result is dummy
+  uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy
 
   final_dec->leakage_feedback(temperature);
   pre_dec->leakage_feedback(temperature);
@@ -1000,15 +945,14 @@ void inst_decoder::leakage_feedback(double temperature)
   power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction;
 }
 
-inst_decoder::~inst_decoder()
-{
-          local_result.cleanup();
+InstructionDecoder::~InstructionDecoder() {
+    local_result.cleanup();
 
-          delete final_dec;
+    delete final_dec;
 
-          delete pre_dec->blk1;
-          delete pre_dec->blk2;
-          delete pre_dec->drv1;
-          delete pre_dec->drv2;
-          delete pre_dec;
+    delete pre_dec->blk1;
+    delete pre_dec->blk2;
+    delete pre_dec->drv1;
+    delete pre_dec->drv2;
+    delete pre_dec;
 }