diff options
Diffstat (limited to 'ext/mcpat/logic.cc')
-rw-r--r-- | ext/mcpat/logic.cc | 1544 |
1 files changed, 744 insertions, 800 deletions
diff --git a/ext/mcpat/logic.cc b/ext/mcpat/logic.cc index 11519d863..43823e77b 100644 --- a/ext/mcpat/logic.cc +++ b/ext/mcpat/logic.cc @@ -2,6 +2,7 @@ * McPAT * SOFTWARE LICENSE AGREEMENT * Copyright 2012 Hewlett-Packard Development Company, L.P. + * Copyright (c) 2010-2013 Advanced Micro Devices, Inc. * All Rights Reserved * * Redistribution and use in source and binary forms, with or without @@ -25,416 +26,500 @@ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ***************************************************************************/ +#include "common.h" #include "logic.h" - //selection_logic -selection_logic::selection_logic( - bool _is_default, - int win_entries_, - int issue_width_, - const InputParameter *configure_interface, - enum Device_ty device_ty_, - enum Core_type core_ty_) - //const ParseXML *_XML_interface) - :is_default(_is_default), - win_entries(win_entries_), - issue_width(issue_width_), - device_ty(device_ty_), - core_ty(core_ty_) - { - //uca_org_t result2; - l_ip=*configure_interface; - local_result = init_interface(&l_ip); - //init_tech_params(l_ip.F_sz_um, false); - //win_entries=numIBEntries;//IQentries; - //issue_width=issueWidth; - selection_power(); - double sckRation = g_tp.sckt_co_eff; - power.readOp.dynamic *= sckRation; - power.writeOp.dynamic *= sckRation; - power.searchOp.dynamic *= sckRation; - - double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); - power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; - } - -void selection_logic::selection_power() -{//based on cost effective superscalar processor TR pp27-31 - double Ctotal, Cor, Cpencode; - int num_arbiter; - double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp; - - //TODO: the 0.8um process data is used. - WSelORn = 12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process - WSelORprequ = 50 * l_ip.F_sz_um;//this was 40 micron for the 0.8 micron process - WSelPn = 12.5 * l_ip.F_sz_um;//this was 10mcron for the 0.8 micron process - WSelPp = 18.75 * l_ip.F_sz_um;//this was 15 micron for the 0.8 micron process - WSelEnn = 6.25 * l_ip.F_sz_um;//this was 5 micron for the 0.8 micron process - WSelEnp = 12.5 * l_ip.F_sz_um;//this was 10 micron for the 0.8 micron process - - - Ctotal=0; - num_arbiter=1; - while(win_entries > 4) - { - win_entries = (int)ceil((double)win_entries / 4.0); - num_arbiter += win_entries; - } - //the 4-input OR logic to generate anyreq - Cor = 4 * drain_C_(WSelORn,NCH,1,1, g_tp.cell_h_def) + drain_C_(WSelORprequ,PCH,1,1, g_tp.cell_h_def); - power.readOp.gate_leakage = cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor)*g_tp.peri_global.Vdd; - - //The total capacity of the 4-bit priority encoder - Cpencode = drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,1, 1, g_tp.cell_h_def) + - 2*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,2, 1, g_tp.cell_h_def) + - 3*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,3, 1, g_tp.cell_h_def) + - 4*drain_C_(WSelPn,NCH,1, 1, g_tp.cell_h_def) + drain_C_(WSelPp,PCH,4, 1, g_tp.cell_h_def) +//precompute priority logic - 2*4*gate_C(WSelEnn+WSelEnp,20.0)+ - 4*drain_C_(WSelEnn,NCH,1, 1, g_tp.cell_h_def) + 2*4*drain_C_(WSelEnp,PCH,1, 1, g_tp.cell_h_def)+//enable logic - (2*4+2*3+2*2+2)*gate_C(WSelPn+WSelPp,10.0);//requests signal - - Ctotal += issue_width * num_arbiter*(Cor+Cpencode); - - power.readOp.dynamic = Ctotal*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*2;//2 means the abitration signal need to travel round trip - power.readOp.leakage = issue_width * num_arbiter * - (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p - + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p - + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p - + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic - + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals - )*g_tp.peri_global.Vdd; - power.readOp.gate_leakage = issue_width * num_arbiter * - (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p - + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p - + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p - + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic - + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals - )*g_tp.peri_global.Vdd; +selection_logic::selection_logic(XMLNode* _xml_data, bool _is_default, + int _win_entries, int issue_width_, + const InputParameter *configure_interface, + string _name, double _accesses, + double clockRate_, enum Device_ty device_ty_, + enum Core_type core_ty_) + : McPATComponent(_xml_data), is_default(_is_default), + win_entries(_win_entries), + issue_width(issue_width_), + accesses(_accesses), + device_ty(device_ty_), + core_ty(core_ty_) { + clockRate = clockRate_; + name = _name; + l_ip = *configure_interface; + local_result = init_interface(&l_ip, name); +} + +void selection_logic::computeArea() { + output_data.area = local_result.area; } +void selection_logic::computeEnergy() { + //based on cost effective superscalar processor TR pp27-31 + double Ctotal, Cor, Cpencode; + int num_arbiter; + double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp; + + //the 0.8um process data is used. + //this was 10 micron for the 0.8 micron process + WSelORn = 12.5 * l_ip.F_sz_um; + //this was 40 micron for the 0.8 micron process + WSelORprequ = 50 * l_ip.F_sz_um; + //this was 10mcron for the 0.8 micron process + WSelPn = 12.5 * l_ip.F_sz_um; + //this was 15 micron for the 0.8 micron process + WSelPp = 18.75 * l_ip.F_sz_um; + //this was 5 micron for the 0.8 micron process + WSelEnn = 6.25 * l_ip.F_sz_um; + //this was 10 micron for the 0.8 micron process + WSelEnp = 12.5 * l_ip.F_sz_um; + + Ctotal = 0; + num_arbiter = 1; + while (win_entries > 4) { + win_entries = (int)ceil((double)win_entries / 4.0); + num_arbiter += win_entries; + } + //the 4-input OR logic to generate anyreq + Cor = 4 * drain_C_(WSelORn, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(WSelORprequ, PCH, 1, 1, g_tp.cell_h_def); + power.readOp.gate_leakage = + cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor) * g_tp.peri_global.Vdd; + + //The total capacity of the 4-bit priority encoder + Cpencode = drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(WSelPp, PCH, 1, 1, g_tp.cell_h_def) + + 2 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(WSelPp, PCH, 2, 1, g_tp.cell_h_def) + + 3 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(WSelPp, PCH, 3, 1, g_tp.cell_h_def) + + 4 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(WSelPp, PCH, 4, 1, g_tp.cell_h_def) +//precompute priority logic + 2 * 4 * gate_C(WSelEnn + WSelEnp, 20.0) + + 4 * drain_C_(WSelEnn, NCH, 1, 1, g_tp.cell_h_def) + + 2 * 4 * drain_C_(WSelEnp, PCH, 1, 1, g_tp.cell_h_def) +//enable logic + (2 * 4 + 2 * 3 + 2 * 2 + 2) * + gate_C(WSelPn + WSelPp, 10.0);//requests signal + + Ctotal += issue_width * num_arbiter * (Cor + Cpencode); + + //2 means the abitration signal need to travel round trip + power.readOp.dynamic = + Ctotal * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 2; + power.readOp.leakage = issue_width * num_arbiter * + (cmos_Isub_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p + + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)//grant2p + + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)//grant3p + + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic + + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant sIsubnals + ) * g_tp.peri_global.Vdd; + power.readOp.gate_leakage = issue_width * num_arbiter * + (cmos_Ig_leakage(WSelPn, WSelPp, 2, nor)/*approximate precompute with a nor gate*///grant1p + + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)//grant2p + + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)//grant3p + + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor)*4//enable logic + + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv)*2*3//for each grant there are two inverters, there are 3 grant signals + ) * g_tp.peri_global.Vdd; + double sckRation = g_tp.sckt_co_eff; + power.readOp.dynamic *= sckRation; + power.writeOp.dynamic *= sckRation; + power.searchOp.dynamic *= sckRation; + + double long_channel_device_reduction = + longer_channel_device_reduction(device_ty, core_ty); + power.readOp.longer_channel_leakage = + power.readOp.leakage * long_channel_device_reduction; + + output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; + output_data.subthreshold_leakage_power = power.readOp.leakage; + output_data.gate_leakage_power = power.readOp.gate_leakage; + output_data.runtime_dynamic_energy = power.readOp.dynamic * accesses; +} dep_resource_conflict_check::dep_resource_conflict_check( - const InputParameter *configure_interface, - const CoreDynParam & dyn_p_, - int compare_bits_, - bool _is_default) - : l_ip(*configure_interface), - coredynp(dyn_p_), - compare_bits(compare_bits_), - is_default(_is_default) -{ - Wcompn = 25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process - Wevalinvp = 25 * l_ip.F_sz_um;//this was 20.0 micron for the 0.8 micron process - Wevalinvn = 100 * l_ip.F_sz_um;//this was 80.0 mcron for the 0.8 micron process - Wcomppreequ = 50 * l_ip.F_sz_um;//this was 40.0 micron for the 0.8 micron process - WNORn = 6.75 * l_ip.F_sz_um;//this was 5.4 micron for the 0.8 micron process - WNORp = 38.125 * l_ip.F_sz_um;//this was 30.5 micron for the 0.8 micron process - - local_result = init_interface(&l_ip); - - if (coredynp.core_ty==Inorder) - compare_bits += 16 + 8 + 8;//TODO: opcode bits + log(shared resources) + REG TAG BITS-->opcode comparator - else - compare_bits += 16 + 8 + 8; - - conflict_check_power(); - double sckRation = g_tp.sckt_co_eff; - power.readOp.dynamic *= sckRation; - power.writeOp.dynamic *= sckRation; - power.searchOp.dynamic *= sckRation; + XMLNode* _xml_data, const string _name, + const InputParameter *configure_interface, + const CoreParameters & dyn_p_, int compare_bits_, + double clockRate_, bool _is_default) + : McPATComponent(_xml_data), l_ip(*configure_interface), + coredynp(dyn_p_), compare_bits(compare_bits_), is_default(_is_default) { + + name = _name; + clockRate = clockRate_; + //this was 20.0 micron for the 0.8 micron process + Wcompn = 25 * l_ip.F_sz_um; + //this was 20.0 micron for the 0.8 micron process + Wevalinvp = 25 * l_ip.F_sz_um; + //this was 80.0 mcron for the 0.8 micron process + Wevalinvn = 100 * l_ip.F_sz_um; + //this was 40.0 micron for the 0.8 micron process + Wcomppreequ = 50 * l_ip.F_sz_um; + //this was 5.4 micron for the 0.8 micron process + WNORn = 6.75 * l_ip.F_sz_um; + //this was 30.5 micron for the 0.8 micron process + WNORp = 38.125 * l_ip.F_sz_um; + + // To make CACTI happy. + l_ip.cache_sz = MIN_BUFFER_SIZE; + local_result = init_interface(&l_ip, name); + + if (coredynp.core_ty == Inorder) + //TODO: opcode bits + log(shared resources) + REG TAG BITS --> + //opcode comparator + compare_bits += 16 + 8 + 8; + else + compare_bits += 16 + 8 + 8; + + conflict_check_power(); + double sckRation = g_tp.sckt_co_eff; + power.readOp.dynamic *= sckRation; + power.writeOp.dynamic *= sckRation; + power.searchOp.dynamic *= sckRation; } -void dep_resource_conflict_check::conflict_check_power() -{ - double Ctotal; - int num_comparators; - num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision. - //When decode-width ==1, no dcl logic +void dep_resource_conflict_check::conflict_check_power() { + double Ctotal; + int num_comparators; + //2(N*N-N) is used for source to dest comparison, (N*N-N) is used for + //dest to dest comparision. + num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) - + coredynp.decodeW); - Ctotal = num_comparators * compare_cap(); - //printf("%i,%s\n",XML_interface->sys.core[0].predictor.predictor_entries,XML_interface->sys.core[0].predictor.prediction_scheme); + Ctotal = num_comparators * compare_cap(); - power.readOp.dynamic=Ctotal*/*CLOCKRATE*/g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/*AF*/; - power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn, false); + power.readOp.dynamic = Ctotal * /*CLOCKRATE*/ g_tp.peri_global.Vdd * + g_tp.peri_global.Vdd /*AF*/; + power.readOp.leakage = num_comparators * compare_bits * 2 * + simplified_nmos_leakage(Wcompn, false); - double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); - power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; - power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos); + double long_channel_device_reduction = + longer_channel_device_reduction(Core_device, coredynp.core_ty); + power.readOp.longer_channel_leakage = + power.readOp.leakage * long_channel_device_reduction; + power.readOp.gate_leakage = num_comparators * compare_bits * 2 * + cmos_Ig_leakage(Wcompn, 0, 2, nmos); } /* estimate comparator power consumption (this comparator is similar to the tag-match structure in a CAM */ -double dep_resource_conflict_check::compare_cap() -{ - double c1, c2; - - WNORp = WNORp * compare_bits/2.0;//resize the big NOR gate at the DCL according to fan in. - /* bottom part of comparator */ - c2 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def))+ - drain_C_(Wevalinvp,PCH,1,1, g_tp.cell_h_def) + drain_C_(Wevalinvn,NCH,1,1, g_tp.cell_h_def); - - /* top part of comparator */ - c1 = (compare_bits)*(drain_C_(Wcompn,NCH,1,1, g_tp.cell_h_def)+drain_C_(Wcompn,NCH,2,1, g_tp.cell_h_def)+ - drain_C_(Wcomppreequ,NCH,1,1, g_tp.cell_h_def)) + gate_C(WNORn + WNORp,10.0) + - drain_C_(WNORp,NCH,2,1, g_tp.cell_h_def) + compare_bits*drain_C_(WNORn,NCH,2,1, g_tp.cell_h_def); - return(c1 + c2); +double dep_resource_conflict_check::compare_cap() { + double c1, c2; + + //resize the big NOR gate at the DCL according to fan in. + WNORp = WNORp * compare_bits / 2.0; + /* bottom part of comparator */ + c2 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def)) + + drain_C_(Wevalinvp, PCH, 1, 1, g_tp.cell_h_def) + + drain_C_(Wevalinvn, NCH, 1, 1, g_tp.cell_h_def); + + /* top part of comparator */ + c1 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) + + drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def) + + drain_C_(Wcomppreequ, NCH, 1, 1, g_tp.cell_h_def)) + + gate_C(WNORn + WNORp, 10.0) + + drain_C_(WNORp, NCH, 2, 1, g_tp.cell_h_def) + compare_bits * + drain_C_(WNORn, NCH, 2, 1, g_tp.cell_h_def); + return(c1 + c2); } void dep_resource_conflict_check::leakage_feedback(double temperature) { l_ip.temp = (unsigned int)round(temperature/10.0)*10; - uca_org_t init_result = init_interface(&l_ip); // init_result is dummy + uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy // This is part of conflict_check_power() - int num_comparators = 3*((coredynp.decodeW) * (coredynp.decodeW)-coredynp.decodeW);//2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest to dest comparision. - power.readOp.leakage=num_comparators*compare_bits*2*simplified_nmos_leakage(Wcompn, false); - - double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); - power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; - power.readOp.gate_leakage=num_comparators*compare_bits*2*cmos_Ig_leakage(Wcompn, 0, 2, nmos); + // 2(N*N-N) is used for source to dest comparison, (N*N-N) is used for dest + // to dest comparison. + int num_comparators = 3 * ((coredynp.decodeW) * (coredynp.decodeW) - + coredynp.decodeW); + power.readOp.leakage = num_comparators * compare_bits * 2 * + simplified_nmos_leakage(Wcompn, false); + + double long_channel_device_reduction = + longer_channel_device_reduction(Core_device, coredynp.core_ty); + power.readOp.longer_channel_leakage = power.readOp.leakage * + long_channel_device_reduction; + power.readOp.gate_leakage = num_comparators * compare_bits * 2 * + cmos_Ig_leakage(Wcompn, 0, 2, nmos); } -//TODO: add inverter and transmission gate base DFF. DFFCell::DFFCell( - bool _is_dram, - double _WdecNANDn, - double _WdecNANDp, - double _cell_load, - const InputParameter *configure_interface) -:is_dram(_is_dram), -cell_load(_cell_load), -WdecNANDn(_WdecNANDn), -WdecNANDp(_WdecNANDp) -{//this model is based on the NAND2 based DFF. - l_ip=*configure_interface; -// area.set_area(730*l_ip.F_sz_um*l_ip.F_sz_um); - area.set_area(5*compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp, g_tp.cell_h_def) - + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn, g_tp.cell_h_def)); + bool _is_dram, + double _WdecNANDn, + double _WdecNANDp, + double _cell_load, + const InputParameter *configure_interface) + : is_dram(_is_dram), + cell_load(_cell_load), + WdecNANDn(_WdecNANDn), + WdecNANDp(_WdecNANDp) { //this model is based on the NAND2 based DFF. + l_ip = *configure_interface; + area.set_area(5 * compute_gate_area(NAND, 2,WdecNANDn,WdecNANDp, + g_tp.cell_h_def) + + compute_gate_area(NAND, 2,WdecNANDn,WdecNANDn, + g_tp.cell_h_def)); } -double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out) -{ - double Ctotal = 0; - //printf("WdecNANDn = %E\n", WdecNANDn); +double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out) { + double Ctotal = 0; - /* part 1: drain cap of NAND gate */ - Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram); + /* part 1: drain cap of NAND gate */ + Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) + fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram); - /* part 2: gate cap of NAND gates */ - Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram); + /* part 2: gate cap of NAND gates */ + Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram); - return Ctotal; + return Ctotal; } -void DFFCell::compute_DFF_cell() -{ - double c1, c2, c3, c4, c5, c6; - /* node 5 and node 6 are identical to node 1 in capacitance */ - c1 = c5 = c6 = fpfp_node_cap(2, 1); - c2 = fpfp_node_cap(2, 3); - c3 = fpfp_node_cap(3, 2); - c4 = fpfp_node_cap(2, 2); - - //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2 - clock_cap= 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram); - e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2*cell_load)*0.5*g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;; - - /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */ - e_keep_1.readOp.dynamic += c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ; - e_keep_0.readOp.dynamic += c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ; - e_clock.readOp.dynamic += clock_cap* g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;; - - /* static power */ - e_switch.readOp.leakage += (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF - + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd; - e_switch.readOp.gate_leakage += (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand)*5//5 NAND2 and 1 NAND3 in a DFF - + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand))*g_tp.peri_global.Vdd; - //printf("leakage =%E\n",cmos_Ileak(1, is_dram) ); +void DFFCell::compute_DFF_cell() { + double c1, c2, c3, c4, c5, c6; + /* node 5 and node 6 are identical to node 1 in capacitance */ + c1 = c5 = c6 = fpfp_node_cap(2, 1); + c2 = fpfp_node_cap(2, 3); + c3 = fpfp_node_cap(3, 2); + c4 = fpfp_node_cap(2, 2); + + //cap-load of the clock signal in each Dff, actually the clock signal only connected to one NAND2 + clock_cap = 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram); + e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2 * cell_load) * + 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;; + + /* no 1/2 for e_keep and e_clock because clock signal switches twice in one cycle */ + e_keep_1.readOp.dynamic += + c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ; + e_keep_0.readOp.dynamic += + c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd ; + e_clock.readOp.dynamic += + clock_cap * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;; + + /* static power */ + e_switch.readOp.leakage += + (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand) * + 5//5 NAND2 and 1 NAND3 in a DFF + + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand)) * + g_tp.peri_global.Vdd; + e_switch.readOp.gate_leakage += + (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand) * + 5//5 NAND2 and 1 NAND3 in a DFF + + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand)) * + g_tp.peri_global.Vdd; } -Pipeline::Pipeline( - const InputParameter *configure_interface, - const CoreDynParam & dyn_p_, - enum Device_ty device_ty_, - bool _is_core_pipeline, - bool _is_default) -: l_ip(*configure_interface), - coredynp(dyn_p_), - device_ty(device_ty_), - is_core_pipeline(_is_core_pipeline), - is_default(_is_default), - num_piperegs(0.0) - - { - local_result = init_interface(&l_ip); - if (!coredynp.Embedded) - process_ind = true; - else - process_ind = false; - WNANDn = (process_ind)? 25 * l_ip.F_sz_um : g_tp.min_w_nmos_ ;//this was 20 micron for the 0.8 micron process - WNANDp = (process_ind)? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_*pmos_to_nmos_sz_ratio();//this was 30 micron for the 0.8 micron process - load_per_pipeline_stage = 2*gate_C(WNANDn + WNANDp, 0, false); - compute(); +Pipeline::Pipeline(XMLNode* _xml_data, + const InputParameter *configure_interface, + const CoreParameters & dyn_p_, + enum Device_ty device_ty_, + bool _is_core_pipeline, + bool _is_default) + : McPATComponent(_xml_data), l_ip(*configure_interface), + coredynp(dyn_p_), device_ty(device_ty_), + is_core_pipeline(_is_core_pipeline), is_default(_is_default), + num_piperegs(0.0) { + name = "Pipeline?"; + + local_result = init_interface(&l_ip, name); + if (!coredynp.Embedded) { + process_ind = true; + } else { + process_ind = false; + } + //this was 20 micron for the 0.8 micron process + WNANDn = (process_ind) ? 25 * l_ip.F_sz_um : g_tp.min_w_nmos_ ; + //this was 30 micron for the 0.8 micron process + WNANDp = (process_ind) ? 37.5 * l_ip.F_sz_um : g_tp.min_w_nmos_ * + pmos_to_nmos_sz_ratio(); + load_per_pipeline_stage = 2 * gate_C(WNANDn + WNANDp, 0, false); + compute(); } -void Pipeline::compute() -{ - compute_stage_vector(); - DFFCell pipe_reg(false, WNANDn,WNANDp, load_per_pipeline_stage, &l_ip); - pipe_reg.compute_DFF_cell(); - - double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic; - //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider - //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power. - double pipe_reg_power = num_piperegs * (pipe_reg.e_switch.readOp.dynamic+pipe_reg.e_keep_0.readOp.dynamic+pipe_reg.e_keep_1.readOp.dynamic)/3+clock_power_pipereg; - double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage; - double pipe_reg_gate_leakage = num_piperegs * pipe_reg.e_switch.readOp.gate_leakage; - power.readOp.dynamic +=pipe_reg_power; - power.readOp.leakage +=pipe_reg_leakage; - power.readOp.gate_leakage +=pipe_reg_gate_leakage; - area.set_area(num_piperegs * pipe_reg.area.get_area()); - - double long_channel_device_reduction = longer_channel_device_reduction(device_ty, coredynp.core_ty); - power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; - - - double sckRation = g_tp.sckt_co_eff; - power.readOp.dynamic *= sckRation; - power.writeOp.dynamic *= sckRation; - power.searchOp.dynamic *= sckRation; - double macro_layout_overhead = g_tp.macro_layout_overhead; +void Pipeline::compute() { + compute_stage_vector(); + DFFCell pipe_reg(false, WNANDn, WNANDp, load_per_pipeline_stage, &l_ip); + pipe_reg.compute_DFF_cell(); + + double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic; + //******************pipeline power: currently, we average all the possibilities of the states of DFFs in the pipeline. A better way to do it is to consider + //the harming distance of two consecutive signals, However McPAT does not have plan to do this in near future as it focuses on worst case power. + double pipe_reg_power = num_piperegs * + (pipe_reg.e_switch.readOp.dynamic + pipe_reg.e_keep_0.readOp.dynamic + + pipe_reg.e_keep_1.readOp.dynamic) / 3 + clock_power_pipereg; + double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage; + double pipe_reg_gate_leakage = num_piperegs * + pipe_reg.e_switch.readOp.gate_leakage; + power.readOp.dynamic += pipe_reg_power; + power.readOp.leakage += pipe_reg_leakage; + power.readOp.gate_leakage += pipe_reg_gate_leakage; + area.set_area(num_piperegs * pipe_reg.area.get_area()); + + double long_channel_device_reduction = + longer_channel_device_reduction(device_ty, coredynp.core_ty); + power.readOp.longer_channel_leakage = power.readOp.leakage * + long_channel_device_reduction; + + + double sckRation = g_tp.sckt_co_eff; + power.readOp.dynamic *= sckRation; + power.writeOp.dynamic *= sckRation; + power.searchOp.dynamic *= sckRation; + double macro_layout_overhead = g_tp.macro_layout_overhead; if (!coredynp.Embedded) - area.set_area(area.get_area()*macro_layout_overhead); -} - -void Pipeline::compute_stage_vector() -{ - double num_stages, tot_stage_vector, per_stage_vector; - int opcode_length = coredynp.x86? coredynp.micro_opcode_length:coredynp.opcode_length; - //Hthread = thread_clock_gated? 1:num_thread; + area.set_area(area.get_area() * macro_layout_overhead); - if (!is_core_pipeline) - { - num_piperegs=l_ip.pipeline_stages*l_ip.per_stage_vector;//The number of pipeline stages are calculated based on the achievable throughput and required throughput - } - else - { - if (coredynp.core_ty==Inorder) - { - /* assume 6 pipe stages and try to estimate bits per pipe stage */ - /* pipe stage 0/IF */ - num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads; - /* pipe stage IF/ID */ - num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads; - /* pipe stage IF/ThreadSEL */ - if (coredynp.multithreaded) num_piperegs += coredynp.num_hthreads*coredynp.perThreadState; //8 bit thread states - /* pipe stage ID/EXE */ - num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width + pow(2.0,opcode_length)+ 2*coredynp.int_data_width)*coredynp.num_hthreads; - /* pipe stage EXE/MEM */ - num_piperegs += coredynp.issueW*(3 * coredynp.arch_ireg_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/); - /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/ - num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length) + 8*2*coredynp.int_data_width/*+2*powers (2,reg_length)*/); -// /* pipe stage 5/6 */ -// num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/); -// /* pipe stage 6/7 */ -// num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/*+2*powers (2,reg_length)*/); -// /* pipe stage 7/8 */ -// num_piperegs += issueWidth*(data_width + powers (2,opcode_length)/**2*powers (2,reg_length)*/); -// /* assume 50% extra in control signals (rule of thumb) */ - num_stages=6; + output_data.area = area.get_area() / 1e6; + output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; + output_data.subthreshold_leakage_power = power.readOp.leakage; + output_data.gate_leakage_power = power.readOp.gate_leakage; + output_data.runtime_dynamic_energy = power.readOp.dynamic * total_cycles; +} - } - else - { - /* assume 12 stage pipe stages and try to estimate bits per pipe stage */ - /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */ - - /* pipe stage 0/1F*/ - num_piperegs += coredynp.pc_width*2*coredynp.num_hthreads ;//PC and Next PC - /* pipe stage IF/ID */ - num_piperegs += coredynp.fetchW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is used to feed branch predictor in ID - /* pipe stage 1D/Renaming*/ - num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width)*coredynp.num_hthreads;//PC is for branch exe in later stage. - /* pipe stage Renaming/wire_drive */ - num_piperegs += coredynp.decodeW*(coredynp.instruction_length + coredynp.pc_width); - /* pipe stage Renaming/IssueQ */ - num_piperegs += coredynp.issueW*(coredynp.instruction_length + coredynp.pc_width + 3*coredynp.phy_ireg_width)*coredynp.num_hthreads;//3*coredynp.phy_ireg_width means 2 sources and 1 dest - /* pipe stage IssueQ/Dispatch */ - num_piperegs += coredynp.issueW*(coredynp.instruction_length + 3 * coredynp.phy_ireg_width); - /* pipe stage Dispatch/EXE */ - - num_piperegs += coredynp.issueW*(3 * coredynp.phy_ireg_width + coredynp.pc_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/); - /* 2^opcode_length means the total decoded signal for the opcode*/ - num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/); - /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/ - num_piperegs += coredynp.issueW*(2*coredynp.int_data_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/); - /* pipe stage EXE/MEM, data need to be read/write, address*/ - num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.v_address_width + pow(2.0,opcode_length)/*+2*powers (2,reg_length)*/);//memory Opcode still need to be passed - /* pipe stage MEM/WB; result data, writeback regs */ - num_piperegs += coredynp.issueW*(coredynp.int_data_width + coredynp.phy_ireg_width /* powers (2,opcode_length) + (2,opcode_length)+2*powers (2,reg_length)*/); - /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/ - num_piperegs += coredynp.commitW*(coredynp.int_data_width + coredynp.v_address_width + coredynp.phy_ireg_width/*+ powers (2,opcode_length)*2*powers (2,reg_length)*/)*coredynp.num_hthreads; -// if (multithreaded) -// { -// -// } - num_stages=12; +void Pipeline::compute_stage_vector() { + double num_stages, tot_stage_vector, per_stage_vector; + int opcode_length = coredynp.x86 ? + coredynp.micro_opcode_length : coredynp.opcode_width; + + if (!is_core_pipeline) { + //The number of pipeline stages are calculated based on the achievable + //throughput and required throughput + num_piperegs = l_ip.pipeline_stages * l_ip.per_stage_vector; + } else { + if (coredynp.core_ty == Inorder) { + /* assume 6 pipe stages and try to estimate bits per pipe stage */ + /* pipe stage 0/IF */ + num_piperegs += coredynp.pc_width * 2 * coredynp.num_hthreads; + /* pipe stage IF/ID */ + num_piperegs += coredynp.fetchW * + (coredynp.instruction_length + coredynp.pc_width) * + coredynp.num_hthreads; + /* pipe stage IF/ThreadSEL */ + if (coredynp.multithreaded) { + num_piperegs += coredynp.num_hthreads * + coredynp.perThreadState; //8 bit thread states + } + /* pipe stage ID/EXE */ + num_piperegs += coredynp.decodeW * + (coredynp.instruction_length + coredynp.pc_width + + pow(2.0, opcode_length) + 2 * coredynp.int_data_width) * + coredynp.num_hthreads; + /* pipe stage EXE/MEM */ + num_piperegs += coredynp.issueW * + (3 * coredynp.arch_ireg_width + pow(2.0, opcode_length) + 8 * + 2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/); + /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal for the opcode*/ + num_piperegs += coredynp.issueW * + (2 * coredynp.int_data_width + pow(2.0, opcode_length) + 8 * + 2 * coredynp.int_data_width/*+2*powers (2,reg_length)*/); + num_stages = 6; + } else { + /* assume 12 stage pipe stages and try to estimate bits per pipe stage */ + /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM */ + + /* pipe stage 0/1F*/ + num_piperegs += + coredynp.pc_width * 2 * coredynp.num_hthreads ;//PC and Next PC + /* pipe stage IF/ID */ + num_piperegs += coredynp.fetchW * + (coredynp.instruction_length + coredynp.pc_width) * + coredynp.num_hthreads;//PC is used to feed branch predictor in ID + /* pipe stage 1D/Renaming*/ + num_piperegs += coredynp.decodeW * + (coredynp.instruction_length + coredynp.pc_width) * + coredynp.num_hthreads;//PC is for branch exe in later stage. + /* pipe stage Renaming/wire_drive */ + num_piperegs += coredynp.decodeW * + (coredynp.instruction_length + coredynp.pc_width); + /* pipe stage Renaming/IssueQ */ + //3*coredynp.phy_ireg_width means 2 sources and 1 dest + num_piperegs += coredynp.issueW * + (coredynp.instruction_length + coredynp.pc_width + 3 * + coredynp.phy_ireg_width) * coredynp.num_hthreads; + /* pipe stage IssueQ/Dispatch */ + num_piperegs += coredynp.issueW * + (coredynp.instruction_length + 3 * coredynp.phy_ireg_width); + /* pipe stage Dispatch/EXE */ + + num_piperegs += coredynp.issueW * + (3 * coredynp.phy_ireg_width + coredynp.pc_width + + pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/); + /* 2^opcode_length means the total decoded signal for the opcode*/ + num_piperegs += coredynp.issueW * + (2 * coredynp.int_data_width + pow(2.0, opcode_length) + /*+2*powers (2,reg_length)*/); + /*2 source operands in EXE; Assume 2EXE stages* since we do not really distinguish OP*/ + num_piperegs += coredynp.issueW * + (2 * coredynp.int_data_width + pow(2.0, opcode_length) + /*+2*powers (2,reg_length)*/); + /* pipe stage EXE/MEM, data need to be read/write, address*/ + //memory Opcode still need to be passed + num_piperegs += coredynp.issueW * + (coredynp.int_data_width + coredynp.v_address_width + + pow(2.0, opcode_length)/*+2*powers (2,reg_length)*/); + /* pipe stage MEM/WB; result data, writeback regs */ + num_piperegs += coredynp.issueW * + (coredynp.int_data_width + coredynp.phy_ireg_width + /* powers (2,opcode_length) + + (2,opcode_length)+2*powers (2,reg_length)*/); + /* pipe stage WB/CM ; result data, regs need to be updated, address for resolve memory ops in ROB's top*/ + num_piperegs += coredynp.commitW * + (coredynp.int_data_width + coredynp.v_address_width + + coredynp.phy_ireg_width + /*+ powers (2,opcode_length)*2*powers (2,reg_length)*/) * + coredynp.num_hthreads; + num_stages = 12; } /* assume 50% extra in control registers and interrupt registers (rule of thumb) */ num_piperegs = num_piperegs * 1.5; - tot_stage_vector=num_piperegs; - per_stage_vector=tot_stage_vector/num_stages; - - if (coredynp.core_ty==Inorder) - { - if (coredynp.pipeline_stages>6) - num_piperegs= per_stage_vector*coredynp.pipeline_stages; + tot_stage_vector = num_piperegs; + per_stage_vector = tot_stage_vector / num_stages; + + if (coredynp.core_ty == Inorder) { + if (coredynp.pipeline_stages > 6) + num_piperegs = per_stage_vector * coredynp.pipeline_stages; + } else { //OOO + if (coredynp.pipeline_stages > 12) + num_piperegs = per_stage_vector * coredynp.pipeline_stages; } - else//OOO - { - if (coredynp.pipeline_stages>12) - num_piperegs= per_stage_vector*coredynp.pipeline_stages; - } - } + } } -FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParameter* interface_ip_,const CoreDynParam & dyn_p_, enum FU_type fu_type_) -:XML(XML_interface), - ithCore(ithCore_), - interface_ip(*interface_ip_), - coredynp(dyn_p_), - fu_type(fu_type_) -{ - double area_t;//, leakage, gate_leakage; +FunctionalUnit::FunctionalUnit(XMLNode* _xml_data, + InputParameter* interface_ip_, + const CoreParameters & _core_params, + const CoreStatistics & _core_stats, + enum FU_type fu_type_) + : McPATComponent(_xml_data), + interface_ip(*interface_ip_), core_params(_core_params), + core_stats(_core_stats), fu_type(fu_type_) { + double area_t; + double leakage; + double gate_leakage; double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); - clockRate = coredynp.clockRate; - executionTime = coredynp.executionTime; - - //XML_interface=_XML_interface; - uca_org_t result2; - result2 = init_interface(&interface_ip); - if (XML->sys.Embedded) - { - if (fu_type == FPU) - { - num_fu=coredynp.num_fpus; + clockRate = core_params.clockRate; + + uca_org_t result2; + // Temp name for the following function call + name = "Functional Unit"; + + result2 = init_interface(&interface_ip, name); + + if (core_params.Embedded) { + if (fu_type == FPU) { + num_fu=core_params.num_fpus; //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 area_t = 4.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 The base number //4.47 contains both VFP and NEON processing unit, VFP is about 40% and NEON is about 60% @@ -449,10 +534,8 @@ FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParam per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per Hz energy(nJ) //FPU power from Sandia's processor sizing tech report FU_height=(18667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data - } - else if (fu_type == ALU) - { - num_fu=coredynp.num_alus; + } else if (fu_type == ALU) { + num_fu=core_params.num_alus; area_t = 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; @@ -462,10 +545,8 @@ FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParam per_access_energy = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ) FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU - } - else if (fu_type == MUL) - { - num_fu=coredynp.num_muls; + } else if (fu_type == MUL) { + num_fu=core_params.num_muls; area_t = 280*260*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; @@ -474,197 +555,117 @@ FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_, InputParam base_energy = 0; per_access_energy = 1.15*2/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data - } - else - { + } else { cout<<"Unknown Functional Unit Type"<<endl; exit(0); } per_access_energy *=0.5;//According to ARM data embedded processor has much lower per acc energy + } else { + if (fu_type == FPU) { + name = "Floating Point Unit(s)"; + num_fu = core_params.num_fpus; + area_t = 8.47 * 1e6 * (g_ip->F_sz_nm * g_ip->F_sz_nm / 90.0 / + 90.0);//this is um^2 + if (g_ip->F_sz_nm > 90) + area_t = 8.47 * 1e6 * + g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 + leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + //W The base energy of ALU average numbers from Intel 4G and + //773Mhz (Wattch) + base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 3; + base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 / + 1.2); + per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ) + FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data + } else if (fu_type == ALU) { + name = "Integer ALU(s)"; + num_fu = core_params.num_alus; + //this is um^2 ALU + MUl + area_t = 280 * 260 * 2 * g_tp.scaling_factor.logic_scaling_co_eff; + leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; + //W The base energy of ALU average numbers from Intel 4G and 773Mhz + //(Wattch) + base_energy = core_params.core_ty == Inorder ? 0 : 89e-3; + base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 / + 1.2); + per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ) + FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU + } else if (fu_type == MUL) { + name = "Multiply/Divide Unit(s)"; + num_fu = core_params.num_muls; + //this is um^2 ALU + MUl + area_t = 280 * 260 * 2 * 3 * + g_tp.scaling_factor.logic_scaling_co_eff; + leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W + gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; + //W The base energy of ALU average numbers from Intel 4G and 773Mhz + //(Wattch) + base_energy = core_params.core_ty == Inorder ? 0 : 89e-3 * 2; + base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 / + 1.2); + per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch + FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data + } else { + cout << "Unknown Functional Unit Type" << endl; + exit(0); } - else - { - if (fu_type == FPU) - { - num_fu=coredynp.num_fpus; - //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 - area_t = 8.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is um^2 - if (g_ip->F_sz_nm>90) - area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 - leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W - gate_leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W - //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in FPU usually it can have up to 20 cycles. - base_energy = coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) - base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); - per_access_energy = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per op energy(nJ) - FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data - } - else if (fu_type == ALU) - { - num_fu=coredynp.num_alus; - area_t = 280*260*2*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl - leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W - gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; - base_energy = coredynp.core_ty==Inorder? 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) - base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); - per_access_energy = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ) - FU_height=(6222*num_fu)*interface_ip.F_sz_um;//integer ALU + } - } - else if (fu_type == MUL) - { - num_fu=coredynp.num_muls; - area_t = 280*260*2*3*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU + MUl - leakage = area_t *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2;//unit W - gate_leakage = area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_, 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd/2; - base_energy = coredynp.core_ty==Inorder? 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and 773Mhz (Wattch) - base_energy *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); - per_access_energy = 1.15*2/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9; //This is per cycle energy(nJ), coefficient based on Wattch - FU_height=(9334*num_fu )*interface_ip.F_sz_um;//divider/mul from Sun's data - } - else - { - cout<<"Unknown Functional Unit Type"<<endl; - exit(0); - } - } - //IEXEU, simple ALU and FPU - // double C_ALU, C_EXEU, C_FPU; //Lum Equivalent capacitance of IEXEU and FPU. Based on Intel and Sun 90nm process fabracation. - // - // C_ALU = 0.025e-9;//F - // C_EXEU = 0.05e-9; //F - // C_FPU = 0.35e-9;//F area.set_area(area_t*num_fu); - leakage *= num_fu; - gate_leakage *=num_fu; - double macro_layout_overhead = g_tp.macro_layout_overhead; -// if (!XML->sys.Embedded) - area.set_area(area.get_area()*macro_layout_overhead); -} - -void FunctionalUnit::computeEnergy(bool is_tdp) -{ - double pppm_t[4] = {1,1,1,1}; - double FU_duty_cycle; - if (is_tdp) - { - - - set_pppm(pppm_t, 2, 2, 2, 2);//2 means two source operands needs to be passed for each int instruction. - if (fu_type == FPU) - { - stats_t.readAc.access = num_fu; - tdp_stats = stats_t; - FU_duty_cycle = coredynp.FPU_duty_cycle; - } - else if (fu_type == ALU) - { - stats_t.readAc.access = 1*num_fu; - tdp_stats = stats_t; - FU_duty_cycle = coredynp.ALU_duty_cycle; - } - else if (fu_type == MUL) - { - stats_t.readAc.access = num_fu; - tdp_stats = stats_t; - FU_duty_cycle = coredynp.MUL_duty_cycle; - } - - //power.readOp.dynamic = base_energy/clockRate + energy*stats_t.readAc.access; - power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy/clockRate; - double sckRation = g_tp.sckt_co_eff; - power.readOp.dynamic *= sckRation*FU_duty_cycle; - power.writeOp.dynamic *= sckRation; - power.searchOp.dynamic *= sckRation; - - power.readOp.leakage = leakage; - power.readOp.gate_leakage = gate_leakage; - double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); - power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; - - } - else - { - if (fu_type == FPU) - { - stats_t.readAc.access = XML->sys.core[ithCore].fpu_accesses; - rtp_stats = stats_t; - } - else if (fu_type == ALU) - { - stats_t.readAc.access = XML->sys.core[ithCore].ialu_accesses; - rtp_stats = stats_t; - } - else if (fu_type == MUL) - { - stats_t.readAc.access = XML->sys.core[ithCore].mul_accesses; - rtp_stats = stats_t; - } - - //rt_power.readOp.dynamic = base_energy*executionTime + energy*stats_t.readAc.access; - rt_power.readOp.dynamic = per_access_energy*stats_t.readAc.access + base_energy*executionTime; - double sckRation = g_tp.sckt_co_eff; - rt_power.readOp.dynamic *= sckRation; - rt_power.writeOp.dynamic *= sckRation; - rt_power.searchOp.dynamic *= sckRation; - - } - - + power.readOp.leakage = leakage * num_fu; + power.readOp.gate_leakage = gate_leakage * num_fu; + + double long_channel_device_reduction = + longer_channel_device_reduction(Core_device, core_params.core_ty); + power.readOp.longer_channel_leakage = + power.readOp.leakage * long_channel_device_reduction; + double macro_layout_overhead = g_tp.macro_layout_overhead; + area.set_area(area.get_area()*macro_layout_overhead); } -void FunctionalUnit::displayEnergy(uint32_t indent,int plevel,bool is_tdp) -{ - string indent_str(indent, ' '); - string indent_str_next(indent+2, ' '); - bool long_channel = XML->sys.longer_channel_device; - -// cout << indent_str_next << "Results Broadcast Bus Area = " << bypass->area.get_area() *1e-6 << " mm^2" << endl; - if (is_tdp) - { - if (fu_type == FPU) - { - cout << indent_str << "Floating Point Units (FPUs) (Count: "<< coredynp.num_fpus <<" ):" << endl; - cout << indent_str_next << "Area = " << area.get_area()*1e-6 << " mm^2" << endl; - cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; -// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl; - cout << indent_str_next<< "Subthreshold Leakage = " - << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; - cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; - cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; - cout <<endl; - } - else if (fu_type == ALU) - { - cout << indent_str << "Integer ALUs (Count: "<< coredynp.num_alus <<" ):" << endl; - cout << indent_str_next << "Area = " << area.get_area()*1e-6 << " mm^2" << endl; - cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; -// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl; - cout << indent_str_next<< "Subthreshold Leakage = " - << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; - cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; - cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; - cout <<endl; - } - else if (fu_type == MUL) - { - cout << indent_str << "Complex ALUs (Mul/Div) (Count: "<< coredynp.num_muls <<" ):" << endl; - cout << indent_str_next << "Area = " << area.get_area()*1e-6 << " mm^2" << endl; - cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; -// cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage << " W" << endl; - cout << indent_str_next<< "Subthreshold Leakage = " - << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; - cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; - cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; - cout <<endl; - - } +void FunctionalUnit::computeEnergy() { + double pppm_t[4] = {1, 1, 1, 1}; + double FU_duty_cycle; + double sckRation = g_tp.sckt_co_eff; + + // TDP power calculation + //2 means two source operands needs to be passed for each int instruction. + set_pppm(pppm_t, 2, 2, 2, 2); + tdp_stats.readAc.access = num_fu; + if (fu_type == FPU) { + FU_duty_cycle = core_stats.FPU_duty_cycle; + } else if (fu_type == ALU) { + FU_duty_cycle = core_stats.ALU_duty_cycle; + } else if (fu_type == MUL) { + FU_duty_cycle = core_stats.MUL_duty_cycle; + } - } - else - { - } + power.readOp.dynamic = + per_access_energy * tdp_stats.readAc.access + base_energy / clockRate; + power.readOp.dynamic *= sckRation * FU_duty_cycle; + + // Runtime power calculation + if (fu_type == FPU) { + rtp_stats.readAc.access = core_stats.fpu_accesses; + } else if (fu_type == ALU) { + rtp_stats.readAc.access = core_stats.ialu_accesses; + } else if (fu_type == MUL) { + rtp_stats.readAc.access = core_stats.mul_accesses; + } + rt_power.readOp.dynamic = per_access_energy * rtp_stats.readAc.access + + base_energy * execution_time; + rt_power.readOp.dynamic *= sckRation; + + output_data.area = area.get_area() / 1e6; + output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; + output_data.subthreshold_leakage_power = + (longer_channel_device) ? power.readOp.longer_channel_leakage : + power.readOp.leakage; + output_data.gate_leakage_power = power.readOp.gate_leakage; + output_data.runtime_dynamic_energy = rt_power.readOp.dynamic; } void FunctionalUnit::leakage_feedback(double temperature) @@ -672,7 +673,8 @@ void FunctionalUnit::leakage_feedback(double temperature) // Update the temperature and initialize the global interfaces. interface_ip.temp = (unsigned int)round(temperature/10.0)*10; - uca_org_t init_result = init_interface(&interface_ip); // init_result is dummy + // init_result is dummy + uca_org_t init_result = init_interface(&interface_ip, name); // This is part of FunctionalUnit() double area_t, leakage, gate_leakage; @@ -706,277 +708,220 @@ void FunctionalUnit::leakage_feedback(double temperature) power.readOp.leakage = leakage*num_fu; power.readOp.gate_leakage = gate_leakage*num_fu; - power.readOp.longer_channel_leakage = longer_channel_device_reduction(Core_device, coredynp.core_ty); + power.readOp.longer_channel_leakage = + longer_channel_device_reduction(Core_device, core_params.core_ty); } -UndiffCore::UndiffCore(ParseXML* XML_interface, int ithCore_, InputParameter* interface_ip_, const CoreDynParam & dyn_p_, bool exist_, bool embedded_) -:XML(XML_interface), - ithCore(ithCore_), - interface_ip(*interface_ip_), - coredynp(dyn_p_), - core_ty(coredynp.core_ty), - embedded(XML->sys.Embedded), - pipeline_stage(coredynp.pipeline_stages), - num_hthreads(coredynp.num_hthreads), - issue_width(coredynp.issueW), - exist(exist_) -// is_default(_is_default) -{ - if (!exist) return; - double undifferentiated_core=0; - double core_tx_density=0; - double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); +UndiffCore::UndiffCore(XMLNode* _xml_data, InputParameter* interface_ip_, + const CoreParameters & dyn_p_, + bool exist_) + : McPATComponent(_xml_data), + interface_ip(*interface_ip_), coredynp(dyn_p_), + core_ty(coredynp.core_ty), embedded(coredynp.Embedded), + pipeline_stage(coredynp.pipeline_stages), + num_hthreads(coredynp.num_hthreads), issue_width(coredynp.issueW), + exist(exist_) { + if (!exist) return; + + name = "Undifferentiated Core"; + clockRate = coredynp.clockRate; + + double undifferentiated_core = 0; + double core_tx_density = 0; + double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); double undifferentiated_core_coe; - //XML_interface=_XML_interface; - uca_org_t result2; - result2 = init_interface(&interface_ip); - - //Compute undifferentiated core area at 90nm. - if (embedded==false) - { - //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements - if (core_ty==OOO) - { - //undifferentiated_core = (0.0764*pipeline_stage*pipeline_stage -2.3685*pipeline_stage + 10.405);//OOO - undifferentiated_core = (3.57*log(pipeline_stage)-1.2643)>0?(3.57*log(pipeline_stage)-1.2643):0; - } - else if (core_ty==Inorder) - { - //undifferentiated_core = (0.1238*pipeline_stage + 7.2572)*0.9;//inorder - undifferentiated_core = (-2.19*log(pipeline_stage)+6.55)>0?(-2.19*log(pipeline_stage)+6.55):0; - } - else - { - cout<<"invalid core type"<<endl; - exit(0); - } - undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0716); + uca_org_t result2; + result2 = init_interface(&interface_ip, name); + + //Compute undifferentiated core area at 90nm. + if (embedded == false) { + //Based on the results of polynomial/log curve fitting based on undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott, Opteron die measurements + if (core_ty == OOO) { + undifferentiated_core = (3.57 * log(pipeline_stage) - 1.2643) > 0 ? + (3.57 * log(pipeline_stage) - 1.2643) : 0; + } else if (core_ty == Inorder) { + undifferentiated_core = (-2.19 * log(pipeline_stage) + 6.55) > 0 ? + (-2.19 * log(pipeline_stage) + 6.55) : 0; + } else { + cout << "invalid core type" << endl; + exit(0); } - else - { - //Based on the results in paper "parametrized processor models" Sandia Labs - if (XML->sys.opt_clockrate) + undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0716); + } else { + //Based on the results in paper "parametrized processor models" Sandia Labs + if (opt_for_clk) undifferentiated_core_coe = 0.05; else undifferentiated_core_coe = 0; - undifferentiated_core = (0.4109* pipeline_stage - 0.776)*undifferentiated_core_coe; - undifferentiated_core *= (1+ logtwo(num_hthreads)* 0.0426); - } - - undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff*1e6;//change from mm^2 to um^2 - core_tx_density = g_tp.scaling_factor.core_tx_density; - //undifferentiated_core = 3*1e6; - //undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff;//(g_ip->F_sz_um*g_ip->F_sz_um/0.09/0.09)*; - power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W - power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd; - - double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); - power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; - area.set_area(undifferentiated_core); - - scktRatio = g_tp.sckt_co_eff; - power.readOp.dynamic *= scktRatio; - power.writeOp.dynamic *= scktRatio; - power.searchOp.dynamic *= scktRatio; - macro_PR_overhead = g_tp.macro_layout_overhead; - area.set_area(area.get_area()*macro_PR_overhead); - - - -// double vt=g_tp.peri_global.Vth; -// double velocity_index=1.1; -// double c_in=gate_C(g_tp.min_w_nmos_, g_tp.min_w_nmos_*pmos_to_nmos_sizing_r , 0.0, false); -// double c_out= drain_C_(g_tp.min_w_nmos_, NCH, 2, 1, g_tp.cell_h_def, false) + drain_C_(g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, PCH, 1, 1, g_tp.cell_h_def, false) + c_in; -// double w_nmos=g_tp.min_w_nmos_; -// double w_pmos=g_tp.min_w_nmos_*pmos_to_nmos_sizing_r; -// double i_on_n=1.0; -// double i_on_p=1.0; -// double i_on_n_in=1.0; -// double i_on_p_in=1; -// double vdd=g_tp.peri_global.Vdd; - -// power.readOp.sc=shortcircuit_simple(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd); -// power.readOp.dynamic=c_out*vdd*vdd/2; - -// cout<<power.readOp.dynamic << "dynamic" <<endl; -// cout<<power.readOp.sc << "sc" << endl; - -// power.readOp.sc=shortcircuit(vt, velocity_index, c_in, c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd); -// power.readOp.dynamic=c_out*vdd*vdd/2; -// -// cout<<power.readOp.dynamic << "dynamic" <<endl; -// cout<<power.readOp.sc << "sc" << endl; - - - -} - - -void UndiffCore::displayEnergy(uint32_t indent,int plevel,bool is_tdp) -{ - string indent_str(indent, ' '); - string indent_str_next(indent+2, ' '); - bool long_channel = XML->sys.longer_channel_device; - - if (is_tdp) - { - cout << indent_str << "UndiffCore:" << endl; - cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl; - cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; - //cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl; - cout << indent_str_next<< "Subthreshold Leakage = " - << (long_channel? power.readOp.longer_channel_leakage:power.readOp.leakage) <<" W" << endl; - cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; - //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; - cout <<endl; - } - else - { - cout << indent_str << "UndiffCore:" << endl; - cout << indent_str_next << "Area = " << area.get_area()*1e-6<< " mm^2" << endl; - cout << indent_str_next << "Peak Dynamic = " << power.readOp.dynamic*clockRate << " W" << endl; - cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage <<" W" << endl; - cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage << " W" << endl; - //cout << indent_str_next << "Runtime Dynamic = " << rt_power.readOp.dynamic/executionTime << " W" << endl; - cout <<endl; - } + undifferentiated_core = (0.4109 * pipeline_stage - 0.776) * + undifferentiated_core_coe; + undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0426); + } + undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff * + 1e6;//change from mm^2 to um^2 + core_tx_density = g_tp.scaling_factor.core_tx_density; + power.readOp.leakage = undifferentiated_core*(core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd;//unit W + power.readOp.gate_leakage = undifferentiated_core*(core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_, 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1, inv)*g_tp.peri_global.Vdd; + + double long_channel_device_reduction = longer_channel_device_reduction(Core_device, coredynp.core_ty); + power.readOp.longer_channel_leakage = + power.readOp.leakage * long_channel_device_reduction; + area.set_area(undifferentiated_core); + + scktRatio = g_tp.sckt_co_eff; + power.readOp.dynamic *= scktRatio; + power.writeOp.dynamic *= scktRatio; + power.searchOp.dynamic *= scktRatio; + macro_PR_overhead = g_tp.macro_layout_overhead; + area.set_area(area.get_area()*macro_PR_overhead); + + output_data.area = area.get_area() / 1e6; + output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; + output_data.subthreshold_leakage_power = + longer_channel_device ? power.readOp.longer_channel_leakage : + power.readOp.leakage; + output_data.gate_leakage_power = power.readOp.gate_leakage; } -inst_decoder::inst_decoder( - bool _is_default, - const InputParameter *configure_interface, - int opcode_length_, - int num_decoders_, - bool x86_, - enum Device_ty device_ty_, - enum Core_type core_ty_) -:is_default(_is_default), - opcode_length(opcode_length_), - num_decoders(num_decoders_), - x86(x86_), - device_ty(device_ty_), - core_ty(core_ty_) - { - /* - * Instruction decoder is different from n to 2^n decoders - * that are commonly used in row decoders in memory arrays. - * The RISC instruction decoder is typically a very simple device. - * We can decode an instruction by simply - * separating the machine word into small parts using wire slices - * The RISC instruction decoder can be approximate by the n to 2^n decoders, - * although this approximation usually underestimate power since each decoded - * instruction normally has more than 1 active signal. - * - * However, decoding a CISC instruction word is much more difficult - * than the RISC case. A CISC decoder is typically set up as a state machine. - * The machine reads the opcode field to determine - * what type of instruction it is, - * and where the other data values are. - * The instruction word is read in piece by piece, - * and decisions are made at each stage as to - * how the remainder of the instruction word will be read. - * (sequencer and ROM are usually needed) - * An x86 decoder can be even more complex since - * it involve both decoding instructions into u-ops and - * merge u-ops when doing micro-ops fusion. - */ - bool is_dram=false; - double pmos_to_nmos_sizing_r; - double load_nmos_width, load_pmos_width; - double C_driver_load, R_wire_load; - Area cell; - - l_ip=*configure_interface; - local_result = init_interface(&l_ip); - cell.h =g_tp.cell_h_def; - cell.w =g_tp.cell_h_def; - - num_decoder_segments = (int)ceil(opcode_length/18.0); - if (opcode_length > 18) opcode_length = 18; - num_decoded_signals= (int)pow(2.0,opcode_length); - pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); - load_nmos_width=g_tp.max_w_nmos_ /2; - load_pmos_width= g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r; - C_driver_load = 1024*gate_C(load_nmos_width + load_pmos_width, 0, is_dram); //TODO: this number 1024 needs to be revisited - R_wire_load = 3000*l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um; - - final_dec = new Decoder( - num_decoded_signals, - false, - C_driver_load, - R_wire_load, - false/*is_fa*/, - false/*is_dram*/, - false/*wl_tr*/, //to use peri device - cell); - - PredecBlk * predec_blk1 = new PredecBlk( - num_decoded_signals, - final_dec, - 0,//Assuming predec and dec are back to back - 0, - 1,//Each Predec only drives one final dec - false/*is_dram*/, - true); - PredecBlk * predec_blk2 = new PredecBlk( - num_decoded_signals, - final_dec, - 0,//Assuming predec and dec are back to back - 0, - 1,//Each Predec only drives one final dec - false/*is_dram*/, - false); - - PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false); - PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false); - - pre_dec = new Predec(predec_blk_drv1, predec_blk_drv2); - - double area_decoder = final_dec->area.get_area() * num_decoded_signals * num_decoder_segments*num_decoders; - //double w_decoder = area_decoder / area.get_h(); - double area_pre_dec = (predec_blk_drv1->area.get_area() + - predec_blk_drv2->area.get_area() + - predec_blk1->area.get_area() + - predec_blk2->area.get_area())* - num_decoder_segments*num_decoders; - area.set_area(area.get_area()+ area_decoder + area_pre_dec); - double macro_layout_overhead = g_tp.macro_layout_overhead; - double chip_PR_overhead = g_tp.chip_layout_overhead; - area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead); - - inst_decoder_delay_power(); - - double sckRation = g_tp.sckt_co_eff; - power.readOp.dynamic *= sckRation; - power.writeOp.dynamic *= sckRation; - power.searchOp.dynamic *= sckRation; - - double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); - power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; - +InstructionDecoder::InstructionDecoder(XMLNode* _xml_data, const string _name, + bool _is_default, + const InputParameter *configure_interface, + int opcode_length_, int num_decoders_, + bool x86_, + double clockRate_, + enum Device_ty device_ty_, + enum Core_type core_ty_) + : McPATComponent(_xml_data), is_default(_is_default), + opcode_length(opcode_length_), num_decoders(num_decoders_), x86(x86_), + device_ty(device_ty_), core_ty(core_ty_) { + /* + * Instruction decoder is different from n to 2^n decoders + * that are commonly used in row decoders in memory arrays. + * The RISC instruction decoder is typically a very simple device. + * We can decode an instruction by simply + * separating the machine word into small parts using wire slices + * The RISC instruction decoder can be approximate by the n to 2^n decoders, + * although this approximation usually underestimate power since each decoded + * instruction normally has more than 1 active signal. + * + * However, decoding a CISC instruction word is much more difficult + * than the RISC case. A CISC decoder is typically set up as a state machine. + * The machine reads the opcode field to determine + * what type of instruction it is, + * and where the other data values are. + * The instruction word is read in piece by piece, + * and decisions are made at each stage as to + * how the remainder of the instruction word will be read. + * (sequencer and ROM are usually needed) + * An x86 decoder can be even more complex since + * it involve both decoding instructions into u-ops and + * merge u-ops when doing micro-ops fusion. + */ + name = _name; + clockRate = clockRate_; + bool is_dram = false; + double pmos_to_nmos_sizing_r; + double load_nmos_width, load_pmos_width; + double C_driver_load, R_wire_load; + Area cell; + + l_ip = *configure_interface; + local_result = init_interface(&l_ip, name); + cell.h = g_tp.cell_h_def; + cell.w = g_tp.cell_h_def; + + num_decoder_segments = (int)ceil(opcode_length / 18.0); + if (opcode_length > 18) opcode_length = 18; + num_decoded_signals = (int)pow(2.0, opcode_length); + pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); + load_nmos_width = g_tp.max_w_nmos_ / 2; + load_pmos_width = g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r; + C_driver_load = 1024 * gate_C(load_nmos_width + load_pmos_width, 0, is_dram); + R_wire_load = 3000 * l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um; + + final_dec = new Decoder( + num_decoded_signals, + false, + C_driver_load, + R_wire_load, + false/*is_fa*/, + false/*is_dram*/, + false/*wl_tr*/, //to use peri device + cell); + + PredecBlk * predec_blk1 = new PredecBlk( + num_decoded_signals, + final_dec, + 0,//Assuming predec and dec are back to back + 0, + 1,//Each Predec only drives one final dec + false/*is_dram*/, + true); + PredecBlk * predec_blk2 = new PredecBlk( + num_decoded_signals, + final_dec, + 0,//Assuming predec and dec are back to back + 0, + 1,//Each Predec only drives one final dec + false/*is_dram*/, + false); + + PredecBlkDrv * predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false); + PredecBlkDrv * predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false); + + pre_dec = new Predec(predec_blk_drv1, predec_blk_drv2); + + double area_decoder = final_dec->area.get_area() * num_decoded_signals * + num_decoder_segments * num_decoders; + //double w_decoder = area_decoder / area.get_h(); + double area_pre_dec = (predec_blk_drv1->area.get_area() + + predec_blk_drv2->area.get_area() + + predec_blk1->area.get_area() + + predec_blk2->area.get_area()) * + num_decoder_segments * num_decoders; + area.set_area(area.get_area() + area_decoder + area_pre_dec); + double macro_layout_overhead = g_tp.macro_layout_overhead; + double chip_PR_overhead = g_tp.chip_layout_overhead; + area.set_area(area.get_area()*macro_layout_overhead*chip_PR_overhead); + + inst_decoder_delay_power(); + + double sckRation = g_tp.sckt_co_eff; + power.readOp.dynamic *= sckRation; + power.writeOp.dynamic *= sckRation; + power.searchOp.dynamic *= sckRation; + + double long_channel_device_reduction = + longer_channel_device_reduction(device_ty, core_ty); + power.readOp.longer_channel_leakage = power.readOp.leakage * + long_channel_device_reduction; + + output_data.area = area.get_area() / 1e6; + output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; + output_data.subthreshold_leakage_power = power.readOp.leakage; + output_data.gate_leakage_power = power.readOp.gate_leakage; } -void inst_decoder::inst_decoder_delay_power() -{ +void InstructionDecoder::inst_decoder_delay_power() { - double dec_outrisetime; - double inrisetime=0, outrisetime; - double pppm_t[4] = {1,1,1,1}; - double squencer_passes = x86?2:1; + double dec_outrisetime; + double inrisetime = 0, outrisetime; + double pppm_t[4] = {1, 1, 1, 1}; + double squencer_passes = x86 ? 2 : 1; - outrisetime = pre_dec->compute_delays(inrisetime); - dec_outrisetime = final_dec->compute_delays(outrisetime); - set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments); - power = power + pre_dec->power*pppm_t; + outrisetime = pre_dec->compute_delays(inrisetime); + dec_outrisetime = final_dec->compute_delays(outrisetime); + set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments, squencer_passes*num_decoder_segments, num_decoder_segments); + power = power + pre_dec->power * pppm_t; set_pppm(pppm_t, squencer_passes*num_decoder_segments, num_decoder_segments*num_decoded_signals, - num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments); - power = power + final_dec->power*pppm_t; + num_decoder_segments*num_decoded_signals, squencer_passes*num_decoder_segments); + power = power + final_dec->power * pppm_t; } -void inst_decoder::leakage_feedback(double temperature) -{ + +void InstructionDecoder::leakage_feedback(double temperature) { l_ip.temp = (unsigned int)round(temperature/10.0)*10; - uca_org_t init_result = init_interface(&l_ip); // init_result is dummy + uca_org_t init_result = init_interface(&l_ip, name); // init_result is dummy final_dec->leakage_feedback(temperature); pre_dec->leakage_feedback(temperature); @@ -1000,15 +945,14 @@ void inst_decoder::leakage_feedback(double temperature) power.readOp.longer_channel_leakage = power.readOp.leakage*long_channel_device_reduction; } -inst_decoder::~inst_decoder() -{ - local_result.cleanup(); +InstructionDecoder::~InstructionDecoder() { + local_result.cleanup(); - delete final_dec; + delete final_dec; - delete pre_dec->blk1; - delete pre_dec->blk2; - delete pre_dec->drv1; - delete pre_dec->drv2; - delete pre_dec; + delete pre_dec->blk1; + delete pre_dec->blk2; + delete pre_dec->drv1; + delete pre_dec->drv2; + delete pre_dec; } |