diff options
Diffstat (limited to 'ext/mcpat/array.cc')
-rw-r--r-- | ext/mcpat/array.cc | 386 |
1 files changed, 198 insertions, 188 deletions
diff --git a/ext/mcpat/array.cc b/ext/mcpat/array.cc index 975f82fad..0e46afe03 100644 --- a/ext/mcpat/array.cc +++ b/ext/mcpat/array.cc @@ -2,6 +2,7 @@ * McPAT * SOFTWARE LICENSE AGREEMENT * Copyright 2012 Hewlett-Packard Development Company, L.P. + * Copyright (c) 2010-2013 Advanced Micro Devices, Inc. * All Rights Reserved * * Redistribution and use in source and binary forms, with or without @@ -25,232 +26,242 @@ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ***************************************************************************/ -#define GLOBALVAR -#include <cassert> -#include <cmath> #include <iostream> +#include <math.h> #include "area.h" #include "array.h" +#include "common.h" #include "decoder.h" -#include "globalvar.h" #include "parameter.h" using namespace std; -ArrayST::ArrayST(const InputParameter *configure_interface, - string _name, - enum Device_ty device_ty_, - bool opt_local_, - enum Core_type core_ty_, - bool _is_default) -:l_ip(*configure_interface), - name(_name), - device_ty(device_ty_), - opt_local(opt_local_), - core_ty(core_ty_), - is_default(_is_default) - { - - if (l_ip.cache_sz<64) l_ip.cache_sz=64; - l_ip.error_checking();//not only do the error checking but also fill some missing parameters - optimize_array(); +double ArrayST::area_efficiency_threshold = 20.0; +int ArrayST::ed = 0; +//Fixed number, make sure timing can be satisfied. +int ArrayST::delay_wt = 100; +int ArrayST::cycle_time_wt = 1000; +//Fixed number, This is used to exhaustive search for individual components. +int ArrayST::area_wt = 10; +//Fixed number, This is used to exhaustive search for individual components. +int ArrayST::dynamic_power_wt = 10; +int ArrayST::leakage_power_wt = 10; +//Fixed number, make sure timing can be satisfied. +int ArrayST::delay_dev = 1000000; +int ArrayST::cycle_time_dev = 100; +//Fixed number, This is used to exhaustive search for individual components. +int ArrayST::area_dev = 1000000; +//Fixed number, This is used to exhaustive search for individual components. +int ArrayST::dynamic_power_dev = 1000000; +int ArrayST::leakage_power_dev = 1000000; +int ArrayST::cycle_time_dev_threshold = 10; + + +ArrayST::ArrayST(XMLNode* _xml_data, + const InputParameter *configure_interface, string _name, + enum Device_ty device_ty_, double _clockRate, + bool opt_local_, enum Core_type core_ty_, bool _is_default) + : McPATComponent(_xml_data), l_ip(*configure_interface), + device_ty(device_ty_), opt_local(opt_local_), core_ty(core_ty_), + is_default(_is_default) { + name = _name; + clockRate = _clockRate; + if (l_ip.cache_sz < MIN_BUFFER_SIZE) + l_ip.cache_sz = MIN_BUFFER_SIZE; + + if (!l_ip.error_checking(name)) { + exit(1); + } -} + output_data.reset(); + computeEnergy(); + computeArea(); +} -void ArrayST::compute_base_power() - { - //l_ip.out_w =l_ip.line_sz*8; - local_result=cacti_interface(&l_ip); +void ArrayST::compute_base_power() { + local_result = cacti_interface(&l_ip); +} - } +void ArrayST::computeArea() { + area.set_area(local_result.area); + output_data.area = local_result.area / 1e6; +} -void ArrayST::optimize_array() -{ - list<uca_org_t > candidate_solutions(0); - list<uca_org_t >::iterator candidate_iter, min_dynamic_energy_iter; +void ArrayST::computeEnergy() { + list<uca_org_t > candidate_solutions(0); + list<uca_org_t >::iterator candidate_iter, min_dynamic_energy_iter; - uca_org_t * temp_res = 0; - local_result.valid=false; + uca_org_t* temp_res = NULL; + local_result.valid = false; - double throughput=l_ip.throughput, latency=l_ip.latency; - double area_efficiency_threshold = 20.0; - bool throughput_overflow=true, latency_overflow=true; - compute_base_power(); + double throughput = l_ip.throughput; + double latency = l_ip.latency; + bool throughput_overflow = true; + bool latency_overflow = true; + compute_base_power(); - if ((local_result.cycle_time - throughput) <= 1e-10 ) - throughput_overflow=false; - if ((local_result.access_time - latency)<= 1e-10) - latency_overflow=false; + if ((local_result.cycle_time - throughput) <= 1e-10 ) + throughput_overflow = false; + if ((local_result.access_time - latency) <= 1e-10) + latency_overflow = false; - if (opt_for_clk && opt_local) - { - if (throughput_overflow || latency_overflow) - { - l_ip.ed=0; + if (opt_for_clk && opt_local) { + if (throughput_overflow || latency_overflow) { + l_ip.ed = ed; - l_ip.delay_wt = 100;//Fixed number, make sure timing can be satisfied. - l_ip.cycle_time_wt = 1000; + l_ip.delay_wt = delay_wt; + l_ip.cycle_time_wt = cycle_time_wt; - l_ip.area_wt = 10;//Fixed number, This is used to exhaustive search for individual components. - l_ip.dynamic_power_wt = 10;//Fixed number, This is used to exhaustive search for individual components. - l_ip.leakage_power_wt = 10; + l_ip.area_wt = area_wt; + l_ip.dynamic_power_wt = dynamic_power_wt; + l_ip.leakage_power_wt = leakage_power_wt; - l_ip.delay_dev = 1000000;//Fixed number, make sure timing can be satisfied. - l_ip.cycle_time_dev = 100; + l_ip.delay_dev = delay_dev; + l_ip.cycle_time_dev = cycle_time_dev; - l_ip.area_dev = 1000000;//Fixed number, This is used to exhaustive search for individual components. - l_ip.dynamic_power_dev = 1000000;//Fixed number, This is used to exhaustive search for individual components. - l_ip.leakage_power_dev = 1000000; + l_ip.area_dev = area_dev; + l_ip.dynamic_power_dev = dynamic_power_dev; + l_ip.leakage_power_dev = leakage_power_dev; - throughput_overflow=true; //Reset overflow flag before start optimization iterations - latency_overflow=true; + //Reset overflow flag before start optimization iterations + throughput_overflow = true; + latency_overflow = true; - temp_res = &local_result; //Clean up the result for optimized for ED^2P - temp_res->cleanup(); - } + //Clean up the result for optimized for ED^2P + temp_res = &local_result; + temp_res->cleanup(); + } - while ((throughput_overflow || latency_overflow)&&l_ip.cycle_time_dev > 10)// && l_ip.delay_dev > 10 - { - compute_base_power(); - - l_ip.cycle_time_dev-=10;//This is the time_dev to be used for next iteration - - // from best area to worst area -->worst timing to best timing - if ((((local_result.cycle_time - throughput) <= 1e-10 ) && (local_result.access_time - latency)<= 1e-10)|| - (local_result.data_array2->area_efficiency < area_efficiency_threshold && l_ip.assoc == 0)) - { //if no satisfiable solution is found,the most aggressive one is left - candidate_solutions.push_back(local_result); - //output_data_csv(candidate_solutions.back()); - if (((local_result.cycle_time - throughput) <= 1e-10) && ((local_result.access_time - latency)<= 1e-10)) - //ensure stop opt not because of cam - { - throughput_overflow=false; - latency_overflow=false; - } - - } - else - { - //TODO: whether checking the partial satisfied results too, or just change the mark??? - if ((local_result.cycle_time - throughput) <= 1e-10) - throughput_overflow=false; - if ((local_result.access_time - latency)<= 1e-10) - latency_overflow=false; - - if (l_ip.cycle_time_dev > 10) - { //if not >10 local_result is the last result, it cannot be cleaned up - temp_res = &local_result; //Only solutions not saved in the list need to be cleaned up - temp_res->cleanup(); - } - } -// l_ip.cycle_time_dev-=10; -// l_ip.delay_dev-=10; + while ((throughput_overflow || latency_overflow) && + l_ip.cycle_time_dev > cycle_time_dev_threshold) { + compute_base_power(); + + //This is the time_dev to be used for next iteration + l_ip.cycle_time_dev -= cycle_time_dev_threshold; + + // from best area to worst area -->worst timing to best timing + if ((((local_result.cycle_time - throughput) <= 1e-10 ) && + (local_result.access_time - latency) <= 1e-10) || + (local_result.data_array2->area_efficiency < + area_efficiency_threshold && l_ip.assoc == 0)) { + //if no satisfiable solution is found,the most aggressive one + //is left + candidate_solutions.push_back(local_result); + if (((local_result.cycle_time - throughput) <= 1e-10) && + ((local_result.access_time - latency) <= 1e-10)) { + //ensure stop opt not because of cam + throughput_overflow = false; + latency_overflow = false; + } + } else { + if ((local_result.cycle_time - throughput) <= 1e-10) + throughput_overflow = false; + if ((local_result.access_time - latency) <= 1e-10) + latency_overflow = false; + + //if not >10 local_result is the last result, it cannot be + //cleaned up + if (l_ip.cycle_time_dev > cycle_time_dev_threshold) { + //Only solutions not saved in the list need to be + //cleaned up + temp_res = &local_result; + temp_res->cleanup(); } + } + } - if (l_ip.assoc > 0) - { - //For array structures except CAM and FA, Give warning but still provide a result with best timing found - if (throughput_overflow==true) - cout<< "Warning: " << name<<" array structure cannot satisfy throughput constraint." << endl; - if (latency_overflow==true) - cout<< "Warning: " << name<<" array structure cannot satisfy latency constraint." << endl; + if (l_ip.assoc > 0) { + //For array structures except CAM and FA, Give warning but still + //provide a result with best timing found + if (throughput_overflow == true) + cout << "Warning: " << name + << " array structure cannot satisfy throughput constraint." + << endl; + if (latency_overflow == true) + cout << "Warning: " << name + << " array structure cannot satisfy latency constraint." + << endl; } -// else -// { -// /*According to "Content-Addressable Memory (CAM) Circuits and -// Architectures": A Tutorial and Survey -// by Kostas Pagiamtzis et al. -// CAM structures can be heavily pipelined and use look-ahead techniques, -// therefore timing can be relaxed. But McPAT does not model the advanced -// techniques. If continue optimizing, the area efficiency will be too low -// */ -// //For CAM and FA, stop opt if area efficiency is too low -// if (throughput_overflow==true) -// cout<< "Warning: " <<" McPAT stopped optimization on throughput for "<< name -// <<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl; -// if (latency_overflow==true) -// cout<< "Warning: " <<" McPAT stopped optimization on latency for "<< name -// <<" array structure because its area efficiency is below "<<area_efficiency_threshold<<"% " << endl; -// } - - //double min_dynamic_energy, min_dynamic_power, min_leakage_power, min_cycle_time; - double min_dynamic_energy=BIGNUM; - if (candidate_solutions.empty()==false) - { - local_result.valid=true; - for (candidate_iter = candidate_solutions.begin(); candidate_iter != candidate_solutions.end(); ++candidate_iter) - - { - if (min_dynamic_energy > (candidate_iter)->power.readOp.dynamic) - { - min_dynamic_energy = (candidate_iter)->power.readOp.dynamic; - min_dynamic_energy_iter = candidate_iter; - local_result = *(min_dynamic_energy_iter); - //TODO: since results are reordered results and l_ip may miss match. Therefore, the final output spread sheets may show the miss match. - - } - else - { - candidate_iter->cleanup() ; - } - - } + double min_dynamic_energy = BIGNUM; + if (candidate_solutions.empty() == false) { + local_result.valid = true; + for (candidate_iter = candidate_solutions.begin(); + candidate_iter != candidate_solutions.end(); + ++candidate_iter) { + if (min_dynamic_energy > + (candidate_iter)->power.readOp.dynamic) { + min_dynamic_energy = + (candidate_iter)->power.readOp.dynamic; + min_dynamic_energy_iter = candidate_iter; + local_result = *(min_dynamic_energy_iter); + } else { + candidate_iter->cleanup() ; + } + } - } - candidate_solutions.clear(); - } - double long_channel_device_reduction = longer_channel_device_reduction(device_ty,core_ty); - - double macro_layout_overhead = g_tp.macro_layout_overhead; - double chip_PR_overhead = g_tp.chip_layout_overhead; - double total_overhead = macro_layout_overhead*chip_PR_overhead; - local_result.area *= total_overhead; - - //maintain constant power density - double pppm_t[4] = {total_overhead,1,1,total_overhead}; - - double sckRation = g_tp.sckt_co_eff; - local_result.power.readOp.dynamic *= sckRation; - local_result.power.writeOp.dynamic *= sckRation; - local_result.power.searchOp.dynamic *= sckRation; - local_result.power.readOp.leakage *= l_ip.nbanks; - local_result.power.readOp.longer_channel_leakage = - local_result.power.readOp.leakage*long_channel_device_reduction; - local_result.power = local_result.power* pppm_t; - - local_result.data_array2->power.readOp.dynamic *= sckRation; - local_result.data_array2->power.writeOp.dynamic *= sckRation; - local_result.data_array2->power.searchOp.dynamic *= sckRation; - local_result.data_array2->power.readOp.leakage *= l_ip.nbanks; - local_result.data_array2->power.readOp.longer_channel_leakage = - local_result.data_array2->power.readOp.leakage*long_channel_device_reduction; - local_result.data_array2->power = local_result.data_array2->power* pppm_t; - - - if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache) - { - local_result.tag_array2->power.readOp.dynamic *= sckRation; - local_result.tag_array2->power.writeOp.dynamic *= sckRation; - local_result.tag_array2->power.searchOp.dynamic *= sckRation; - local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks; - local_result.tag_array2->power.readOp.longer_channel_leakage = - local_result.tag_array2->power.readOp.leakage*long_channel_device_reduction; - local_result.tag_array2->power = local_result.tag_array2->power* pppm_t; } + candidate_solutions.clear(); + } + double long_channel_device_reduction = + longer_channel_device_reduction(device_ty, core_ty); + + double macro_layout_overhead = g_tp.macro_layout_overhead; + double chip_PR_overhead = g_tp.chip_layout_overhead; + double total_overhead = macro_layout_overhead * chip_PR_overhead; + local_result.area *= total_overhead; + + //maintain constant power density + double pppm_t[4] = {total_overhead, 1, 1, total_overhead}; + + double sckRation = g_tp.sckt_co_eff; + local_result.power.readOp.dynamic *= sckRation; + local_result.power.writeOp.dynamic *= sckRation; + local_result.power.searchOp.dynamic *= sckRation; + local_result.power.readOp.leakage *= l_ip.nbanks; + local_result.power.readOp.longer_channel_leakage = + local_result.power.readOp.leakage * long_channel_device_reduction; + local_result.power = local_result.power * pppm_t; + + local_result.data_array2->power.readOp.dynamic *= sckRation; + local_result.data_array2->power.writeOp.dynamic *= sckRation; + local_result.data_array2->power.searchOp.dynamic *= sckRation; + local_result.data_array2->power.readOp.leakage *= l_ip.nbanks; + local_result.data_array2->power.readOp.longer_channel_leakage = + local_result.data_array2->power.readOp.leakage * + long_channel_device_reduction; + local_result.data_array2->power = local_result.data_array2->power * pppm_t; + + + if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache) { + local_result.tag_array2->power.readOp.dynamic *= sckRation; + local_result.tag_array2->power.writeOp.dynamic *= sckRation; + local_result.tag_array2->power.searchOp.dynamic *= sckRation; + local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks; + local_result.tag_array2->power.readOp.longer_channel_leakage = + local_result.tag_array2->power.readOp.leakage * + long_channel_device_reduction; + local_result.tag_array2->power = + local_result.tag_array2->power * pppm_t; + } + power = local_result.power; + + output_data.peak_dynamic_power = power.readOp.dynamic * clockRate; + output_data.subthreshold_leakage_power = power.readOp.leakage; + output_data.gate_leakage_power = power.readOp.gate_leakage; } void ArrayST::leakage_feedback(double temperature) @@ -296,7 +307,6 @@ void ArrayST::leakage_feedback(double temperature) } } -ArrayST:: ~ArrayST() -{ - local_result.cleanup(); +ArrayST::~ArrayST() { + local_result.cleanup(); } |