diff options
Diffstat (limited to 'ext/mcpat/cacti/nuca.cc')
-rw-r--r-- | ext/mcpat/cacti/nuca.cc | 612 |
1 files changed, 612 insertions, 0 deletions
diff --git a/ext/mcpat/cacti/nuca.cc b/ext/mcpat/cacti/nuca.cc new file mode 100644 index 000000000..2aabe843f --- /dev/null +++ b/ext/mcpat/cacti/nuca.cc @@ -0,0 +1,612 @@ +/***************************************************************************** + * McPAT/CACTI + * SOFTWARE LICENSE AGREEMENT + * Copyright 2012 Hewlett-Packard Development Company, L.P. + * All Rights Reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.” + * + ***************************************************************************/ + + + +#include <cassert> + +#include "Ucache.h" +#include "nuca.h" + +unsigned int MIN_BANKSIZE=65536; +#define FIXED_OVERHEAD 55e-12 /* clock skew and jitter in s. Ref: Hrishikesh et al ISCA 01 */ +#define LATCH_DELAY 28e-12 /* latch delay in s (later should use FO4 TODO) */ +#define CONTR_2_BANK_LAT 0 + +int cont_stats[2 /*l2 or l3*/][5/* cores */][ROUTER_TYPES][7 /*banks*/][8 /* cycle time */]; + + Nuca::Nuca( + TechnologyParameter::DeviceType *dt = &(g_tp.peri_global) + ):deviceType(dt) +{ + init_cont(); +} + +void +Nuca::init_cont() +{ + FILE *cont; + char line[5000]; + char jk[5000]; + cont = fopen("contention.dat", "r"); + if (!cont) { + cout << "contention.dat file is missing!\n"; + exit(0); + } + + for(int i=0; i<2; i++) { + for(int j=2; j<5; j++) { + for(int k=0; k<ROUTER_TYPES; k++) { + for(int l=0;l<7; l++) { + int *temp = cont_stats[i/*l2 or l3*/][j/*core*/][k/*64 or 128 or 256 link bw*/][l /* no banks*/]; + assert(fscanf(cont, "%[^\n]\n", line) != EOF); + sscanf(line, "%[^:]: %d %d %d %d %d %d %d %d",jk, &temp[0], &temp[1], &temp[2], &temp[3], + &temp[4], &temp[5], &temp[6], &temp[7]); + } + } + } + } + fclose(cont); +} + + void +Nuca::print_cont_stats() +{ + for(int i=0; i<2; i++) { + for(int j=2; j<5; j++) { + for(int k=0; k<ROUTER_TYPES; k++) { + for(int l=0;l<7; l++) { + for(int m=0;l<7; l++) { + cout << cont_stats[i][j][k][l][m] << " "; + } + cout << endl; + } + } + } + } + cout << endl; +} + +Nuca::~Nuca(){ + for (int i = wt_min; i <= wt_max; i++) { + delete wire_vertical[i]; + delete wire_horizontal[i]; + } +} + +/* converts latency (in s) to cycles depending upon the FREQUENCY (in GHz) */ + int +Nuca::calc_cycles(double lat, double oper_freq) +{ + //TODO: convert latch delay to FO4 */ + double cycle_time = (1.0/(oper_freq*1e9)); /*s*/ + cycle_time -= LATCH_DELAY; + cycle_time -= FIXED_OVERHEAD; + + return (int)ceil(lat/cycle_time); +} + + +nuca_org_t::~nuca_org_t() { + // if(h_wire) delete h_wire; + // if(v_wire) delete v_wire; + // if(router) delete router; +} + +/* + * Version - 6.0 + * + * Perform exhaustive search across different bank organizatons, + * router configurations, grid organizations, and wire models and + * find an optimal NUCA organization + * For different bank count values + * 1. Optimal bank organization is calculated + * 2. For each bank organization, find different NUCA organizations + * using various router configurations, grid organizations, + * and wire models. + * 3. NUCA model with the least cost is picked for + * this particular bank count + * Finally include contention statistics and find the optimal + * NUCA configuration + */ + void +Nuca::sim_nuca() +{ + /* temp variables */ + int it, ro, wr; + int num_cyc; + unsigned int i, j, k; + unsigned int r, c; + int l2_c; + int bank_count = 0; + uca_org_t ures; + nuca_org_t *opt_n; + mem_array tag, data; + list<nuca_org_t *> nuca_list; + Router *router_s[ROUTER_TYPES]; + router_s[0] = new Router(64.0, 8, 4, &(g_tp.peri_global)); + router_s[0]->print_router(); + router_s[1] = new Router(128.0, 8, 4, &(g_tp.peri_global)); + router_s[1]->print_router(); + router_s[2] = new Router(256.0, 8, 4, &(g_tp.peri_global)); + router_s[2]->print_router(); + + int core_in; // to store no. of cores + + /* to search diff grid organizations */ + double curr_hop, totno_hops, totno_hhops, totno_vhops, tot_lat, + curr_acclat; + double avg_lat, avg_hop, avg_hhop, avg_vhop, avg_dyn_power, + avg_leakage_power; + + double opt_acclat = INF, opt_avg_lat = INF, opt_tot_lat = INF; + int opt_rows = 0; + int opt_columns = 0; + double opt_totno_hops = 0; + double opt_avg_hop = 0; + double opt_dyn_power = 0, opt_leakage_power = 0; + min_values_t minval; + + int bank_start = 0; + + int flit_width = 0; + + /* vertical and horizontal hop latency values */ + int ver_hop_lat, hor_hop_lat; /* in cycles */ + + + /* no. of different bank sizes to consider */ + int iterations; + + + g_ip->nuca_cache_sz = g_ip->cache_sz; + nuca_list.push_back(new nuca_org_t()); + + if (g_ip->cache_level == 0) l2_c = 1; + else l2_c = 0; + + if (g_ip->cores <= 4) core_in = 2; + else if (g_ip->cores <= 8) core_in = 3; + else if (g_ip->cores <= 16) core_in = 4; + else {cout << "Number of cores should be <= 16!\n"; exit(0);} + + + // set the lower bound to an appropriate value. this depends on cache associativity + if (g_ip->assoc > 2) { + i = 2; + while (i != g_ip->assoc) { + MIN_BANKSIZE *= 2; + i *= 2; + } + } + + iterations = (int)logtwo((int)g_ip->cache_sz/MIN_BANKSIZE); + + if (g_ip->force_wiretype) + { + if (g_ip->wt == Low_swing) { + wt_min = Low_swing; + wt_max = Low_swing; + } + else { + wt_min = Global; + wt_max = Low_swing-1; + } + } + else { + wt_min = Global; + wt_max = Low_swing; + } + if (g_ip->nuca_bank_count != 0) { // simulate just one bank + if (g_ip->nuca_bank_count != 2 && g_ip->nuca_bank_count != 4 && + g_ip->nuca_bank_count != 8 && g_ip->nuca_bank_count != 16 && + g_ip->nuca_bank_count != 32 && g_ip->nuca_bank_count != 64) { + fprintf(stderr,"Incorrect bank count value! Please fix the value in cache.cfg\n"); + } + bank_start = (int)logtwo((double)g_ip->nuca_bank_count); + iterations = bank_start+1; + g_ip->cache_sz = g_ip->cache_sz/g_ip->nuca_bank_count; + } + cout << "Simulating various NUCA configurations\n"; + for (it=bank_start; it<iterations; it++) { /* different bank count values */ + ures.tag_array2 = &tag; + ures.data_array2 = &data; + /* + * find the optimal bank organization + */ + solve(&ures); +// output_UCA(&ures); + bank_count = g_ip->nuca_cache_sz/g_ip->cache_sz; + cout << "====" << g_ip->cache_sz << "\n"; + + for (wr=wt_min; wr<=wt_max; wr++) { + + for (ro=0; ro<ROUTER_TYPES; ro++) + { + flit_width = (int) router_s[ro]->flit_size; //initialize router + nuca_list.back()->nuca_pda.cycle_time = router_s[ro]->cycle_time; + + /* calculate router and wire parameters */ + + double vlength = ures.cache_ht; /* length of the wire (u)*/ + double hlength = ures.cache_len; // u + + /* find delay, area, and power for wires */ + wire_vertical[wr] = new Wire((enum Wire_type) wr, vlength); + wire_horizontal[wr] = new Wire((enum Wire_type) wr, hlength); + + + hor_hop_lat = calc_cycles(wire_horizontal[wr]->delay, + 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); + ver_hop_lat = calc_cycles(wire_vertical[wr]->delay, + 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); + + /* + * assume a grid like topology and explore for optimal network + * configuration using different row and column count values. + */ + for (c=1; c<=(unsigned int)bank_count; c++) { + while (bank_count%c != 0) c++; + r = bank_count/c; + + /* + * to find the avg access latency of a NUCA cache, uncontended + * access time to each bank from the + * cache controller is calculated. + * avg latency = + * sum of the access latencies to individual banks)/bank + * count value. + */ + totno_hops = totno_hhops = totno_vhops = tot_lat = 0; + k = 1; + for (i=0; i<r; i++) { + for (j=0; j<c; j++) { + /* + * vertical hops including the + * first hop from the cache controller + */ + curr_hop = i + 1; + curr_hop += j; /* horizontal hops */ + totno_hhops += j; + totno_vhops += (i+1); + curr_acclat = (i * ver_hop_lat + CONTR_2_BANK_LAT + + j * hor_hop_lat); + + tot_lat += curr_acclat; + totno_hops += curr_hop; + } + } + avg_lat = tot_lat/bank_count; + avg_hop = totno_hops/bank_count; + avg_hhop = totno_hhops/bank_count; + avg_vhop = totno_vhops/bank_count; + + /* net access latency */ + curr_acclat = 2*avg_lat + 2*(router_s[ro]->delay*avg_hop) + + calc_cycles(ures.access_time, + 1/(nuca_list.back()->nuca_pda.cycle_time*.001)); + + /* avg access lat of nuca */ + avg_dyn_power = + avg_hop * + (router_s[ro]->power.readOp.dynamic) + avg_hhop * + (wire_horizontal[wr]->power.readOp.dynamic) * + (g_ip->block_sz*8 + 64) + avg_vhop * + (wire_vertical[wr]->power.readOp.dynamic) * + (g_ip->block_sz*8 + 64) + ures.power.readOp.dynamic; + + avg_leakage_power = + bank_count * router_s[ro]->power.readOp.leakage + + avg_hhop * (wire_horizontal[wr]->power.readOp.leakage* + wire_horizontal[wr]->delay) * flit_width + + avg_vhop * (wire_vertical[wr]->power.readOp.leakage * + wire_horizontal[wr]->delay); + + if (curr_acclat < opt_acclat) { + opt_acclat = curr_acclat; + opt_tot_lat = tot_lat; + opt_avg_lat = avg_lat; + opt_totno_hops = totno_hops; + opt_avg_hop = avg_hop; + opt_rows = r; + opt_columns = c; + opt_dyn_power = avg_dyn_power; + opt_leakage_power = avg_leakage_power; + } + totno_hops = 0; + tot_lat = 0; + totno_hhops = 0; + totno_vhops = 0; + } + nuca_list.back()->wire_pda.power.readOp.dynamic = + opt_avg_hop * flit_width * + (wire_horizontal[wr]->power.readOp.dynamic + + wire_vertical[wr]->power.readOp.dynamic); + nuca_list.back()->avg_hops = opt_avg_hop; + /* network delay/power */ + nuca_list.back()->h_wire = wire_horizontal[wr]; + nuca_list.back()->v_wire = wire_vertical[wr]; + nuca_list.back()->router = router_s[ro]; + /* bank delay/power */ + + nuca_list.back()->bank_pda.delay = ures.access_time; + nuca_list.back()->bank_pda.power = ures.power; + nuca_list.back()->bank_pda.area.h = ures.cache_ht; + nuca_list.back()->bank_pda.area.w = ures.cache_len; + nuca_list.back()->bank_pda.cycle_time = ures.cycle_time; + + num_cyc = calc_cycles(nuca_list.back()->bank_pda.delay /*s*/, + 1/(nuca_list.back()->nuca_pda.cycle_time*.001/*GHz*/)); + if(num_cyc%2 != 0) num_cyc++; + if (num_cyc > 16) num_cyc = 16; // we have data only up to 16 cycles + + if (it < 7) { + nuca_list.back()->nuca_pda.delay = opt_acclat + + cont_stats[l2_c][core_in][ro][it][num_cyc/2-1]; + nuca_list.back()->contention = + cont_stats[l2_c][core_in][ro][it][num_cyc/2-1]; + } + else { + nuca_list.back()->nuca_pda.delay = opt_acclat + + cont_stats[l2_c][core_in][ro][7][num_cyc/2-1]; + nuca_list.back()->contention = + cont_stats[l2_c][core_in][ro][7][num_cyc/2-1]; + } + nuca_list.back()->nuca_pda.power.readOp.dynamic = opt_dyn_power; + nuca_list.back()->nuca_pda.power.readOp.leakage = opt_leakage_power; + + /* array organization */ + nuca_list.back()->bank_count = bank_count; + nuca_list.back()->rows = opt_rows; + nuca_list.back()->columns = opt_columns; + calculate_nuca_area (nuca_list.back()); + + minval.update_min_values(nuca_list.back()); + nuca_list.push_back(new nuca_org_t()); + opt_acclat = BIGNUM; + + } + } + g_ip->cache_sz /= 2; + } + + delete(nuca_list.back()); + nuca_list.pop_back(); + opt_n = find_optimal_nuca(&nuca_list, &minval); + print_nuca(opt_n); + g_ip->cache_sz = g_ip->nuca_cache_sz/opt_n->bank_count; + + list<nuca_org_t *>::iterator niter; + for (niter = nuca_list.begin(); niter != nuca_list.end(); ++niter) + { + delete *niter; + } + nuca_list.clear(); + + for(int i=0; i < ROUTER_TYPES; i++) + { + delete router_s[i]; + } + g_ip->display_ip(); + // g_ip->force_cache_config = true; + // g_ip->ndwl = 8; + // g_ip->ndbl = 16; + // g_ip->nspd = 4; + // g_ip->ndcm = 1; + // g_ip->ndsam1 = 8; + // g_ip->ndsam2 = 32; + +} + + + void +Nuca::print_nuca (nuca_org_t *fr) +{ + printf("\n---------- CACTI version 6.5, Non-uniform Cache Access " + "----------\n\n"); + printf("Optimal number of banks - %d\n", fr->bank_count); + printf("Grid organization rows x columns - %d x %d\n", + fr->rows, fr->columns); + printf("Network frequency - %g GHz\n", + (1/fr->nuca_pda.cycle_time)*1e3); + printf("Cache dimension (mm x mm) - %g x %g\n", + fr->nuca_pda.area.h, + fr->nuca_pda.area.w); + + fr->router->print_router(); + + printf("\n\nWire stats:\n"); + if (fr->h_wire->wt == Global) { + printf("\tWire type - Full swing global wires with least " + "possible delay\n"); + } + else if (fr->h_wire->wt == Global_5) { + printf("\tWire type - Full swing global wires with " + "5%% delay penalty\n"); + } + else if (fr->h_wire->wt == Global_10) { + printf("\tWire type - Full swing global wires with " + "10%% delay penalty\n"); + } + else if (fr->h_wire->wt == Global_20) { + printf("\tWire type - Full swing global wires with " + "20%% delay penalty\n"); + } + else if (fr->h_wire->wt == Global_30) { + printf("\tWire type - Full swing global wires with " + "30%% delay penalty\n"); + } + else if(fr->h_wire->wt == Low_swing) { + printf("\tWire type - Low swing wires\n"); + } + + printf("\tHorizontal link delay - %g (ns)\n", + fr->h_wire->delay*1e9); + printf("\tVertical link delay - %g (ns)\n", + fr->v_wire->delay*1e9); + printf("\tDelay/length - %g (ns/mm)\n", + fr->h_wire->delay*1e9/fr->bank_pda.area.w); + printf("\tHorizontal link energy -dynamic/access %g (nJ)\n" + "\t -leakage %g (nW)\n\n", + fr->h_wire->power.readOp.dynamic*1e9, + fr->h_wire->power.readOp.leakage*1e9); + printf("\tVertical link energy -dynamic/access %g (nJ)\n" + "\t -leakage %g (nW)\n\n", + fr->v_wire->power.readOp.dynamic*1e9, + fr->v_wire->power.readOp.leakage*1e9); + printf("\n\n"); + fr->v_wire->print_wire(); + printf("\n\nBank stats:\n"); +} + + + nuca_org_t * +Nuca::find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval) +{ + double cost = 0; + double min_cost = BIGNUM; + nuca_org_t *res = NULL; + float d, a, dp, lp, c; + int v; + dp = g_ip->dynamic_power_wt_nuca; + lp = g_ip->leakage_power_wt_nuca; + a = g_ip->area_wt_nuca; + d = g_ip->delay_wt_nuca; + c = g_ip->cycle_time_wt_nuca; + + list<nuca_org_t *>::iterator niter; + + + for (niter = n->begin(); niter != n->end(); niter++) { + fprintf(stderr, "\n-----------------------------" + "---------------\n"); + + + printf("NUCA___stats %d \tbankcount: lat = %g \tdynP = %g \twt = %d\t " + "bank_dpower = %g \tleak = %g \tcycle = %g\n", + (*niter)->bank_count, + (*niter)->nuca_pda.delay, + (*niter)->nuca_pda.power.readOp.dynamic, + (*niter)->h_wire->wt, + (*niter)->bank_pda.power.readOp.dynamic, + (*niter)->nuca_pda.power.readOp.leakage, + (*niter)->nuca_pda.cycle_time); + + + if (g_ip->ed == 1) { + cost = ((*niter)->nuca_pda.delay/minval->min_delay)* + ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn); + if (min_cost > cost) { + min_cost = cost; + res = ((*niter)); + } + } + else if (g_ip->ed == 2) { + cost = ((*niter)->nuca_pda.delay/minval->min_delay)* + ((*niter)->nuca_pda.delay/minval->min_delay)* + ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn); + if (min_cost > cost) { + min_cost = cost; + res = ((*niter)); + } + } + else { + /* + * check whether the current organization + * meets the input deviation constraints + */ + v = check_nuca_org((*niter), minval); + if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling + + if (v) { + cost = (d * ((*niter)->nuca_pda.delay/minval->min_delay) + + c * ((*niter)->nuca_pda.cycle_time/minval->min_cyc) + + dp * ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn) + + lp * ((*niter)->nuca_pda.power.readOp.leakage/minval->min_leakage) + + a * ((*niter)->nuca_pda.area.get_area()/minval->min_area)); + fprintf(stderr, "cost = %g\n", cost); + + if (min_cost > cost) { + min_cost = cost; + res = ((*niter)); + } + } + else { + niter = n->erase(niter); + if (niter !=n->begin()) + niter --; + } + } + } + return res; +} + + int +Nuca::check_nuca_org (nuca_org_t *n, min_values_t *minval) +{ + if (((n->nuca_pda.delay - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev_nuca) { + return 0; + } + if (((n->nuca_pda.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 > + g_ip->dynamic_power_dev_nuca) { + return 0; + } + if (((n->nuca_pda.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 > + g_ip->leakage_power_dev_nuca) { + return 0; + } + if (((n->nuca_pda.cycle_time - minval->min_cyc)/minval->min_cyc)*100 > + g_ip->cycle_time_dev_nuca) { + return 0; + } + if (((n->nuca_pda.area.get_area() - minval->min_area)/minval->min_area)*100 > + g_ip->area_dev_nuca) { + return 0; + } + return 1; +} + + void +Nuca::calculate_nuca_area (nuca_org_t *nuca) +{ + nuca->nuca_pda.area.h= + nuca->rows * ((nuca->h_wire->wire_width + + nuca->h_wire->wire_spacing) + * nuca->router->flit_size + + nuca->bank_pda.area.h); + + nuca->nuca_pda.area.w = + nuca->columns * ((nuca->v_wire->wire_width + + nuca->v_wire->wire_spacing) + * nuca->router->flit_size + + nuca->bank_pda.area.w); +} + |