/***************************************************************************** * McPAT * SOFTWARE LICENSE AGREEMENT * Copyright 2012 Hewlett-Packard Development Company, L.P. * Copyright (c) 2010-2013 Advanced Micro Devices, Inc. * All Rights Reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer; * redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution; * neither the name of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ***************************************************************************/ #include #include #include #include #include #include "basic_circuit.h" #include "basic_components.h" #include "common.h" #include "const.h" #include "io.h" #include "logic.h" #include "memoryctrl.h" #include "parameter.h" /* overview of MC models: * McPAT memory controllers are modeled according to large number of industrial data points. * The Basic memory controller architecture is base on the Synopsis designs * (DesignWare DDR2/DDR3-Lite memory controllers and DDR2/DDR3-Lite protocol controllers) * as in Cadence ChipEstimator Tool * * An MC has 3 parts as shown in this design. McPAT models both high performance MC * based on Niagara processor designs and curving and low power MC based on data points in * Cadence ChipEstimator Tool. * * The frontend is modeled analytically, the backend is modeled empirically according to * DDR2/DDR3-Lite protocol controllers in Cadence ChipEstimator Tool * The PHY is modeled based on * "A 100mW 9.6Gb/s Transceiver in 90nm CMOS for next-generation memory interfaces ," ISSCC 2006, * and A 14mW 6.25Gb/s Transceiver in 90nm CMOS for Serial Chip-to-Chip Communication," ISSCC 2007 * * In Cadence ChipEstimator Tool there are two types of memory controllers: the full memory controllers * that includes the frontend as the DesignWare DDR2/DDR3-Lite memory controllers and the backend only * memory controllers as the DDR2/DDR3-Lite protocol controllers (except DesignWare DDR2/DDR3-Lite memory * controllers, all memory controller IP in Cadence ChipEstimator Tool are backend memory controllers such as * DDRC 1600A and DDRC 800A). Thus,to some extend the area and power difference between DesignWare * DDR2/DDR3-Lite memory controllers and DDR2/DDR3-Lite protocol controllers can be an estimation to the * frontend power and area, which is very close the analitically modeled results of the frontend for Niagara2@65nm * */ MCBackend::MCBackend(XMLNode* _xml_data, InputParameter* interface_ip_, const MCParameters & mcp_, const MCStatistics & mcs_) : McPATComponent(_xml_data), l_ip(*interface_ip_), mcp(mcp_), mcs(mcs_) { name = "Transaction Engine"; local_result = init_interface(&l_ip, name); // Set up stats for the power calculations tdp_stats.reset(); tdp_stats.readAc.access = 0.5 * mcp.num_channels * mcp.clockRate; tdp_stats.writeAc.access = 0.5 * mcp.num_channels * mcp.clockRate; rtp_stats.reset(); rtp_stats.readAc.access = mcs.reads; rtp_stats.writeAc.access = mcs.writes; } void MCBackend::computeArea() { // The area is in nm^2 if (mcp.mc_type == MC) { if (mcp.type == 0) { output_data.area = (2.7927 * log(mcp.peak_transfer_rate * 2) - 19.862) / 2.0 * mcp.dataBusWidth / 128.0 * (l_ip.F_sz_um / 0.09) * mcp.num_channels; } else { output_data.area = 0.15 * mcp.dataBusWidth / 72.0 * (l_ip.F_sz_um / 0.065) * (l_ip.F_sz_um / 0.065) * mcp.num_channels; } } else { //skip old model cout << "Unknown memory controllers" << endl; exit(0); //area based on Cadence ChipEstimator for 8bit bus output_data.area = 0.243 * mcp.dataBusWidth / 8; } } void MCBackend::computeEnergy() { double C_MCB, mc_power; double backend_dyn; double backend_gates; double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio(); double NMOS_sizing = g_tp.min_w_nmos_; double PMOS_sizing = g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r; double area_um2 = output_data.area * 1e6; if (mcp.mc_type == MC) { if (mcp.type == 0) { //assuming the approximately same scaling factor as seen in processors. //C_MCB = 1.6/200/1e6/144/1.2/1.2*g_ip.F_sz_um/0.19;//Based on Niagara power numbers.The base power (W) is divided by device frequency and vdd and scale to target process. //mc_power = 0.0291*2;//29.1mW@200MHz @130nm From Power Analysis of SystemLevel OnChip Communication Architectures by Lahiri et mc_power = 4.32*0.1;//4.32W@1GhzMHz @65nm Cadence ChipEstimator 10% for backend C_MCB = mc_power/1e9/72/1.1/1.1*l_ip.F_sz_um/0.065; //per access energy in memory controller power.readOp.dynamic = C_MCB * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * (mcp.dataBusWidth/*+mcp.addressBusWidth*/); power.readOp.leakage = area_um2 / 2 * (g_tp.scaling_factor.core_tx_density) * cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 1, inv) * g_tp.peri_global.Vdd;//unit W power.readOp.gate_leakage = area_um2 / 2 * (g_tp.scaling_factor.core_tx_density) * cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 1, inv) * g_tp.peri_global.Vdd;//unit W } else { //Average on DDR2/3 protocol controller and DDRC 1600/800A in //Cadence ChipEstimate backend_dyn = 0.9e-9 / 800e6 * mcp.clockRate / 12800 * mcp.peak_transfer_rate* mcp.dataBusWidth / 72.0 * g_tp.peri_global.Vdd / 1.1 * g_tp.peri_global.Vdd / 1.1 * (l_ip.F_sz_nm/65.0); //Scaling to technology and DIMM feature. The base IP support //DDR3-1600(PC3 12800) //5000 is from Cadence ChipEstimator backend_gates = 50000 * mcp.dataBusWidth / 64.0; power.readOp.dynamic = backend_dyn; power.readOp.leakage = (backend_gates) * cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand) * g_tp.peri_global.Vdd;//unit W power.readOp.gate_leakage = (backend_gates) * cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand) * g_tp.peri_global.Vdd;//unit W } } else { //skip old model cout<<"Unknown memory controllers"< 0; interface_ip.tag_w = tag; interface_ip.access_mode = Normal; interface_ip.obj_func_dyn_energy = 0; interface_ip.obj_func_dyn_power = 0; interface_ip.obj_func_leak_power = 0; interface_ip.obj_func_cycle_t = 1; interface_ip.num_rw_ports = 0; interface_ip.num_rd_ports = mcp.num_channels; interface_ip.num_wr_ports = interface_ip.num_rd_ports; interface_ip.num_se_rd_ports = 0; interface_ip.num_search_ports = mcp.num_channels; interface_ip.is_cache = true; interface_ip.pure_cam = false; interface_ip.pure_ram = false; interface_ip.throughput = 1.0 / mcp.clockRate; interface_ip.latency = 1.0 / mcp.clockRate; frontendBuffer = new CacheArray(xml_data, &interface_ip, "Reorder Buffer", Uncore_device, mcp.clockRate); children.push_back(frontendBuffer); frontendBuffer->tdp_stats.reset(); frontendBuffer->tdp_stats.readAc.access = frontendBuffer->l_ip.num_search_ports + frontendBuffer->l_ip.num_wr_ports; frontendBuffer->tdp_stats.writeAc.access = frontendBuffer->l_ip.num_search_ports; frontendBuffer->tdp_stats.searchAc.access = frontendBuffer->l_ip.num_wr_ports; frontendBuffer->rtp_stats.reset(); // TODO: These stats assume that access power is calculated per buffer // bit, which requires the stats to take into account the number of // bits for each buffer slot. This should be revised... //For each channel, each memory word need to check the address data to //achieve best scheduling results. //and this need to be done on all physical DIMMs in each logical memory //DIMM *mcp.dataBusWidth/72 frontendBuffer->rtp_stats.readAc.access = mcs.reads * mcp.llcBlockSize * BITS_PER_BYTE / mcp.dataBusWidth * mcp.dataBusWidth / 72; frontendBuffer->rtp_stats.writeAc.access = mcs.writes * mcp.llcBlockSize * BITS_PER_BYTE / mcp.dataBusWidth * mcp.dataBusWidth / 72; frontendBuffer->rtp_stats.searchAc.access = frontendBuffer->rtp_stats.readAc.access + frontendBuffer->rtp_stats.writeAc.access; // Read Buffers //Support key words first operation data = (int)ceil(mcp.dataBusWidth / BITS_PER_BYTE); interface_ip.cache_sz = data * mcp.IO_buffer_size_per_channel; interface_ip.line_sz = data; interface_ip.assoc = mcp.read_buffer_assoc; interface_ip.nbanks = mcp.read_buffer_nbanks; interface_ip.out_w = interface_ip.line_sz * BITS_PER_BYTE; interface_ip.specific_tag = mcp.read_buffer_tag_width > 0; interface_ip.tag_w = mcp.read_buffer_tag_width; interface_ip.access_mode = Sequential; interface_ip.obj_func_dyn_energy = 0; interface_ip.obj_func_dyn_power = 0; interface_ip.obj_func_leak_power = 0; interface_ip.obj_func_cycle_t = 1; interface_ip.num_rw_ports = 0; interface_ip.num_rd_ports = mcp.num_channels; interface_ip.num_wr_ports = interface_ip.num_rd_ports; interface_ip.num_se_rd_ports = 0; interface_ip.num_search_ports = 0; interface_ip.is_cache = false; interface_ip.pure_cam = false; interface_ip.pure_ram = true; interface_ip.throughput = 1.0 / mcp.clockRate; interface_ip.latency = 1.0 / mcp.clockRate; readBuffer = new CacheArray(xml_data, &interface_ip, "Read Buffer", Uncore_device, mcp.clockRate); children.push_back(readBuffer); readBuffer->tdp_stats.reset(); readBuffer->tdp_stats.readAc.access = readBuffer->l_ip.num_rd_ports * mcs.duty_cycle; readBuffer->tdp_stats.writeAc.access = readBuffer->l_ip.num_wr_ports * mcs.duty_cycle; readBuffer->rtp_stats.reset(); readBuffer->rtp_stats.readAc.access = mcs.reads * mcp.llcBlockSize * BITS_PER_BYTE / mcp.dataBusWidth; readBuffer->rtp_stats.writeAc.access = mcs.reads * mcp.llcBlockSize * BITS_PER_BYTE / mcp.dataBusWidth; // Write Buffer //Support key words first operation data = (int)ceil(mcp.dataBusWidth / BITS_PER_BYTE); interface_ip.cache_sz = data * mcp.IO_buffer_size_per_channel; interface_ip.line_sz = data; interface_ip.assoc = mcp.write_buffer_assoc; interface_ip.nbanks = mcp.write_buffer_nbanks; interface_ip.out_w = interface_ip.line_sz * BITS_PER_BYTE; interface_ip.specific_tag = mcp.write_buffer_tag_width > 0; interface_ip.tag_w = mcp.write_buffer_tag_width; interface_ip.access_mode = Normal; interface_ip.obj_func_dyn_energy = 0; interface_ip.obj_func_dyn_power = 0; interface_ip.obj_func_leak_power = 0; interface_ip.obj_func_cycle_t = 1; interface_ip.num_rw_ports = 0; interface_ip.num_rd_ports = mcp.num_channels; interface_ip.num_wr_ports = interface_ip.num_rd_ports; interface_ip.num_se_rd_ports = 0; interface_ip.num_search_ports = 0; interface_ip.is_cache = false; interface_ip.pure_cam = false; interface_ip.pure_ram = true; interface_ip.throughput = 1.0 / mcp.clockRate; interface_ip.latency = 1.0 / mcp.clockRate; writeBuffer = new CacheArray(xml_data, &interface_ip, "Write Buffer", Uncore_device, mcp.clockRate); children.push_back(writeBuffer); writeBuffer->tdp_stats.reset(); writeBuffer->tdp_stats.readAc.access = writeBuffer->l_ip.num_rd_ports * mcs.duty_cycle; writeBuffer->tdp_stats.writeAc.access = writeBuffer->l_ip.num_wr_ports * mcs.duty_cycle; writeBuffer->rtp_stats.reset(); writeBuffer->rtp_stats.readAc.access = mcs.reads * mcp.llcBlockSize * BITS_PER_BYTE / mcp.dataBusWidth; writeBuffer->rtp_stats.writeAc.access = mcs.writes * mcp.llcBlockSize * BITS_PER_BYTE / mcp.dataBusWidth; // TODO: Set up selection logic as a leaf node in tree //selection and arbitration logic MC_arb = new selection_logic(xml_data, is_default, mcp.req_window_size_per_channel, 1, &interface_ip, "Arbitration Logic", (mcs.reads + mcs.writes), mcp.clockRate, Uncore_device); // MC_arb is not included in the roll-up due to the uninitialized area //children.push_back(MC_arb); } MemoryController::MemoryController(XMLNode* _xml_data, InputParameter* interface_ip_) : McPATComponent(_xml_data), interface_ip(*interface_ip_) { name = "Memory Controller"; set_mc_param(); // TODO: Pass params and stats as pointers children.push_back(new MCFrontEnd(xml_data, &interface_ip, mcp, mcs)); children.push_back(new MCBackend(xml_data, &interface_ip, mcp, mcs)); if (mcp.type==0 || (mcp.type == 1 && mcp.withPHY)) { children.push_back(new MCPHY(xml_data, &interface_ip, mcp, mcs)); } } void MemoryController::initialize_params() { memset(&mcp, 0, sizeof(MCParameters)); } void MemoryController::set_mc_param() { initialize_params(); int num_children = xml_data->nChildNode("param"); int tech_type; int mat_type; int i; for (i = 0; i < num_children; i++) { XMLNode* paramNode = xml_data->getChildNodePtr("param", &i); XMLCSTR node_name = paramNode->getAttribute("name"); XMLCSTR value = paramNode->getAttribute("value"); if (!node_name) warnMissingParamName(paramNode->getAttribute("id")); ASSIGN_FP_IF("mc_clock", mcp.clockRate); ASSIGN_INT_IF("tech_type", tech_type); ASSIGN_ENUM_IF("mc_type", mcp.mc_type, MemoryCtrl_type); ASSIGN_FP_IF("num_mcs", mcp.num_mcs); ASSIGN_INT_IF("llc_line_length", mcp.llc_line_length); ASSIGN_INT_IF("databus_width", mcp.databus_width); ASSIGN_INT_IF("memory_channels_per_mc", mcp.num_channels); ASSIGN_INT_IF("req_window_size_per_channel", mcp.req_window_size_per_channel); ASSIGN_INT_IF("IO_buffer_size_per_channel", mcp.IO_buffer_size_per_channel); ASSIGN_INT_IF("addressbus_width", mcp.addressbus_width); ASSIGN_INT_IF("opcode_width", mcp.opcodeW); ASSIGN_INT_IF("type", mcp.type); ASSIGN_ENUM_IF("LVDS", mcp.LVDS, bool); ASSIGN_ENUM_IF("withPHY", mcp.withPHY, bool); ASSIGN_INT_IF("peak_transfer_rate", mcp.peak_transfer_rate); ASSIGN_INT_IF("number_ranks", mcp.number_ranks); ASSIGN_INT_IF("reorder_buffer_assoc", mcp.reorder_buffer_assoc); ASSIGN_INT_IF("reorder_buffer_nbanks", mcp.reorder_buffer_nbanks); ASSIGN_INT_IF("read_buffer_assoc", mcp.read_buffer_assoc); ASSIGN_INT_IF("read_buffer_nbanks", mcp.read_buffer_nbanks); ASSIGN_INT_IF("read_buffer_tag_width", mcp.read_buffer_tag_width); ASSIGN_INT_IF("write_buffer_assoc", mcp.write_buffer_assoc); ASSIGN_INT_IF("write_buffer_nbanks", mcp.write_buffer_nbanks); ASSIGN_INT_IF("write_buffer_tag_width", mcp.write_buffer_tag_width); ASSIGN_INT_IF("wire_mat_type", mat_type); ASSIGN_ENUM_IF("wire_type", interface_ip.wt, Wire_type); else { warnUnrecognizedParam(node_name); } } if (mcp.mc_type != MC) { cout << "Unknown memory controller type: Only DRAM controller is " << "supported for now" << endl; exit(0); } // Change from MHz to Hz mcp.clockRate *= 1e6; interface_ip.data_arr_ram_cell_tech_type = tech_type; interface_ip.data_arr_peri_global_tech_type = tech_type; interface_ip.tag_arr_ram_cell_tech_type = tech_type; interface_ip.tag_arr_peri_global_tech_type = tech_type; interface_ip.wire_is_mat_type = mat_type; interface_ip.wire_os_mat_type = mat_type; num_children = xml_data->nChildNode("stat"); for (i = 0; i < num_children; i++) { XMLNode* statNode = xml_data->getChildNodePtr("stat", &i); XMLCSTR node_name = statNode->getAttribute("name"); XMLCSTR value = statNode->getAttribute("value"); if (!node_name) warnMissingStatName(statNode->getAttribute("id")); ASSIGN_FP_IF("duty_cycle", mcs.duty_cycle); ASSIGN_FP_IF("perc_load", mcs.perc_load); ASSIGN_FP_IF("memory_reads", mcs.reads); ASSIGN_INT_IF("memory_writes", mcs.writes); else { warnUnrecognizedStat(node_name); } } // Add ECC overhead mcp.llcBlockSize = int(ceil(mcp.llc_line_length / BITS_PER_BYTE)) + mcp.llc_line_length; mcp.dataBusWidth = int(ceil(mcp.databus_width / BITS_PER_BYTE)) + mcp.databus_width; } MCFrontEnd ::~MCFrontEnd() { if (MC_arb) { delete MC_arb; MC_arb = NULL; } if (frontendBuffer) { delete frontendBuffer; frontendBuffer = NULL; } if (readBuffer) { delete readBuffer; readBuffer = NULL; } if (writeBuffer) { delete writeBuffer; writeBuffer = NULL; } } MemoryController::~MemoryController() { // TODO: use default constructor to delete children }