/* * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. * All rights reserved. * * For use for simulation and test purposes only * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Authors: John Kalamatianos, * Joe Gross */ #include "gpu-compute/lds_state.hh" #include #include #include #include "gpu-compute/compute_unit.hh" #include "gpu-compute/gpu_dyn_inst.hh" #include "gpu-compute/shader.hh" /** * the default constructor that works with SWIG */ LdsState::LdsState(const Params *params) : MemObject(params), tickEvent(this), cuPort(name() + ".port", this), maximumSize(params->size), range(params->range), bankConflictPenalty(params->bankConflictPenalty), banks(params->banks) { fatal_if(params->banks <= 0, "Number of LDS banks should be positive number"); fatal_if((params->banks & (params->banks - 1)) != 0, "Number of LDS banks should be a power of 2"); fatal_if(params->size <= 0, "cannot allocate an LDS with a size less than 1"); fatal_if(params->size % 2, "the LDS should be an even number"); } /** * Needed by the SWIG compiler */ LdsState * LdsStateParams::create() { return new LdsState(this); } /** * set the parent and name based on the parent */ void LdsState::setParent(ComputeUnit *x_parent) { // check that this gets assigned to the same thing each time fatal_if(!x_parent, "x_parent should not be nullptr"); fatal_if(x_parent == parent, "should not be setting the parent twice"); parent = x_parent; _name = x_parent->name() + ".LdsState"; } /** * derive the gpu mem packet from the packet and then count the bank conflicts */ unsigned LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses) { Packet::SenderState *baseSenderState = packet->senderState; while (baseSenderState->predecessor) { baseSenderState = baseSenderState->predecessor; } const ComputeUnit::LDSPort::SenderState *senderState = dynamic_cast(baseSenderState); fatal_if(!senderState, "did not get the right sort of sender state"); GPUDynInstPtr gpuDynInst = senderState->getMemInst(); return countBankConflicts(gpuDynInst, bankAccesses); } // Count the total number of bank conflicts for the local memory packet unsigned LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst, unsigned *numBankAccesses) { int bank_conflicts = 0; std::vector bank; // the number of LDS banks being touched by the memory instruction int numBanks = std::min(parent->wfSize(), banks); // if the wavefront size is larger than the number of LDS banks, we // need to iterate over all work items to calculate the total // number of bank conflicts int groups = (parent->wfSize() > numBanks) ? (parent->wfSize() / numBanks) : 1; for (int i = 0; i < groups; i++) { // Address Array holding all the work item addresses of an instruction std::vector addr_array; addr_array.resize(numBanks, 0); bank.clear(); bank.resize(banks, 0); int max_bank = 0; // populate the address array for all active work items for (int j = 0; j < numBanks; j++) { if (gpuDynInst->exec_mask[(i*numBanks)+j]) { addr_array[j] = gpuDynInst->addr[(i*numBanks)+j]; } else { addr_array[j] = std::numeric_limits::max(); } } if (gpuDynInst->isLoad() || gpuDynInst->isStore()) { // mask identical addresses for (int j = 0; j < numBanks; ++j) { for (int j0 = 0; j0 < j; j0++) { if (addr_array[j] != std::numeric_limits::max() && addr_array[j] == addr_array[j0]) { addr_array[j] = std::numeric_limits::max(); } } } } // calculate bank conflicts for (int j = 0; j < numBanks; ++j) { if (addr_array[j] != std::numeric_limits::max()) { int bankId = addr_array[j] % banks; bank[bankId]++; max_bank = std::max(max_bank, bank[bankId]); // Count the number of LDS banks accessed. // Since we have masked identical addresses all remaining // accesses will need to be serialized if they access // the same bank (bank conflict). (*numBankAccesses)++; } } bank_conflicts += max_bank; } panic_if(bank_conflicts > parent->wfSize(), "Max bank conflicts should match num of work items per instr"); return bank_conflicts; } /** * receive the packet from the CU */ bool LdsState::CuSidePort::recvTimingReq(PacketPtr packet) { return ownerLds->processPacket(packet); } GPUDynInstPtr LdsState::getDynInstr(PacketPtr packet) { ComputeUnit::LDSPort::SenderState *ss = dynamic_cast( packet->senderState); return ss->getMemInst(); } /** * process an incoming packet, add it to the return queue */ bool LdsState::processPacket(PacketPtr packet) { unsigned bankAccesses = 0; // the number of conflicts this packet will have when accessing the LDS unsigned bankConflicts = countBankConflicts(packet, &bankAccesses); // count the total number of physical LDS bank accessed parent->ldsBankAccesses += bankAccesses; // count the LDS bank conflicts. A number set to 1 indicates one // access per bank maximum so there are no bank conflicts parent->ldsBankConflictDist.sample(bankConflicts-1); GPUDynInstPtr dynInst = getDynInstr(packet); // account for the LDS bank conflict overhead int busLength = (dynInst->isLoad()) ? parent->loadBusLength() : (dynInst->isStore()) ? parent->storeBusLength() : parent->loadBusLength(); // delay for accessing the LDS Tick processingTime = parent->shader->ticks(bankConflicts * bankConflictPenalty) + parent->shader->ticks(busLength); // choose (delay + last packet in queue) or (now + delay) as the time to // return this Tick doneAt = earliestReturnTime() + processingTime; // then store it for processing return returnQueuePush(std::make_pair(doneAt, packet)); } /** * add this to the queue of packets to be returned */ bool LdsState::returnQueuePush(std::pair thePair) { // TODO add time limits (e.g. one packet per cycle) and queue size limits // and implement flow control returnQueue.push(thePair); // if there is no set wakeup time, look through the queue if (!tickEvent.scheduled()) { process(); } return true; } /** * receive a packet in functional mode */ void LdsState::CuSidePort::recvFunctional(PacketPtr pkt) { fatal("not implemented"); } /** * receive a retry for a response */ void LdsState::CuSidePort::recvRespRetry() { // TODO verify that this is the right way to do this assert(ownerLds->isRetryResp()); ownerLds->setRetryResp(false); ownerLds->process(); } /** * receive a retry */ void LdsState::CuSidePort::recvRetry() { fatal("not implemented"); } /** * look for packets to return at this time */ bool LdsState::process() { Tick now = clockEdge(); // send back completed packets while (!returnQueue.empty() && returnQueue.front().first <= now) { PacketPtr packet = returnQueue.front().second; ComputeUnit::LDSPort::SenderState *ss = dynamic_cast( packet->senderState); GPUDynInstPtr gpuDynInst = ss->getMemInst(); gpuDynInst->initiateAcc(gpuDynInst); packet->makeTimingResponse(); returnQueue.pop(); bool success = cuPort.sendTimingResp(packet); if (!success) { retryResp = true; panic("have not handled timing responses being NACK'd when sent" "back"); } } // determine the next wakeup time if (!returnQueue.empty()) { Tick next = returnQueue.front().first; if (tickEvent.scheduled()) { if (next < tickEvent.when()) { tickEvent.deschedule(); tickEvent.schedule(next); } } else { tickEvent.schedule(next); } } return true; } /** * wake up at this time and perform specified actions */ void LdsState::TickEvent::process() { ldsState->process(); }