diff options
Diffstat (limited to 'src/gpu-compute/lds_state.cc')
-rw-r--r-- | src/gpu-compute/lds_state.cc | 341 |
1 files changed, 341 insertions, 0 deletions
diff --git a/src/gpu-compute/lds_state.cc b/src/gpu-compute/lds_state.cc new file mode 100644 index 000000000..91ee8009a --- /dev/null +++ b/src/gpu-compute/lds_state.cc @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2014-2015 Advanced Micro Devices, Inc. + * All rights reserved. + * + * For use for simulation and test purposes only + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Author: John Kalamatianos, Joe Gross + */ + +#include "gpu-compute/lds_state.hh" + +#include <array> +#include <cstdio> +#include <cstdlib> + +#include "gpu-compute/compute_unit.hh" +#include "gpu-compute/gpu_dyn_inst.hh" +#include "gpu-compute/shader.hh" + +/** + * the default constructor that works with SWIG + */ +LdsState::LdsState(const Params *params) : + MemObject(params), + tickEvent(this), + cuPort(name() + ".port", this), + maximumSize(params->size), + range(params->range), + bankConflictPenalty(params->bankConflictPenalty), + banks(params->banks) +{ + fatal_if(params->banks <= 0, + "Number of LDS banks should be positive number"); + fatal_if((params->banks & (params->banks - 1)) != 0, + "Number of LDS banks should be a power of 2"); + fatal_if(params->size <= 0, + "cannot allocate an LDS with a size less than 1"); + fatal_if(params->size % 2, + "the LDS should be an even number"); +} + +/** + * Needed by the SWIG compiler + */ +LdsState * +LdsStateParams::create() +{ + return new LdsState(this); +} + +/** + * set the parent and name based on the parent + */ +void +LdsState::setParent(ComputeUnit *x_parent) +{ + // check that this gets assigned to the same thing each time + fatal_if(!x_parent, "x_parent should not be nullptr"); + fatal_if(x_parent == parent, + "should not be setting the parent twice"); + + parent = x_parent; + _name = x_parent->name() + ".LdsState"; +} + +/** + * derive the gpu mem packet from the packet and then count the bank conflicts + */ +unsigned +LdsState::countBankConflicts(PacketPtr packet, unsigned *bankAccesses) +{ + Packet::SenderState *baseSenderState = packet->senderState; + while (baseSenderState->predecessor) { + baseSenderState = baseSenderState->predecessor; + } + const ComputeUnit::LDSPort::SenderState *senderState = + dynamic_cast<ComputeUnit::LDSPort::SenderState *>(baseSenderState); + + fatal_if(!senderState, + "did not get the right sort of sender state"); + + GPUDynInstPtr gpuDynInst = senderState->getMemInst(); + + return countBankConflicts(gpuDynInst, bankAccesses); +} + +// Count the total number of bank conflicts for the local memory packet +unsigned +LdsState::countBankConflicts(GPUDynInstPtr gpuDynInst, + unsigned *numBankAccesses) +{ + int bank_conflicts = 0; + std::vector<int> bank; + // the number of LDS banks being touched by the memory instruction + int numBanks = std::min(parent->wfSize(), banks); + // if the wavefront size is larger than the number of LDS banks, we + // need to iterate over all work items to calculate the total + // number of bank conflicts + int groups = (parent->wfSize() > numBanks) ? + (parent->wfSize() / numBanks) : 1; + for (int i = 0; i < groups; i++) { + // Address Array holding all the work item addresses of an instruction + std::vector<Addr> addr_array; + addr_array.resize(numBanks, 0); + bank.clear(); + bank.resize(banks, 0); + int max_bank = 0; + + // populate the address array for all active work items + for (int j = 0; j < numBanks; j++) { + if (gpuDynInst->exec_mask[(i*numBanks)+j]) { + addr_array[j] = gpuDynInst->addr[(i*numBanks)+j]; + } else { + addr_array[j] = std::numeric_limits<Addr>::max(); + } + } + + if (gpuDynInst->m_op == Enums::MO_LD || + gpuDynInst->m_op == Enums::MO_ST) { + // mask identical addresses + for (int j = 0; j < numBanks; ++j) { + for (int j0 = 0; j0 < j; j0++) { + if (addr_array[j] != std::numeric_limits<Addr>::max() + && addr_array[j] == addr_array[j0]) { + addr_array[j] = std::numeric_limits<Addr>::max(); + } + } + } + } + // calculate bank conflicts + for (int j = 0; j < numBanks; ++j) { + if (addr_array[j] != std::numeric_limits<Addr>::max()) { + int bankId = addr_array[j] % banks; + bank[bankId]++; + max_bank = std::max(max_bank, bank[bankId]); + // Count the number of LDS banks accessed. + // Since we have masked identical addresses all remaining + // accesses will need to be serialized if they access + // the same bank (bank conflict). + (*numBankAccesses)++; + } + } + bank_conflicts += max_bank; + } + panic_if(bank_conflicts > parent->wfSize(), + "Max bank conflicts should match num of work items per instr"); + return bank_conflicts; +} + +/** + * receive the packet from the CU + */ +bool +LdsState::CuSidePort::recvTimingReq(PacketPtr packet) +{ + return ownerLds->processPacket(packet); +} + +GPUDynInstPtr +LdsState::getDynInstr(PacketPtr packet) +{ + ComputeUnit::LDSPort::SenderState *ss = + dynamic_cast<ComputeUnit::LDSPort::SenderState *>( + packet->senderState); + return ss->getMemInst(); +} + +/** + * process an incoming packet, add it to the return queue + */ +bool +LdsState::processPacket(PacketPtr packet) +{ + unsigned bankAccesses = 0; + // the number of conflicts this packet will have when accessing the LDS + unsigned bankConflicts = countBankConflicts(packet, &bankAccesses); + // count the total number of physical LDS bank accessed + parent->ldsBankAccesses += bankAccesses; + // count the LDS bank conflicts. A number set to 1 indicates one + // access per bank maximum so there are no bank conflicts + parent->ldsBankConflictDist.sample(bankConflicts-1); + + GPUDynInstPtr dynInst = getDynInstr(packet); + // account for the LDS bank conflict overhead + int busLength = (dynInst->m_op == Enums::MO_LD) ? parent->loadBusLength() : + (dynInst->m_op == Enums::MO_ST) ? parent->storeBusLength() : + parent->loadBusLength(); + // delay for accessing the LDS + Tick processingTime = + parent->shader->ticks(bankConflicts * bankConflictPenalty) + + parent->shader->ticks(busLength); + // choose (delay + last packet in queue) or (now + delay) as the time to + // return this + Tick doneAt = earliestReturnTime() + processingTime; + // then store it for processing + return returnQueuePush(std::make_pair(doneAt, packet)); +} + +/** + * add this to the queue of packets to be returned + */ +bool +LdsState::returnQueuePush(std::pair<Tick, PacketPtr> thePair) +{ + // TODO add time limits (e.g. one packet per cycle) and queue size limits + // and implement flow control + returnQueue.push(thePair); + + // if there is no set wakeup time, look through the queue + if (!tickEvent.scheduled()) { + process(); + } + + return true; +} + +/** + * receive a packet in functional mode + */ +void +LdsState::CuSidePort::recvFunctional(PacketPtr pkt) +{ + fatal("not implemented"); +} + +/** + * receive a retry for a response + */ +void +LdsState::CuSidePort::recvRespRetry() +{ + // TODO verify that this is the right way to do this + assert(ownerLds->isRetryResp()); + ownerLds->setRetryResp(false); + ownerLds->process(); +} + +/** + * receive a retry + */ +void +LdsState::CuSidePort::recvRetry() +{ + fatal("not implemented"); +} + +/** + * look for packets to return at this time + */ +bool +LdsState::process() +{ + Tick now = clockEdge(); + + // send back completed packets + while (!returnQueue.empty() && returnQueue.front().first <= now) { + PacketPtr packet = returnQueue.front().second; + + ComputeUnit::LDSPort::SenderState *ss = + dynamic_cast<ComputeUnit::LDSPort::SenderState *>( + packet->senderState); + + GPUDynInstPtr gpuDynInst = ss->getMemInst(); + + gpuDynInst->initiateAcc(gpuDynInst); + + packet->makeTimingResponse(); + + returnQueue.pop(); + + bool success = cuPort.sendTimingResp(packet); + + if (!success) { + retryResp = true; + panic("have not handled timing responses being NACK'd when sent" + "back"); + } + } + + // determine the next wakeup time + if (!returnQueue.empty()) { + + Tick next = returnQueue.front().first; + + if (tickEvent.scheduled()) { + + if (next < tickEvent.when()) { + + tickEvent.deschedule(); + tickEvent.schedule(next); + } + } else { + tickEvent.schedule(next); + } + } + + return true; +} + +/** + * wake up at this time and perform specified actions + */ +void +LdsState::TickEvent::process() +{ + ldsState->process(); +} + +/** + * + */ +void +LdsState::regStats() +{ +} |