summaryrefslogtreecommitdiff
path: root/src/gpu-compute/shader.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/gpu-compute/shader.cc')
-rw-r--r--src/gpu-compute/shader.cc412
1 files changed, 412 insertions, 0 deletions
diff --git a/src/gpu-compute/shader.cc b/src/gpu-compute/shader.cc
new file mode 100644
index 000000000..e8d7946ff
--- /dev/null
+++ b/src/gpu-compute/shader.cc
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * For use for simulation and test purposes only
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Steve Reinhardt
+ */
+
+#include "gpu-compute/shader.hh"
+
+#include <limits>
+
+#include "arch/x86/linux/linux.hh"
+#include "base/chunk_generator.hh"
+#include "debug/GPUDisp.hh"
+#include "debug/GPUMem.hh"
+#include "debug/HSAIL.hh"
+#include "gpu-compute/dispatcher.hh"
+#include "gpu-compute/gpu_static_inst.hh"
+#include "gpu-compute/qstruct.hh"
+#include "gpu-compute/wavefront.hh"
+#include "mem/packet.hh"
+#include "mem/ruby/system/RubySystem.hh"
+#include "sim/sim_exit.hh"
+
+Shader::Shader(const Params *p) : SimObject(p),
+ clock(p->clk_domain->clockPeriod()), cpuThread(nullptr), gpuTc(nullptr),
+ cpuPointer(p->cpu_pointer), tickEvent(this), timingSim(p->timing),
+ hsail_mode(SIMT), impl_kern_boundary_sync(p->impl_kern_boundary_sync),
+ separate_acquire_release(p->separate_acquire_release), coissue_return(1),
+ trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
+ globalMemSize(p->globalmem), nextSchedCu(0), sa_n(0), tick_cnt(0),
+ box_tick_cnt(0), start_tick_cnt(0)
+{
+
+ cuList.resize(n_cu);
+
+ for (int i = 0; i < n_cu; ++i) {
+ cuList[i] = p->CUs[i];
+ assert(i == cuList[i]->cu_id);
+ cuList[i]->shader = this;
+ }
+}
+
+Addr
+Shader::mmap(int length)
+{
+
+ Addr start;
+
+ // round up length to the next page
+ length = roundUp(length, TheISA::PageBytes);
+
+ if (X86Linux64::mmapGrowsDown()) {
+ DPRINTF(HSAIL, "GROWS DOWN");
+ start = gpuTc->getProcessPtr()->mmap_end -length;
+ gpuTc->getProcessPtr()->mmap_end = start;
+ } else {
+ DPRINTF(HSAIL, "GROWS UP");
+ start = gpuTc->getProcessPtr()->mmap_end;
+ gpuTc->getProcessPtr()->mmap_end += length;
+
+ // assertion to make sure we don't overwrite the stack (it grows down)
+ assert(gpuTc->getProcessPtr()->mmap_end <
+ gpuTc->getProcessPtr()->stack_base -
+ gpuTc->getProcessPtr()->max_stack_size);
+
+ }
+
+ DPRINTF(HSAIL,"Shader::mmap start= %#x, %#x\n", start, length);
+
+ gpuTc->getProcessPtr()->allocateMem(start,length);
+
+ return start;
+}
+
+void
+Shader::init()
+{
+ // grab the threadContext of the thread running on the CPU
+ assert(cpuPointer);
+ gpuTc = cpuPointer->getContext(0);
+ assert(gpuTc);
+}
+
+Shader::~Shader()
+{
+ for (int j = 0; j < n_cu; ++j)
+ delete cuList[j];
+}
+
+void
+Shader::updateThreadContext(int tid) {
+ // thread context of the thread which dispatched work
+ assert(cpuPointer);
+ gpuTc = cpuPointer->getContext(tid);
+ assert(gpuTc);
+}
+
+void
+Shader::hostWakeUp(BaseCPU *cpu) {
+ if (cpuPointer == cpu) {
+ if (gpuTc->status() == ThreadContext::Suspended)
+ cpu->activateContext(gpuTc->threadId());
+ } else {
+ //Make sure both dispatcher and shader are trying to
+ //wakeup same host. Hack here to enable kernel launch
+ //from multiple CPUs
+ panic("Dispatcher wants to wakeup a different host");
+ }
+}
+
+Shader*
+ShaderParams::create()
+{
+ return new Shader(this);
+}
+
+void
+Shader::exec()
+{
+ tick_cnt = curTick();
+ box_tick_cnt = curTick() - start_tick_cnt;
+
+ // apply any scheduled adds
+ for (int i = 0; i < sa_n; ++i) {
+ if (sa_when[i] <= tick_cnt) {
+ *sa_val[i] += sa_x[i];
+ sa_val.erase(sa_val.begin() + i);
+ sa_x.erase(sa_x.begin() + i);
+ sa_when.erase(sa_when.begin() + i);
+ --sa_n;
+ --i;
+ }
+ }
+
+ // clock all of the cu's
+ for (int i = 0; i < n_cu; ++i)
+ cuList[i]->exec();
+}
+
+bool
+Shader::dispatch_workgroups(NDRange *ndr)
+{
+ bool scheduledSomething = false;
+ int cuCount = 0;
+ int curCu = nextSchedCu;
+
+ while (cuCount < n_cu) {
+ //Every time we try a CU, update nextSchedCu
+ nextSchedCu = (nextSchedCu + 1) % n_cu;
+
+ // dispatch workgroup iff the following two conditions are met:
+ // (a) wg_rem is true - there are unassigned workgroups in the grid
+ // (b) there are enough free slots in cu cuList[i] for this wg
+ if (ndr->wg_disp_rem && cuList[curCu]->ReadyWorkgroup(ndr)) {
+ scheduledSomething = true;
+ DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d\n", curCu);
+
+ // ticks() member function translates cycles to simulation ticks.
+ if (!tickEvent.scheduled()) {
+ schedule(tickEvent, curTick() + this->ticks(1));
+ }
+
+ cuList[curCu]->StartWorkgroup(ndr);
+ ndr->wgId[0]++;
+ ndr->globalWgId++;
+ if (ndr->wgId[0] * ndr->q.wgSize[0] >= ndr->q.gdSize[0]) {
+ ndr->wgId[0] = 0;
+ ndr->wgId[1]++;
+
+ if (ndr->wgId[1] * ndr->q.wgSize[1] >= ndr->q.gdSize[1]) {
+ ndr->wgId[1] = 0;
+ ndr->wgId[2]++;
+
+ if (ndr->wgId[2] * ndr->q.wgSize[2] >= ndr->q.gdSize[2]) {
+ ndr->wg_disp_rem = false;
+ break;
+ }
+ }
+ }
+ }
+
+ ++cuCount;
+ curCu = nextSchedCu;
+ }
+
+ return scheduledSomething;
+}
+
+void
+Shader::handshake(GpuDispatcher *_dispatcher)
+{
+ dispatcher = _dispatcher;
+}
+
+void
+Shader::doFunctionalAccess(RequestPtr req, MemCmd cmd, void *data,
+ bool suppress_func_errors, int cu_id)
+{
+ unsigned block_size = RubySystem::getBlockSizeBytes();
+ unsigned size = req->getSize();
+
+ Addr tmp_addr;
+ BaseTLB::Mode trans_mode;
+
+ if (cmd == MemCmd::ReadReq) {
+ trans_mode = BaseTLB::Read;
+ } else if (cmd == MemCmd::WriteReq) {
+ trans_mode = BaseTLB::Write;
+ } else {
+ fatal("unexcepted MemCmd\n");
+ }
+
+ tmp_addr = req->getVaddr();
+ Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
+
+ assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
+
+ // Misaligned access
+ if (split_addr > tmp_addr) {
+ RequestPtr req1, req2;
+ req->splitOnVaddr(split_addr, req1, req2);
+
+
+ PacketPtr pkt1 = new Packet(req2, cmd);
+ PacketPtr pkt2 = new Packet(req1, cmd);
+
+ functionalTLBAccess(pkt1, cu_id, trans_mode);
+ functionalTLBAccess(pkt2, cu_id, trans_mode);
+
+ PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
+ PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
+
+ new_pkt1->dataStatic(data);
+ new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
+
+ if (suppress_func_errors) {
+ new_pkt1->setSuppressFuncError();
+ new_pkt2->setSuppressFuncError();
+ }
+
+ // fixme: this should be cuList[cu_id] if cu_id != n_cu
+ // The latter requires a memPort in the dispatcher
+ cuList[0]->memPort[0]->sendFunctional(new_pkt1);
+ cuList[0]->memPort[0]->sendFunctional(new_pkt2);
+
+ delete new_pkt1;
+ delete new_pkt2;
+ delete pkt1;
+ delete pkt2;
+ } else {
+ PacketPtr pkt = new Packet(req, cmd);
+ functionalTLBAccess(pkt, cu_id, trans_mode);
+ PacketPtr new_pkt = new Packet(pkt->req, cmd);
+ new_pkt->dataStatic(data);
+
+ if (suppress_func_errors) {
+ new_pkt->setSuppressFuncError();
+ };
+
+ // fixme: this should be cuList[cu_id] if cu_id != n_cu
+ // The latter requires a memPort in the dispatcher
+ cuList[0]->memPort[0]->sendFunctional(new_pkt);
+
+ delete new_pkt;
+ delete pkt;
+ }
+}
+
+bool
+Shader::busy()
+{
+ for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
+ if (!cuList[i_cu]->isDone()) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void
+Shader::ScheduleAdd(uint32_t *val,Tick when,int x)
+{
+ sa_val.push_back(val);
+ sa_when.push_back(tick_cnt + when);
+ sa_x.push_back(x);
+ ++sa_n;
+}
+
+Shader::TickEvent::TickEvent(Shader *_shader)
+ : Event(CPU_Tick_Pri), shader(_shader)
+{
+}
+
+
+void
+Shader::TickEvent::process()
+{
+ if (shader->busy()) {
+ shader->exec();
+ shader->schedule(this, curTick() + shader->ticks(1));
+ }
+}
+
+const char*
+Shader::TickEvent::description() const
+{
+ return "Shader tick";
+}
+
+void
+Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+ MemCmd cmd, bool suppress_func_errors)
+{
+ uint8_t *data_buf = (uint8_t*)ptr;
+
+ for (ChunkGenerator gen(address, size, RubySystem::getBlockSizeBytes());
+ !gen.done(); gen.next()) {
+ Request *req = new Request(0, gen.addr(), gen.size(), 0,
+ cuList[0]->masterId(), 0, 0, 0);
+
+ doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
+ data_buf += gen.size();
+ delete req;
+ }
+}
+
+void
+Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
+{
+ AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
+}
+
+void
+Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+ bool suppress_func_errors)
+{
+ AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, suppress_func_errors);
+}
+
+void
+Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
+{
+ AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
+}
+
+void
+Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
+ bool suppress_func_errors)
+{
+ AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
+ suppress_func_errors);
+}
+
+/*
+ * Send a packet through the appropriate TLB functional port.
+ * If cu_id=n_cu, then this is the dispatcher's TLB.
+ * Otherwise it's the TLB of the cu_id compute unit.
+ */
+void
+Shader::functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
+{
+ // update senderState. Need to know the gpuTc and the TLB mode
+ pkt->senderState =
+ new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
+
+ if (cu_id == n_cu) {
+ dispatcher->tlbPort->sendFunctional(pkt);
+ } else {
+ // even when the perLaneTLB flag is turned on
+ // it's ok tp send all accesses through lane 0
+ // since the lane # is not known here,
+ // This isn't important since these are functional accesses.
+ cuList[cu_id]->tlbPort[0]->sendFunctional(pkt);
+ }
+
+ /* safe_cast the senderState */
+ TheISA::GpuTLB::TranslationState *sender_state =
+ safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
+
+ delete sender_state->tlbEntry;
+ delete pkt->senderState;
+}