From 612f8f074fa1099cf70faf495d46cc647762a031 Mon Sep 17 00:00:00 2001 From: ARM gem5 Developers Date: Fri, 24 Jan 2014 15:29:34 -0600 Subject: arm: Add support for ARMv8 (AArch64 & AArch32) Note: AArch64 and AArch32 interworking is not supported. If you use an AArch64 kernel you are restricted to AArch64 user-mode binaries. This will be addressed in a later patch. Note: Virtualization is only supported in AArch32 mode. This will also be fixed in a later patch. Contributors: Giacomo Gabrielli (TrustZone, LPAE, system-level AArch64, AArch64 NEON, validation) Thomas Grocutt (AArch32 Virtualization, AArch64 FP, validation) Mbou Eyole (AArch64 NEON, validation) Ali Saidi (AArch64 Linux support, code integration, validation) Edmund Grimley-Evans (AArch64 FP) William Wang (AArch64 Linux support) Rene De Jong (AArch64 Linux support, performance opt.) Matt Horsnell (AArch64 MP, validation) Matt Evans (device models, code integration, validation) Chris Adeniyi-Jones (AArch64 syscall-emulation) Prakash Ramrakhyani (validation) Dam Sunwoo (validation) Chander Sudanthi (validation) Stephan Diestelhorst (validation) Andreas Hansson (code integration, performance opt.) Eric Van Hensbergen (performance opt.) Gabe Black --- src/arch/arm/isa/insts/neon64_mem.isa | 471 ++++++++++++++++++++++++++++++++++ 1 file changed, 471 insertions(+) create mode 100644 src/arch/arm/isa/insts/neon64_mem.isa (limited to 'src/arch/arm/isa/insts/neon64_mem.isa') diff --git a/src/arch/arm/isa/insts/neon64_mem.isa b/src/arch/arm/isa/insts/neon64_mem.isa new file mode 100644 index 000000000..32a37f87e --- /dev/null +++ b/src/arch/arm/isa/insts/neon64_mem.isa @@ -0,0 +1,471 @@ +// -*- mode: c++ -*- + +// Copyright (c) 2012-2013 ARM Limited +// All rights reserved +// +// The license below extends only to copyright in the software and shall +// not be construed as granting a license to any other intellectual +// property including but not limited to intellectual property relating +// to a hardware implementation of the functionality of the software +// licensed hereunder. You may use the software subject to the license +// terms below provided that you ensure that this notice is replicated +// unmodified and in its entirety in all distributions of the software, +// modified or unmodified, in source code or in binary form. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer; +// redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution; +// neither the name of the copyright holders nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: Mbou Eyole +// Giacomo Gabrielli + +let {{ + + header_output = '' + decoder_output = '' + exec_output = '' + + def mkMemAccMicroOp(name): + global header_output, decoder_output, exec_output + SPAlignmentCheckCodeNeon = ''' + if (baseIsSP && bits(XURa, 3, 0) && + SPAlignmentCheckEnabled(xc->tcBase())) { + return new SPAlignmentFault(); + } + ''' + eaCode = SPAlignmentCheckCodeNeon + ''' + EA = XURa + imm; + ''' + memDecl = ''' + const int MaxNumBytes = 16; + union MemUnion { + uint8_t bytes[MaxNumBytes]; + uint32_t floatRegBits[MaxNumBytes / 4]; + }; + ''' + + # Do endian conversion for all the elements + convCode = ''' + VReg x = {0, 0}; + + x.lo = (((XReg) memUnion.floatRegBits[1]) << 32) | + (XReg) memUnion.floatRegBits[0]; + x.hi = (((XReg) memUnion.floatRegBits[3]) << 32) | + (XReg) memUnion.floatRegBits[2]; + + const unsigned eCount = 16 / (1 << eSize); + + if (isBigEndian64(xc->tcBase())) { + for (unsigned i = 0; i < eCount; i++) { + switch (eSize) { + case 0x3: // 64-bit + writeVecElem(&x, (XReg) gtobe( + (uint64_t) readVecElem(x, i, eSize)), i, eSize); + break; + case 0x2: // 32-bit + writeVecElem(&x, (XReg) gtobe( + (uint32_t) readVecElem(x, i, eSize)), i, eSize); + break; + case 0x1: // 16-bit + writeVecElem(&x, (XReg) gtobe( + (uint16_t) readVecElem(x, i, eSize)), i, eSize); + break; + default: // 8-bit + break; // Nothing to do here + } + } + } else { + for (unsigned i = 0; i < eCount; i++) { + switch (eSize) { + case 0x3: // 64-bit + writeVecElem(&x, (XReg) gtole( + (uint64_t) readVecElem(x, i, eSize)), i, eSize); + break; + case 0x2: // 32-bit + writeVecElem(&x, (XReg) gtole( + (uint32_t) readVecElem(x, i, eSize)), i, eSize); + break; + case 0x1: // 16-bit + writeVecElem(&x, (XReg) gtole( + (uint16_t) readVecElem(x, i, eSize)), i, eSize); + break; + default: // 8-bit + break; // Nothing to do here + } + } + } + + memUnion.floatRegBits[0] = (uint32_t) x.lo; + memUnion.floatRegBits[1] = (uint32_t) (x.lo >> 32); + memUnion.floatRegBits[2] = (uint32_t) x.hi; + memUnion.floatRegBits[3] = (uint32_t) (x.hi >> 32); + ''' + + # Offload everything into registers + regSetCode = '' + for reg in range(4): + regSetCode += ''' + AA64FpDestP%(reg)d_uw = gtoh(memUnion.floatRegBits[%(reg)d]); + ''' % { 'reg' : reg } + + # Pull everything in from registers + regGetCode = '' + for reg in range(4): + regGetCode += ''' + memUnion.floatRegBits[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); + ''' % { 'reg' : reg } + + loadMemAccCode = convCode + regSetCode + storeMemAccCode = regGetCode + convCode + + loadIop = InstObjParams(name + 'ld', + 'MicroNeonLoad64', + 'MicroNeonMemOp', + { 'mem_decl' : memDecl, + 'memacc_code' : loadMemAccCode, + 'ea_code' : simd64EnabledCheckCode + eaCode, + }, + [ 'IsMicroop', 'IsMemRef', 'IsLoad' ]) + storeIop = InstObjParams(name + 'st', + 'MicroNeonStore64', + 'MicroNeonMemOp', + { 'mem_decl' : memDecl, + 'memacc_code' : storeMemAccCode, + 'ea_code' : simd64EnabledCheckCode + eaCode, + }, + [ 'IsMicroop', 'IsMemRef', 'IsStore' ]) + + exec_output += NeonLoadExecute64.subst(loadIop) + \ + NeonLoadInitiateAcc64.subst(loadIop) + \ + NeonLoadCompleteAcc64.subst(loadIop) + \ + NeonStoreExecute64.subst(storeIop) + \ + NeonStoreInitiateAcc64.subst(storeIop) + \ + NeonStoreCompleteAcc64.subst(storeIop) + header_output += MicroNeonMemDeclare64.subst(loadIop) + \ + MicroNeonMemDeclare64.subst(storeIop) + + def mkMarshalMicroOp(name, Name): + global header_output, decoder_output, exec_output + + getInputCodeOp1L = '' + for v in range(4): + for p in range(4): + getInputCodeOp1L += ''' + writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)d_uw, + %(p)d, 0x2); + ''' % { 'v' : v, 'p' : p } + + getInputCodeOp1S = '' + for v in range(4): + for p in range(4): + getInputCodeOp1S += ''' + writeVecElem(&input[%(v)d], (XReg) AA64FpOp1P%(p)dV%(v)dS_uw, + %(p)d, 0x2); + ''' % { 'v' : v, 'p' : p } + + if name == 'deint_neon_uop': + + eCode = ''' + VReg input[4]; // input data from scratch area + VReg output[2]; // output data to arch. SIMD regs + VReg temp; + temp.lo = 0; + temp.hi = 0; + ''' + for p in range(4): + eCode += ''' + writeVecElem(&temp, (XReg) AA64FpDestP%(p)dV1L_uw, %(p)d, 0x2); + ''' % { 'p' : p } + eCode += getInputCodeOp1L + + # Note that numRegs is not always the same as numStructElems; in + # particular, for LD1/ST1, numStructElems is 1 but numRegs can be + # 1, 2, 3 or 4 + + eCode += ''' + output[0].lo = 0; + output[0].hi = 0; + output[1].lo = 0; + output[1].hi = 0; + + int eCount = dataSize / (8 << eSize); + int eSizeBytes = 1 << eSize; // element size in bytes + int numBytes = step * dataSize / 4; + int totNumBytes = numRegs * dataSize / 8; + + int structElemNo, pos, a, b; + XReg data; + + for (int r = 0; r < 2; ++r) { + for (int i = 0; i < eCount; ++i) { + if (numBytes < totNumBytes) { + structElemNo = r + (step * 2); + if (numStructElems == 1) { + pos = (eSizeBytes * i) + + (eCount * structElemNo * eSizeBytes); + } else { + pos = (numStructElems * eSizeBytes * i) + + (structElemNo * eSizeBytes); + } + a = pos / 16; + b = (pos % 16) / eSizeBytes; + data = (XReg) readVecElem(input[a], (XReg) b, + eSize); + writeVecElem(&output[r], data, i, eSize); + numBytes += eSizeBytes; + } + } + } + ''' + for p in range(4): + eCode += ''' + AA64FpDestP%(p)dV0L_uw = (uint32_t) readVecElem(output[0], + %(p)d, 0x2); + ''' % { 'p' : p } + eCode += ''' + if ((numRegs % 2 == 0) || (numRegs == 3 && step == 0)) { + ''' + for p in range(4): + eCode += ''' + AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem( + output[1], %(p)d, 0x2); + ''' % { 'p' : p } + eCode += ''' + } else { + ''' + for p in range(4): + eCode += ''' + AA64FpDestP%(p)dV1L_uw = (uint32_t) readVecElem(temp, + %(p)d, 0x2); + ''' % { 'p' : p } + eCode += ''' + } + ''' + + iop = InstObjParams(name, Name, 'MicroNeonMixOp64', + { 'code' : eCode }, ['IsMicroop']) + header_output += MicroNeonMixDeclare64.subst(iop) + exec_output += MicroNeonMixExecute64.subst(iop) + + elif name == 'int_neon_uop': + + eCode = ''' + VReg input[4]; // input data from arch. SIMD regs + VReg output[2]; // output data to scratch area + ''' + + eCode += getInputCodeOp1S + + # Note that numRegs is not always the same as numStructElems; in + # particular, for LD1/ST1, numStructElems is 1 but numRegs can be + # 1, 2, 3 or 4 + + eCode += ''' + int eCount = dataSize / (8 << eSize); + int eSizeBytes = 1 << eSize; + int totNumBytes = numRegs * dataSize / 8; + int numOutputElems = 128 / (8 << eSize); + int stepOffset = step * 32; + + for (int i = 0; i < 2; ++i) { + output[i].lo = 0; + output[i].hi = 0; + } + + int r = 0, k = 0, i, j; + XReg data; + + for (int pos = stepOffset; pos < 32 + stepOffset; + pos += eSizeBytes) { + if (pos < totNumBytes) { + if (numStructElems == 1) { + i = (pos / eSizeBytes) % eCount; + j = pos / (eCount * eSizeBytes); + } else { + i = pos / (numStructElems * eSizeBytes); + j = (pos % (numStructElems * eSizeBytes)) / + eSizeBytes; + } + data = (XReg) readVecElem(input[j], (XReg) i, eSize); + writeVecElem(&output[r], data, k, eSize); + k++; + if (k == numOutputElems){ + k = 0; + ++r; + } + } + } + ''' + for v in range(2): + for p in range(4): + eCode += ''' + AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem( + output[%(v)d], %(p)d, 0x2); + ''' % { 'v': v, 'p': p} + + iop = InstObjParams(name, Name, 'MicroNeonMixOp64', + { 'code' : eCode }, ['IsMicroop']) + header_output += MicroNeonMixDeclare64.subst(iop) + exec_output += MicroNeonMixExecute64.subst(iop) + + elif name == 'unpack_neon_uop': + + eCode = ''' + VReg input[4]; //input data from scratch area + VReg output[2]; //output data to arch. SIMD regs + ''' + + eCode += getInputCodeOp1L + + # Fill output regs with register data initially. Note that + # elements in output register outside indexed lanes are left + # untouched + for v in range(2): + for p in range(4): + eCode += ''' + writeVecElem(&output[%(v)d], (XReg) AA64FpDestP%(p)dV%(v)dL_uw, + %(p)d, 0x2); + ''' % { 'v': v, 'p': p} + eCode += ''' + int eCount = dataSize / (8 << eSize); + int eCount128 = 128 / (8 << eSize); + int eSizeBytes = 1 << eSize; + int totNumBytes = numStructElems * eSizeBytes; + int numInputElems = eCount128; + int stepOffset = step * 2 * eSizeBytes; + int stepLimit = 2 * eSizeBytes; + + int r = 0, i, j; + XReg data; + + for (int pos = stepOffset; pos < stepLimit + stepOffset; + pos += eSizeBytes) { + if (pos < totNumBytes) { + r = pos / eSizeBytes; + j = r / numInputElems; + i = r % numInputElems; + data = (XReg) readVecElem(input[j], (XReg) i, eSize); + + if (replicate) { + for (int i = 0; i < eCount128; ++i) { + if (i < eCount) { + writeVecElem(&output[r % 2], data, i, + eSize); + } else { // zero extend if necessary + writeVecElem(&output[r % 2], (XReg) 0, i, + eSize); + } + } + } else { + writeVecElem(&output[r % 2], data, lane, eSize); + } + } + } + ''' + for v in range(2): + for p in range(4): + eCode += ''' + AA64FpDestP%(p)dV%(v)dL_uw = (uint32_t) readVecElem( + output[%(v)d], %(p)d, 0x2); + ''' % { 'v' : v, 'p' : p } + + iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64', + { 'code' : eCode }, ['IsMicroop']) + header_output += MicroNeonMixLaneDeclare64.subst(iop) + exec_output += MicroNeonMixExecute64.subst(iop) + + elif name == 'pack_neon_uop': + + eCode = ''' + VReg input[4]; // input data from arch. SIMD regs + VReg output[2]; // output data to scratch area + ''' + + eCode += getInputCodeOp1S + + eCode += ''' + int eSizeBytes = 1 << eSize; + int numOutputElems = 128 / (8 << eSize); + int totNumBytes = numStructElems * eSizeBytes; + int stepOffset = step * 32; + int stepLimit = 32; + + int r = 0, i, j; + XReg data; + + for (int i = 0; i < 2; ++i) { + output[i].lo = 0; + output[i].hi = 0; + } + + for (int pos = stepOffset; pos < stepLimit + stepOffset; + pos += eSizeBytes) { + if (pos < totNumBytes) { + r = pos / 16; + j = pos / eSizeBytes; + i = (pos / eSizeBytes) % numOutputElems; + data = (XReg) readVecElem(input[j], lane, eSize); + writeVecElem(&output[r % 2], data, i, eSize); + } + } + ''' + + for v in range(2): + for p in range(4): + eCode += ''' + AA64FpDestP%(p)dV%(v)d_uw = (uint32_t) readVecElem( + output[%(v)d], %(p)d, 0x2); + ''' % { 'v' : v, 'p' : p } + + iop = InstObjParams(name, Name, 'MicroNeonMixLaneOp64', + { 'code' : eCode }, ['IsMicroop']) + header_output += MicroNeonMixLaneDeclare64.subst(iop) + exec_output += MicroNeonMixExecute64.subst(iop) + + # Generate instructions + mkMemAccMicroOp('mem_neon_uop') + mkMarshalMicroOp('deint_neon_uop', 'MicroDeintNeon64') + mkMarshalMicroOp('int_neon_uop', 'MicroIntNeon64') + mkMarshalMicroOp('unpack_neon_uop', 'MicroUnpackNeon64') + mkMarshalMicroOp('pack_neon_uop', 'MicroPackNeon64') + +}}; + +let {{ + + iop = InstObjParams('vldmult64', 'VldMult64', 'VldMultOp64', '', []) + header_output += VMemMultDeclare64.subst(iop) + decoder_output += VMemMultConstructor64.subst(iop) + + iop = InstObjParams('vstmult64', 'VstMult64', 'VstMultOp64', '', []) + header_output += VMemMultDeclare64.subst(iop) + decoder_output += VMemMultConstructor64.subst(iop) + + iop = InstObjParams('vldsingle64', 'VldSingle64', 'VldSingleOp64', '', []) + header_output += VMemSingleDeclare64.subst(iop) + decoder_output += VMemSingleConstructor64.subst(iop) + + iop = InstObjParams('vstsingle64', 'VstSingle64', 'VstSingleOp64', '', []) + header_output += VMemSingleDeclare64.subst(iop) + decoder_output += VMemSingleConstructor64.subst(iop) + +}}; -- cgit v1.2.3