diff options
author | ARM gem5 Developers <none@none> | 2014-01-24 15:29:34 -0600 |
---|---|---|
committer | ARM gem5 Developers <none@none> | 2014-01-24 15:29:34 -0600 |
commit | 612f8f074fa1099cf70faf495d46cc647762a031 (patch) | |
tree | bd1e99c43bf15292395eadd4b7ae3f5c823545c3 /src/arch/arm/isa/insts/neon64.isa | |
parent | f3585c841e964c98911784a187fc4f081a02a0a6 (diff) | |
download | gem5-612f8f074fa1099cf70faf495d46cc647762a031.tar.xz |
arm: Add support for ARMv8 (AArch64 & AArch32)
Note: AArch64 and AArch32 interworking is not supported. If you use an AArch64
kernel you are restricted to AArch64 user-mode binaries. This will be addressed
in a later patch.
Note: Virtualization is only supported in AArch32 mode. This will also be fixed
in a later patch.
Contributors:
Giacomo Gabrielli (TrustZone, LPAE, system-level AArch64, AArch64 NEON, validation)
Thomas Grocutt (AArch32 Virtualization, AArch64 FP, validation)
Mbou Eyole (AArch64 NEON, validation)
Ali Saidi (AArch64 Linux support, code integration, validation)
Edmund Grimley-Evans (AArch64 FP)
William Wang (AArch64 Linux support)
Rene De Jong (AArch64 Linux support, performance opt.)
Matt Horsnell (AArch64 MP, validation)
Matt Evans (device models, code integration, validation)
Chris Adeniyi-Jones (AArch64 syscall-emulation)
Prakash Ramrakhyani (validation)
Dam Sunwoo (validation)
Chander Sudanthi (validation)
Stephan Diestelhorst (validation)
Andreas Hansson (code integration, performance opt.)
Eric Van Hensbergen (performance opt.)
Gabe Black
Diffstat (limited to 'src/arch/arm/isa/insts/neon64.isa')
-rw-r--r-- | src/arch/arm/isa/insts/neon64.isa | 3355 |
1 files changed, 3355 insertions, 0 deletions
diff --git a/src/arch/arm/isa/insts/neon64.isa b/src/arch/arm/isa/insts/neon64.isa new file mode 100644 index 000000000..e065761f4 --- /dev/null +++ b/src/arch/arm/isa/insts/neon64.isa @@ -0,0 +1,3355 @@ +// -*- mode: c++ -*- + +// Copyright (c) 2012-2013 ARM Limited +// All rights reserved +// +// The license below extends only to copyright in the software and shall +// not be construed as granting a license to any other intellectual +// property including but not limited to intellectual property relating +// to a hardware implementation of the functionality of the software +// licensed hereunder. You may use the software subject to the license +// terms below provided that you ensure that this notice is replicated +// unmodified and in its entirety in all distributions of the software, +// modified or unmodified, in source code or in binary form. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer; +// redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution; +// neither the name of the copyright holders nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: Giacomo Gabrielli +// Mbou Eyole + +let {{ + + header_output = "" + exec_output = "" + + # FP types (FP operations always work with unsigned representations) + floatTypes = ("uint32_t", "uint64_t") + smallFloatTypes = ("uint32_t",) + + def threeEqualRegInstX(name, Name, opClass, types, rCount, op, + readDest=False, pairwise=False, scalar=False, + byElem=False): + assert (not pairwise) or ((not byElem) and (not scalar)) + global header_output, exec_output + eWalkCode = simd64EnabledCheckCode + ''' + RegVect srcReg1, destReg; + ''' + if byElem: + # 2nd register operand has to be read fully + eWalkCode += ''' + FullRegVect srcReg2; + ''' + else: + eWalkCode += ''' + RegVect srcReg2; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw); + srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); + ''' % { "reg" : reg } + if byElem: + # 2nd operand has to be read fully + for reg in range(rCount, 4): + eWalkCode += ''' + srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + if pairwise: + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(2 * i < eCount ? + srcReg1.elements[2 * i] : + srcReg2.elements[2 * i - eCount]); + Element srcElem2 = gtoh(2 * i < eCount ? + srcReg1.elements[2 * i + 1] : + srcReg2.elements[2 * i + 1 - eCount]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + else: + scalarCheck = ''' + if (i != 0) { + destReg.elements[i] = 0; + continue; + } + ''' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + %(scalarCheck)s + Element srcElem1 = gtoh(srcReg1.elements[i]); + Element srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode, + "scalarCheck" : scalarCheck if scalar else "", + "src2Index" : "imm" if byElem else "i" } + for reg in range(rCount): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + if rCount < 4: # zero upper half + for reg in range(rCount, 4): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = 0; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataX2RegImmOp" if byElem else "DataX2RegOp", + { "code": eWalkCode, + "r_count": rCount, + "op_class": opClass }, []) + if byElem: + header_output += NeonX2RegImmOpDeclare.subst(iop) + else: + header_output += NeonX2RegOpDeclare.subst(iop) + exec_output += NeonXEqualRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def threeUnequalRegInstX(name, Name, opClass, types, op, + bigSrc1, bigSrc2, bigDest, readDest, scalar=False, + byElem=False, hi=False): + assert not (scalar and hi) + global header_output, exec_output + src1Cnt = src2Cnt = destCnt = 2 + src1Prefix = src2Prefix = destPrefix = '' + if bigSrc1: + src1Cnt = 4 + src1Prefix = 'Big' + if bigSrc2: + src2Cnt = 4 + src2Prefix = 'Big' + if bigDest: + destCnt = 4 + destPrefix = 'Big' + if byElem: + src2Prefix = 'Full' + eWalkCode = simd64EnabledCheckCode + ''' + %sRegVect srcReg1; + %sRegVect srcReg2; + %sRegVect destReg; + ''' % (src1Prefix, src2Prefix, destPrefix) + srcReg1 = 0 + if hi and not bigSrc1: # long/widening operations + srcReg1 = 2 + for reg in range(src1Cnt): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(srcReg1)d_uw); + ''' % { "reg" : reg, "srcReg1" : srcReg1 } + srcReg1 += 1 + srcReg2 = 0 + if (not byElem) and (hi and not bigSrc2): # long/widening operations + srcReg2 = 2 + for reg in range(src2Cnt): + eWalkCode += ''' + srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(srcReg2)d_uw); + ''' % { "reg" : reg, "srcReg2" : srcReg2 } + srcReg2 += 1 + if byElem: + # 2nd operand has to be read fully + for reg in range(src2Cnt, 4): + eWalkCode += ''' + srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw); + ''' % { "reg" : reg } + if readDest: + for reg in range(destCnt): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + scalarCheck = ''' + if (i != 0) { + destReg.elements[i] = 0; + continue; + } + ''' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + %(scalarCheck)s + %(src1Prefix)sElement srcElem1 = gtoh(srcReg1.elements[i]); + %(src1Prefix)sElement srcElem2 = gtoh(srcReg2.elements[%(src2Index)s]); + %(destPrefix)sElement destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode, + "src1Prefix" : src1Prefix, "src2Prefix" : src2Prefix, + "destPrefix" : destPrefix, + "scalarCheck" : scalarCheck if scalar else "", + "src2Index" : "imm" if byElem else "i" } + destReg = 0 + if hi and not bigDest: + # narrowing operations + destReg = 2 + for reg in range(destCnt): + eWalkCode += ''' + AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg, "destReg": destReg } + destReg += 1 + if destCnt < 4 and not hi: # zero upper half + for reg in range(destCnt, 4): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = 0; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataX2RegImmOp" if byElem else "DataX2RegOp", + { "code": eWalkCode, + "r_count": 2, + "op_class": opClass }, []) + if byElem: + header_output += NeonX2RegImmOpDeclare.subst(iop) + else: + header_output += NeonX2RegOpDeclare.subst(iop) + exec_output += NeonXUnequalRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def threeRegNarrowInstX(name, Name, opClass, types, op, readDest=False, + scalar=False, byElem=False, hi=False): + assert not byElem + threeUnequalRegInstX(name, Name, opClass, types, op, + True, True, False, readDest, scalar, byElem, hi) + + def threeRegLongInstX(name, Name, opClass, types, op, readDest=False, + scalar=False, byElem=False, hi=False): + threeUnequalRegInstX(name, Name, opClass, types, op, + False, False, True, readDest, scalar, byElem, hi) + + def threeRegWideInstX(name, Name, opClass, types, op, readDest=False, + scalar=False, byElem=False, hi=False): + assert not byElem + threeUnequalRegInstX(name, Name, opClass, types, op, + True, False, True, readDest, scalar, byElem, hi) + + def twoEqualRegInstX(name, Name, opClass, types, rCount, op, + readDest=False, scalar=False, byElem=False, + hasImm=False, isDup=False): + global header_output, exec_output + assert (not isDup) or byElem + if byElem: + hasImm = True + if isDup: + eWalkCode = simd64EnabledCheckCode + ''' + FullRegVect srcReg1; + RegVect destReg; + ''' + else: + eWalkCode = simd64EnabledCheckCode + ''' + RegVect srcReg1, destReg; + ''' + for reg in range(4 if isDup else rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + scalarCheck = ''' + if (i != 0) { + destReg.elements[i] = 0; + continue; + } + ''' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + %(scalarCheck)s + unsigned j = i; + Element srcElem1 = gtoh(srcReg1.elements[%(src1Index)s]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[j] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode, + "scalarCheck" : scalarCheck if scalar else "", + "src1Index" : "imm" if byElem else "i" } + for reg in range(rCount): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + if rCount < 4: # zero upper half + for reg in range(rCount, 4): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = 0; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataX1RegImmOp" if hasImm else "DataX1RegOp", + { "code": eWalkCode, + "r_count": rCount, + "op_class": opClass }, []) + if hasImm: + header_output += NeonX1RegImmOpDeclare.subst(iop) + else: + header_output += NeonX1RegOpDeclare.subst(iop) + exec_output += NeonXEqualRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def twoRegLongInstX(name, Name, opClass, types, op, readDest=False, + hi=False, hasImm=False): + global header_output, exec_output + eWalkCode = simd64EnabledCheckCode + ''' + RegVect srcReg1; + BigRegVect destReg; + ''' + destReg = 0 if not hi else 2 + for reg in range(2): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(destReg)d_uw); + ''' % { "reg" : reg, "destReg": destReg } + destReg += 1 + destReg = 0 if not hi else 2 + if readDest: + for reg in range(4): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); + ''' % { "reg" : reg } + destReg += 1 + readDestCode = '' + if readDest: + readDestCode = 'destReg = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(srcReg1.elements[i]); + BigElement destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(4): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataX1RegImmOp" if hasImm else "DataX1RegOp", + { "code": eWalkCode, + "r_count": 2, + "op_class": opClass }, []) + if hasImm: + header_output += NeonX1RegImmOpDeclare.subst(iop) + else: + header_output += NeonX1RegOpDeclare.subst(iop) + exec_output += NeonXUnequalRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def twoRegNarrowInstX(name, Name, opClass, types, op, readDest=False, + scalar=False, hi=False, hasImm=False): + global header_output, exec_output + eWalkCode = simd64EnabledCheckCode + ''' + BigRegVect srcReg1; + RegVect destReg; + ''' + for reg in range(4): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw); + ''' % { "reg" : reg } + if readDest: + for reg in range(2): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); + ''' % { "reg" : reg } + else: + eWalkCode += ''' + destReg.elements[0] = 0; + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + scalarCheck = ''' + if (i != 0) { + destReg.elements[i] = 0; + continue; + } + ''' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + %(scalarCheck)s + BigElement srcElem1 = gtoh(srcReg1.elements[i]); + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode, + "scalarCheck" : scalarCheck if scalar else "" } + destReg = 0 if not hi else 2 + for reg in range(2): + eWalkCode += ''' + AA64FpDestP%(destReg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg, "destReg": destReg } + destReg += 1 + if not hi: + for reg in range(2, 4): # zero upper half + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = 0; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataX1RegImmOp" if hasImm else "DataX1RegOp", + { "code": eWalkCode, + "r_count": 2, + "op_class": opClass }, []) + if hasImm: + header_output += NeonX1RegImmOpDeclare.subst(iop) + else: + header_output += NeonX1RegOpDeclare.subst(iop) + exec_output += NeonXUnequalRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def threeRegScrambleInstX(name, Name, opClass, types, rCount, op): + global header_output, exec_output + eWalkCode = simd64EnabledCheckCode + ''' + RegVect srcReg1, srcReg2, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw); + srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw); + ''' % { "reg" : reg } + eWalkCode += op + for reg in range(rCount): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + if rCount < 4: + for reg in range(rCount, 4): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = 0; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataX2RegOp", + { "code": eWalkCode, + "r_count": rCount, + "op_class": opClass }, []) + header_output += NeonX2RegOpDeclare.subst(iop) + exec_output += NeonXEqualRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def insFromVecElemInstX(name, Name, opClass, types, rCount): + global header_output, exec_output + eWalkCode = simd64EnabledCheckCode + ''' + FullRegVect srcReg1; + RegVect destReg; + ''' + for reg in range(4): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw); + ''' % { "reg" : reg } + for reg in range(rCount): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); + ''' % { "reg" : reg } + eWalkCode += ''' + Element srcElem1 = gtoh(srcReg1.elements[imm2]); + Element destElem = srcElem1; + destReg.elements[imm1] = htog(destElem); + ''' + for reg in range(rCount): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataX1Reg2ImmOp", + { "code": eWalkCode, + "r_count": rCount, + "op_class": opClass }, []) + header_output += NeonX1Reg2ImmOpDeclare.subst(iop) + exec_output += NeonXEqualRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def twoRegPairwiseScInstX(name, Name, opClass, types, rCount, op): + global header_output, exec_output + eWalkCode = simd64EnabledCheckCode + ''' + RegVect srcReg1, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw); + ''' % { "reg" : reg } + eWalkCode += ''' + Element srcElem1 = gtoh(srcReg1.elements[0]); + Element srcElem2 = gtoh(srcReg1.elements[1]); + Element destElem; + %(op)s + destReg.elements[0] = htog(destElem); + ''' % { "op" : op } + destCnt = rCount / 2 + for reg in range(destCnt): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + for reg in range(destCnt, 4): # zero upper half + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = 0; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataX1RegOp", + { "code": eWalkCode, + "r_count": rCount, + "op_class": opClass }, []) + header_output += NeonX1RegOpDeclare.subst(iop) + exec_output += NeonXEqualRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def twoRegAcrossInstX(name, Name, opClass, types, rCount, op, + doubleDest=False, long=False): + global header_output, exec_output + destPrefix = "Big" if long else "" + eWalkCode = simd64EnabledCheckCode + ''' + RegVect srcReg1; + %sRegVect destReg; + ''' % destPrefix + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw); + ''' % { "reg" : reg } + eWalkCode += ''' + destReg.regs[0] = 0; + %(destPrefix)sElement destElem = 0; + for (unsigned i = 0; i < eCount; i++) { + Element srcElem1 = gtoh(srcReg1.elements[i]); + if (i == 0) { + destElem = srcElem1; + } else { + %(op)s + } + } + destReg.elements[0] = htog(destElem); + ''' % { "op" : op, "destPrefix" : destPrefix } + destCnt = 2 if doubleDest else 1 + for reg in range(destCnt): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + for reg in range(destCnt, 4): # zero upper half + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = 0; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataX1RegOp", + { "code": eWalkCode, + "r_count": rCount, + "op_class": opClass }, []) + header_output += NeonX1RegOpDeclare.subst(iop) + if long: + exec_output += NeonXUnequalRegOpExecute.subst(iop) + else: + exec_output += NeonXEqualRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def twoRegCondenseInstX(name, Name, opClass, types, rCount, op, + readDest=False): + global header_output, exec_output + eWalkCode = simd64EnabledCheckCode + ''' + RegVect srcRegs; + BigRegVect destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcRegs.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw); + ''' % { "reg" : reg } + if readDest: + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount / 2; i++) { + Element srcElem1 = gtoh(srcRegs.elements[2 * i]); + Element srcElem2 = gtoh(srcRegs.elements[2 * i + 1]); + BigElement destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + if rCount < 4: # zero upper half + for reg in range(rCount, 4): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = 0; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataX1RegOp", + { "code": eWalkCode, + "r_count": rCount, + "op_class": opClass }, []) + header_output += NeonX1RegOpDeclare.subst(iop) + exec_output += NeonXUnequalRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def oneRegImmInstX(name, Name, opClass, types, rCount, op, readDest=False): + global header_output, exec_output + eWalkCode = simd64EnabledCheckCode + ''' + RegVect destReg; + ''' + if readDest: + for reg in range(rCount): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); + ''' % { "reg" : reg } + readDestCode = '' + if readDest: + readDestCode = 'destElem = gtoh(destReg.elements[i]);' + eWalkCode += ''' + for (unsigned i = 0; i < eCount; i++) { + Element destElem; + %(readDest)s + %(op)s + destReg.elements[i] = htog(destElem); + } + ''' % { "op" : op, "readDest" : readDestCode } + for reg in range(rCount): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + if rCount < 4: # zero upper half + for reg in range(rCount, 4): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = 0; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataXImmOnlyOp", + { "code": eWalkCode, + "r_count": rCount, + "op_class": opClass }, []) + header_output += NeonX1RegImmOnlyOpDeclare.subst(iop) + exec_output += NeonXEqualRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def dupGprInstX(name, Name, opClass, types, rCount, gprSpec): + global header_output, exec_output + eWalkCode = simd64EnabledCheckCode + ''' + RegVect destReg; + for (unsigned i = 0; i < eCount; i++) { + destReg.elements[i] = htog((Element) %sOp1); + } + ''' % gprSpec + for reg in range(rCount): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + if rCount < 4: # zero upper half + for reg in range(rCount, 4): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = 0; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataX1RegOp", + { "code": eWalkCode, + "r_count": rCount, + "op_class": opClass }, []) + header_output += NeonX1RegOpDeclare.subst(iop) + exec_output += NeonXEqualRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def extInstX(name, Name, opClass, types, rCount, op): + global header_output, exec_output + eWalkCode = simd64EnabledCheckCode + ''' + RegVect srcReg1, srcReg2, destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + srcReg1.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw); + srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw); + ''' % { "reg" : reg } + eWalkCode += op + for reg in range(rCount): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + if rCount < 4: # zero upper half + for reg in range(rCount, 4): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = 0; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataX2RegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "op_class": opClass }, []) + header_output += NeonX2RegImmOpDeclare.subst(iop) + exec_output += NeonXEqualRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def insFromGprInstX(name, Name, opClass, types, rCount, gprSpec): + global header_output, exec_output + eWalkCode = simd64EnabledCheckCode + ''' + RegVect destReg; + ''' + for reg in range(rCount): + eWalkCode += ''' + destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); + ''' % { "reg" : reg } + eWalkCode += ''' + destReg.elements[imm] = htog((Element) %sOp1); + ''' % gprSpec + for reg in range(rCount): + eWalkCode += ''' + AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataX1RegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "op_class": opClass }, []) + header_output += NeonX1RegImmOpDeclare.subst(iop) + exec_output += NeonXEqualRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def insToGprInstX(name, Name, opClass, types, rCount, gprSpec, + signExt=False): + global header_output, exec_output + eWalkCode = simd64EnabledCheckCode + ''' + FullRegVect srcReg; + ''' + for reg in range(4): + eWalkCode += ''' + srcReg.regs[%(reg)d] = htog(AA64FpOp1P%(reg)d_uw); + ''' % { "reg" : reg } + if signExt: + eWalkCode += ''' + %sDest = sext<sizeof(Element) * 8>(srcReg.elements[imm]); + ''' % gprSpec + else: + eWalkCode += ''' + %sDest = srcReg.elements[imm]; + ''' % gprSpec + iop = InstObjParams(name, Name, + "DataX1RegImmOp", + { "code": eWalkCode, + "r_count": rCount, + "op_class": opClass }, []) + header_output += NeonX1RegImmOpDeclare.subst(iop) + exec_output += NeonXEqualRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + def tbxTblInstX(name, Name, opClass, types, length, isTbl, rCount): + global header_output, decoder_output, exec_output + code = simd64EnabledCheckCode + ''' + union + { + uint8_t bytes[64]; + FloatRegBits regs[16]; + } table; + + union + { + uint8_t bytes[%(rCount)d * 4]; + FloatRegBits regs[%(rCount)d]; + } destReg, srcReg2; + + const unsigned length = %(length)d; + const bool isTbl = %(isTbl)s; + ''' % { "rCount" : rCount, "length" : length, "isTbl" : isTbl } + for reg in range(rCount): + code += ''' + srcReg2.regs[%(reg)d] = htog(AA64FpOp2P%(reg)d_uw); + destReg.regs[%(reg)d] = htog(AA64FpDestP%(reg)d_uw); + ''' % { "reg" : reg } + for reg in range(16): + if reg < length * 4: + code += ''' + table.regs[%(reg)d] = htog(AA64FpOp1P%(p)dV%(v)dS_uw); + ''' % { "reg" : reg, "p" : reg % 4, "v" : reg / 4 } + else: + code += ''' + table.regs[%(reg)d] = 0; + ''' % { "reg" : reg } + code += ''' + for (unsigned i = 0; i < sizeof(destReg); i++) { + uint8_t index = srcReg2.bytes[i]; + if (index < 16 * length) { + destReg.bytes[i] = table.bytes[index]; + } else { + if (isTbl) + destReg.bytes[i] = 0; + // else destReg.bytes[i] unchanged + } + } + ''' + for reg in range(rCount): + code += ''' + AA64FpDestP%(reg)d_uw = gtoh(destReg.regs[%(reg)d]); + ''' % { "reg" : reg } + if rCount < 4: # zero upper half + for reg in range(rCount, 4): + code += ''' + AA64FpDestP%(reg)d_uw = 0; + ''' % { "reg" : reg } + iop = InstObjParams(name, Name, + "DataX2RegOp", + { "code": code, + "r_count": rCount, + "op_class": opClass }, []) + header_output += NeonX2RegOpDeclare.subst(iop) + exec_output += NeonXEqualRegOpExecute.subst(iop) + for type in types: + substDict = { "targs" : type, + "class_name" : Name } + exec_output += NeonXExecDeclare.subst(substDict) + + # ABS + absCode = ''' + if (srcElem1 < 0) { + destElem = -srcElem1; + } else { + destElem = srcElem1; + } + ''' + twoEqualRegInstX("abs", "AbsDX", "SimdAluOp", signedTypes, 2, absCode) + twoEqualRegInstX("abs", "AbsQX", "SimdAluOp", signedTypes, 4, absCode) + # ADD + addCode = "destElem = srcElem1 + srcElem2;" + threeEqualRegInstX("add", "AddDX", "SimdAddOp", unsignedTypes, 2, addCode) + threeEqualRegInstX("add", "AddQX", "SimdAddOp", unsignedTypes, 4, addCode) + # ADDHN, ADDHN2 + addhnCode = ''' + destElem = ((BigElement)srcElem1 + (BigElement)srcElem2) >> + (sizeof(Element) * 8); + ''' + threeRegNarrowInstX("addhn", "AddhnX", "SimdAddOp", smallUnsignedTypes, + addhnCode) + threeRegNarrowInstX("addhn2", "Addhn2X", "SimdAddOp", smallUnsignedTypes, + addhnCode, hi=True) + # ADDP (scalar) + twoRegPairwiseScInstX("addp", "AddpScQX", "SimdAddOp", ("uint64_t",), 4, + addCode) + # ADDP (vector) + threeEqualRegInstX("addp", "AddpDX", "SimdAddOp", smallUnsignedTypes, 2, + addCode, pairwise=True) + threeEqualRegInstX("addp", "AddpQX", "SimdAddOp", unsignedTypes, 4, + addCode, pairwise=True) + # ADDV + # Note: SimdAddOp can be a bit optimistic here + addAcrossCode = "destElem += srcElem1;" + twoRegAcrossInstX("addv", "AddvDX", "SimdAddOp", ("uint8_t", "uint16_t"), + 2, addAcrossCode) + twoRegAcrossInstX("addv", "AddvQX", "SimdAddOp", smallUnsignedTypes, 4, + addAcrossCode) + # AND + andCode = "destElem = srcElem1 & srcElem2;" + threeEqualRegInstX("and", "AndDX", "SimdAluOp", ("uint64_t",), 2, andCode) + threeEqualRegInstX("and", "AndQX", "SimdAluOp", ("uint64_t",), 4, andCode) + # BIC (immediate) + bicImmCode = "destElem &= ~imm;" + oneRegImmInstX("bic", "BicImmDX", "SimdAluOp", ("uint64_t",), 2, + bicImmCode, True) + oneRegImmInstX("bic", "BicImmQX", "SimdAluOp", ("uint64_t",), 4, + bicImmCode, True) + # BIC (register) + bicCode = "destElem = srcElem1 & ~srcElem2;" + threeEqualRegInstX("bic", "BicDX", "SimdAluOp", ("uint64_t",), 2, bicCode) + threeEqualRegInstX("bic", "BicQX", "SimdAluOp", ("uint64_t",), 4, bicCode) + # BIF + bifCode = "destElem = (destElem & srcElem2) | (srcElem1 & ~srcElem2);" + threeEqualRegInstX("bif", "BifDX", "SimdAluOp", ("uint64_t",), 2, bifCode, + True) + threeEqualRegInstX("bif", "BifQX", "SimdAluOp", ("uint64_t",), 4, bifCode, + True) + # BIT + bitCode = "destElem = (srcElem1 & srcElem2) | (destElem & ~srcElem2);" + threeEqualRegInstX("bit", "BitDX", "SimdAluOp", ("uint64_t",), 2, bitCode, + True) + threeEqualRegInstX("bit", "BitQX", "SimdAluOp", ("uint64_t",), 4, bitCode, + True) + # BSL + bslCode = "destElem = (srcElem1 & destElem) | (srcElem2 & ~destElem);" + threeEqualRegInstX("bsl", "BslDX", "SimdAluOp", ("uint64_t",), 2, bslCode, + True) + threeEqualRegInstX("bsl", "BslQX", "SimdAluOp", ("uint64_t",), 4, bslCode, + True) + # CLS + clsCode = ''' + unsigned count = 0; + if (srcElem1 < 0) { + srcElem1 <<= 1; + while (srcElem1 < 0 && count < sizeof(Element) * 8 - 1) { + count++; + srcElem1 <<= 1; + } + } else { + srcElem1 <<= 1; + while (srcElem1 >= 0 && count < sizeof(Element) * 8 - 1) { + count++; + srcElem1 <<= 1; + } + } + destElem = count; + ''' + twoEqualRegInstX("cls", "ClsDX", "SimdAluOp", smallSignedTypes, 2, clsCode) + twoEqualRegInstX("cls", "ClsQX", "SimdAluOp", smallSignedTypes, 4, clsCode) + # CLZ + clzCode = ''' + unsigned count = 0; + while (srcElem1 >= 0 && count < sizeof(Element) * 8) { + count++; + srcElem1 <<= 1; + } + destElem = count; + ''' + twoEqualRegInstX("clz", "ClzDX", "SimdAluOp", smallSignedTypes, 2, clzCode) + twoEqualRegInstX("clz", "ClzQX", "SimdAluOp", smallSignedTypes, 4, clzCode) + # CMEQ (register) + cmeqCode = "destElem = (srcElem1 == srcElem2) ? (Element)(-1) : 0;" + threeEqualRegInstX("cmeq", "CmeqDX", "SimdCmpOp", unsignedTypes, 2, + cmeqCode) + threeEqualRegInstX("cmeq", "CmeqQX", "SimdCmpOp", unsignedTypes, 4, + cmeqCode) + # CMEQ (zero) + cmeqZeroCode = "destElem = (srcElem1 == 0) ? (Element)(-1) : 0;" + twoEqualRegInstX("cmeq", "CmeqZeroDX", "SimdCmpOp", signedTypes, 2, + cmeqZeroCode) + twoEqualRegInstX("cmeq", "CmeqZeroQX", "SimdCmpOp", signedTypes, 4, + cmeqZeroCode) + # CMGE (register) + cmgeCode = "destElem = (srcElem1 >= srcElem2) ? (Element)(-1) : 0;" + threeEqualRegInstX("cmge", "CmgeDX", "SimdCmpOp", signedTypes, 2, cmgeCode) + threeEqualRegInstX("cmge", "CmgeQX", "SimdCmpOp", signedTypes, 4, cmgeCode) + # CMGE (zero) + cmgeZeroCode = "destElem = (srcElem1 >= 0) ? (Element)(-1) : 0;" + twoEqualRegInstX("cmge", "CmgeZeroDX", "SimdCmpOp", signedTypes, 2, + cmgeZeroCode) + twoEqualRegInstX("cmge", "CmgeZeroQX", "SimdCmpOp", signedTypes, 4, + cmgeZeroCode) + # CMGT (register) + cmgtCode = "destElem = (srcElem1 > srcElem2) ? (Element)(-1) : 0;" + threeEqualRegInstX("cmgt", "CmgtDX", "SimdCmpOp", signedTypes, 2, cmgtCode) + threeEqualRegInstX("cmgt", "CmgtQX", "SimdCmpOp", signedTypes, 4, cmgtCode) + # CMGT (zero) + cmgtZeroCode = "destElem = (srcElem1 > 0) ? (Element)(-1) : 0;" + twoEqualRegInstX("cmgt", "CmgtZeroDX", "SimdCmpOp", signedTypes, 2, + cmgtZeroCode) + twoEqualRegInstX("cmgt", "CmgtZeroQX", "SimdCmpOp", signedTypes, 4, + cmgtZeroCode) + # CMHI (register) + threeEqualRegInstX("cmhi", "CmhiDX", "SimdCmpOp", unsignedTypes, 2, + cmgtCode) + threeEqualRegInstX("cmhi", "CmhiQX", "SimdCmpOp", unsignedTypes, 4, + cmgtCode) + # CMHS (register) + threeEqualRegInstX("cmhs", "CmhsDX", "SimdCmpOp", unsignedTypes, 2, + cmgeCode) + threeEqualRegInstX("cmhs", "CmhsQX", "SimdCmpOp", unsignedTypes, 4, + cmgeCode) + # CMLE (zero) + cmleZeroCode = "destElem = (srcElem1 <= 0) ? (Element)(-1) : 0;" + twoEqualRegInstX("cmle", "CmleZeroDX", "SimdCmpOp", signedTypes, 2, + cmleZeroCode) + twoEqualRegInstX("cmle", "CmleZeroQX", "SimdCmpOp", signedTypes, 4, + cmleZeroCode) + # CMLT (zero) + cmltZeroCode = "destElem = (srcElem1 < 0) ? (Element)(-1) : 0;" + twoEqualRegInstX("cmlt", "CmltZeroDX", "SimdCmpOp", signedTypes, 2, + cmltZeroCode) + twoEqualRegInstX("cmlt", "CmltZeroQX", "SimdCmpOp", signedTypes, 4, + cmltZeroCode) + # CMTST (register) + tstCode = "destElem = (srcElem1 & srcElem2) ? (Element)(-1) : 0;" + threeEqualRegInstX("cmtst", "CmtstDX", "SimdAluOp", unsignedTypes, 2, + tstCode) + threeEqualRegInstX("cmtst", "CmtstQX", "SimdAluOp", unsignedTypes, 4, + tstCode) + # CNT + cntCode = ''' + unsigned count = 0; + while (srcElem1 && count < sizeof(Element) * 8) { + count += srcElem1 & 0x1; + srcElem1 >>= 1; + } + destElem = count; + ''' + twoEqualRegInstX("cnt", "CntDX", "SimdAluOp", ("uint8_t",), 2, cntCode) + twoEqualRegInstX("cnt", "CntQX", "SimdAluOp", ("uint8_t",), 4, cntCode) + # DUP (element) + dupCode = "destElem = srcElem1;" + twoEqualRegInstX("dup", "DupElemDX", "SimdMiscOp", smallUnsignedTypes, 2, + dupCode, isDup=True, byElem=True) + twoEqualRegInstX("dup", "DupElemQX", "SimdMiscOp", unsignedTypes, 4, + dupCode, isDup=True, byElem=True) + twoEqualRegInstX("dup", "DupElemScX", "SimdMiscOp", unsignedTypes, 4, + dupCode, isDup=True, byElem=True, scalar=True) + # DUP (general register) + dupGprInstX("dup", "DupGprWDX", "SimdMiscOp", smallUnsignedTypes, 2, 'W') + dupGprInstX("dup", "DupGprWQX", "SimdMiscOp", smallUnsignedTypes, 4, 'W') + dupGprInstX("dup", "DupGprXQX", "SimdMiscOp", ("uint64_t",), 4, 'X') + # EOR + eorCode = "destElem = srcElem1 ^ srcElem2;" + threeEqualRegInstX("eor", "EorDX", "SimdAluOp", ("uint64_t",), 2, eorCode) + threeEqualRegInstX("eor", "EorQX", "SimdAluOp", ("uint64_t",), 4, eorCode) + # EXT + extCode = ''' + for (unsigned i = 0; i < eCount; i++) { + unsigned index = i + imm; + if (index < eCount) { + destReg.elements[i] = srcReg1.elements[index]; + } else { + index -= eCount; + if (index >= eCount) { + fault = new UndefinedInstruction(machInst, false, mnemonic); + } else { + destReg.elements[i] = srcReg2.elements[index]; + } + } + } + ''' + extInstX("Ext", "ExtDX", "SimdMiscOp", ("uint8_t",), 2, extCode) + extInstX("Ext", "ExtQX", "SimdMiscOp", ("uint8_t",), 4, extCode) + # FABD + fpOp = ''' + FPSCR fpscr = (FPSCR) FpscrExc; + destElem = %s; + FpscrExc = fpscr; + ''' + fabdCode = fpOp % "fplibAbs<Element>(fplibSub(srcElem1, srcElem2, fpscr))" + threeEqualRegInstX("fabd", "FabdDX", "SimdFloatAddOp", smallFloatTypes, 2, + fabdCode) + threeEqualRegInstX("fabd", "FabdQX", "SimdFloatAddOp", floatTypes, 4, + fabdCode) + threeEqualRegInstX("fabd", "FabdScX", "SimdFloatAddOp", floatTypes, 4, + fabdCode, scalar=True) + # FABS + fabsCode = fpOp % "fplibAbs<Element>(srcElem1)" + twoEqualRegInstX("Abs", "FabsDX", "SimdFloatAluOp", smallFloatTypes, 2, + fabsCode) + twoEqualRegInstX("Abs", "FabsQX", "SimdFloatAluOp", floatTypes, 4, + fabsCode) + # FACGE + fpCmpAbsOp = fpOp % ("fplibCompare%s<Element>(fplibAbs<Element>(srcElem1)," + " fplibAbs<Element>(srcElem2), fpscr) ? -1 : 0") + facgeCode = fpCmpAbsOp % "GE" + threeEqualRegInstX("facge", "FacgeDX", "SimdFloatCmpOp", smallFloatTypes, + 2, facgeCode) + threeEqualRegInstX("facge", "FacgeQX", "SimdFloatCmpOp", floatTypes, 4, + facgeCode) + threeEqualRegInstX("facge", "FacgeScX", "SimdFloatCmpOp", floatTypes, 4, + facgeCode, scalar=True) + # FACGT + facgtCode = fpCmpAbsOp % "GT" + threeEqualRegInstX("facgt", "FacgtDX", "SimdFloatCmpOp", smallFloatTypes, + 2, facgtCode) + threeEqualRegInstX("facgt", "FacgtQX", "SimdFloatCmpOp", floatTypes, 4, + facgtCode) + threeEqualRegInstX("facgt", "FacgtScX", "SimdFloatCmpOp", floatTypes, 4, + facgtCode, scalar=True) + # FADD + fpBinOp = fpOp % "fplib%s<Element>(srcElem1, srcElem2, fpscr)" + faddCode = fpBinOp % "Add" + threeEqualRegInstX("fadd", "FaddDX", "SimdFloatAddOp", smallFloatTypes, 2, + faddCode) + threeEqualRegInstX("fadd", "FaddQX", "SimdFloatAddOp", floatTypes, 4, + faddCode) + # FADDP (scalar) + twoRegPairwiseScInstX("faddp", "FaddpScDX", "SimdFloatAddOp", + ("uint32_t",), 2, faddCode) + twoRegPairwiseScInstX("faddp", "FaddpScQX", "SimdFloatAddOp", + ("uint64_t",), 4, faddCode) + # FADDP (vector) + threeEqualRegInstX("faddp", "FaddpDX", "SimdFloatAddOp", smallFloatTypes, + 2, faddCode, pairwise=True) + threeEqualRegInstX("faddp", "FaddpQX", "SimdFloatAddOp", floatTypes, 4, + faddCode, pairwise=True) + # FCMEQ (register) + fpCmpOp = fpOp % ("fplibCompare%s<Element>(srcElem1, srcElem2, fpscr) ?" + " -1 : 0") + fcmeqCode = fpCmpOp % "EQ" + threeEqualRegInstX("fcmeq", "FcmeqDX", "SimdFloatCmpOp", smallFloatTypes, + 2, fcmeqCode) + threeEqualRegInstX("fcmeq", "FcmeqQX", "SimdFloatCmpOp", floatTypes, 4, + fcmeqCode) + threeEqualRegInstX("fcmeq", "FcmeqScX", "SimdFloatCmpOp", floatTypes, 4, + fcmeqCode, scalar=True) + # FCMEQ (zero) + fpCmpZeroOp = fpOp % "fplibCompare%s<Element>(srcElem1, 0, fpscr) ? -1 : 0" + fcmeqZeroCode = fpCmpZeroOp % "EQ" + twoEqualRegInstX("fcmeq", "FcmeqZeroDX", "SimdFloatCmpOp", smallFloatTypes, + 2, fcmeqZeroCode) + twoEqualRegInstX("fcmeq", "FcmeqZeroQX", "SimdFloatCmpOp", floatTypes, 4, + fcmeqZeroCode) + twoEqualRegInstX("fcmeq", "FcmeqZeroScX", "SimdFloatCmpOp", floatTypes, 4, + fcmeqZeroCode, scalar=True) + # FCMGE (register) + fcmgeCode = fpCmpOp % "GE" + threeEqualRegInstX("fcmge", "FcmgeDX", "SimdFloatCmpOp", smallFloatTypes, + 2, fcmgeCode) + threeEqualRegInstX("fcmge", "FcmgeQX", "SimdFloatCmpOp", floatTypes, 4, + fcmgeCode) + threeEqualRegInstX("fcmge", "FcmgeScX", "SimdFloatCmpOp", floatTypes, 4, + fcmgeCode, scalar=True) + # FCMGE (zero) + fcmgeZeroCode = fpCmpZeroOp % "GE" + twoEqualRegInstX("fcmge", "FcmgeZeroDX", "SimdFloatCmpOp", smallFloatTypes, + 2, fcmgeZeroCode) + twoEqualRegInstX("fcmge", "FcmgeZeroQX", "SimdFloatCmpOp", floatTypes, 4, + fcmgeZeroCode) + twoEqualRegInstX("fcmge", "FcmgeZeroScX", "SimdFloatCmpOp", floatTypes, 4, + fcmgeZeroCode, scalar=True) + # FCMGT (register) + fcmgtCode = fpCmpOp % "GT" + threeEqualRegInstX("fcmgt", "FcmgtDX", "SimdFloatCmpOp", smallFloatTypes, + 2, fcmgtCode) + threeEqualRegInstX("fcmgt", "FcmgtQX", "SimdFloatCmpOp", floatTypes, 4, + fcmgtCode) + threeEqualRegInstX("fcmgt", "FcmgtScX", "SimdFloatCmpOp", floatTypes, 4, + fcmgtCode, scalar=True) + # FCMGT (zero) + fcmgtZeroCode = fpCmpZeroOp % "GT" + twoEqualRegInstX("fcmgt", "FcmgtZeroDX", "SimdFloatCmpOp", smallFloatTypes, + 2, fcmgtZeroCode) + twoEqualRegInstX("fcmgt", "FcmgtZeroQX", "SimdFloatCmpOp", floatTypes, 4, + fcmgtZeroCode) + twoEqualRegInstX("fcmgt", "FcmgtZeroScX", "SimdFloatCmpOp", floatTypes, 4, + fcmgtZeroCode, scalar=True) + # FCMLE (zero) + fpCmpRevZeroOp = fpOp % ("fplibCompare%s<Element>(0, srcElem1, fpscr) ?" + " -1 : 0") + fcmleZeroCode = fpCmpRevZeroOp % "GE" + twoEqualRegInstX("fcmle", "FcmleZeroDX", "SimdFloatCmpOp", smallFloatTypes, + 2, fcmleZeroCode) + twoEqualRegInstX("fcmle", "FcmleZeroQX", "SimdFloatCmpOp", floatTypes, 4, + fcmleZeroCode) + twoEqualRegInstX("fcmle", "FcmleZeroScX", "SimdFloatCmpOp", floatTypes, 4, + fcmleZeroCode, scalar=True) + # FCMLT (zero) + fcmltZeroCode = fpCmpRevZeroOp % "GT" + twoEqualRegInstX("fcmlt", "FcmltZeroDX", "SimdFloatCmpOp", smallFloatTypes, + 2, fcmltZeroCode) + twoEqualRegInstX("fcmlt", "FcmltZeroQX", "SimdFloatCmpOp", floatTypes, 4, + fcmltZeroCode) + twoEqualRegInstX("fcmlt", "FcmltZeroScX", "SimdFloatCmpOp", floatTypes, 4, + fcmltZeroCode, scalar=True) + # FCVTAS + fcvtCode = fpOp % ("fplibFPToFixed<Element, Element>(" + "srcElem1, %s, %s, %s, fpscr)") + fcvtasCode = fcvtCode % ("0", "false", "FPRounding_TIEAWAY") + twoEqualRegInstX("fcvtas", "FcvtasDX", "SimdCvtOp", smallFloatTypes, 2, + fcvtasCode) + twoEqualRegInstX("fcvtas", "FcvtasQX", "SimdCvtOp", floatTypes, 4, + fcvtasCode) + twoEqualRegInstX("fcvtas", "FcvtasScX", "SimdCvtOp", floatTypes, 4, + fcvtasCode, scalar=True) + # FCVTAU + fcvtauCode = fcvtCode % ("0", "true", "FPRounding_TIEAWAY") + twoEqualRegInstX("fcvtau", "FcvtauDX", "SimdCvtOp", smallFloatTypes, 2, + fcvtauCode) + twoEqualRegInstX("fcvtau", "FcvtauQX", "SimdCvtOp", floatTypes, 4, + fcvtauCode) + twoEqualRegInstX("fcvtau", "FcvtauScX", "SimdCvtOp", floatTypes, 4, + fcvtauCode, scalar=True) + # FCVTL, FCVTL2 + fcvtlCode = fpOp % ("fplibConvert<Element, BigElement>(" + "srcElem1, FPCRRounding(fpscr), fpscr)") + twoRegLongInstX("fcvtl", "FcvtlX", "SimdCvtOp", ("uint16_t", "uint32_t"), + fcvtlCode) + twoRegLongInstX("fcvtl", "Fcvtl2X", "SimdCvtOp", ("uint16_t", "uint32_t"), + fcvtlCode, hi=True) + # FCVTMS + fcvtmsCode = fcvtCode % ("0", "false", "FPRounding_NEGINF") + twoEqualRegInstX("fcvtms", "FcvtmsDX", "SimdCvtOp", smallFloatTypes, 2, + fcvtmsCode) + twoEqualRegInstX("fcvtms", "FcvtmsQX", "SimdCvtOp", floatTypes, 4, + fcvtmsCode) + twoEqualRegInstX("fcvtms", "FcvtmsScX", "SimdCvtOp", floatTypes, 4, + fcvtmsCode, scalar=True) + # FCVTMU + fcvtmuCode = fcvtCode % ("0", "true", "FPRounding_NEGINF") + twoEqualRegInstX("fcvtmu", "FcvtmuDX", "SimdCvtOp", smallFloatTypes, 2, + fcvtmuCode) + twoEqualRegInstX("fcvtmu", "FcvtmuQX", "SimdCvtOp", floatTypes, 4, + fcvtmuCode) + twoEqualRegInstX("fcvtmu", "FcvtmuScX", "SimdCvtOp", floatTypes, 4, + fcvtmuCode, scalar=True) + # FCVTN, FCVTN2 + fcvtnCode = fpOp % ("fplibConvert<BigElement, Element>(" + "srcElem1, FPCRRounding(fpscr), fpscr)") + twoRegNarrowInstX("fcvtn", "FcvtnX", "SimdCvtOp", + ("uint16_t", "uint32_t"), fcvtnCode) + twoRegNarrowInstX("fcvtn", "Fcvtn2X", "SimdCvtOp", + ("uint16_t", "uint32_t"), fcvtnCode, hi=True) + # FCVTNS + fcvtnsCode = fcvtCode % ("0", "false", "FPRounding_TIEEVEN") + twoEqualRegInstX("fcvtns", "FcvtnsDX", "SimdCvtOp", smallFloatTypes, 2, + fcvtnsCode) + twoEqualRegInstX("fcvtns", "FcvtnsQX", "SimdCvtOp", floatTypes, 4, + fcvtnsCode) + twoEqualRegInstX("fcvtns", "FcvtnsScX", "SimdCvtOp", floatTypes, 4, + fcvtnsCode, scalar=True) + # FCVTNU + fcvtnuCode = fcvtCode % ("0", "true", "FPRounding_TIEEVEN") + twoEqualRegInstX("fcvtnu", "FcvtnuDX", "SimdCvtOp", smallFloatTypes, 2, + fcvtnuCode) + twoEqualRegInstX("fcvtnu", "FcvtnuQX", "SimdCvtOp", floatTypes, 4, + fcvtnuCode) + twoEqualRegInstX("fcvtnu", "FcvtnuScX", "SimdCvtOp", floatTypes, 4, + fcvtnuCode, scalar=True) + # FCVTPS + fcvtpsCode = fcvtCode % ("0", "false", "FPRounding_POSINF") + twoEqualRegInstX("fcvtps", "FcvtpsDX", "SimdCvtOp", smallFloatTypes, 2, + fcvtpsCode) + twoEqualRegInstX("fcvtps", "FcvtpsQX", "SimdCvtOp", floatTypes, 4, + fcvtpsCode) + twoEqualRegInstX("fcvtps", "FcvtpsScX", "SimdCvtOp", floatTypes, 4, + fcvtpsCode, scalar=True) + # FCVTPU + fcvtpuCode = fcvtCode % ("0", "true", "FPRounding_POSINF") + twoEqualRegInstX("fcvtpu", "FcvtpuDX", "SimdCvtOp", smallFloatTypes, 2, + fcvtpuCode) + twoEqualRegInstX("fcvtpu", "FcvtpuQX", "SimdCvtOp", floatTypes, 4, + fcvtpuCode) + twoEqualRegInstX("fcvtpu", "FcvtpuScX", "SimdCvtOp", floatTypes, 4, + fcvtpuCode, scalar=True) + # FCVTXN, FCVTXN2 + fcvtxnCode = fpOp % ("fplibConvert<BigElement, Element>(" + "srcElem1, FPRounding_ODD, fpscr)") + twoRegNarrowInstX("fcvtxn", "FcvtxnX", "SimdCvtOp", smallFloatTypes, + fcvtxnCode) + twoRegNarrowInstX("fcvtxn", "Fcvtxn2X", "SimdCvtOp", smallFloatTypes, + fcvtxnCode, hi=True) + twoRegNarrowInstX("fcvtxn", "FcvtxnScX", "SimdCvtOp", smallFloatTypes, + fcvtxnCode, scalar=True) + # FCVTZS (fixed-point) + fcvtzsCode = fcvtCode % ("imm", "false", "FPRounding_ZERO") + twoEqualRegInstX("fcvtzs", "FcvtzsFixedDX", "SimdCvtOp", smallFloatTypes, + 2, fcvtzsCode, hasImm=True) + twoEqualRegInstX("fcvtzs", "FcvtzsFixedQX", "SimdCvtOp", floatTypes, 4, + fcvtzsCode, hasImm=True) + twoEqualRegInstX("fcvtzs", "FcvtzsFixedScX", "SimdCvtOp", floatTypes, 4, + fcvtzsCode, hasImm=True, scalar=True) + # FCVTZS (integer) + fcvtzsIntCode = fcvtCode % ("0", "false", "FPRounding_ZERO") + twoEqualRegInstX("fcvtzs", "FcvtzsIntDX", "SimdCvtOp", smallFloatTypes, + 2, fcvtzsIntCode) + twoEqualRegInstX("fcvtzs", "FcvtzsIntQX", "SimdCvtOp", floatTypes, 4, + fcvtzsIntCode) + twoEqualRegInstX("fcvtzs", "FcvtzsIntScX", "SimdCvtOp", floatTypes, 4, + fcvtzsIntCode, scalar=True) + # FCVTZU (fixed-point) + fcvtzuCode = fcvtCode % ("imm", "true", "FPRounding_ZERO") + twoEqualRegInstX("fcvtzu", "FcvtzuFixedDX", "SimdCvtOp", smallFloatTypes, + 2, fcvtzuCode, hasImm=True) + twoEqualRegInstX("fcvtzu", "FcvtzuFixedQX", "SimdCvtOp", floatTypes, 4, + fcvtzuCode, hasImm=True) + twoEqualRegInstX("fcvtzu", "FcvtzuFixedScX", "SimdCvtOp", floatTypes, 4, + fcvtzuCode, hasImm=True, scalar=True) + # FCVTZU (integer) + fcvtzuIntCode = fcvtCode % ("0", "true", "FPRounding_ZERO") + twoEqualRegInstX("fcvtzu", "FcvtzuIntDX", "SimdCvtOp", smallFloatTypes, 2, + fcvtzuIntCode) + twoEqualRegInstX("fcvtzu", "FcvtzuIntQX", "SimdCvtOp", floatTypes, 4, + fcvtzuIntCode) + twoEqualRegInstX("fcvtzu", "FcvtzuIntScX", "SimdCvtOp", floatTypes, 4, + fcvtzuIntCode, scalar=True) + # FDIV + fdivCode = fpBinOp % "Div" + threeEqualRegInstX("fdiv", "FdivDX", "SimdFloatDivOp", smallFloatTypes, 2, + fdivCode) + threeEqualRegInstX("fdiv", "FdivQX", "SimdFloatDivOp", floatTypes, 4, + fdivCode) + # FMAX + fmaxCode = fpBinOp % "Max" + threeEqualRegInstX("fmax", "FmaxDX", "SimdFloatCmpOp", smallFloatTypes, 2, + fmaxCode) + threeEqualRegInstX("fmax", "FmaxQX", "SimdFloatCmpOp", floatTypes, 4, + fmaxCode) + # FMAXNM + fmaxnmCode = fpBinOp % "MaxNum" + threeEqualRegInstX("fmaxnm", "FmaxnmDX", "SimdFloatCmpOp", smallFloatTypes, + 2, fmaxnmCode) + threeEqualRegInstX("fmaxnm", "FmaxnmQX", "SimdFloatCmpOp", floatTypes, 4, + fmaxnmCode) + # FMAXNMP (scalar) + twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScDX", "SimdFloatCmpOp", + ("uint32_t",), 2, fmaxnmCode) + twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScQX", "SimdFloatCmpOp", + ("uint64_t",), 4, fmaxnmCode) + # FMAXNMP (vector) + threeEqualRegInstX("fmaxnmp", "FmaxnmpDX", "SimdFloatCmpOp", + smallFloatTypes, 2, fmaxnmCode, pairwise=True) + threeEqualRegInstX("fmaxnmp", "FmaxnmpQX", "SimdFloatCmpOp", floatTypes, 4, + fmaxnmCode, pairwise=True) + # FMAXNMV + # Note: SimdFloatCmpOp can be a bit optimistic here + fpAcrossOp = fpOp % "fplib%s<Element>(destElem, srcElem1, fpscr)" + fmaxnmAcrossCode = fpAcrossOp % "MaxNum" + twoRegAcrossInstX("fmaxnmv", "FmaxnmvQX", "SimdFloatCmpOp", ("uint32_t",), + 4, fmaxnmAcrossCode) + # FMAXP (scalar) + twoRegPairwiseScInstX("fmaxp", "FmaxpScDX", "SimdFloatCmpOp", + ("uint32_t",), 2, fmaxCode) + twoRegPairwiseScInstX("fmaxp", "FmaxpScQX", "SimdFloatCmpOp", + ("uint64_t",), 4, fmaxCode) + # FMAXP (vector) + threeEqualRegInstX("fmaxp", "FmaxpDX", "SimdFloatCmpOp", smallFloatTypes, + 2, fmaxCode, pairwise=True) + threeEqualRegInstX("fmaxp", "FmaxpQX", "SimdFloatCmpOp", floatTypes, 4, + fmaxCode, pairwise=True) + # FMAXV + # Note: SimdFloatCmpOp can be a bit optimistic here + fmaxAcrossCode = fpAcrossOp % "Max" + twoRegAcrossInstX("fmaxv", "FmaxvQX", "SimdFloatCmpOp", ("uint32_t",), 4, + fmaxAcrossCode) + # FMIN + fminCode = fpBinOp % "Min" + threeEqualRegInstX("fmin", "FminDX", "SimdFloatCmpOp", smallFloatTypes, 2, + fminCode) + threeEqualRegInstX("fmin", "FminQX", "SimdFloatCmpOp", floatTypes, 4, + fminCode) + # FMINNM + fminnmCode = fpBinOp % "MinNum" + threeEqualRegInstX("fminnm", "FminnmDX", "SimdFloatCmpOp", smallFloatTypes, + 2, fminnmCode) + threeEqualRegInstX("fminnm", "FminnmQX", "SimdFloatCmpOp", floatTypes, 4, + fminnmCode) + # FMINNMP (scalar) + twoRegPairwiseScInstX("fminnmp", "FminnmpScDX", "SimdFloatCmpOp", + ("uint32_t",), 2, fminnmCode) + twoRegPairwiseScInstX("fminnmp", "FminnmpScQX", "SimdFloatCmpOp", + ("uint64_t",), 4, fminnmCode) + # FMINNMP (vector) + threeEqualRegInstX("fminnmp", "FminnmpDX", "SimdFloatCmpOp", + smallFloatTypes, 2, fminnmCode, pairwise=True) + threeEqualRegInstX("fminnmp", "FminnmpQX", "SimdFloatCmpOp", floatTypes, 4, + fminnmCode, pairwise=True) + # FMINNMV + # Note: SimdFloatCmpOp can be a bit optimistic here + fminnmAcrossCode = fpAcrossOp % "MinNum" + twoRegAcrossInstX("fminnmv", "FminnmvQX", "SimdFloatCmpOp", ("uint32_t",), + 4, fminnmAcrossCode) + # FMINP (scalar) + twoRegPairwiseScInstX("fminp", "FminpScDX", "SimdFloatCmpOp", + ("uint32_t",), 2, fminCode) + twoRegPairwiseScInstX("fminp", "FminpScQX", "SimdFloatCmpOp", + ("uint64_t",), 4, fminCode) + # FMINP (vector) + threeEqualRegInstX("fminp", "FminpDX", "SimdFloatCmpOp", smallFloatTypes, + 2, fminCode, pairwise=True) + threeEqualRegInstX("fminp", "FminpQX", "SimdFloatCmpOp", floatTypes, 4, + fminCode, pairwise=True) + # FMINV + # Note: SimdFloatCmpOp can be a bit optimistic here + fminAcrossCode = fpAcrossOp % "Min" + twoRegAcrossInstX("fminv", "FminvQX", "SimdFloatCmpOp", ("uint32_t",), 4, + fminAcrossCode) + # FMLA (by element) + fmlaCode = fpOp % ("fplibMulAdd<Element>(" + "destElem, srcElem1, srcElem2, fpscr)") + threeEqualRegInstX("fmla", "FmlaElemDX", "SimdFloatMultAccOp", + smallFloatTypes, 2, fmlaCode, True, byElem=True) + threeEqualRegInstX("fmla", "FmlaElemQX", "SimdFloatMultAccOp", floatTypes, + 4, fmlaCode, True, byElem=True) + threeEqualRegInstX("fmla", "FmlaElemScX", "SimdFloatMultAccOp", floatTypes, + 4, fmlaCode, True, byElem=True, scalar=True) + # FMLA (vector) + threeEqualRegInstX("fmla", "FmlaDX", "SimdFloatMultAccOp", smallFloatTypes, + 2, fmlaCode, True) + threeEqualRegInstX("fmla", "FmlaQX", "SimdFloatMultAccOp", floatTypes, 4, + fmlaCode, True) + # FMLS (by element) + fmlsCode = fpOp % ("fplibMulAdd<Element>(destElem," + " fplibNeg<Element>(srcElem1), srcElem2, fpscr)") + threeEqualRegInstX("fmls", "FmlsElemDX", "SimdFloatMultAccOp", + smallFloatTypes, 2, fmlsCode, True, byElem=True) + threeEqualRegInstX("fmls", "FmlsElemQX", "SimdFloatMultAccOp", floatTypes, + 4, fmlsCode, True, byElem=True) + threeEqualRegInstX("fmls", "FmlsElemScX", "SimdFloatMultAccOp", floatTypes, + 4, fmlsCode, True, byElem=True, scalar=True) + # FMLS (vector) + threeEqualRegInstX("fmls", "FmlsDX", "SimdFloatMultAccOp", smallFloatTypes, + 2, fmlsCode, True) + threeEqualRegInstX("fmls", "FmlsQX", "SimdFloatMultAccOp", floatTypes, 4, + fmlsCode, True) + # FMOV + fmovCode = 'destElem = imm;' + oneRegImmInstX("fmov", "FmovDX", "SimdMiscOp", smallFloatTypes, 2, + fmovCode) + oneRegImmInstX("fmov", "FmovQX", "SimdMiscOp", floatTypes, 4, fmovCode) + # FMUL (by element) + fmulCode = fpBinOp % "Mul" + threeEqualRegInstX("fmul", "FmulElemDX", "SimdFloatMultOp", + smallFloatTypes, 2, fmulCode, byElem=True) + threeEqualRegInstX("fmul", "FmulElemQX", "SimdFloatMultOp", floatTypes, 4, + fmulCode, byElem=True) + threeEqualRegInstX("fmul", "FmulElemScX", "SimdFloatMultOp", floatTypes, 4, + fmulCode, byElem=True, scalar=True) + # FMUL (vector) + threeEqualRegInstX("fmul", "FmulDX", "SimdFloatMultOp", smallFloatTypes, 2, + fmulCode) + threeEqualRegInstX("fmul", "FmulQX", "SimdFloatMultOp", floatTypes, 4, + fmulCode) + # FMULX + fmulxCode = fpBinOp % "MulX" + threeEqualRegInstX("fmulx", "FmulxDX", "SimdFloatMultOp", smallFloatTypes, + 2, fmulxCode) + threeEqualRegInstX("fmulx", "FmulxQX", "SimdFloatMultOp", floatTypes, 4, + fmulxCode) + threeEqualRegInstX("fmulx", "FmulxScX", "SimdFloatMultOp", floatTypes, 4, + fmulxCode, scalar=True) + # FMULX (by element) + threeEqualRegInstX("fmulx", "FmulxElemDX", "SimdFloatMultOp", + smallFloatTypes, 2, fmulxCode, byElem=True) + threeEqualRegInstX("fmulx", "FmulxElemQX", "SimdFloatMultOp", floatTypes, + 4, fmulxCode, byElem=True) + threeEqualRegInstX("fmulx", "FmulxElemScX", "SimdFloatMultOp", floatTypes, + 4, fmulxCode, byElem=True, scalar=True) + # FNEG + fnegCode = fpOp % "fplibNeg<Element>(srcElem1)" + twoEqualRegInstX("Neg", "FnegDX", "SimdFloatAluOp", smallFloatTypes, 2, + fnegCode) + twoEqualRegInstX("Neg", "FnegQX", "SimdFloatAluOp", floatTypes, 4, + fnegCode) + # FRECPE + frecpeCode = fpOp % "fplibRecipEstimate<Element>(srcElem1, fpscr)" + twoEqualRegInstX("frecpe", "FrecpeDX", "SimdFloatMultAccOp", + smallFloatTypes, 2, frecpeCode) + twoEqualRegInstX("frecpe", "FrecpeQX", "SimdFloatMultAccOp", floatTypes, 4, + frecpeCode) + twoEqualRegInstX("frecpe", "FrecpeScX", "SimdFloatMultAccOp", floatTypes, + 4, frecpeCode, scalar=True) + # FRECPS + frecpsCode = fpBinOp % "RecipStepFused" + threeEqualRegInstX("frecps", "FrecpsDX", "SimdFloatMultAccOp", + smallFloatTypes, 2, frecpsCode) + threeEqualRegInstX("frecps", "FrecpsQX", "SimdFloatMultAccOp", floatTypes, + 4, frecpsCode) + threeEqualRegInstX("frecps", "FrecpsScX", "SimdFloatMultAccOp", floatTypes, + 4, frecpsCode, scalar=True) + # FRECPX + frecpxCode = fpOp % "fplibRecpX<Element>(srcElem1, fpscr)" + twoEqualRegInstX("frecpx", "FrecpxX", "SimdFloatMultAccOp", floatTypes, 4, + frecpxCode, scalar=True) + # FRINTA + frintCode = fpOp % "fplibRoundInt<Element>(srcElem1, %s, %s, fpscr)" + frintaCode = frintCode % ("FPRounding_TIEAWAY", "false") + twoEqualRegInstX("frinta", "FrintaDX", "SimdCvtOp", smallFloatTypes, 2, + frintaCode) + twoEqualRegInstX("frinta", "FrintaQX", "SimdCvtOp", floatTypes, 4, + frintaCode) + # FRINTI + frintiCode = frintCode % ("FPCRRounding(fpscr)", "false") + twoEqualRegInstX("frinti", "FrintiDX", "SimdCvtOp", smallFloatTypes, 2, + frintiCode) + twoEqualRegInstX("frinti", "FrintiQX", "SimdCvtOp", floatTypes, 4, + frintiCode) + # FRINTM + frintmCode = frintCode % ("FPRounding_NEGINF", "false") + twoEqualRegInstX("frintm", "FrintmDX", "SimdCvtOp", smallFloatTypes, 2, + frintmCode) + twoEqualRegInstX("frintm", "FrintmQX", "SimdCvtOp", floatTypes, 4, + frintmCode) + # FRINTN + frintnCode = frintCode % ("FPRounding_TIEEVEN", "false") + twoEqualRegInstX("frintn", "FrintnDX", "SimdCvtOp", smallFloatTypes, 2, + frintnCode) + twoEqualRegInstX("frintn", "FrintnQX", "SimdCvtOp", floatTypes, 4, + frintnCode) + # FRINTP + frintpCode = frintCode % ("FPRounding_POSINF", "false") + twoEqualRegInstX("frintp", "FrintpDX", "SimdCvtOp", smallFloatTypes, 2, + frintpCode) + twoEqualRegInstX("frintp", "FrintpQX", "SimdCvtOp", floatTypes, 4, + frintpCode) + # FRINTX + frintxCode = frintCode % ("FPCRRounding(fpscr)", "true") + twoEqualRegInstX("frintx", "FrintxDX", "SimdCvtOp", smallFloatTypes, 2, + frintxCode) + twoEqualRegInstX("frintx", "FrintxQX", "SimdCvtOp", floatTypes, 4, + frintxCode) + # FRINTZ + frintzCode = frintCode % ("FPRounding_ZERO", "false") + twoEqualRegInstX("frintz", "FrintzDX", "SimdCvtOp", smallFloatTypes, 2, + frintzCode) + twoEqualRegInstX("frintz", "FrintzQX", "SimdCvtOp", floatTypes, 4, + frintzCode) + # FRSQRTE + frsqrteCode = fpOp % "fplibRSqrtEstimate<Element>(srcElem1, fpscr)" + twoEqualRegInstX("frsqrte", "FrsqrteDX", "SimdFloatSqrtOp", + smallFloatTypes, 2, frsqrteCode) + twoEqualRegInstX("frsqrte", "FrsqrteQX", "SimdFloatSqrtOp", floatTypes, 4, + frsqrteCode) + twoEqualRegInstX("frsqrte", "FrsqrteScX", "SimdFloatSqrtOp", floatTypes, 4, + frsqrteCode, scalar=True) + # FRSQRTS + frsqrtsCode = fpBinOp % "RSqrtStepFused" + threeEqualRegInstX("frsqrts", "FrsqrtsDX", "SimdFloatMiscOp", + smallFloatTypes, 2, frsqrtsCode) + threeEqualRegInstX("frsqrts", "FrsqrtsQX", "SimdFloatMiscOp", floatTypes, + 4, frsqrtsCode) + threeEqualRegInstX("frsqrts", "FrsqrtsScX", "SimdFloatMiscOp", floatTypes, + 4, frsqrtsCode, scalar=True) + # FSQRT + fsqrtCode = fpOp % "fplibSqrt<Element>(srcElem1, fpscr)" + twoEqualRegInstX("fsqrt", "FsqrtDX", "SimdFloatSqrtOp", smallFloatTypes, 2, + fsqrtCode) + twoEqualRegInstX("fsqrt", "FsqrtQX", "SimdFloatSqrtOp", floatTypes, 4, + fsqrtCode) + # FSUB + fsubCode = fpBinOp % "Sub" + threeEqualRegInstX("fsub", "FsubDX", "SimdFloatAddOp", smallFloatTypes, 2, + fsubCode) + threeEqualRegInstX("fsub", "FsubQX", "SimdFloatAddOp", floatTypes, 4, + fsubCode) + # INS (element) + insFromVecElemInstX("ins", "InsElemX", "SimdMiscOp", unsignedTypes, 4) + # INS (general register) + insFromGprInstX("ins", "InsGprWX", "SimdMiscOp", smallUnsignedTypes, 4, + 'W') + insFromGprInstX("ins", "InsGprXX", "SimdMiscOp", unsignedTypes, 4, 'X') + # MLA (by element) + mlaCode = "destElem += srcElem1 * srcElem2;" + threeEqualRegInstX("mla", "MlaElemDX", "SimdMultAccOp", + ("uint16_t", "uint32_t"), 2, mlaCode, True, byElem=True) + threeEqualRegInstX("mla", "MlaElemQX", "SimdMultAccOp", + ("uint16_t", "uint32_t"), 4, mlaCode, True, byElem=True) + # MLA (vector) + threeEqualRegInstX("mla", "MlaDX", "SimdMultAccOp", smallUnsignedTypes, 2, + mlaCode, True) + threeEqualRegInstX("mla", "MlaQX", "SimdMultAccOp", smallUnsignedTypes, 4, + mlaCode, True) + # MLS (by element) + mlsCode = "destElem -= srcElem1 * srcElem2;" + threeEqualRegInstX("mls", "MlsElemDX", "SimdMultAccOp", + ("uint16_t", "uint32_t"), 2, mlsCode, True, byElem=True) + threeEqualRegInstX("mls", "MlsElemQX", "SimdMultAccOp", + ("uint16_t", "uint32_t"), 4, mlsCode, True, byElem=True) + # MLS (vector) + threeEqualRegInstX("mls", "MlsDX", "SimdMultAccOp", smallUnsignedTypes, 2, + mlsCode, True) + threeEqualRegInstX("mls", "MlsQX", "SimdMultAccOp", smallUnsignedTypes, 4, + mlsCode, True) + # MOV (element) -> alias to INS (element) + # MOV (from general) -> alias to INS (general register) + # MOV (scalar) -> alias to DUP (element) + # MOV (to general) -> alias to UMOV + # MOV (vector) -> alias to ORR (register) + # MOVI + movImmCode = "destElem = imm;" + oneRegImmInstX("movi", "MoviDX", "SimdMiscOp", ("uint64_t",), 2, + movImmCode) + oneRegImmInstX("movi", "MoviQX", "SimdMiscOp", ("uint64_t",), 4, + movImmCode) + # MUL (by element) + mulCode = "destElem = srcElem1 * srcElem2;" + threeEqualRegInstX("mul", "MulElemDX", "SimdMultOp", + ("uint16_t", "uint32_t"), 2, mulCode, byElem=True) + threeEqualRegInstX("mul", "MulElemQX", "SimdMultOp", + ("uint16_t", "uint32_t"), 4, mulCode, byElem=True) + # MUL (vector) + threeEqualRegInstX("mul", "MulDX", "SimdMultOp", smallUnsignedTypes, 2, + mulCode) + threeEqualRegInstX("mul", "MulQX", "SimdMultOp", smallUnsignedTypes, 4, + mulCode) + # MVN + mvnCode = "destElem = ~srcElem1;" + twoEqualRegInstX("mvn", "MvnDX", "SimdAluOp", ("uint64_t",), 2, mvnCode) + twoEqualRegInstX("mvn", "MvnQX", "SimdAluOp", ("uint64_t",), 4, mvnCode) + # MVNI + mvniCode = "destElem = ~imm;" + oneRegImmInstX("mvni", "MvniDX", "SimdAluOp", ("uint64_t",), 2, mvniCode) + oneRegImmInstX("mvni", "MvniQX", "SimdAluOp", ("uint64_t",), 4, mvniCode) + # NEG + negCode = "destElem = -srcElem1;" + twoEqualRegInstX("neg", "NegDX", "SimdAluOp", signedTypes, 2, negCode) + twoEqualRegInstX("neg", "NegQX", "SimdAluOp", signedTypes, 4, negCode) + # NOT -> alias to MVN + # ORN + ornCode = "destElem = srcElem1 | ~srcElem2;" + threeEqualRegInstX("orn", "OrnDX", "SimdAluOp", ("uint64_t",), 2, ornCode) + threeEqualRegInstX("orn", "OrnQX", "SimdAluOp", ("uint64_t",), 4, ornCode) + # ORR (immediate) + orrImmCode = "destElem |= imm;" + oneRegImmInstX("orr", "OrrImmDX", "SimdAluOp", ("uint64_t",), 2, + orrImmCode, True) + oneRegImmInstX("orr", "OrrImmQX", "SimdAluOp", ("uint64_t",), 4, + orrImmCode, True) + # ORR (register) + orrCode = "destElem = srcElem1 | srcElem2;" + threeEqualRegInstX("orr", "OrrDX", "SimdAluOp", ("uint64_t",), 2, orrCode) + threeEqualRegInstX("orr", "OrrQX", "SimdAluOp", ("uint64_t",), 4, orrCode) + # PMUL + pmulCode = ''' + destElem = 0; + for (unsigned j = 0; j < sizeof(Element) * 8; j++) { + if (bits(srcElem2, j)) + destElem ^= srcElem1 << j; + } + ''' + threeEqualRegInstX("pmul", "PmulDX", "SimdMultOp", ("uint8_t",), 2, + pmulCode) + threeEqualRegInstX("pmul", "PmulQX", "SimdMultOp", ("uint8_t",), 4, + pmulCode) + # PMULL, PMULL2 + # Note: 64-bit PMULL is not available (Crypto. Extension) + pmullCode = ''' + destElem = 0; + for (unsigned j = 0; j < sizeof(Element) * 8; j++) { + if (bits(srcElem2, j)) + destElem ^= (BigElement)srcElem1 << j; + } + ''' + threeRegLongInstX("pmull", "PmullX", "SimdMultOp", ("uint8_t",), pmullCode) + threeRegLongInstX("pmull", "Pmull2X", "SimdMultOp", ("uint8_t",), + pmullCode, hi=True) + # RADDHN, RADDHN2 + raddhnCode = ''' + destElem = ((BigElement)srcElem1 + (BigElement)srcElem2 + + ((BigElement)1 << (sizeof(Element) * 8 - 1))) >> + (sizeof(Element) * 8); + ''' + threeRegNarrowInstX("raddhn", "RaddhnX", "SimdAddOp", smallUnsignedTypes, + raddhnCode) + threeRegNarrowInstX("raddhn2", "Raddhn2X", "SimdAddOp", smallUnsignedTypes, + raddhnCode, hi=True) + # RBIT + rbitCode = ''' + destElem = 0; + Element temp = srcElem1; + for (int i = 0; i < 8 * sizeof(Element); i++) { + destElem = destElem | ((temp & 0x1) << + (8 * sizeof(Element) - 1 - i)); + temp >>= 1; + } + ''' + twoEqualRegInstX("rbit", "RbitDX", "SimdAluOp", ("uint8_t",), 2, rbitCode) + twoEqualRegInstX("rbit", "RbitQX", "SimdAluOp", ("uint8_t",), 4, rbitCode) + # REV16 + rev16Code = ''' + destElem = srcElem1; + unsigned groupSize = ((1 << 1) / sizeof(Element)); + unsigned reverseMask = (groupSize - 1); + j = i ^ reverseMask; + ''' + twoEqualRegInstX("rev16", "Rev16DX", "SimdAluOp", ("uint8_t",), 2, + rev16Code) + twoEqualRegInstX("rev16", "Rev16QX", "SimdAluOp", ("uint8_t",), 4, + rev16Code) + # REV32 + rev32Code = ''' + destElem = srcElem1; + unsigned groupSize = ((1 << 2) / sizeof(Element)); + unsigned reverseMask = (groupSize - 1); + j = i ^ reverseMask; + ''' + twoEqualRegInstX("rev32", "Rev32DX", "SimdAluOp", ("uint8_t", "uint16_t"), + 2, rev32Code) + twoEqualRegInstX("rev32", "Rev32QX", "SimdAluOp", ("uint8_t", "uint16_t"), + 4, rev32Code) + # REV64 + rev64Code = ''' + destElem = srcElem1; + unsigned groupSize = ((1 << 3) / sizeof(Element)); + unsigned reverseMask = (groupSize - 1); + j = i ^ reverseMask; + ''' + twoEqualRegInstX("rev64", "Rev64DX", "SimdAluOp", smallUnsignedTypes, 2, + rev64Code) + twoEqualRegInstX("rev64", "Rev64QX", "SimdAluOp", smallUnsignedTypes, 4, + rev64Code) + # RSHRN, RSHRN2 + rshrnCode = ''' + if (imm > sizeof(srcElem1) * 8) { + destElem = 0; + } else if (imm) { + Element rBit = bits(srcElem1, imm - 1); + destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit; + } else { + destElem = srcElem1; + } + ''' + twoRegNarrowInstX("rshrn", "RshrnX", "SimdShiftOp", smallUnsignedTypes, + rshrnCode, hasImm=True) + twoRegNarrowInstX("rshrn2", "Rshrn2X", "SimdShiftOp", smallUnsignedTypes, + rshrnCode, hasImm=True, hi=True) + # RSUBHN, RSUBHN2 + rsubhnCode = ''' + destElem = ((BigElement)srcElem1 - (BigElement)srcElem2 + + ((BigElement)1 << (sizeof(Element) * 8 - 1))) >> + (sizeof(Element) * 8); + ''' + threeRegNarrowInstX("rsubhn", "RsubhnX", "SimdAddOp", smallTypes, + rsubhnCode) + threeRegNarrowInstX("rsubhn2", "Rsubhn2X", "SimdAddOp", smallTypes, + rsubhnCode, hi=True) + # SABA + abaCode = ''' + destElem += (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) : + (srcElem2 - srcElem1); + ''' + threeEqualRegInstX("saba", "SabaDX", "SimdAddAccOp", smallSignedTypes, 2, + abaCode, True) + threeEqualRegInstX("saba", "SabaQX", "SimdAddAccOp", smallSignedTypes, 4, + abaCode, True) + # SABAL, SABAL2 + abalCode = ''' + destElem += (srcElem1 > srcElem2) ? + ((BigElement)srcElem1 - (BigElement)srcElem2) : + ((BigElement)srcElem2 - (BigElement)srcElem1); + ''' + threeRegLongInstX("sabal", "SabalX", "SimdAddAccOp", smallSignedTypes, + abalCode, True) + threeRegLongInstX("sabal2", "Sabal2X", "SimdAddAccOp", smallSignedTypes, + abalCode, True, hi=True) + # SABD + abdCode = ''' + destElem = (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) : + (srcElem2 - srcElem1); + ''' + threeEqualRegInstX("sabd", "SabdDX", "SimdAddOp", smallSignedTypes, 2, + abdCode) + threeEqualRegInstX("sabd", "SabdQX", "SimdAddOp", smallSignedTypes, 4, + abdCode) + # SABDL, SABDL2 + abdlCode = ''' + destElem = (srcElem1 > srcElem2) ? + ((BigElement)srcElem1 - (BigElement)srcElem2) : + ((BigElement)srcElem2 - (BigElement)srcElem1); + ''' + threeRegLongInstX("sabdl", "SabdlX", "SimdAddAccOp", smallSignedTypes, + abdlCode, True) + threeRegLongInstX("sabdl2", "Sabdl2X", "SimdAddAccOp", smallSignedTypes, + abdlCode, True, hi=True) + # SADALP + adalpCode = "destElem += (BigElement)srcElem1 + (BigElement)srcElem2;" + twoRegCondenseInstX("sadalp", "SadalpDX", "SimdAddOp", smallSignedTypes, 2, + adalpCode, True) + twoRegCondenseInstX("sadalp", "SadalpQX", "SimdAddOp", smallSignedTypes, 4, + adalpCode, True) + # SADDL, SADDL2 + addlwCode = "destElem = (BigElement)srcElem1 + (BigElement)srcElem2;" + threeRegLongInstX("saddl", "SaddlX", "SimdAddAccOp", smallSignedTypes, + addlwCode) + threeRegLongInstX("saddl2", "Saddl2X", "SimdAddAccOp", smallSignedTypes, + addlwCode, hi=True) + # SADDLP + twoRegCondenseInstX("saddlp", "SaddlpDX", "SimdAddOp", smallSignedTypes, 2, + addlwCode) + twoRegCondenseInstX("saddlp", "SaddlpQX", "SimdAddOp", smallSignedTypes, 4, + addlwCode) + # SADDLV + # Note: SimdAddOp can be a bit optimistic here + addAcrossLongCode = "destElem += (BigElement)srcElem1;" + twoRegAcrossInstX("saddlv", "SaddlvDX", "SimdAddOp", ("int8_t", "int16_t"), + 2, addAcrossLongCode, long=True) + twoRegAcrossInstX("saddlv", "SaddlvQX", "SimdAddOp", ("int8_t", "int16_t"), + 4, addAcrossLongCode, long=True) + twoRegAcrossInstX("saddlv", "SaddlvBQX", "SimdAddOp", ("int32_t",), 4, + addAcrossLongCode, doubleDest=True, long=True) + # SADDW, SADDW2 + threeRegWideInstX("saddw", "SaddwX", "SimdAddAccOp", smallSignedTypes, + addlwCode) + threeRegWideInstX("saddw2", "Saddw2X", "SimdAddAccOp", smallSignedTypes, + addlwCode, hi=True) + # SCVTF (fixed-point) + scvtfFixedCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, imm," + " false, FPCRRounding(fpscr), fpscr)") + twoEqualRegInstX("scvtf", "ScvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2, + scvtfFixedCode % 32, hasImm=True) + twoEqualRegInstX("scvtf", "ScvtfFixedSQX", "SimdCvtOp", smallFloatTypes, 4, + scvtfFixedCode % 32, hasImm=True) + twoEqualRegInstX("scvtf", "ScvtfFixedDQX", "SimdCvtOp", ("uint64_t",), 4, + scvtfFixedCode % 64, hasImm=True) + twoEqualRegInstX("scvtf", "ScvtfFixedScSX", "SimdCvtOp", smallFloatTypes, + 4, scvtfFixedCode % 32, hasImm=True, scalar=True) + twoEqualRegInstX("scvtf", "ScvtfFixedScDX", "SimdCvtOp", ("uint64_t",), 4, + scvtfFixedCode % 64, hasImm=True, scalar=True) + # SCVTF (integer) + scvtfIntCode = fpOp % ("fplibFixedToFP<Element>((int%d_t) srcElem1, 0," + " false, FPCRRounding(fpscr), fpscr)") + twoEqualRegInstX("scvtf", "ScvtfIntDX", "SimdCvtOp", smallFloatTypes, 2, + scvtfIntCode % 32) + twoEqualRegInstX("scvtf", "ScvtfIntSQX", "SimdCvtOp", smallFloatTypes, 4, + scvtfIntCode % 32) + twoEqualRegInstX("scvtf", "ScvtfIntDQX", "SimdCvtOp", ("uint64_t",), 4, + scvtfIntCode % 64) + twoEqualRegInstX("scvtf", "ScvtfIntScSX", "SimdCvtOp", smallFloatTypes, 4, + scvtfIntCode % 32, scalar=True) + twoEqualRegInstX("scvtf", "ScvtfIntScDX", "SimdCvtOp", ("uint64_t",), 4, + scvtfIntCode % 64, scalar=True) + # SHADD + haddCode = ''' + Element carryBit = + (((unsigned)srcElem1 & 0x1) + + ((unsigned)srcElem2 & 0x1)) >> 1; + // Use division instead of a shift to ensure the sign extension works + // right. The compiler will figure out if it can be a shift. Mask the + // inputs so they get truncated correctly. + destElem = (((srcElem1 & ~(Element)1) / 2) + + ((srcElem2 & ~(Element)1) / 2)) + carryBit; + ''' + threeEqualRegInstX("shadd", "ShaddDX", "SimdAddOp", smallSignedTypes, 2, + haddCode) + threeEqualRegInstX("shadd", "ShaddQX", "SimdAddOp", smallSignedTypes, 4, + haddCode) + # SHL + shlCode = ''' + if (imm >= sizeof(Element) * 8) + destElem = (srcElem1 << (sizeof(Element) * 8 - 1)) << 1; + else + destElem = srcElem1 << imm; + ''' + twoEqualRegInstX("shl", "ShlDX", "SimdShiftOp", unsignedTypes, 2, shlCode, + hasImm=True) + twoEqualRegInstX("shl", "ShlQX", "SimdShiftOp", unsignedTypes, 4, shlCode, + hasImm=True) + # SHLL, SHLL2 + shllCode = "destElem = ((BigElement)srcElem1) << (sizeof(Element) * 8);" + twoRegLongInstX("shll", "ShllX", "SimdShiftOp", smallTypes, shllCode) + twoRegLongInstX("shll", "Shll2X", "SimdShiftOp", smallTypes, shllCode, + hi=True) + # SHRN, SHRN2 + shrnCode = ''' + if (imm >= sizeof(srcElem1) * 8) { + destElem = 0; + } else { + destElem = srcElem1 >> imm; + } + ''' + twoRegNarrowInstX("shrn", "ShrnX", "SimdShiftOp", smallUnsignedTypes, + shrnCode, hasImm=True) + twoRegNarrowInstX("shrn2", "Shrn2X", "SimdShiftOp", smallUnsignedTypes, + shrnCode, hasImm=True, hi=True) + # SHSUB + hsubCode = ''' + Element borrowBit = + (((srcElem1 & 0x1) - (srcElem2 & 0x1)) >> 1) & 0x1; + // Use division instead of a shift to ensure the sign extension works + // right. The compiler will figure out if it can be a shift. Mask the + // inputs so they get truncated correctly. + destElem = (((srcElem1 & ~(Element)1) / 2) - + ((srcElem2 & ~(Element)1) / 2)) - borrowBit; + ''' + threeEqualRegInstX("shsub", "ShsubDX", "SimdAddOp", smallSignedTypes, 2, + hsubCode) + threeEqualRegInstX("shsub", "ShsubQX", "SimdAddOp", smallSignedTypes, 4, + hsubCode) + # SLI + sliCode = ''' + if (imm >= sizeof(Element) * 8) + destElem = destElem; + else + destElem = (srcElem1 << imm) | (destElem & mask(imm)); + ''' + twoEqualRegInstX("sli", "SliDX", "SimdShiftOp", unsignedTypes, 2, sliCode, + True, hasImm=True) + twoEqualRegInstX("sli", "SliQX", "SimdShiftOp", unsignedTypes, 4, sliCode, + True, hasImm=True) + # SMAX + maxCode = "destElem = (srcElem1 > srcElem2) ? srcElem1 : srcElem2;" + threeEqualRegInstX("smax", "SmaxDX", "SimdCmpOp", smallSignedTypes, 2, + maxCode) + threeEqualRegInstX("smax", "SmaxQX", "SimdCmpOp", smallSignedTypes, 4, + maxCode) + # SMAXP + threeEqualRegInstX("smaxp", "SmaxpDX", "SimdCmpOp", smallSignedTypes, 2, + maxCode, pairwise=True) + threeEqualRegInstX("smaxp", "SmaxpQX", "SimdCmpOp", smallSignedTypes, 4, + maxCode, pairwise=True) + # SMAXV + maxAcrossCode = ''' + if (i == 0 || srcElem1 > destElem) + destElem = srcElem1; + ''' + twoRegAcrossInstX("smaxv", "SmaxvDX", "SimdCmpOp", ("int8_t", "int16_t"), + 2, maxAcrossCode) + twoRegAcrossInstX("smaxv", "SmaxvQX", "SimdCmpOp", smallSignedTypes, 4, + maxAcrossCode) + # SMIN + minCode = "destElem = (srcElem1 < srcElem2) ? srcElem1 : srcElem2;" + threeEqualRegInstX("smin", "SminDX", "SimdCmpOp", smallSignedTypes, 2, + minCode) + threeEqualRegInstX("smin", "SminQX", "SimdCmpOp", smallSignedTypes, 4, + minCode) + # SMINP + threeEqualRegInstX("sminp", "SminpDX", "SimdCmpOp", smallSignedTypes, 2, + minCode, pairwise=True) + threeEqualRegInstX("sminp", "SminpQX", "SimdCmpOp", smallSignedTypes, 4, + minCode, pairwise=True) + # SMINV + minAcrossCode = ''' + if (i == 0 || srcElem1 < destElem) + destElem = srcElem1; + ''' + twoRegAcrossInstX("sminv", "SminvDX", "SimdCmpOp", ("int8_t", "int16_t"), + 2, minAcrossCode) + twoRegAcrossInstX("sminv", "SminvQX", "SimdCmpOp", smallSignedTypes, 4, + minAcrossCode) + # SMLAL, SMLAL2 (by element) + mlalCode = "destElem += (BigElement)srcElem1 * (BigElement)srcElem2;" + threeRegLongInstX("smlal", "SmlalElemX", "SimdMultAccOp", + ("int16_t", "int32_t"), mlalCode, True, byElem=True) + threeRegLongInstX("smlal", "SmlalElem2X", "SimdMultAccOp", + ("int16_t", "int32_t"), mlalCode, True, byElem=True, + hi=True) + # SMLAL, SMLAL2 (vector) + threeRegLongInstX("smlal", "SmlalX", "SimdMultAccOp", smallSignedTypes, + mlalCode, True) + threeRegLongInstX("smlal", "Smlal2X", "SimdMultAccOp", smallSignedTypes, + mlalCode, True, hi=True) + # SMLSL, SMLSL2 (by element) + mlslCode = "destElem -= (BigElement)srcElem1 * (BigElement)srcElem2;" + threeRegLongInstX("smlsl", "SmlslElemX", "SimdMultAccOp", smallSignedTypes, + mlslCode, True, byElem=True) + threeRegLongInstX("smlsl", "SmlslElem2X", "SimdMultAccOp", + smallSignedTypes, mlslCode, True, byElem=True, hi=True) + # SMLSL, SMLSL2 (vector) + threeRegLongInstX("smlsl", "SmlslX", "SimdMultAccOp", smallSignedTypes, + mlslCode, True) + threeRegLongInstX("smlsl", "Smlsl2X", "SimdMultAccOp", smallSignedTypes, + mlslCode, True, hi=True) + # SMOV + insToGprInstX("smov", "SmovWX", "SimdMiscOp", ("int8_t", "int16_t"), 4, + 'W', True) + insToGprInstX("smov", "SmovXX", "SimdMiscOp", smallSignedTypes, 4, 'X', + True) + # SMULL, SMULL2 (by element) + mullCode = "destElem = (BigElement)srcElem1 * (BigElement)srcElem2;" + threeRegLongInstX("smull", "SmullElemX", "SimdMultOp", smallSignedTypes, + mullCode, byElem=True) + threeRegLongInstX("smull", "SmullElem2X", "SimdMultOp", smallSignedTypes, + mullCode, byElem=True, hi=True) + # SMULL, SMULL2 (vector) + threeRegLongInstX("smull", "SmullX", "SimdMultOp", smallSignedTypes, + mullCode) + threeRegLongInstX("smull", "Smull2X", "SimdMultOp", smallSignedTypes, + mullCode, hi=True) + # SQABS + sqabsCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + if (srcElem1 == (Element)((Element)1 << (sizeof(Element) * 8 - 1))) { + fpscr.qc = 1; + destElem = ~srcElem1; + } else if (srcElem1 < 0) { + destElem = -srcElem1; + } else { + destElem = srcElem1; + } + FpscrQc = fpscr; + ''' + twoEqualRegInstX("sqabs", "SqabsDX", "SimdAluOp", smallSignedTypes, 2, + sqabsCode) + twoEqualRegInstX("sqabs", "SqabsQX", "SimdAluOp", signedTypes, 4, + sqabsCode) + twoEqualRegInstX("sqabs", "SqabsScX", "SimdAluOp", signedTypes, 4, + sqabsCode, scalar=True) + # SQADD + sqaddCode = ''' + destElem = srcElem1 + srcElem2; + FPSCR fpscr = (FPSCR) FpscrQc; + bool negDest = (destElem < 0); + bool negSrc1 = (srcElem1 < 0); + bool negSrc2 = (srcElem2 < 0); + if ((negDest != negSrc1) && (negSrc1 == negSrc2)) { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + if (negDest) + destElem -= 1; + fpscr.qc = 1; + } + FpscrQc = fpscr; + ''' + threeEqualRegInstX("sqadd", "SqaddDX", "SimdAddOp", smallSignedTypes, 2, + sqaddCode) + threeEqualRegInstX("sqadd", "SqaddQX", "SimdAddOp", signedTypes, 4, + sqaddCode) + threeEqualRegInstX("sqadd", "SqaddScX", "SimdAddOp", signedTypes, 4, + sqaddCode, scalar=True) + # SQDMLAL, SQDMLAL2 (by element) + qdmlalCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2); + Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1); + Element halfNeg = maxNeg / 2; + if ((srcElem1 == maxNeg && srcElem2 == maxNeg) || + (srcElem1 == halfNeg && srcElem2 == maxNeg) || + (srcElem1 == maxNeg && srcElem2 == halfNeg)) { + midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8)); + fpscr.qc = 1; + } + bool negPreDest = ltz(destElem); + destElem += midElem; + bool negDest = ltz(destElem); + bool negMid = ltz(midElem); + if (negPreDest == negMid && negMid != negDest) { + destElem = mask(sizeof(BigElement) * 8 - 1); + if (negPreDest) + destElem = ~destElem; + fpscr.qc = 1; + } + FpscrQc = fpscr; + ''' + threeRegLongInstX("sqdmlal", "SqdmlalElemX", "SimdMultAccOp", + ("int16_t", "int32_t"), qdmlalCode, True, byElem=True) + threeRegLongInstX("sqdmlal", "SqdmlalElem2X", "SimdMultAccOp", + ("int16_t", "int32_t"), qdmlalCode, True, byElem=True, + hi=True) + threeRegLongInstX("sqdmlal", "SqdmlalElemScX", "SimdMultAccOp", + ("int16_t", "int32_t"), qdmlalCode, True, byElem=True, + scalar=True) + # SQDMLAL, SQDMLAL2 (vector) + threeRegLongInstX("sqdmlal", "SqdmlalX", "SimdMultAccOp", + ("int16_t", "int32_t"), qdmlalCode, True) + threeRegLongInstX("sqdmlal", "Sqdmlal2X", "SimdMultAccOp", + ("int16_t", "int32_t"), qdmlalCode, True, hi=True) + threeRegLongInstX("sqdmlal", "SqdmlalScX", "SimdMultAccOp", + ("int16_t", "int32_t"), qdmlalCode, True, scalar=True) + # SQDMLSL, SQDMLSL2 (by element) + qdmlslCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2); + Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1); + Element halfNeg = maxNeg / 2; + if ((srcElem1 == maxNeg && srcElem2 == maxNeg) || + (srcElem1 == halfNeg && srcElem2 == maxNeg) || + (srcElem1 == maxNeg && srcElem2 == halfNeg)) { + midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8)); + fpscr.qc = 1; + } + bool negPreDest = ltz(destElem); + destElem -= midElem; + bool negDest = ltz(destElem); + bool posMid = ltz((BigElement)-midElem); + if (negPreDest == posMid && posMid != negDest) { + destElem = mask(sizeof(BigElement) * 8 - 1); + if (negPreDest) + destElem = ~destElem; + fpscr.qc = 1; + } + FpscrQc = fpscr; + ''' + threeRegLongInstX("sqdmlsl", "SqdmlslElemX", "SimdMultAccOp", + ("int16_t", "int32_t"), qdmlslCode, True, byElem=True) + threeRegLongInstX("sqdmlsl", "SqdmlslElem2X", "SimdMultAccOp", + ("int16_t", "int32_t"), qdmlslCode, True, byElem=True, + hi=True) + threeRegLongInstX("sqdmlsl", "SqdmlslElemScX", "SimdMultAccOp", + ("int16_t", "int32_t"), qdmlslCode, True, byElem=True, + scalar=True) + # SQDMLSL, SQDMLSL2 (vector) + threeRegLongInstX("sqdmlsl", "SqdmlslX", "SimdMultAccOp", + ("int16_t", "int32_t"), qdmlslCode, True) + threeRegLongInstX("sqdmlsl", "Sqdmlsl2X", "SimdMultAccOp", + ("int16_t", "int32_t"), qdmlslCode, True, hi=True) + threeRegLongInstX("sqdmlsl", "SqdmlslScX", "SimdMultAccOp", + ("int16_t", "int32_t"), qdmlslCode, True, scalar=True) + # SQDMULH (by element) + sqdmulhCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2) >> + (sizeof(Element) * 8); + if (srcElem1 == srcElem2 && + srcElem1 == (Element)((Element)1 << + (sizeof(Element) * 8 - 1))) { + destElem = ~srcElem1; + fpscr.qc = 1; + } + FpscrQc = fpscr; + ''' + threeEqualRegInstX("sqdmulh", "SqdmulhElemDX", "SimdMultOp", + ("int16_t", "int32_t"), 2, sqdmulhCode, byElem=True) + threeEqualRegInstX("sqdmulh", "SqdmulhElemQX", "SimdMultOp", + ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True) + threeEqualRegInstX("sqdmulh", "SqdmulhElemScX", "SimdMultOp", + ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True, + scalar=True) + # SQDMULH (vector) + threeEqualRegInstX("sqdmulh", "SqdmulhDX", "SimdMultOp", + ("int16_t", "int32_t"), 2, sqdmulhCode) + threeEqualRegInstX("sqdmulh", "SqdmulhQX", "SimdMultOp", + ("int16_t", "int32_t"), 4, sqdmulhCode) + threeEqualRegInstX("sqdmulh", "SqdmulhScX", "SimdMultOp", + ("int16_t", "int32_t"), 4, sqdmulhCode, scalar=True) + # SQDMULL, SQDMULL2 (by element) + qdmullCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2); + if (srcElem1 == srcElem2 && + srcElem1 == (Element)((Element)1 << + (Element)(sizeof(Element) * 8 - 1))) { + destElem = ~((BigElement)srcElem1 << (sizeof(Element) * 8)); + fpscr.qc = 1; + } + FpscrQc = fpscr; + ''' + threeRegLongInstX("sqdmull", "SqdmullElemX", "SimdMultOp", + ("int16_t", "int32_t"), qdmullCode, True, byElem=True) + threeRegLongInstX("sqdmull", "SqdmullElem2X", "SimdMultOp", + ("int16_t", "int32_t"), qdmullCode, True, byElem=True, + hi=True) + threeRegLongInstX("sqdmull", "SqdmullElemScX", "SimdMultOp", + ("int16_t", "int32_t"), qdmullCode, True, byElem=True, + scalar=True) + # SQDMULL, SQDMULL2 (vector) + threeRegLongInstX("sqdmull", "SqdmullX", "SimdMultOp", + ("int16_t", "int32_t"), qdmullCode, True) + threeRegLongInstX("sqdmull", "Sqdmull2X", "SimdMultOp", + ("int16_t", "int32_t"), qdmullCode, True, hi=True) + threeRegLongInstX("sqdmull", "SqdmullScX", "SimdMultOp", + ("int16_t", "int32_t"), qdmullCode, True, scalar=True) + # SQNEG + sqnegCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + if (srcElem1 == (Element)((Element)1 << (sizeof(Element) * 8 - 1))) { + fpscr.qc = 1; + destElem = ~srcElem1; + } else { + destElem = -srcElem1; + } + FpscrQc = fpscr; + ''' + twoEqualRegInstX("sqneg", "SqnegDX", "SimdAluOp", smallSignedTypes, 2, + sqnegCode) + twoEqualRegInstX("sqneg", "SqnegQX", "SimdAluOp", signedTypes, 4, + sqnegCode) + twoEqualRegInstX("sqneg", "SqnegScX", "SimdAluOp", signedTypes, 4, + sqnegCode, scalar=True) + # SQRDMULH (by element) + sqrdmulhCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2 + + ((int64_t)1 << (sizeof(Element) * 8 - 1))) >> + (sizeof(Element) * 8); + Element maxNeg = (Element)1 << (sizeof(Element) * 8 - 1); + Element halfNeg = maxNeg / 2; + if ((srcElem1 == maxNeg && srcElem2 == maxNeg) || + (srcElem1 == halfNeg && srcElem2 == maxNeg) || + (srcElem1 == maxNeg && srcElem2 == halfNeg)) { + if (destElem < 0) { + destElem = mask(sizeof(Element) * 8 - 1); + } else { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + } + fpscr.qc = 1; + } + FpscrQc = fpscr; + ''' + threeEqualRegInstX("sqrdmulh", "SqrdmulhElemDX", "SimdMultOp", + ("int16_t", "int32_t"), 2, sqrdmulhCode, byElem=True) + threeEqualRegInstX("sqrdmulh", "SqrdmulhElemQX", "SimdMultOp", + ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True) + threeEqualRegInstX("sqrdmulh", "SqrdmulhElemScX", "SimdMultOp", + ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True, + scalar=True) + # SQRDMULH (vector) + threeEqualRegInstX("sqrdmulh", "SqrdmulhDX", "SimdMultOp", + ("int16_t", "int32_t"), 2, sqrdmulhCode) + threeEqualRegInstX("sqrdmulh", "SqrdmulhQX", "SimdMultOp", + ("int16_t", "int32_t"), 4, sqrdmulhCode) + threeEqualRegInstX("sqrdmulh", "SqrdmulhScX", "SimdMultOp", + ("int16_t", "int32_t"), 4, sqrdmulhCode, scalar=True) + # SQRSHL + sqrshlCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + FPSCR fpscr = (FPSCR) FpscrQc; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + Element rBit = 0; + if (shiftAmt <= sizeof(Element) * 8) + rBit = bits(srcElem1, shiftAmt - 1); + if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0) + rBit = 1; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (srcElem1 < 0 && destElem >= 0) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + destElem += rBit; + } else if (shiftAmt > 0) { + bool sat = false; + if (shiftAmt >= sizeof(Element) * 8) { + if (srcElem1 != 0) + sat = true; + else + destElem = 0; + } else { + if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - 1 - shiftAmt) != + ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) { + sat = true; + } else { + destElem = srcElem1 << shiftAmt; + } + } + if (sat) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + } + } else { + destElem = srcElem1; + } + FpscrQc = fpscr; + ''' + threeEqualRegInstX("sqrshl", "SqrshlDX", "SimdCmpOp", smallSignedTypes, 2, + sqrshlCode) + threeEqualRegInstX("sqrshl", "SqrshlQX", "SimdCmpOp", signedTypes, 4, + sqrshlCode) + threeEqualRegInstX("sqrshl", "SqrshlScX", "SimdCmpOp", signedTypes, 4, + sqrshlCode, scalar=True) + # SQRSHRN, SQRSHRN2 + sqrshrnCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0 && srcElem1 != -1) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = (srcElem1 >> (imm - 1)); + uint64_t rBit = mid & 0x1; + mid >>= 1; + mid |= -(mid & ((BigElement)1 << + (sizeof(BigElement) * 8 - 1 - imm))); + mid += rBit; + if (mid != (Element)mid) { + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + if (srcElem1 != (Element)srcElem1) { + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + fpscr.qc = 1; + } else { + destElem = srcElem1; + } + } + FpscrQc = fpscr; + ''' + twoRegNarrowInstX("sqrshrn", "SqrshrnX", "SimdShiftOp", smallSignedTypes, + sqrshrnCode, hasImm=True) + twoRegNarrowInstX("sqrshrn2", "Sqrshrn2X", "SimdShiftOp", smallSignedTypes, + sqrshrnCode, hasImm=True, hi=True) + twoRegNarrowInstX("sqrshrn", "SqrshrnScX", "SimdShiftOp", smallSignedTypes, + sqrshrnCode, hasImm=True, scalar=True) + # SQRSHRUN, SQRSHRUN2 + sqrshrunCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = (srcElem1 >> (imm - 1)); + uint64_t rBit = mid & 0x1; + mid >>= 1; + mid |= -(mid & ((BigElement)1 << + (sizeof(BigElement) * 8 - 1 - imm))); + mid += rBit; + if (bits(mid, sizeof(BigElement) * 8 - 1, + sizeof(Element) * 8) != 0) { + if (srcElem1 < 0) { + destElem = 0; + } else { + destElem = mask(sizeof(Element) * 8); + } + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + if (srcElem1 < 0) { + fpscr.qc = 1; + destElem = 0; + } else { + destElem = srcElem1; + } + } + FpscrQc = fpscr; + ''' + twoRegNarrowInstX("sqrshrun", "SqrshrunX", "SimdShiftOp", smallSignedTypes, + sqrshrunCode, hasImm=True) + twoRegNarrowInstX("sqrshrun", "Sqrshrun2X", "SimdShiftOp", + smallSignedTypes, sqrshrunCode, hasImm=True, hi=True) + twoRegNarrowInstX("sqrshrun", "SqrshrunScX", "SimdShiftOp", + smallSignedTypes, sqrshrunCode, hasImm=True, scalar=True) + # SQSHL (immediate) + sqshlImmCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + if (imm >= sizeof(Element) * 8) { + if (srcElem1 != 0) { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + if (srcElem1 > 0) + destElem = ~destElem; + fpscr.qc = 1; + } else { + destElem = 0; + } + } else if (imm) { + destElem = (srcElem1 << imm); + uint64_t topBits = bits((uint64_t)srcElem1, + sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - 1 - imm); + if (topBits != 0 && topBits != mask(imm + 1)) { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + if (srcElem1 > 0) + destElem = ~destElem; + fpscr.qc = 1; + } + } else { + destElem = srcElem1; + } + FpscrQc = fpscr; + ''' + twoEqualRegInstX("sqshl", "SqshlImmDX", "SimdAluOp", smallSignedTypes, 2, + sqshlImmCode, hasImm=True) + twoEqualRegInstX("sqshl", "SqshlImmQX", "SimdAluOp", signedTypes, 4, + sqshlImmCode, hasImm=True) + twoEqualRegInstX("sqshl", "SqshlImmScX", "SimdAluOp", signedTypes, 4, + sqshlImmCode, hasImm=True, scalar=True) + # SQSHL (register) + sqshlCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + FPSCR fpscr = (FPSCR) FpscrQc; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (srcElem1 < 0 && destElem >= 0) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + } else if (shiftAmt > 0) { + bool sat = false; + if (shiftAmt >= sizeof(Element) * 8) { + if (srcElem1 != 0) + sat = true; + else + destElem = 0; + } else { + if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - 1 - shiftAmt) != + ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) { + sat = true; + } else { + destElem = srcElem1 << shiftAmt; + } + } + if (sat) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + } + } else { + destElem = srcElem1; + } + FpscrQc = fpscr; + ''' + threeEqualRegInstX("sqshl", "SqshlDX", "SimdAluOp", smallSignedTypes, 2, + sqshlCode) + threeEqualRegInstX("sqshl", "SqshlQX", "SimdAluOp", signedTypes, 4, + sqshlCode) + threeEqualRegInstX("sqshl", "SqshlScX", "SimdAluOp", signedTypes, 4, + sqshlCode, scalar=True) + # SQSHLU + sqshluCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + if (imm >= sizeof(Element) * 8) { + if (srcElem1 < 0) { + destElem = 0; + fpscr.qc = 1; + } else if (srcElem1 > 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = 0; + } + } else if (imm) { + destElem = (srcElem1 << imm); + uint64_t topBits = bits((uint64_t)srcElem1, + sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - imm); + if (srcElem1 < 0) { + destElem = 0; + fpscr.qc = 1; + } else if (topBits != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } + } else { + if (srcElem1 < 0) { + fpscr.qc = 1; + destElem = 0; + } else { + destElem = srcElem1; + } + } + FpscrQc = fpscr; + ''' + twoEqualRegInstX("sqshlu", "SqshluDX", "SimdAluOp", smallSignedTypes, 2, + sqshluCode, hasImm=True) + twoEqualRegInstX("sqshlu", "SqshluQX", "SimdAluOp", signedTypes, 4, + sqshluCode, hasImm=True) + twoEqualRegInstX("sqshlu", "SqshluScX", "SimdAluOp", signedTypes, 4, + sqshluCode, hasImm=True, scalar=True) + # SQSHRN, SQSHRN2 + sqshrnCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0 && srcElem1 != -1) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = ((srcElem1 >> (imm - 1)) >> 1); + mid |= -(mid & ((BigElement)1 << + (sizeof(BigElement) * 8 - 1 - imm))); + if (mid != (Element)mid) { + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + destElem = srcElem1; + } + FpscrQc = fpscr; + ''' + twoRegNarrowInstX("sqshrn", "SqshrnX", "SimdShiftOp", smallSignedTypes, + sqshrnCode, hasImm=True) + twoRegNarrowInstX("sqshrn2", "Sqshrn2X", "SimdShiftOp", smallSignedTypes, + sqshrnCode, hasImm=True, hi=True) + twoRegNarrowInstX("sqshrn", "SqshrnScX", "SimdShiftOp", smallSignedTypes, + sqshrnCode, hasImm=True, scalar=True) + # SQSHRUN, SQSHRUN2 + sqshrunCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = ((srcElem1 >> (imm - 1)) >> 1); + if (bits(mid, sizeof(BigElement) * 8 - 1, + sizeof(Element) * 8) != 0) { + if (srcElem1 < 0) { + destElem = 0; + } else { + destElem = mask(sizeof(Element) * 8); + } + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + destElem = srcElem1; + } + FpscrQc = fpscr; + ''' + twoRegNarrowInstX("sqshrun", "SqshrunX", "SimdShiftOp", smallSignedTypes, + sqshrunCode, hasImm=True) + twoRegNarrowInstX("sqshrun", "Sqshrun2X", "SimdShiftOp", smallSignedTypes, + sqshrunCode, hasImm=True, hi=True) + twoRegNarrowInstX("sqshrun", "SqshrunScX", "SimdShiftOp", smallSignedTypes, + sqshrunCode, hasImm=True, scalar=True) + # SQSUB + sqsubCode = ''' + destElem = srcElem1 - srcElem2; + FPSCR fpscr = (FPSCR) FpscrQc; + bool negDest = (destElem < 0); + bool negSrc1 = (srcElem1 < 0); + bool posSrc2 = (srcElem2 >= 0); + if ((negDest != negSrc1) && (negSrc1 == posSrc2)) { + destElem = (Element)1 << (sizeof(Element) * 8 - 1); + if (negDest) + destElem -= 1; + fpscr.qc = 1; + } + FpscrQc = fpscr; + ''' + threeEqualRegInstX("sqsub", "SqsubDX", "SimdAddOp", smallSignedTypes, 2, + sqsubCode) + threeEqualRegInstX("sqsub", "SqsubQX", "SimdAddOp", signedTypes, 4, + sqsubCode) + threeEqualRegInstX("sqsub", "SqsubScX", "SimdAddOp", signedTypes, 4, + sqsubCode, scalar=True) + # SQXTN, SQXTN2 + sqxtnCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + destElem = srcElem1; + if ((BigElement)destElem != srcElem1) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8 - 1); + if (srcElem1 < 0) + destElem = ~destElem; + } + FpscrQc = fpscr; + ''' + twoRegNarrowInstX("sqxtn", "SqxtnX", "SimdMiscOp", smallSignedTypes, + sqxtnCode) + twoRegNarrowInstX("sqxtn", "Sqxtn2X", "SimdMiscOp", smallSignedTypes, + sqxtnCode, hi=True) + twoRegNarrowInstX("sqxtn", "SqxtnScX", "SimdMiscOp", smallSignedTypes, + sqxtnCode, scalar=True) + # SQXTUN, SQXTUN2 + sqxtunCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + destElem = srcElem1; + if (srcElem1 < 0 || + ((BigElement)destElem & mask(sizeof(Element) * 8)) != srcElem1) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8); + if (srcElem1 < 0) + destElem = ~destElem; + } + FpscrQc = fpscr; + ''' + twoRegNarrowInstX("sqxtun", "SqxtunX", "SimdMiscOp", smallSignedTypes, + sqxtunCode) + twoRegNarrowInstX("sqxtun", "Sqxtun2X", "SimdMiscOp", smallSignedTypes, + sqxtunCode, hi=True) + twoRegNarrowInstX("sqxtun", "SqxtunScX", "SimdMiscOp", smallSignedTypes, + sqxtunCode, scalar=True) + # SRHADD + rhaddCode = ''' + Element carryBit = + (((unsigned)srcElem1 & 0x1) + + ((unsigned)srcElem2 & 0x1) + 1) >> 1; + // Use division instead of a shift to ensure the sign extension works + // right. The compiler will figure out if it can be a shift. Mask the + // inputs so they get truncated correctly. + destElem = (((srcElem1 & ~(Element)1) / 2) + + ((srcElem2 & ~(Element)1) / 2)) + carryBit; + ''' + threeEqualRegInstX("srhadd", "SrhaddDX", "SimdAddOp", smallSignedTypes, 2, + rhaddCode) + threeEqualRegInstX("srhadd", "SrhaddQX", "SimdAddOp", smallSignedTypes, 4, + rhaddCode) + # SRI + sriCode = ''' + if (imm >= sizeof(Element) * 8) + destElem = destElem; + else + destElem = (srcElem1 >> imm) | + (destElem & ~mask(sizeof(Element) * 8 - imm)); + ''' + twoEqualRegInstX("sri", "SriDX", "SimdShiftOp", unsignedTypes, 2, sriCode, + True, hasImm=True) + twoEqualRegInstX("sri", "SriQX", "SimdShiftOp", unsignedTypes, 4, sriCode, + True, hasImm=True) + # SRSHL + rshlCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + Element rBit = 0; + if (shiftAmt <= sizeof(Element) * 8) + rBit = bits(srcElem1, shiftAmt - 1); + if (shiftAmt > sizeof(Element) * 8 && ltz(srcElem1)) + rBit = 1; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (ltz(srcElem1) && !ltz(destElem)) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + destElem += rBit; + } else if (shiftAmt > 0) { + if (shiftAmt >= sizeof(Element) * 8) { + destElem = 0; + } else { + destElem = srcElem1 << shiftAmt; + } + } else { + destElem = srcElem1; + } + ''' + threeEqualRegInstX("srshl", "SrshlDX", "SimdShiftOp", signedTypes, 2, + rshlCode) + threeEqualRegInstX("srshl", "SrshlQX", "SimdShiftOp", signedTypes, 4, + rshlCode) + # SRSHR + rshrCode = ''' + if (imm > sizeof(srcElem1) * 8) { + destElem = 0; + } else if (imm) { + Element rBit = bits(srcElem1, imm - 1); + destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit; + } else { + destElem = srcElem1; + } + ''' + twoEqualRegInstX("srshr", "SrshrDX", "SimdShiftOp", signedTypes, 2, + rshrCode, hasImm=True) + twoEqualRegInstX("srshr", "SrshrQX", "SimdShiftOp", signedTypes, 4, + rshrCode, hasImm=True) + # SRSRA + rsraCode = ''' + if (imm > sizeof(srcElem1) * 8) { + destElem += 0; + } else if (imm) { + Element rBit = bits(srcElem1, imm - 1); + destElem += ((srcElem1 >> (imm - 1)) >> 1) + rBit; + } else { + destElem += srcElem1; + } + ''' + twoEqualRegInstX("srsra", "SrsraDX", "SimdShiftOp", signedTypes, 2, + rsraCode, True, hasImm=True) + twoEqualRegInstX("srsra", "SrsraQX", "SimdShiftOp", signedTypes, 4, + rsraCode, True, hasImm=True) + # SSHL + shlCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + // Make sure the right shift sign extended when it should. + if (ltz(srcElem1) && !ltz(destElem)) { + destElem |= -((Element)1 << (sizeof(Element) * 8 - + 1 - shiftAmt)); + } + } else { + if (shiftAmt >= sizeof(Element) * 8) { + destElem = 0; + } else { + destElem = srcElem1 << shiftAmt; + } + } + ''' + threeEqualRegInstX("sshl", "SshlDX", "SimdShiftOp", signedTypes, 2, + shlCode) + threeEqualRegInstX("sshl", "SshlQX", "SimdShiftOp", signedTypes, 4, + shlCode) + # SSHLL, SSHLL2 + shllCode = ''' + if (imm >= sizeof(destElem) * 8) { + destElem = 0; + } else { + destElem = (BigElement)srcElem1 << imm; + } + ''' + twoRegLongInstX("sshll", "SshllX", "SimdShiftOp", smallSignedTypes, + shllCode, hasImm=True) + twoRegLongInstX("sshll", "Sshll2X", "SimdShiftOp", smallSignedTypes, + shllCode, hasImm=True, hi=True) + # SSHR + shrCode = ''' + if (imm >= sizeof(srcElem1) * 8) { + if (ltz(srcElem1)) + destElem = -1; + else + destElem = 0; + } else { + destElem = srcElem1 >> imm; + } + ''' + twoEqualRegInstX("sshr", "SshrDX", "SimdShiftOp", signedTypes, 2, shrCode, + hasImm=True) + twoEqualRegInstX("sshr", "SshrQX", "SimdShiftOp", signedTypes, 4, shrCode, + hasImm=True) + # SSRA + sraCode = ''' + Element mid;; + if (imm >= sizeof(srcElem1) * 8) { + mid = ltz(srcElem1) ? -1 : 0; + } else { + mid = srcElem1 >> imm; + if (ltz(srcElem1) && !ltz(mid)) { + mid |= -(mid & ((Element)1 << + (sizeof(Element) * 8 - 1 - imm))); + } + } + destElem += mid; + ''' + twoEqualRegInstX("ssra", "SsraDX", "SimdShiftOp", signedTypes, 2, sraCode, + True, hasImm=True) + twoEqualRegInstX("ssra", "SsraQX", "SimdShiftOp", signedTypes, 4, sraCode, + True, hasImm=True) + # SSUBL + sublwCode = "destElem = (BigElement)srcElem1 - (BigElement)srcElem2;" + threeRegLongInstX("ssubl", "SsublX", "SimdAddOp", smallSignedTypes, + sublwCode) + threeRegLongInstX("ssubl2", "Ssubl2X", "SimdAddOp", smallSignedTypes, + sublwCode, hi=True) + # SSUBW + threeRegWideInstX("ssubw", "SsubwX", "SimdAddOp", smallSignedTypes, + sublwCode) + threeRegWideInstX("ssubw2", "Ssubw2X", "SimdAddOp", smallSignedTypes, + sublwCode, hi=True) + # SUB + subCode = "destElem = srcElem1 - srcElem2;" + threeEqualRegInstX("sub", "SubDX", "SimdAddOp", unsignedTypes, 2, subCode) + threeEqualRegInstX("sub", "SubQX", "SimdAddOp", unsignedTypes, 4, subCode) + # SUBHN, SUBHN2 + subhnCode = ''' + destElem = ((BigElement)srcElem1 - (BigElement)srcElem2) >> + (sizeof(Element) * 8); + ''' + threeRegNarrowInstX("subhn", "SubhnX", "SimdAddOp", smallUnsignedTypes, + subhnCode) + threeRegNarrowInstX("subhn2", "Subhn2X", "SimdAddOp", smallUnsignedTypes, + subhnCode, hi=True) + # SUQADD + suqaddCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + Element tmp = destElem + srcElem1; + if (bits(destElem, sizeof(Element) * 8 - 1) == 0) { + if (bits(tmp, sizeof(Element) * 8 - 1) == 1 || + tmp < srcElem1 || tmp < destElem) { + destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1; + fpscr.qc = 1; + } else { + destElem = tmp; + } + } else { + Element absDestElem = (~destElem) + 1; + if (absDestElem < srcElem1) { + // Still check for positive sat., no need to check for negative sat. + if (bits(tmp, sizeof(Element) * 8 - 1) == 1) { + destElem = (((Element) 1) << (sizeof(Element) * 8 - 1)) - 1; + fpscr.qc = 1; + } else { + destElem = tmp; + } + } else { + destElem = tmp; + } + } + FpscrQc = fpscr; + ''' + twoEqualRegInstX("suqadd", "SuqaddDX", "SimdAddOp", smallUnsignedTypes, 2, + suqaddCode, True) + twoEqualRegInstX("suqadd", "SuqaddQX", "SimdAddOp", unsignedTypes, 4, + suqaddCode, True) + twoEqualRegInstX("suqadd", "SuqaddScX", "SimdAddOp", unsignedTypes, 4, + suqaddCode, True, scalar=True) + # SXTL -> alias to SSHLL + # TBL + tbxTblInstX("tbl", "Tbl1DX", "SimdMiscOp", ("uint8_t",), 1, "true", 2) + tbxTblInstX("tbl", "Tbl1QX", "SimdMiscOp", ("uint8_t",), 1, "true", 4) + tbxTblInstX("tbl", "Tbl2DX", "SimdMiscOp", ("uint8_t",), 2, "true", 2) + tbxTblInstX("tbl", "Tbl2QX", "SimdMiscOp", ("uint8_t",), 2, "true", 4) + tbxTblInstX("tbl", "Tbl3DX", "SimdMiscOp", ("uint8_t",), 3, "true", 2) + tbxTblInstX("tbl", "Tbl3QX", "SimdMiscOp", ("uint8_t",), 3, "true", 4) + tbxTblInstX("tbl", "Tbl4DX", "SimdMiscOp", ("uint8_t",), 4, "true", 2) + tbxTblInstX("tbl", "Tbl4QX", "SimdMiscOp", ("uint8_t",), 4, "true", 4) + # TBX + tbxTblInstX("tbx", "Tbx1DX", "SimdMiscOp", ("uint8_t",), 1, "false", 2) + tbxTblInstX("tbx", "Tbx1QX", "SimdMiscOp", ("uint8_t",), 1, "false", 4) + tbxTblInstX("tbx", "Tbx2DX", "SimdMiscOp", ("uint8_t",), 2, "false", 2) + tbxTblInstX("tbx", "Tbx2QX", "SimdMiscOp", ("uint8_t",), 2, "false", 4) + tbxTblInstX("tbx", "Tbx3DX", "SimdMiscOp", ("uint8_t",), 3, "false", 2) + tbxTblInstX("tbx", "Tbx3QX", "SimdMiscOp", ("uint8_t",), 3, "false", 4) + tbxTblInstX("tbx", "Tbx4DX", "SimdMiscOp", ("uint8_t",), 4, "false", 2) + tbxTblInstX("tbx", "Tbx4QX", "SimdMiscOp", ("uint8_t",), 4, "false", 4) + # TRN1 + trnCode = ''' + unsigned part = %s; + for (unsigned i = 0; i < eCount / 2; i++) { + destReg.elements[2 * i] = srcReg1.elements[2 * i + part]; + destReg.elements[2 * i + 1] = srcReg2.elements[2 * i + part]; + } + ''' + threeRegScrambleInstX("trn1", "Trn1DX", "SimdAluOp", smallUnsignedTypes, 2, + trnCode % "0") + threeRegScrambleInstX("trn1", "Trn1QX", "SimdAluOp", unsignedTypes, 4, + trnCode % "0") + # TRN2 + threeRegScrambleInstX("trn2", "Trn2DX", "SimdAluOp", smallUnsignedTypes, 2, + trnCode % "1") + threeRegScrambleInstX("trn2", "Trn2QX", "SimdAluOp", unsignedTypes, 4, + trnCode % "1") + # UABA + threeEqualRegInstX("uaba", "UabaDX", "SimdAddAccOp", smallUnsignedTypes, 2, + abaCode, True) + threeEqualRegInstX("uaba", "UabaQX", "SimdAddAccOp", smallUnsignedTypes, 4, + abaCode, True) + # UABAL, UABAL2 + threeRegLongInstX("uabal", "UabalX", "SimdAddAccOp", smallUnsignedTypes, + abalCode, True) + threeRegLongInstX("uabal2", "Uabal2X", "SimdAddAccOp", smallUnsignedTypes, + abalCode, True, hi=True) + # UABD + threeEqualRegInstX("uabd", "UabdDX", "SimdAddOp", smallUnsignedTypes, 2, + abdCode) + threeEqualRegInstX("uabd", "UabdQX", "SimdAddOp", smallUnsignedTypes, 4, + abdCode) + # UABDL, UABDL2 + threeRegLongInstX("uabdl", "UabdlX", "SimdAddAccOp", smallUnsignedTypes, + abdlCode, True) + threeRegLongInstX("uabdl2", "Uabdl2X", "SimdAddAccOp", smallUnsignedTypes, + abdlCode, True, hi=True) + # UADALP + twoRegCondenseInstX("uadalp", "UadalpDX", "SimdAddOp", smallUnsignedTypes, + 2, adalpCode, True) + twoRegCondenseInstX("uadalp", "UadalpQX", "SimdAddOp", smallUnsignedTypes, + 4, adalpCode, True) + # UADDL, UADDL2 + threeRegLongInstX("uaddl", "UaddlX", "SimdAddAccOp", smallUnsignedTypes, + addlwCode) + threeRegLongInstX("uaddl2", "Uaddl2X", "SimdAddAccOp", smallUnsignedTypes, + addlwCode, hi=True) + # UADDLP + twoRegCondenseInstX("uaddlp", "UaddlpDX", "SimdAddOp", smallUnsignedTypes, + 2, addlwCode) + twoRegCondenseInstX("uaddlp", "UaddlpQX", "SimdAddOp", smallUnsignedTypes, + 4, addlwCode) + # UADDLV + twoRegAcrossInstX("uaddlv", "UaddlvDX", "SimdAddOp", + ("uint8_t", "uint16_t"), 2, addAcrossLongCode, long=True) + twoRegAcrossInstX("uaddlv", "UaddlvQX", "SimdAddOp", + ("uint8_t", "uint16_t"), 4, addAcrossLongCode, long=True) + twoRegAcrossInstX("uaddlv", "UaddlvBQX", "SimdAddOp", ("uint32_t",), 4, + addAcrossLongCode, doubleDest=True, long=True) + # UADDW + threeRegWideInstX("uaddw", "UaddwX", "SimdAddAccOp", smallUnsignedTypes, + addlwCode) + threeRegWideInstX("uaddw2", "Uaddw2X", "SimdAddAccOp", smallUnsignedTypes, + addlwCode, hi=True) + # UCVTF (fixed-point) + ucvtfFixedCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, imm, true," + " FPCRRounding(fpscr), fpscr)") + twoEqualRegInstX("ucvtf", "UcvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2, + ucvtfFixedCode, hasImm=True) + twoEqualRegInstX("ucvtf", "UcvtfFixedQX", "SimdCvtOp", floatTypes, 4, + ucvtfFixedCode, hasImm=True) + twoEqualRegInstX("ucvtf", "UcvtfFixedScX", "SimdCvtOp", floatTypes, 4, + ucvtfFixedCode, hasImm=True, scalar=True) + # UCVTF (integer) + ucvtfIntCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, 0, true," + " FPCRRounding(fpscr), fpscr)") + twoEqualRegInstX("ucvtf", "UcvtfIntDX", "SimdCvtOp", smallFloatTypes, 2, + ucvtfIntCode) + twoEqualRegInstX("ucvtf", "UcvtfIntQX", "SimdCvtOp", floatTypes, 4, + ucvtfIntCode) + twoEqualRegInstX("ucvtf", "UcvtfIntScX", "SimdCvtOp", floatTypes, 4, + ucvtfIntCode, scalar=True) + # UHADD + threeEqualRegInstX("uhadd", "UhaddDX", "SimdAddOp", smallUnsignedTypes, 2, + haddCode) + threeEqualRegInstX("uhadd", "UhaddQX", "SimdAddOp", smallUnsignedTypes, 4, + haddCode) + # UHSUB + threeEqualRegInstX("uhsub", "UhsubDX", "SimdAddOp", smallUnsignedTypes, 2, + hsubCode) + threeEqualRegInstX("uhsub", "UhsubQX", "SimdAddOp", smallUnsignedTypes, 4, + hsubCode) + # UMAX + threeEqualRegInstX("umax", "UmaxDX", "SimdCmpOp", smallUnsignedTypes, 2, + maxCode) + threeEqualRegInstX("umax", "UmaxQX", "SimdCmpOp", smallUnsignedTypes, 4, + maxCode) + # UMAXP + threeEqualRegInstX("umaxp", "UmaxpDX", "SimdCmpOp", smallUnsignedTypes, 2, + maxCode, pairwise=True) + threeEqualRegInstX("umaxp", "UmaxpQX", "SimdCmpOp", smallUnsignedTypes, 4, + maxCode, pairwise=True) + # UMAXV + twoRegAcrossInstX("umaxv", "UmaxvDX", "SimdCmpOp", ("uint8_t", "uint16_t"), + 2, maxAcrossCode) + twoRegAcrossInstX("umaxv", "UmaxvQX", "SimdCmpOp", smallUnsignedTypes, 4, + maxAcrossCode) + # UMIN + threeEqualRegInstX("umin", "UminDX", "SimdCmpOp", smallUnsignedTypes, 2, + minCode) + threeEqualRegInstX("umin", "UminQX", "SimdCmpOp", smallUnsignedTypes, 4, + minCode) + # UMINP + threeEqualRegInstX("uminp", "UminpDX", "SimdCmpOp", smallUnsignedTypes, 2, + minCode, pairwise=True) + threeEqualRegInstX("uminp", "UminpQX", "SimdCmpOp", smallUnsignedTypes, 4, + minCode, pairwise=True) + # UMINV + twoRegAcrossInstX("uminv", "UminvDX", "SimdCmpOp", ("uint8_t", "uint16_t"), + 2, minAcrossCode) + twoRegAcrossInstX("uminv", "UminvQX", "SimdCmpOp", smallUnsignedTypes, 4, + minAcrossCode) + # UMLAL (by element) + threeRegLongInstX("umlal", "UmlalElemX", "SimdMultAccOp", + smallUnsignedTypes, mlalCode, True, byElem=True) + threeRegLongInstX("umlal", "UmlalElem2X", "SimdMultAccOp", + smallUnsignedTypes, mlalCode, True, byElem=True, hi=True) + # UMLAL (vector) + threeRegLongInstX("umlal", "UmlalX", "SimdMultAccOp", smallUnsignedTypes, + mlalCode, True) + threeRegLongInstX("umlal", "Umlal2X", "SimdMultAccOp", smallUnsignedTypes, + mlalCode, True, hi=True) + # UMLSL (by element) + threeRegLongInstX("umlsl", "UmlslElemX", "SimdMultAccOp", + smallUnsignedTypes, mlslCode, True, byElem=True) + threeRegLongInstX("umlsl", "UmlslElem2X", "SimdMultAccOp", + smallUnsignedTypes, mlslCode, True, byElem=True, hi=True) + # UMLSL (vector) + threeRegLongInstX("umlsl", "UmlslX", "SimdMultAccOp", smallUnsignedTypes, + mlslCode, True) + threeRegLongInstX("umlsl", "Umlsl2X", "SimdMultAccOp", smallUnsignedTypes, + mlslCode, True, hi=True) + # UMOV + insToGprInstX("umov", "UmovWX", "SimdMiscOp", smallUnsignedTypes, 4, 'W') + insToGprInstX("umov", "UmovXX", "SimdMiscOp", ("uint64_t",), 4, 'X') + # UMULL, UMULL2 (by element) + threeRegLongInstX("umull", "UmullElemX", "SimdMultOp", smallUnsignedTypes, + mullCode, byElem=True) + threeRegLongInstX("umull", "UmullElem2X", "SimdMultOp", smallUnsignedTypes, + mullCode, byElem=True, hi=True) + # UMULL, UMULL2 (vector) + threeRegLongInstX("umull", "UmullX", "SimdMultOp", smallUnsignedTypes, + mullCode) + threeRegLongInstX("umull", "Umull2X", "SimdMultOp", smallUnsignedTypes, + mullCode, hi=True) + # UQADD + uqaddCode = ''' + destElem = srcElem1 + srcElem2; + FPSCR fpscr = (FPSCR) FpscrQc; + if (destElem < srcElem1 || destElem < srcElem2) { + destElem = (Element)(-1); + fpscr.qc = 1; + } + FpscrQc = fpscr; + ''' + threeEqualRegInstX("uqadd", "UqaddDX", "SimdAddOp", smallUnsignedTypes, 2, + uqaddCode) + threeEqualRegInstX("uqadd", "UqaddQX", "SimdAddOp", unsignedTypes, 4, + uqaddCode) + threeEqualRegInstX("uqadd", "UqaddScX", "SimdAddOp", unsignedTypes, 4, + uqaddCode, scalar=True) + # UQRSHL + uqrshlCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + FPSCR fpscr = (FPSCR) FpscrQc; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + Element rBit = 0; + if (shiftAmt <= sizeof(Element) * 8) + rBit = bits(srcElem1, shiftAmt - 1); + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + destElem += rBit; + } else { + if (shiftAmt >= sizeof(Element) * 8) { + if (srcElem1 != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = 0; + } + } else { + if (bits(srcElem1, sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - shiftAmt)) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = srcElem1 << shiftAmt; + } + } + } + FpscrQc = fpscr; + ''' + threeEqualRegInstX("uqrshl", "UqrshlDX", "SimdCmpOp", smallUnsignedTypes, + 2, uqrshlCode) + threeEqualRegInstX("uqrshl", "UqrshlQX", "SimdCmpOp", unsignedTypes, 4, + uqrshlCode) + threeEqualRegInstX("uqrshl", "UqrshlScX", "SimdCmpOp", unsignedTypes, 4, + uqrshlCode, scalar=True) + # UQRSHRN + uqrshrnCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = (srcElem1 >> (imm - 1)); + uint64_t rBit = mid & 0x1; + mid >>= 1; + mid += rBit; + if (mid != (Element)mid) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + if (srcElem1 != (Element)srcElem1) { + destElem = mask(sizeof(Element) * 8 - 1); + fpscr.qc = 1; + } else { + destElem = srcElem1; + } + } + FpscrQc = fpscr; + ''' + twoRegNarrowInstX("uqrshrn", "UqrshrnX", "SimdShiftOp", smallUnsignedTypes, + uqrshrnCode, hasImm=True) + twoRegNarrowInstX("uqrshrn2", "Uqrshrn2X", "SimdShiftOp", + smallUnsignedTypes, uqrshrnCode, hasImm=True, hi=True) + twoRegNarrowInstX("uqrshrn", "UqrshrnScX", "SimdShiftOp", + smallUnsignedTypes, uqrshrnCode, hasImm=True, + scalar=True) + # UQSHL (immediate) + uqshlImmCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + if (imm >= sizeof(Element) * 8) { + if (srcElem1 != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = 0; + } + } else if (imm) { + destElem = (srcElem1 << imm); + uint64_t topBits = bits((uint64_t)srcElem1, + sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - imm); + if (topBits != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } + } else { + destElem = srcElem1; + } + FpscrQc = fpscr; + ''' + twoEqualRegInstX("uqshl", "UqshlImmDX", "SimdAluOp", smallUnsignedTypes, 2, + uqshlImmCode, hasImm=True) + twoEqualRegInstX("uqshl", "UqshlImmQX", "SimdAluOp", unsignedTypes, 4, + uqshlImmCode, hasImm=True) + twoEqualRegInstX("uqshl", "UqshlImmScX", "SimdAluOp", unsignedTypes, 4, + uqshlImmCode, hasImm=True, scalar=True) + # UQSHL (register) + uqshlCode = ''' + int16_t shiftAmt = (int8_t)srcElem2; + FPSCR fpscr = (FPSCR) FpscrQc; + if (shiftAmt < 0) { + shiftAmt = -shiftAmt; + if (shiftAmt >= sizeof(Element) * 8) { + shiftAmt = sizeof(Element) * 8 - 1; + destElem = 0; + } else { + destElem = (srcElem1 >> shiftAmt); + } + } else if (shiftAmt > 0) { + if (shiftAmt >= sizeof(Element) * 8) { + if (srcElem1 != 0) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = 0; + } + } else { + if (bits(srcElem1, sizeof(Element) * 8 - 1, + sizeof(Element) * 8 - shiftAmt)) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = srcElem1 << shiftAmt; + } + } + } else { + destElem = srcElem1; + } + FpscrQc = fpscr; + ''' + threeEqualRegInstX("uqshl", "UqshlDX", "SimdAluOp", smallUnsignedTypes, 2, + uqshlCode) + threeEqualRegInstX("uqshl", "UqshlQX", "SimdAluOp", unsignedTypes, 4, + uqshlCode) + threeEqualRegInstX("uqshl", "UqshlScX", "SimdAluOp", unsignedTypes, 4, + uqshlCode, scalar=True) + # UQSHRN, UQSHRN2 + uqshrnCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + if (imm > sizeof(srcElem1) * 8) { + if (srcElem1 != 0) + fpscr.qc = 1; + destElem = 0; + } else if (imm) { + BigElement mid = ((srcElem1 >> (imm - 1)) >> 1); + if (mid != (Element)mid) { + destElem = mask(sizeof(Element) * 8); + fpscr.qc = 1; + } else { + destElem = mid; + } + } else { + destElem = srcElem1; + } + FpscrQc = fpscr; + ''' + twoRegNarrowInstX("uqshrn", "UqshrnX", "SimdShiftOp", smallUnsignedTypes, + uqshrnCode, hasImm=True) + twoRegNarrowInstX("uqshrn2", "Uqshrn2X", "SimdShiftOp", smallUnsignedTypes, + uqshrnCode, hasImm=True, hi=True) + twoRegNarrowInstX("uqshrn", "UqshrnScX", "SimdShiftOp", smallUnsignedTypes, + uqshrnCode, hasImm=True, scalar=True) + # UQSUB + uqsubCode = ''' + destElem = srcElem1 - srcElem2; + FPSCR fpscr = (FPSCR) FpscrQc; + if (destElem > srcElem1) { + destElem = 0; + fpscr.qc = 1; + } + FpscrQc = fpscr; + ''' + threeEqualRegInstX("uqsub", "UqsubDX", "SimdAddOp", smallUnsignedTypes, 2, + uqsubCode) + threeEqualRegInstX("uqsub", "UqsubQX", "SimdAddOp", unsignedTypes, 4, + uqsubCode) + threeEqualRegInstX("uqsub", "UqsubScX", "SimdAddOp", unsignedTypes, 4, + uqsubCode, scalar=True) + # UQXTN + uqxtnCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + destElem = srcElem1; + if ((BigElement)destElem != srcElem1) { + fpscr.qc = 1; + destElem = mask(sizeof(Element) * 8); + } + FpscrQc = fpscr; + ''' + twoRegNarrowInstX("uqxtn", "UqxtnX", "SimdMiscOp", smallUnsignedTypes, + uqxtnCode) + twoRegNarrowInstX("uqxtn", "Uqxtn2X", "SimdMiscOp", smallUnsignedTypes, + uqxtnCode, hi=True) + twoRegNarrowInstX("uqxtn", "UqxtnScX", "SimdMiscOp", smallUnsignedTypes, + uqxtnCode, scalar=True) + # URECPE + urecpeCode = "destElem = unsignedRecipEstimate(srcElem1);" + twoEqualRegInstX("urecpe", "UrecpeDX", "SimdMultAccOp", ("uint32_t",), 2, + urecpeCode) + twoEqualRegInstX("urecpe", "UrecpeQX", "SimdMultAccOp", ("uint32_t",), 4, + urecpeCode) + # URHADD + threeEqualRegInstX("urhadd", "UrhaddDX", "SimdAddOp", smallUnsignedTypes, + 2, rhaddCode) + threeEqualRegInstX("urhadd", "UrhaddQX", "SimdAddOp", smallUnsignedTypes, + 4, rhaddCode) + # URSHL + threeEqualRegInstX("urshl", "UrshlDX", "SimdShiftOp", unsignedTypes, 2, + rshlCode) + threeEqualRegInstX("urshl", "UrshlQX", "SimdShiftOp", unsignedTypes, 4, + rshlCode) + # URSHR + twoEqualRegInstX("urshr", "UrshrDX", "SimdShiftOp", unsignedTypes, 2, + rshrCode, hasImm=True) + twoEqualRegInstX("urshr", "UrshrQX", "SimdShiftOp", unsignedTypes, 4, + rshrCode, hasImm=True) + # URSQRTE + ursqrteCode = "destElem = unsignedRSqrtEstimate(srcElem1);" + twoEqualRegInstX("ursqrte", "UrsqrteDX", "SimdSqrtOp", ("uint32_t",), 2, + ursqrteCode) + twoEqualRegInstX("ursqrte", "UrsqrteQX", "SimdSqrtOp", ("uint32_t",), 4, + ursqrteCode) + # URSRA + twoEqualRegInstX("ursra", "UrsraDX", "SimdShiftOp", unsignedTypes, 2, + rsraCode, True, hasImm=True) + twoEqualRegInstX("ursra", "UrsraQX", "SimdShiftOp", unsignedTypes, 4, + rsraCode, True, hasImm=True) + # USHL + threeEqualRegInstX("ushl", "UshlDX", "SimdShiftOp", unsignedTypes, 2, + shlCode) + threeEqualRegInstX("ushl", "UshlQX", "SimdShiftOp", unsignedTypes, 4, + shlCode) + # USHLL, USHLL2 + twoRegLongInstX("ushll", "UshllX", "SimdShiftOp", smallUnsignedTypes, + shllCode, hasImm=True) + twoRegLongInstX("ushll", "Ushll2X", "SimdShiftOp", smallUnsignedTypes, + shllCode, hi=True, hasImm=True) + # USHR + twoEqualRegInstX("ushr", "UshrDX", "SimdShiftOp", unsignedTypes, 2, + shrCode, hasImm=True) + twoEqualRegInstX("ushr", "UshrQX", "SimdShiftOp", unsignedTypes, 4, + shrCode, hasImm=True) + # USQADD + usqaddCode = ''' + FPSCR fpscr = (FPSCR) FpscrQc; + Element tmp = destElem + srcElem1; + if (bits(srcElem1, sizeof(Element) * 8 - 1) == 0) { + if (tmp < srcElem1 || tmp < destElem) { + destElem = (Element)(-1); + fpscr.qc = 1; + } else { + destElem = tmp; + } + } else { + Element absSrcElem1 = (~srcElem1) + 1; + if (absSrcElem1 > destElem) { + destElem = 0; + fpscr.qc = 1; + } else { + destElem = tmp; + } + } + FpscrQc = fpscr; + ''' + twoEqualRegInstX("usqadd", "UsqaddDX", "SimdAddOp", smallUnsignedTypes, 2, + usqaddCode, True) + twoEqualRegInstX("usqadd", "UsqaddQX", "SimdAddOp", unsignedTypes, 4, + usqaddCode, True) + twoEqualRegInstX("usqadd", "UsqaddScX", "SimdAddOp", unsignedTypes, 4, + usqaddCode, True, scalar=True) + # USRA + twoEqualRegInstX("usra", "UsraDX", "SimdShiftOp", unsignedTypes, 2, + sraCode, True, hasImm=True) + twoEqualRegInstX("usra", "UsraQX", "SimdShiftOp", unsignedTypes, 4, + sraCode, True, hasImm=True) + # USUBL + threeRegLongInstX("usubl", "UsublX", "SimdAddOp", smallUnsignedTypes, + sublwCode) + threeRegLongInstX("usubl2", "Usubl2X", "SimdAddOp", smallUnsignedTypes, + sublwCode, hi=True) + # USUBW + threeRegWideInstX("usubw", "UsubwX", "SimdAddOp", smallUnsignedTypes, + sublwCode) + threeRegWideInstX("usubw2", "Usubw2X", "SimdAddOp", smallUnsignedTypes, + sublwCode, hi=True) + # UXTL -> alias to USHLL + # UZP1 + uzpCode = ''' + unsigned part = %s; + for (unsigned i = 0; i < eCount / 2; i++) { + destReg.elements[i] = srcReg1.elements[2 * i + part]; + destReg.elements[eCount / 2 + i] = srcReg2.elements[2 * i + part]; + } + ''' + threeRegScrambleInstX("Uzp1", "Uzp1DX", "SimdAluOp", smallUnsignedTypes, 2, + uzpCode % "0") + threeRegScrambleInstX("Uzp1", "Uzp1QX", "SimdAluOp", unsignedTypes, 4, + uzpCode % "0") + # UZP2 + threeRegScrambleInstX("Uzp2", "Uzp2DX", "SimdAluOp", smallUnsignedTypes, 2, + uzpCode % "1") + threeRegScrambleInstX("Uzp2", "Uzp2QX", "SimdAluOp", unsignedTypes, 4, + uzpCode % "1") + # XTN, XTN2 + xtnCode = "destElem = srcElem1;" + twoRegNarrowInstX("Xtn", "XtnX", "SimdMiscOp", smallUnsignedTypes, xtnCode) + twoRegNarrowInstX("Xtn", "Xtn2X", "SimdMiscOp", smallUnsignedTypes, + xtnCode, hi=True) + # ZIP1 + zipCode = ''' + unsigned base = %s; + for (unsigned i = 0; i < eCount / 2; i++) { + destReg.elements[2 * i] = srcReg1.elements[base + i]; + destReg.elements[2 * i + 1] = srcReg2.elements[base + i]; + } + ''' + threeRegScrambleInstX("zip1", "Zip1DX", "SimdAluOp", smallUnsignedTypes, 2, + zipCode % "0") + threeRegScrambleInstX("zip1", "Zip1QX", "SimdAluOp", unsignedTypes, 4, + zipCode % "0") + # ZIP2 + threeRegScrambleInstX("zip2", "Zip2DX", "SimdAluOp", smallUnsignedTypes, 2, + zipCode % "eCount / 2") + threeRegScrambleInstX("zip2", "Zip2QX", "SimdAluOp", unsignedTypes, 4, + zipCode % "eCount / 2") + +}}; |