From e385fc5267da9dee60b10091caaf484fd5bf73bb Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Fri, 30 Sep 2022 15:21:50 +0100 Subject: [PATCH] Updated synctools/tablegen/AArch64 files to LLVM 14.0.5. --- suite/synctools/tablegen/AArch64/AArch64.td | 1144 +++- .../AArch64/AArch64CallingConvention.td | 362 +- .../tablegen/AArch64/AArch64Combine.td | 233 + .../AArch64/AArch64GenRegisterBankInfo.def | 275 + .../tablegen/AArch64/AArch64InstrAtomics.td | 153 +- .../tablegen/AArch64/AArch64InstrFormats.td | 2154 ++++++-- .../tablegen/AArch64/AArch64InstrGISel.td | 287 + .../tablegen/AArch64/AArch64InstrInfo.td | 2685 +++++++-- .../tablegen/AArch64/AArch64PfmCounters.td | 18 + .../tablegen/AArch64/AArch64RegisterBanks.td | 9 +- .../tablegen/AArch64/AArch64RegisterInfo.td | 384 +- .../tablegen/AArch64/AArch64SMEInstrInfo.td | 143 + .../tablegen/AArch64/AArch64SVEInstrInfo.td | 3251 +++++++++-- .../tablegen/AArch64/AArch64SchedA53.td | 15 +- .../tablegen/AArch64/AArch64SchedA55.td | 361 ++ .../tablegen/AArch64/AArch64SchedA57.td | 80 +- .../AArch64/AArch64SchedA57WriteRes.td | 26 +- .../tablegen/AArch64/AArch64SchedA64FX.td | 3896 +++++++++++++ .../tablegen/AArch64/AArch64SchedAmpere1.td | 1136 ++++ .../tablegen/AArch64/AArch64SchedCyclone.td | 79 +- .../tablegen/AArch64/AArch64SchedExynosM1.td | 847 --- .../tablegen/AArch64/AArch64SchedExynosM3.td | 235 +- .../tablegen/AArch64/AArch64SchedExynosM4.td | 1017 ++++ .../tablegen/AArch64/AArch64SchedExynosM5.td | 1016 ++++ .../tablegen/AArch64/AArch64SchedFalkor.td | 16 +- .../AArch64/AArch64SchedFalkorDetails.td | 13 +- .../tablegen/AArch64/AArch64SchedKryo.td | 16 +- .../AArch64/AArch64SchedKryoDetails.td | 11 +- .../AArch64/AArch64SchedPredAmpere.td | 25 + .../AArch64/AArch64SchedPredExynos.td | 157 + .../AArch64/AArch64SchedPredicates.td | 441 ++ .../tablegen/AArch64/AArch64SchedTSV110.td | 747 +++ .../tablegen/AArch64/AArch64SchedThunderX.td | 16 +- .../AArch64/AArch64SchedThunderX2T99.td | 43 +- .../AArch64/AArch64SchedThunderX3T110.td | 2003 +++++++ .../tablegen/AArch64/AArch64Schedule.td | 30 +- .../tablegen/AArch64/AArch64SystemOperands.td | 635 ++- .../tablegen/AArch64/SMEInstrFormats.td | 726 +++ .../tablegen/AArch64/SVEInstrFormats.td | 4903 +++++++++++++++-- 39 files changed, 26048 insertions(+), 3540 deletions(-) create mode 100644 suite/synctools/tablegen/AArch64/AArch64Combine.td create mode 100644 suite/synctools/tablegen/AArch64/AArch64GenRegisterBankInfo.def create mode 100644 suite/synctools/tablegen/AArch64/AArch64InstrGISel.td create mode 100644 suite/synctools/tablegen/AArch64/AArch64PfmCounters.td create mode 100644 suite/synctools/tablegen/AArch64/AArch64SMEInstrInfo.td create mode 100644 suite/synctools/tablegen/AArch64/AArch64SchedA55.td create mode 100644 suite/synctools/tablegen/AArch64/AArch64SchedA64FX.td create mode 100644 suite/synctools/tablegen/AArch64/AArch64SchedAmpere1.td delete mode 100644 suite/synctools/tablegen/AArch64/AArch64SchedExynosM1.td create mode 100644 suite/synctools/tablegen/AArch64/AArch64SchedExynosM4.td create mode 100644 suite/synctools/tablegen/AArch64/AArch64SchedExynosM5.td create mode 100644 suite/synctools/tablegen/AArch64/AArch64SchedPredAmpere.td create mode 100644 suite/synctools/tablegen/AArch64/AArch64SchedPredExynos.td create mode 100644 suite/synctools/tablegen/AArch64/AArch64SchedPredicates.td create mode 100644 suite/synctools/tablegen/AArch64/AArch64SchedTSV110.td create mode 100644 suite/synctools/tablegen/AArch64/AArch64SchedThunderX3T110.td create mode 100644 suite/synctools/tablegen/AArch64/SMEInstrFormats.td diff --git a/suite/synctools/tablegen/AArch64/AArch64.td b/suite/synctools/tablegen/AArch64/AArch64.td index a69d3814..80e574b7 100644 --- a/suite/synctools/tablegen/AArch64/AArch64.td +++ b/suite/synctools/tablegen/AArch64/AArch64.td @@ -1,9 +1,8 @@ //=- AArch64.td - Describe the AArch64 Target Machine --------*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -43,11 +42,11 @@ def FeatureAES : SubtargetFeature< "Enable AES support", [FeatureNEON]>; // Crypto has been split up and any combination is now valid (see the -// crypto defintions above). Also, crypto is now context sensitive: +// crypto definitions above). Also, crypto is now context sensitive: // it has a different meaning for e.g. Armv8.4 than it has for Armv8.2. // Therefore, we rely on Clang, the user interacing tool, to pass on the // appropriate crypto options. But here in the backend, crypto has very little -// meaning anymore. We kept the Crypto defintion here for backward +// meaning anymore. We kept the Crypto definition here for backward // compatibility, and now imply features SHA2 and AES, which was the // "traditional" meaning of Crypto. def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", @@ -62,28 +61,105 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true", def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true", "Enable ARMv8.1 Large System Extension (LSE) atomic instructions">; +def FeatureLSE2 : SubtargetFeature<"lse2", "HasLSE2", "true", + "Enable ARMv8.4 Large System Extension 2 (LSE2) atomicity rules">; + +def FeatureOutlineAtomics : SubtargetFeature<"outline-atomics", "OutlineAtomics", "true", + "Enable out of line atomics to support LSE instructions">; + def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true", "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">; +def FeaturePAN : SubtargetFeature< + "pan", "HasPAN", "true", + "Enables ARM v8.1 Privileged Access-Never extension">; + +def FeatureLOR : SubtargetFeature< + "lor", "HasLOR", "true", + "Enables ARM v8.1 Limited Ordering Regions extension">; + +def FeatureCONTEXTIDREL2 : SubtargetFeature<"CONTEXTIDREL2", "HasCONTEXTIDREL2", + "true", "Enable RW operand CONTEXTIDR_EL2" >; + +def FeatureVH : SubtargetFeature<"vh", "HasVH", "true", + "Enables ARM v8.1 Virtual Host extension", [FeatureCONTEXTIDREL2] >; + def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", "Enable ARMv8 PMUv3 Performance Monitors extension">; def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", "Full FP16", [FeatureFPARMv8]>; +def FeatureFP16FML : SubtargetFeature<"fp16fml", "HasFP16FML", "true", + "Enable FP16 FML instructions", [FeatureFullFP16]>; + def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true", "Enable Statistical Profiling extension">; -def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true", - "Enable Scalable Vector Extension (SVE) instructions">; +def FeaturePAN_RWV : SubtargetFeature< + "pan-rwv", "HasPAN_RWV", "true", + "Enable v8.2 PAN s1e1R and s1e1W Variants", + [FeaturePAN]>; + +// UAO PState +def FeaturePsUAO : SubtargetFeature< "uaops", "HasPsUAO", "true", + "Enable v8.2 UAO PState">; + +def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP", + "true", "Enable v8.2 data Cache Clean to Point of Persistence" >; + +def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true", + "Enable Scalable Vector Extension (SVE) instructions", [FeatureFullFP16]>; + +// This flag is currently still labeled as Experimental, but when fully +// implemented this should tell the compiler to use the zeroing pseudos to +// benefit from the reverse instructions (e.g. SUB vs SUBR) if the inactive +// lanes are known to be zero. The pseudos will then be expanded using the +// MOVPRFX instruction to zero the inactive lanes. This feature should only be +// enabled if MOVPRFX instructions are known to merge with the destructive +// operations they prefix. +// +// This feature could similarly be extended to support cheap merging of _any_ +// value into the inactive lanes using the MOVPRFX instruction that uses +// merging-predication. +def FeatureExperimentalZeroingPseudos + : SubtargetFeature<"use-experimental-zeroing-pseudos", + "UseExperimentalZeroingPseudos", "true", + "Hint to the compiler that the MOVPRFX instruction is " + "merged with destructive operations", + []>; + +def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl", + "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">; + +def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true", + "Enable Scalable Vector Extension 2 (SVE2) instructions", + [FeatureSVE, FeatureUseScalarIncVL]>; + +def FeatureSVE2AES : SubtargetFeature<"sve2-aes", "HasSVE2AES", "true", + "Enable AES SVE2 instructions", [FeatureSVE2, FeatureAES]>; + +def FeatureSVE2SM4 : SubtargetFeature<"sve2-sm4", "HasSVE2SM4", "true", + "Enable SM4 SVE2 instructions", [FeatureSVE2, FeatureSM4]>; + +def FeatureSVE2SHA3 : SubtargetFeature<"sve2-sha3", "HasSVE2SHA3", "true", + "Enable SHA3 SVE2 instructions", [FeatureSVE2, FeatureSHA3]>; + +def FeatureSVE2BitPerm : SubtargetFeature<"sve2-bitperm", "HasSVE2BitPerm", "true", + "Enable bit permutation SVE2 instructions", [FeatureSVE2]>; -/// Cyclone has register move instructions which are "free". def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; -/// Cyclone has instructions which zero registers for "free". +def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true", + "Has zero-cycle zeroing instructions for generic registers">; + +def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false", + "Has no zero-cycle zeroing instructions for FP registers">; + def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", - "Has zero-cycle zeroing instructions">; + "Has zero-cycle zeroing instructions", + [FeatureZCZeroingGP]>; /// ... but the floating-point version doesn't quite work in rare cases on older /// CPUs. @@ -96,16 +172,14 @@ def FeatureStrictAlign : SubtargetFeature<"strict-align", "Disallow all unaligned memory " "access">; -def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true", - "Reserve X18, making it unavailable " - "as a GPR">; +foreach i = {1-7,9-15,18,20-28,30} in + def FeatureReserveX#i : SubtargetFeature<"reserve-x"#i, "ReserveXRegister["#i#"]", "true", + "Reserve X"#i#", making it unavailable " + "as a GPR">; -def FeatureReserveX20 : SubtargetFeature<"reserve-x20", "ReserveX20", "true", - "Reserve X20, making it unavailable " - "as a GPR">; - -def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true", - "Use alias analysis during codegen">; +foreach i = {8-15,18} in + def FeatureCallSavedX#i : SubtargetFeature<"call-saved-x"#i, + "CustomCallSavedXRegs["#i#"]", "true", "Make X"#i#" callee saved.">; def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps", "true", @@ -117,11 +191,11 @@ def FeaturePredictableSelectIsExpensive : SubtargetFeature< def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move", "CustomAsCheapAsMove", "true", - "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">; + "Use custom handling of cheap instructions">; def FeatureExynosCheapAsMoveHandling : SubtargetFeature<"exynos-cheap-as-move", "ExynosAsCheapAsMove", "true", - "Use Exynos specific code in TargetInstrInfo::isAsCheapAsAMove()", + "Use Exynos specific handling of cheap instructions", [FeatureCustomCheapAsMoveHandling]>; def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler", @@ -148,6 +222,10 @@ def FeatureArithmeticCbzFusion : SubtargetFeature< "arith-cbz-fusion", "HasArithmeticCbzFusion", "true", "CPU fuses arithmetic + cbz/cbnz operations">; +def FeatureCmpBccFusion : SubtargetFeature< + "cmp-bcc-fusion", "HasCmpBccFusion", "true", + "CPU fuses cmp+bcc operations">; + def FeatureFuseAddress : SubtargetFeature< "fuse-address", "HasFuseAddress", "true", "CPU fuses address generation and memory operations">; @@ -156,10 +234,18 @@ def FeatureFuseAES : SubtargetFeature< "fuse-aes", "HasFuseAES", "true", "CPU fuses AES crypto operations">; +def FeatureFuseArithmeticLogic : SubtargetFeature< + "fuse-arith-logic", "HasFuseArithmeticLogic", "true", + "CPU fuses arithmetic and logic operations">; + def FeatureFuseCCSelect : SubtargetFeature< "fuse-csel", "HasFuseCCSelect", "true", "CPU fuses conditional select operations">; +def FeatureFuseCryptoEOR : SubtargetFeature< + "fuse-crypto-eor", "HasFuseCryptoEOR", "true", + "CPU fuses AES/PMULL and EOR operations">; + def FeatureFuseLiterals : SubtargetFeature< "fuse-literals", "HasFuseLiterals", "true", "CPU fuses literal generation operations">; @@ -168,6 +254,10 @@ def FeatureDisableLatencySchedHeuristic : SubtargetFeature< "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", "Disable latency scheduling heuristic">; +def FeatureForce32BitJumpTables + : SubtargetFeature<"force-32bit-jump-tables", "Force32BitJumpTables", "true", + "Force jump table entries to be 32-bits wide except at MinSize">; + def FeatureRCPC : SubtargetFeature<"rcpc", "HasRCPC", "true", "Enable support for RCPC extension">; @@ -179,6 +269,66 @@ def FeatureDotProd : SubtargetFeature< "dotprod", "HasDotProd", "true", "Enable dot product support">; +def FeaturePAuth : SubtargetFeature< + "pauth", "HasPAuth", "true", + "Enable v8.3-A Pointer Authentication extension">; + +def FeatureJS : SubtargetFeature< + "jsconv", "HasJS", "true", + "Enable v8.3-A JavaScript FP conversion instructions", + [FeatureFPARMv8]>; + +def FeatureCCIDX : SubtargetFeature< + "ccidx", "HasCCIDX", "true", + "Enable v8.3-A Extend of the CCSIDR number of sets">; + +def FeatureComplxNum : SubtargetFeature< + "complxnum", "HasComplxNum", "true", + "Enable v8.3-A Floating-point complex number support", + [FeatureNEON]>; + +def FeatureNV : SubtargetFeature< + "nv", "HasNV", "true", + "Enable v8.4-A Nested Virtualization Enchancement">; + +def FeatureMPAM : SubtargetFeature< + "mpam", "HasMPAM", "true", + "Enable v8.4-A Memory system Partitioning and Monitoring extension">; + +def FeatureDIT : SubtargetFeature< + "dit", "HasDIT", "true", + "Enable v8.4-A Data Independent Timing instructions">; + +def FeatureTRACEV8_4 : SubtargetFeature< + "tracev8.4", "HasTRACEV8_4", "true", + "Enable v8.4-A Trace extension">; + +def FeatureAM : SubtargetFeature< + "am", "HasAM", "true", + "Enable v8.4-A Activity Monitors extension">; + +def FeatureAMVS : SubtargetFeature< + "amvs", "HasAMVS", "true", + "Enable v8.6-A Activity Monitors Virtualization support", + [FeatureAM]>; + +def FeatureSEL2 : SubtargetFeature< + "sel2", "HasSEL2", "true", + "Enable v8.4-A Secure Exception Level 2 extension">; + +def FeatureTLB_RMI : SubtargetFeature< + "tlb-rmi", "HasTLB_RMI", "true", + "Enable v8.4-A TLB Range and Maintenance Instructions">; + +def FeatureFlagM : SubtargetFeature< + "flagm", "HasFlagM", "true", + "Enable v8.4-A Flag Manipulation Instructions">; + +// 8.4 RCPC enchancements: LDAPR & STLR instructions with Immediate Offset +def FeatureRCPC_IMMO : SubtargetFeature<"rcpc-immo", "HasRCPC_IMMO", "true", + "Enable v8.4-A RCPC instructions with Immediate Offsets", + [FeatureRCPC]>; + def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates", "NegativeImmediates", "false", "Convert immediates and instructions " @@ -196,21 +346,200 @@ def FeatureAggressiveFMA : "true", "Enable Aggressive FMA for floating-point.">; +def FeatureAltFPCmp : SubtargetFeature<"altnzcv", "HasAlternativeNZCV", "true", + "Enable alternative NZCV format for floating point comparisons">; + +def FeatureFRInt3264 : SubtargetFeature<"fptoint", "HasFRInt3264", "true", + "Enable FRInt[32|64][Z|X] instructions that round a floating-point number to " + "an integer (in FP format) forcing it to fit into a 32- or 64-bit int" >; + +def FeatureSpecRestrict : SubtargetFeature<"specrestrict", "HasSpecRestrict", + "true", "Enable architectural speculation restriction" >; + +def FeatureSB : SubtargetFeature<"sb", "HasSB", + "true", "Enable v8.5 Speculation Barrier" >; + +def FeatureSSBS : SubtargetFeature<"ssbs", "HasSSBS", + "true", "Enable Speculative Store Bypass Safe bit" >; + +def FeaturePredRes : SubtargetFeature<"predres", "HasPredRes", "true", + "Enable v8.5a execution and data prediction invalidation instructions" >; + +def FeatureCacheDeepPersist : SubtargetFeature<"ccdp", "HasCCDP", + "true", "Enable v8.5 Cache Clean to Point of Deep Persistence" >; + +def FeatureBranchTargetId : SubtargetFeature<"bti", "HasBTI", + "true", "Enable Branch Target Identification" >; + +def FeatureRandGen : SubtargetFeature<"rand", "HasRandGen", + "true", "Enable Random Number generation instructions" >; + +def FeatureMTE : SubtargetFeature<"mte", "HasMTE", + "true", "Enable Memory Tagging Extension" >; + +def FeatureTRBE : SubtargetFeature<"trbe", "HasTRBE", + "true", "Enable Trace Buffer Extension">; + +def FeatureETE : SubtargetFeature<"ete", "HasETE", + "true", "Enable Embedded Trace Extension", + [FeatureTRBE]>; + +def FeatureTME : SubtargetFeature<"tme", "HasTME", + "true", "Enable Transactional Memory Extension" >; + +def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals", + "AllowTaggedGlobals", + "true", "Use an instruction sequence for taking the address of a global " + "that allows a memory tag in the upper address bits">; + +def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16", + "true", "Enable BFloat16 Extension" >; + +def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8", + "true", "Enable Matrix Multiply Int8 Extension">; + +def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32", + "true", "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>; + +def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64", + "true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>; + +def FeatureXS : SubtargetFeature<"xs", "HasXS", + "true", "Enable Armv8.7-A limited-TLB-maintenance instruction">; + +def FeatureWFxT : SubtargetFeature<"wfxt", "HasWFxT", + "true", "Enable Armv8.7-A WFET and WFIT instruction">; + +def FeatureHCX : SubtargetFeature< + "hcx", "HasHCX", "true", "Enable Armv8.7-A HCRX_EL2 system register">; + +def FeatureLS64 : SubtargetFeature<"ls64", "HasLS64", + "true", "Enable Armv8.7-A LD64B/ST64B Accelerator Extension">; + +def FeatureHBC : SubtargetFeature<"hbc", "HasHBC", + "true", "Enable Armv8.8-A Hinted Conditional Branches Extension">; + +def FeatureMOPS : SubtargetFeature<"mops", "HasMOPS", + "true", "Enable Armv8.8-A memcpy and memset acceleration instructions">; + +def FeatureBRBE : SubtargetFeature<"brbe", "HasBRBE", + "true", "Enable Branch Record Buffer Extension">; + +def FeatureSPE_EEF : SubtargetFeature<"spe-eef", "HasSPE_EEF", + "true", "Enable extra register in the Statistical Profiling Extension">; + +def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps", + "true", "Enable fine grained virtualization traps extension">; + +def FeatureEnhancedCounterVirtualization : + SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization", + "true", "Enable enhanced counter virtualization extension">; + +def FeatureRME : SubtargetFeature<"rme", "HasRME", + "true", "Enable Realm Management Extension">; + +// A subset of SVE(2) instructions are legal in Streaming SVE execution mode +// defined by SME. +def FeatureStreamingSVE : SubtargetFeature<"streaming-sve", + "HasStreamingSVE", "true", + "Enable subset of SVE(2) instructions for Streaming SVE execution mode">; +def FeatureSME : SubtargetFeature<"sme", "HasSME", "true", + "Enable Scalable Matrix Extension (SME)", [FeatureStreamingSVE, FeatureBF16]>; + +def FeatureSMEF64 : SubtargetFeature<"sme-f64", "HasSMEF64", "true", + "Enable Scalable Matrix Extension (SME) F64F64 instructions", [FeatureSME]>; + +def FeatureSMEI64 : SubtargetFeature<"sme-i64", "HasSMEI64", "true", + "Enable Scalable Matrix Extension (SME) I16I64 instructions", [FeatureSME]>; + +def FeatureAppleA7SysReg : SubtargetFeature<"apple-a7-sysreg", "HasAppleA7SysReg", "true", + "Apple A7 (the CPU formerly known as Cyclone)">; + +def FeatureEL2VMSA : SubtargetFeature<"el2vmsa", "HasEL2VMSA", "true", + "Enable Exception Level 2 Virtual Memory System Architecture">; + +def FeatureEL3 : SubtargetFeature<"el3", "HasEL3", "true", + "Enable Exception Level 3">; + +def FeatureFixCortexA53_835769 : SubtargetFeature<"fix-cortex-a53-835769", + "FixCortexA53_835769", "true", "Mitigate Cortex-A53 Erratum 835769">; + +def FeatureNoBTIAtReturnTwice : SubtargetFeature<"no-bti-at-return-twice", + "NoBTIAtReturnTwice", "true", + "Don't place a BTI instruction " + "after a return-twice">; + //===----------------------------------------------------------------------===// // Architectures. // +def HasV8_0aOps : SubtargetFeature<"v8a", "HasV8_0aOps", "true", + "Support ARM v8.0a instructions", [FeatureEL2VMSA, FeatureEL3]>; def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", - "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM]>; + "Support ARM v8.1a instructions", [HasV8_0aOps, FeatureCRC, FeatureLSE, + FeatureRDM, FeaturePAN, FeatureLOR, FeatureVH]>; def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", - "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>; + "Support ARM v8.2a instructions", [HasV8_1aOps, FeaturePsUAO, + FeaturePAN_RWV, FeatureRAS, FeatureCCPP]>; def HasV8_3aOps : SubtargetFeature<"v8.3a", "HasV8_3aOps", "true", - "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC]>; + "Support ARM v8.3a instructions", [HasV8_2aOps, FeatureRCPC, FeaturePAuth, + FeatureJS, FeatureCCIDX, FeatureComplxNum]>; def HasV8_4aOps : SubtargetFeature<"v8.4a", "HasV8_4aOps", "true", - "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd]>; + "Support ARM v8.4a instructions", [HasV8_3aOps, FeatureDotProd, + FeatureNV, FeatureMPAM, FeatureDIT, + FeatureTRACEV8_4, FeatureAM, FeatureSEL2, FeatureTLB_RMI, + FeatureFlagM, FeatureRCPC_IMMO, FeatureLSE2]>; + +def HasV8_5aOps : SubtargetFeature< + "v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions", + [HasV8_4aOps, FeatureAltFPCmp, FeatureFRInt3264, FeatureSpecRestrict, + FeatureSSBS, FeatureSB, FeaturePredRes, FeatureCacheDeepPersist, + FeatureBranchTargetId]>; + +def HasV8_6aOps : SubtargetFeature< + "v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions", + [HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps, + FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>; + +def HasV8_7aOps : SubtargetFeature< + "v8.7a", "HasV8_7aOps", "true", "Support ARM v8.7a instructions", + [HasV8_6aOps, FeatureXS, FeatureWFxT, FeatureHCX]>; + +def HasV8_8aOps : SubtargetFeature< + "v8.8a", "HasV8_8aOps", "true", "Support ARM v8.8a instructions", + [HasV8_7aOps, FeatureHBC, FeatureMOPS]>; + +def HasV9_0aOps : SubtargetFeature< + "v9a", "HasV9_0aOps", "true", "Support ARM v9a instructions", + [HasV8_5aOps, FeatureSVE2]>; + +def HasV9_1aOps : SubtargetFeature< + "v9.1a", "HasV9_1aOps", "true", "Support ARM v9.1a instructions", + [HasV8_6aOps, HasV9_0aOps]>; + +def HasV9_2aOps : SubtargetFeature< + "v9.2a", "HasV9_2aOps", "true", "Support ARM v9.2a instructions", + [HasV8_7aOps, HasV9_1aOps]>; + +def HasV9_3aOps : SubtargetFeature< + "v9.3a", "HasV9_3aOps", "true", "Support ARM v9.3a instructions", + [HasV8_8aOps, HasV9_2aOps]>; + +def HasV8_0rOps : SubtargetFeature< + "v8r", "HasV8_0rOps", "true", "Support ARM v8r instructions", + [//v8.1 + FeatureCRC, FeaturePAN, FeatureRDM, FeatureLSE, FeatureCONTEXTIDREL2, + //v8.2 + FeatureRAS, FeaturePsUAO, FeatureCCPP, FeaturePAN_RWV, + //v8.3 + FeatureComplxNum, FeatureCCIDX, FeatureJS, + FeaturePAuth, FeatureRCPC, + //v8.4 + FeatureDotProd, FeatureTRACEV8_4, FeatureTLB_RMI, + FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO]>; //===----------------------------------------------------------------------===// // Register File Description @@ -226,6 +555,10 @@ include "AArch64CallingConvention.td" include "AArch64Schedule.td" include "AArch64InstrInfo.td" +include "AArch64SchedPredicates.td" +include "AArch64SchedPredExynos.td" +include "AArch64SchedPredAmpere.td" +include "AArch64Combine.td" def AArch64InstrInfo : InstrInfo; @@ -235,300 +568,639 @@ def AArch64InstrInfo : InstrInfo; include "AArch64SystemOperands.td" +//===----------------------------------------------------------------------===// +// Access to privileged registers +//===----------------------------------------------------------------------===// + +foreach i = 1-3 in +def FeatureUseEL#i#ForTP : SubtargetFeature<"tpidr-el"#i, "UseEL"#i#"ForTP", + "true", "Permit use of TPIDR_EL"#i#" for the TLS base">; + +//===----------------------------------------------------------------------===// +// Control codegen mitigation against Straight Line Speculation vulnerability. +//===----------------------------------------------------------------------===// + +def FeatureHardenSlsRetBr : SubtargetFeature<"harden-sls-retbr", + "HardenSlsRetBr", "true", + "Harden against straight line speculation across RET and BR instructions">; +def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr", + "HardenSlsBlr", "true", + "Harden against straight line speculation across BLR instructions">; +def FeatureHardenSlsNoComdat : SubtargetFeature<"harden-sls-nocomdat", + "HardenSlsNoComdat", "true", + "Generate thunk code for SLS mitigation in the normal text section">; + //===----------------------------------------------------------------------===// // AArch64 Processors supported. // + +//===----------------------------------------------------------------------===// +// Unsupported features to disable for scheduling models +//===----------------------------------------------------------------------===// + +class AArch64Unsupported { list F; } + +def SVEUnsupported : AArch64Unsupported { + let F = [HasSVE, HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, + HasSVE2BitPerm, HasSVEorStreamingSVE, HasSVE2orStreamingSVE]; +} + +def PAUnsupported : AArch64Unsupported { + let F = [HasPAuth]; +} + +def SMEUnsupported : AArch64Unsupported { + let F = [HasSME, HasSMEF64, HasSMEI64]; +} + include "AArch64SchedA53.td" +include "AArch64SchedA55.td" include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" include "AArch64SchedFalkor.td" include "AArch64SchedKryo.td" -include "AArch64SchedExynosM1.td" include "AArch64SchedExynosM3.td" +include "AArch64SchedExynosM4.td" +include "AArch64SchedExynosM5.td" include "AArch64SchedThunderX.td" include "AArch64SchedThunderX2T99.td" +include "AArch64SchedA64FX.td" +include "AArch64SchedThunderX3T110.td" +include "AArch64SchedTSV110.td" +include "AArch64SchedAmpere1.td" -def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", - "Cortex-A35 ARM processors", [ - FeatureCRC, - FeatureCrypto, - FeatureFPARMv8, - FeatureNEON, - FeaturePerfMon - ]>; +def TuneA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", + "Cortex-A35 ARM processors">; -def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", +def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", "Cortex-A53 ARM processors", [ - FeatureBalanceFPOps, - FeatureCRC, - FeatureCrypto, - FeatureCustomCheapAsMoveHandling, - FeatureFPARMv8, FeatureFuseAES, - FeatureNEON, - FeaturePerfMon, - FeaturePostRAScheduler, - FeatureUseAA - ]>; + FeatureBalanceFPOps, + FeatureCustomCheapAsMoveHandling, + FeaturePostRAScheduler]>; -def ProcA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55", +def TuneA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55", "Cortex-A55 ARM processors", [ - HasV8_2aOps, - FeatureCrypto, - FeatureFPARMv8, FeatureFuseAES, - FeatureNEON, - FeatureFullFP16, - FeatureDotProd, - FeatureRCPC, - FeaturePerfMon - ]>; - -def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", - "Cortex-A57 ARM processors", [ - FeatureBalanceFPOps, - FeatureCRC, - FeatureCrypto, - FeatureCustomCheapAsMoveHandling, - FeatureFPARMv8, - FeatureFuseAES, - FeatureFuseLiterals, - FeatureNEON, - FeaturePerfMon, FeaturePostRAScheduler, - FeaturePredictableSelectIsExpensive + FeatureFuseAddress]>; + +def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510", + "Cortex-A510 ARM processors", [ + FeatureFuseAES, + FeaturePostRAScheduler ]>; -def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", +def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", + "Cortex-A57 ARM processors", [ + FeatureFuseAES, + FeatureBalanceFPOps, + FeatureCustomCheapAsMoveHandling, + FeatureFuseLiterals, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive]>; + +def TuneA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65", + "Cortex-A65 ARM processors", [ + FeatureFuseAES, + FeatureFuseAddress, + FeatureFuseLiterals]>; + +def TuneA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", "Cortex-A72 ARM processors", [ - FeatureCRC, - FeatureCrypto, - FeatureFPARMv8, FeatureFuseAES, - FeatureNEON, - FeaturePerfMon - ]>; + FeatureFuseLiterals]>; -def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", +def TuneA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", "Cortex-A73 ARM processors", [ - FeatureCRC, - FeatureCrypto, - FeatureFPARMv8, - FeatureFuseAES, - FeatureNEON, - FeaturePerfMon - ]>; + FeatureFuseAES]>; -def ProcA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75", +def TuneA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75", "Cortex-A75 ARM processors", [ - HasV8_2aOps, - FeatureCrypto, - FeatureFPARMv8, + FeatureFuseAES]>; + +def TuneA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", + "Cortex-A76 ARM processors", [ + FeatureFuseAES]>; + +def TuneA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77", + "Cortex-A77 ARM processors", [ + FeatureCmpBccFusion, + FeatureFuseAES]>; + +def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78", + "Cortex-A78 ARM processors", [ + FeatureCmpBccFusion, + FeatureFuseAES, + FeaturePostRAScheduler]>; + +def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily", + "CortexA78C", + "Cortex-A78C ARM processors", [ + FeatureCmpBccFusion, + FeatureFuseAES, + FeaturePostRAScheduler]>; + +def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710", + "Cortex-A710 ARM processors", [ FeatureFuseAES, - FeatureNEON, - FeatureFullFP16, - FeatureDotProd, - FeatureRCPC, - FeaturePerfMon - ]>; + FeaturePostRAScheduler, + FeatureCmpBccFusion]>; + +def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily", + "CortexR82", + "Cortex-R82 ARM processors", [ + FeaturePostRAScheduler]>; + +def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1", + "Cortex-X1 ARM processors", [ + FeatureCmpBccFusion, + FeatureFuseAES, + FeaturePostRAScheduler]>; + +def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2", + "Cortex-X2 ARM processors", [ + FeatureFuseAES, + FeaturePostRAScheduler, + FeatureCmpBccFusion]>; + +def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX", + "Fujitsu A64FX processors", [ + FeaturePostRAScheduler, + FeatureAggressiveFMA, + FeatureArithmeticBccFusion, + FeaturePredictableSelectIsExpensive + ]>; + +def TuneCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel", + "Nvidia Carmel processors">; // Note that cyclone does not fuse AES instructions, but newer apple chips do // perform the fusion and cyclone is used by default when targetting apple OSes. -def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", - "Cyclone", [ - FeatureAlternateSExtLoadCVTF32Pattern, - FeatureArithmeticBccFusion, - FeatureArithmeticCbzFusion, - FeatureCrypto, - FeatureDisableLatencySchedHeuristic, - FeatureFPARMv8, - FeatureFuseAES, - FeatureNEON, - FeaturePerfMon, - FeatureZCRegMove, - FeatureZCZeroing, - FeatureZCZeroingFPWorkaround - ]>; +def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7", + "Apple A7 (the CPU formerly known as Cyclone)", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureDisableLatencySchedHeuristic, + FeatureFuseAES, FeatureFuseCryptoEOR, + FeatureZCRegMove, + FeatureZCZeroing, + FeatureZCZeroingFPWorkaround] + >; -def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", - "Samsung Exynos-M1 processors", - [FeatureSlowPaired128, - FeatureCRC, - FeatureCrypto, - FeatureExynosCheapAsMoveHandling, - FeatureFPARMv8, - FeatureFuseAES, - FeatureNEON, - FeaturePerfMon, - FeaturePostRAScheduler, - FeatureSlowMisaligned128Store, - FeatureUseRSqrt, - FeatureZCZeroing]>; +def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10", + "Apple A10", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureDisableLatencySchedHeuristic, + FeatureFuseAES, + FeatureFuseCryptoEOR, + FeatureZCRegMove, + FeatureZCZeroing] + >; -def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1", - "Samsung Exynos-M2 processors", - [FeatureSlowPaired128, - FeatureCRC, - FeatureCrypto, - FeatureExynosCheapAsMoveHandling, - FeatureFPARMv8, - FeatureFuseAES, - FeatureNEON, - FeaturePerfMon, - FeaturePostRAScheduler, - FeatureSlowMisaligned128Store, - FeatureZCZeroing]>; +def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", + "Apple A11", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureDisableLatencySchedHeuristic, + FeatureFuseAES, + FeatureFuseCryptoEOR, + FeatureZCRegMove, + FeatureZCZeroing] + >; -def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", +def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", + "Apple A12", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureDisableLatencySchedHeuristic, + FeatureFuseAES, + FeatureFuseCryptoEOR, + FeatureZCRegMove, + FeatureZCZeroing] + >; + +def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", + "Apple A13", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureDisableLatencySchedHeuristic, + FeatureFuseAES, + FeatureFuseCryptoEOR, + FeatureZCRegMove, + FeatureZCZeroing] + >; + +def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", + "Apple A14", [ + FeatureAggressiveFMA, + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureDisableLatencySchedHeuristic, + FeatureFuseAddress, + FeatureFuseAES, + FeatureFuseArithmeticLogic, + FeatureFuseCCSelect, + FeatureFuseCryptoEOR, + FeatureFuseLiterals, + FeatureZCRegMove, + FeatureZCZeroing]>; + +def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", "Samsung Exynos-M3 processors", - [FeatureCRC, - FeatureCrypto, - FeatureExynosCheapAsMoveHandling, - FeatureFPARMv8, + [FeatureExynosCheapAsMoveHandling, + FeatureForce32BitJumpTables, FeatureFuseAddress, FeatureFuseAES, FeatureFuseCCSelect, FeatureFuseLiterals, FeatureLSLFast, - FeatureNEON, - FeaturePerfMon, FeaturePostRAScheduler, - FeaturePredictableSelectIsExpensive, + FeaturePredictableSelectIsExpensive]>; + +def TuneExynosM4 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", + "Samsung Exynos-M3 processors", + [FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureExynosCheapAsMoveHandling, + FeatureForce32BitJumpTables, + FeatureFuseAddress, + FeatureFuseAES, + FeatureFuseArithmeticLogic, + FeatureFuseCCSelect, + FeatureFuseLiterals, + FeatureLSLFast, + FeaturePostRAScheduler, FeatureZCZeroing]>; -def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", +def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", "Qualcomm Kryo processors", [ - FeatureCRC, - FeatureCrypto, FeatureCustomCheapAsMoveHandling, - FeatureFPARMv8, - FeatureNEON, - FeaturePerfMon, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, FeatureZCZeroing, - FeatureLSLFast - ]>; + FeatureLSLFast] + >; -def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", +def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", "Qualcomm Falkor processors", [ - FeatureCRC, - FeatureCrypto, FeatureCustomCheapAsMoveHandling, - FeatureFPARMv8, - FeatureNEON, - FeaturePerfMon, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureRDM, FeatureZCZeroing, FeatureLSLFast, FeatureSlowSTRQro ]>; -def ProcSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", +def TuneNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily", "NeoverseE1", + "Neoverse E1 ARM processors", [ + FeaturePostRAScheduler, + FeatureFuseAES + ]>; + +def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1", + "Neoverse N1 ARM processors", [ + FeaturePostRAScheduler, + FeatureFuseAES + ]>; + +def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2", + "Neoverse N2 ARM processors", [ + FeaturePostRAScheduler, + FeatureFuseAES + ]>; +def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Neoverse512TVB", + "Neoverse 512-TVB ARM processors", [ + FeaturePostRAScheduler, + FeatureFuseAES + ]>; + +def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1", + "Neoverse V1 ARM processors", [ + FeatureFuseAES, + FeaturePostRAScheduler]>; + +def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", "Qualcomm Saphira processors", [ - FeatureCrypto, FeatureCustomCheapAsMoveHandling, - FeatureFPARMv8, - FeatureNEON, - FeatureSPE, - FeaturePerfMon, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, FeatureZCZeroing, - FeatureLSLFast, - HasV8_3aOps]>; + FeatureLSLFast]>; -def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", - "ThunderX2T99", +def TuneThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99", "Cavium ThunderX2 processors", [ FeatureAggressiveFMA, - FeatureCRC, - FeatureCrypto, - FeatureFPARMv8, FeatureArithmeticBccFusion, - FeatureNEON, FeaturePostRAScheduler, - FeaturePredictableSelectIsExpensive, - FeatureLSE, - HasV8_1aOps]>; + FeaturePredictableSelectIsExpensive]>; -def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX", +def TuneThunderX3T110 : SubtargetFeature<"thunderx3t110", "ARMProcFamily", + "ThunderX3T110", + "Marvell ThunderX3 processors", [ + FeatureAggressiveFMA, + FeatureArithmeticBccFusion, + FeaturePostRAScheduler, + FeaturePredictableSelectIsExpensive, + FeatureBalanceFPOps, + FeatureStrictAlign]>; + +def TuneThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX", "Cavium ThunderX processors", [ - FeatureCRC, - FeatureCrypto, - FeatureFPARMv8, - FeaturePerfMon, FeaturePostRAScheduler, - FeaturePredictableSelectIsExpensive, - FeatureNEON]>; + FeaturePredictableSelectIsExpensive]>; -def ProcThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily", +def TuneThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily", "ThunderXT88", "Cavium ThunderX processors", [ - FeatureCRC, - FeatureCrypto, - FeatureFPARMv8, - FeaturePerfMon, FeaturePostRAScheduler, - FeaturePredictableSelectIsExpensive, - FeatureNEON]>; + FeaturePredictableSelectIsExpensive]>; -def ProcThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily", +def TuneThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily", "ThunderXT81", "Cavium ThunderX processors", [ - FeatureCRC, - FeatureCrypto, - FeatureFPARMv8, - FeaturePerfMon, FeaturePostRAScheduler, - FeaturePredictableSelectIsExpensive, - FeatureNEON]>; + FeaturePredictableSelectIsExpensive]>; -def ProcThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily", +def TuneThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily", "ThunderXT83", "Cavium ThunderX processors", [ - FeatureCRC, - FeatureCrypto, - FeatureFPARMv8, - FeaturePerfMon, FeaturePostRAScheduler, - FeaturePredictableSelectIsExpensive, - FeatureNEON]>; + FeaturePredictableSelectIsExpensive]>; -def : ProcessorModel<"generic", NoSchedModel, [ - FeatureFPARMv8, - FeatureFuseAES, - FeatureNEON, - FeaturePerfMon, - FeaturePostRAScheduler - ]>; +def TuneTSV110 : SubtargetFeature<"tsv110", "ARMProcFamily", "TSV110", + "HiSilicon TS-V110 processors", [ + FeatureCustomCheapAsMoveHandling, + FeatureFuseAES, + FeaturePostRAScheduler]>; + +def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1", + "Ampere Computing Ampere-1 processors", [ + FeaturePostRAScheduler, + FeatureFuseAES, + FeatureLSLFast, + FeatureAggressiveFMA, + FeatureArithmeticBccFusion, + FeatureCmpBccFusion, + FeatureFuseAddress, + FeatureFuseLiterals]>; + +def ProcessorFeatures { + list A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto, + FeatureFPARMv8, FeatureNEON, FeaturePerfMon]; + list A55 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeatureFullFP16, FeatureDotProd, + FeatureRCPC, FeaturePerfMon]; + list A510 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, + FeatureMatMulInt8, FeatureBF16, FeatureAM, + FeatureMTE, FeatureETE, FeatureSVE2BitPerm, + FeatureFP16FML]; + list A65 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeatureFullFP16, FeatureDotProd, + FeatureRCPC, FeatureSSBS, FeatureRAS]; + list A76 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeatureFullFP16, FeatureDotProd, + FeatureRCPC, FeatureSSBS]; + list A77 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeatureFullFP16, FeatureDotProd, + FeatureRCPC, FeatureSSBS]; + list A78 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeatureFullFP16, FeatureDotProd, + FeatureRCPC, FeaturePerfMon, FeatureSPE, + FeatureSSBS]; + list A78C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeatureFullFP16, FeatureDotProd, + FeatureFlagM, FeatureFP16FML, FeaturePAuth, + FeaturePerfMon, FeatureRCPC, FeatureSPE, + FeatureSSBS]; + list A710 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, + FeatureETE, FeatureMTE, FeatureFP16FML, + FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8]; + list R82 = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16, + FeatureFP16FML, FeatureSSBS, FeaturePredRes, + FeatureSB, FeatureSpecRestrict]; + list X1 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeatureRCPC, FeaturePerfMon, + FeatureSPE, FeatureFullFP16, FeatureDotProd, + FeatureSSBS]; + list X1C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeatureRCPC, FeaturePerfMon, + FeatureSPE, FeatureFullFP16, FeatureDotProd, + FeaturePAuth, FeatureSSBS]; + list X2 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, + FeatureMatMulInt8, FeatureBF16, FeatureAM, + FeatureMTE, FeatureETE, FeatureSVE2BitPerm, + FeatureFP16FML]; + list A64FX = [HasV8_2aOps, FeatureFPARMv8, FeatureNEON, + FeatureSHA2, FeaturePerfMon, FeatureFullFP16, + FeatureSVE, FeatureComplxNum]; + list Carmel = [HasV8_2aOps, FeatureNEON, FeatureCrypto, + FeatureFullFP16]; + list AppleA7 = [HasV8_0aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON,FeaturePerfMon, FeatureAppleA7SysReg]; + list AppleA10 = [HasV8_0aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeaturePerfMon, FeatureCRC, + FeatureRDM, FeaturePAN, FeatureLOR, FeatureVH]; + list AppleA11 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeaturePerfMon, FeatureFullFP16]; + list AppleA12 = [HasV8_3aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeaturePerfMon, FeatureFullFP16]; + list AppleA13 = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeaturePerfMon, FeatureFullFP16, + FeatureFP16FML, FeatureSHA3]; + list AppleA14 = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeaturePerfMon, FeatureFRInt3264, + FeatureSpecRestrict, FeatureSSBS, FeatureSB, + FeaturePredRes, FeatureCacheDeepPersist, + FeatureFullFP16, FeatureFP16FML, FeatureSHA3, + FeatureAltFPCmp]; + list ExynosM3 = [HasV8_0aOps, FeatureCRC, FeatureCrypto, + FeaturePerfMon]; + list ExynosM4 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd, + FeatureFullFP16, FeaturePerfMon]; + list Falkor = [HasV8_0aOps, FeatureCRC, FeatureCrypto, + FeatureFPARMv8, FeatureNEON, FeaturePerfMon, + FeatureRDM]; + list NeoverseE1 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd, + FeatureFPARMv8, FeatureFullFP16, FeatureNEON, + FeatureRCPC, FeatureSSBS]; + list NeoverseN1 = [HasV8_2aOps, FeatureCrypto, FeatureDotProd, + FeatureFPARMv8, FeatureFullFP16, FeatureNEON, + FeatureRCPC, FeatureSPE, FeatureSSBS]; + list NeoverseN2 = [HasV8_5aOps, FeatureBF16, FeatureETE, + FeatureMatMulInt8, FeatureMTE, FeatureSVE2, + FeatureSVE2BitPerm, FeatureTRBE, FeatureCrypto]; + list Neoverse512TVB = [HasV8_4aOps, FeatureBF16, FeatureCacheDeepPersist, + FeatureCrypto, FeatureFPARMv8, FeatureFP16FML, + FeatureFullFP16, FeatureMatMulInt8, FeatureNEON, + FeaturePerfMon, FeatureRandGen, FeatureSPE, + FeatureSSBS, FeatureSVE]; + list NeoverseV1 = [HasV8_4aOps, FeatureBF16, FeatureCacheDeepPersist, + FeatureCrypto, FeatureFPARMv8, FeatureFP16FML, + FeatureFullFP16, FeatureMatMulInt8, FeatureNEON, + FeaturePerfMon, FeatureRandGen, FeatureSPE, + FeatureSSBS, FeatureSVE]; + list Saphira = [HasV8_4aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeatureSPE, FeaturePerfMon]; + list ThunderX = [HasV8_0aOps, FeatureCRC, FeatureCrypto, + FeatureFPARMv8, FeaturePerfMon, FeatureNEON]; + list ThunderX2T99 = [HasV8_1aOps, FeatureCRC, FeatureCrypto, + FeatureFPARMv8, FeatureNEON, FeatureLSE]; + list ThunderX3T110 = [HasV8_3aOps, FeatureCRC, FeatureCrypto, + FeatureFPARMv8, FeatureNEON, FeatureLSE, + FeaturePAuth, FeaturePerfMon]; + list TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, + FeatureNEON, FeaturePerfMon, FeatureSPE, + FeatureFullFP16, FeatureFP16FML, FeatureDotProd]; + list Ampere1 = [HasV8_6aOps, FeatureNEON, FeaturePerfMon, + FeatureMTE, FeatureSSBS]; + + // ETE and TRBE are future architecture extensions. We temporarily enable them + // by default for users targeting generic AArch64. The extensions do not + // affect code generated by the compiler and can be used only by explicitly + // mentioning the new system register names in assembly. + list Generic = [FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureETE]; +} + + +def : ProcessorModel<"generic", CortexA55Model, ProcessorFeatures.Generic, + [FeatureFuseAES, FeaturePostRAScheduler]>; +def : ProcessorModel<"cortex-a35", CortexA53Model, ProcessorFeatures.A53, + [TuneA35]>; +def : ProcessorModel<"cortex-a34", CortexA53Model, ProcessorFeatures.A53, + [TuneA35]>; +def : ProcessorModel<"cortex-a53", CortexA53Model, ProcessorFeatures.A53, + [TuneA53]>; +def : ProcessorModel<"cortex-a55", CortexA55Model, ProcessorFeatures.A55, + [TuneA55]>; +def : ProcessorModel<"cortex-a510", CortexA55Model, ProcessorFeatures.A510, + [TuneA510]>; +def : ProcessorModel<"cortex-a57", CortexA57Model, ProcessorFeatures.A53, + [TuneA57]>; +def : ProcessorModel<"cortex-a65", CortexA53Model, ProcessorFeatures.A65, + [TuneA65]>; +def : ProcessorModel<"cortex-a65ae", CortexA53Model, ProcessorFeatures.A65, + [TuneA65]>; +def : ProcessorModel<"cortex-a72", CortexA57Model, ProcessorFeatures.A53, + [TuneA72]>; +def : ProcessorModel<"cortex-a73", CortexA57Model, ProcessorFeatures.A53, + [TuneA73]>; +def : ProcessorModel<"cortex-a75", CortexA57Model, ProcessorFeatures.A55, + [TuneA75]>; +def : ProcessorModel<"cortex-a76", CortexA57Model, ProcessorFeatures.A76, + [TuneA76]>; +def : ProcessorModel<"cortex-a76ae", CortexA57Model, ProcessorFeatures.A76, + [TuneA76]>; +def : ProcessorModel<"cortex-a77", CortexA57Model, ProcessorFeatures.A77, + [TuneA77]>; +def : ProcessorModel<"cortex-a78", CortexA57Model, ProcessorFeatures.A78, + [TuneA78]>; +def : ProcessorModel<"cortex-a78c", CortexA57Model, ProcessorFeatures.A78C, + [TuneA78C]>; +def : ProcessorModel<"cortex-a710", CortexA57Model, ProcessorFeatures.A710, + [TuneA710]>; +def : ProcessorModel<"cortex-r82", CortexA55Model, ProcessorFeatures.R82, + [TuneR82]>; +def : ProcessorModel<"cortex-x1", CortexA57Model, ProcessorFeatures.X1, + [TuneX1]>; +def : ProcessorModel<"cortex-x1c", CortexA57Model, ProcessorFeatures.X1C, + [TuneX1]>; +def : ProcessorModel<"cortex-x2", CortexA57Model, ProcessorFeatures.X2, + [TuneX2]>; +def : ProcessorModel<"neoverse-e1", CortexA53Model, + ProcessorFeatures.NeoverseE1, [TuneNeoverseE1]>; +def : ProcessorModel<"neoverse-n1", CortexA57Model, + ProcessorFeatures.NeoverseN1, [TuneNeoverseN1]>; +def : ProcessorModel<"neoverse-n2", CortexA57Model, + ProcessorFeatures.NeoverseN2, [TuneNeoverseN2]>; +def : ProcessorModel<"neoverse-512tvb", CortexA57Model, + ProcessorFeatures.Neoverse512TVB, [TuneNeoverse512TVB]>; +def : ProcessorModel<"neoverse-v1", CortexA57Model, + ProcessorFeatures.NeoverseV1, [TuneNeoverseV1]>; +def : ProcessorModel<"exynos-m3", ExynosM3Model, ProcessorFeatures.ExynosM3, + [TuneExynosM3]>; +def : ProcessorModel<"exynos-m4", ExynosM4Model, ProcessorFeatures.ExynosM4, + [TuneExynosM4]>; +def : ProcessorModel<"exynos-m5", ExynosM5Model, ProcessorFeatures.ExynosM4, + [TuneExynosM4]>; +def : ProcessorModel<"falkor", FalkorModel, ProcessorFeatures.Falkor, + [TuneFalkor]>; +def : ProcessorModel<"saphira", FalkorModel, ProcessorFeatures.Saphira, + [TuneSaphira]>; +def : ProcessorModel<"kryo", KryoModel, ProcessorFeatures.A53, [TuneKryo]>; -// FIXME: Cortex-A35 and Cortex-A55 are currently modeled as a Cortex-A53. -def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; -def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; -def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>; -def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; -// FIXME: Cortex-A72, Cortex-A73 and Cortex-A75 are currently modeled as a Cortex-A57. -def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>; -def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>; -def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>; -def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; -def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>; -def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>; -def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>; -def : ProcessorModel<"exynos-m4", ExynosM3Model, [ProcExynosM3]>; -def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>; -def : ProcessorModel<"saphira", FalkorModel, [ProcSaphira]>; -def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>; // Cavium ThunderX/ThunderX T8X Processors -def : ProcessorModel<"thunderx", ThunderXT8XModel, [ProcThunderX]>; -def : ProcessorModel<"thunderxt88", ThunderXT8XModel, [ProcThunderXT88]>; -def : ProcessorModel<"thunderxt81", ThunderXT8XModel, [ProcThunderXT81]>; -def : ProcessorModel<"thunderxt83", ThunderXT8XModel, [ProcThunderXT83]>; +def : ProcessorModel<"thunderx", ThunderXT8XModel, ProcessorFeatures.ThunderX, + [TuneThunderX]>; +def : ProcessorModel<"thunderxt88", ThunderXT8XModel, + ProcessorFeatures.ThunderX, [TuneThunderXT88]>; +def : ProcessorModel<"thunderxt81", ThunderXT8XModel, + ProcessorFeatures.ThunderX, [TuneThunderXT81]>; +def : ProcessorModel<"thunderxt83", ThunderXT8XModel, + ProcessorFeatures.ThunderX, [TuneThunderXT83]>; // Cavium ThunderX2T9X Processors. Formerly Broadcom Vulcan. -def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>; +def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, + ProcessorFeatures.ThunderX2T99, [TuneThunderX2T99]>; +// Marvell ThunderX3T110 Processors. +def : ProcessorModel<"thunderx3t110", ThunderX3T110Model, + ProcessorFeatures.ThunderX3T110, [TuneThunderX3T110]>; +def : ProcessorModel<"tsv110", TSV110Model, ProcessorFeatures.TSV110, + [TuneTSV110]>; + +// Support cyclone as an alias for apple-a7 so we can still LTO old bitcode. +def : ProcessorModel<"cyclone", CycloneModel, ProcessorFeatures.AppleA7, + [TuneAppleA7]>; + +// iPhone and iPad CPUs +def : ProcessorModel<"apple-a7", CycloneModel, ProcessorFeatures.AppleA7, + [TuneAppleA7]>; +def : ProcessorModel<"apple-a8", CycloneModel, ProcessorFeatures.AppleA7, + [TuneAppleA7]>; +def : ProcessorModel<"apple-a9", CycloneModel, ProcessorFeatures.AppleA7, + [TuneAppleA7]>; +def : ProcessorModel<"apple-a10", CycloneModel, ProcessorFeatures.AppleA10, + [TuneAppleA10]>; +def : ProcessorModel<"apple-a11", CycloneModel, ProcessorFeatures.AppleA11, + [TuneAppleA11]>; +def : ProcessorModel<"apple-a12", CycloneModel, ProcessorFeatures.AppleA12, + [TuneAppleA12]>; +def : ProcessorModel<"apple-a13", CycloneModel, ProcessorFeatures.AppleA13, + [TuneAppleA13]>; +def : ProcessorModel<"apple-a14", CycloneModel, ProcessorFeatures.AppleA14, + [TuneAppleA14]>; + +// Mac CPUs +def : ProcessorModel<"apple-m1", CycloneModel, ProcessorFeatures.AppleA14, + [TuneAppleA14]>; + +// watch CPUs. +def : ProcessorModel<"apple-s4", CycloneModel, ProcessorFeatures.AppleA12, + [TuneAppleA12]>; +def : ProcessorModel<"apple-s5", CycloneModel, ProcessorFeatures.AppleA12, + [TuneAppleA12]>; + +// Alias for the latest Apple processor model supported by LLVM. +def : ProcessorModel<"apple-latest", CycloneModel, ProcessorFeatures.AppleA14, + [TuneAppleA14]>; + +// Fujitsu A64FX +def : ProcessorModel<"a64fx", A64FXModel, ProcessorFeatures.A64FX, + [TuneA64FX]>; + +// Nvidia Carmel +def : ProcessorModel<"carmel", NoSchedModel, ProcessorFeatures.Carmel, + [TuneCarmel]>; + +// Ampere Computing +def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1, + [TuneAmpere1]>; //===----------------------------------------------------------------------===// // Assembly parser @@ -577,3 +1249,9 @@ def AArch64 : Target { let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter]; let AllowRegisterRenaming = 1; } + +//===----------------------------------------------------------------------===// +// Pfm Counters +//===----------------------------------------------------------------------===// + +include "AArch64PfmCounters.td" diff --git a/suite/synctools/tablegen/AArch64/AArch64CallingConvention.td b/suite/synctools/tablegen/AArch64/AArch64CallingConvention.td index 30492003..f2615153 100644 --- a/suite/synctools/tablegen/AArch64/AArch64CallingConvention.td +++ b/suite/synctools/tablegen/AArch64/AArch64CallingConvention.td @@ -1,9 +1,8 @@ //=- AArch64CallingConv.td - Calling Conventions for AArch64 -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,17 +10,19 @@ // //===----------------------------------------------------------------------===// -/// CCIfAlign - Match of the original alignment of the arg -class CCIfAlign : - CCIf; /// CCIfBigEndian - Match only if we're in big endian mode. class CCIfBigEndian : CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>; +class CCIfILP32 : + CCIf<"State.getMachineFunction().getDataLayout().getPointerSize() == 4", A>; + + //===----------------------------------------------------------------------===// // ARM AAPCS64 Calling Convention //===----------------------------------------------------------------------===// +let Entry = 1 in def CC_AArch64_AAPCS : CallingConv<[ CCIfType<[iPTR], CCBitConvertToType>, CCIfType<[v2f32], CCBitConvertToType>, @@ -29,13 +30,29 @@ def CC_AArch64_AAPCS : CallingConv<[ // Big endian vectors must be passed as if they were 1-element vectors so that // their lanes are in a consistent order. - CCIfBigEndian>>, - CCIfBigEndian>>, - // An SRet is passed in X8, not X0 like a normal pointer parameter. - CCIfSRet>>, + // In AAPCS, an SRet is passed in X8, not X0 like a normal pointer parameter. + // However, on windows, in some circumstances, the SRet is passed in X0 or X1 + // instead. The presence of the inreg attribute indicates that SRet is + // passed in the alternative register (X0 or X1), not X8: + // - X0 for non-instance methods. + // - X1 for instance methods. + + // The "sret" attribute identifies indirect returns. + // The "inreg" attribute identifies non-aggregate types. + // The position of the "sret" attribute identifies instance/non-instance + // methods. + // "sret" on argument 0 means non-instance methods. + // "sret" on argument 1 means instance methods. + + CCIfInReg>>>>, + + CCIfSRet>>, // Put ByVal arguments directly on the stack. Minimum size and alignment of a // slot is 64-bit. @@ -47,18 +64,33 @@ def CC_AArch64_AAPCS : CallingConv<[ CCIfNest>, // Pass SwiftSelf in a callee saved register. - CCIfSwiftSelf>>, + CCIfSwiftSelf>>, // A SwiftError is passed in X21. - CCIfSwiftError>>, + CCIfSwiftError>>, + + // Pass SwiftAsync in an otherwise callee saved register so that it will be + // preserved for normal function calls. + CCIfSwiftAsync>>, CCIfConsecutiveRegs>, + CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], + CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, + CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], + CCPassIndirect>, + + CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCAssignToReg<[P0, P1, P2, P3]>>, + CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCPassIndirect>, + // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, // up to eight each of GPR and FPR. CCIfType<[i1, i8, i16], CCPromoteToType>, - CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], - [X0, X1, X2, X3, X4, X5, X6, X7]>>, + CCIfType<[i32], CCAssignToReg<[W0, W1, W2, W3, W4, W5, W6, W7]>>, // i128 is split to two i64s, we can't fit half to register X7. CCIfType<[i64], CCIfSplit>>, @@ -66,129 +98,145 @@ def CC_AArch64_AAPCS : CallingConv<[ // i128 is split to two i64s, and its stack alignment is 16 bytes. CCIfType<[i64], CCIfSplit>>, - CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], - [W0, W1, W2, W3, W4, W5, W6, W7]>>, - CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], - CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>, + CCIfType<[f16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>, + CCIfType<[bf16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], + CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, // If more than will fit in registers, pass them on the stack instead. - CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>, + CCIfType<[i1, i8, i16, f16, bf16], CCAssignToStack<8, 8>>, CCIfType<[i32, f32], CCAssignToStack<8, 8>>, - CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; +let Entry = 1 in def RetCC_AArch64_AAPCS : CallingConv<[ CCIfType<[iPTR], CCBitConvertToType>, CCIfType<[v2f32], CCBitConvertToType>, CCIfType<[v2f64, v4f32], CCBitConvertToType>, - CCIfSwiftError>>, + CCIfConsecutiveRegs>, + CCIfSwiftError>>, // Big endian vectors must be passed as if they were 1-element vectors so that // their lanes are in a consistent order. - CCIfBigEndian>>, - CCIfBigEndian>>, CCIfType<[i1, i8, i16], CCPromoteToType>, - CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], - [X0, X1, X2, X3, X4, X5, X6, X7]>>, - CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], - [W0, W1, W2, W3, W4, W5, W6, W7]>>, - CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], - CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], - CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> + CCIfType<[i32], CCAssignToReg<[W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>, + CCIfType<[f16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>, + CCIfType<[bf16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], + CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], + CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + + CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], + CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, + + CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCAssignToReg<[P0, P1, P2, P3]>> ]>; // Vararg functions on windows pass floats in integer registers +let Entry = 1 in def CC_AArch64_Win64_VarArg : CallingConv<[ - CCIfType<[f16, f32], CCPromoteToType>, + CCIfType<[f16, bf16], CCBitConvertToType>, + CCIfType<[f32], CCBitConvertToType>, CCIfType<[f64], CCBitConvertToType>, CCDelegateTo ]>; +// Windows Control Flow Guard checks take a single argument (the target function +// address) and have no return value. +let Entry = 1 in +def CC_AArch64_Win64_CFGuard_Check : CallingConv<[ + CCIfType<[i64], CCAssignToReg<[X15]>> +]>; + // Darwin uses a calling convention which differs in only two ways // from the standard one at this level: // + i128s (i.e. split i64s) don't need even registers. // + Stack slots are sized as needed rather than being at least 64-bit. +let Entry = 1 in def CC_AArch64_DarwinPCS : CallingConv<[ CCIfType<[iPTR], CCBitConvertToType>, CCIfType<[v2f32], CCBitConvertToType>, CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, // An SRet is passed in X8, not X0 like a normal pointer parameter. - CCIfSRet>>, + CCIfSRet>>, // Put ByVal arguments directly on the stack. Minimum size and alignment of a // slot is 64-bit. CCIfByVal>, // Pass SwiftSelf in a callee saved register. - CCIfSwiftSelf>>, + CCIfSwiftSelf>>, // A SwiftError is passed in X21. - CCIfSwiftError>>, + CCIfSwiftError>>, + + // Pass SwiftAsync in an otherwise callee saved register so that it will be + // preserved for normal function calls. + CCIfSwiftAsync>>, CCIfConsecutiveRegs>, // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, // up to eight each of GPR and FPR. CCIfType<[i1, i8, i16], CCPromoteToType>, - CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], - [X0, X1, X2, X3, X4, X5, X6, X7]>>, + CCIfType<[i32], CCAssignToReg<[W0, W1, W2, W3, W4, W5, W6, W7]>>, // i128 is split to two i64s, we can't fit half to register X7. CCIfType<[i64], - CCIfSplit>>, + CCIfSplit>>, // i128 is split to two i64s, and its stack alignment is 16 bytes. CCIfType<[i64], CCIfSplit>>, - CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], - [W0, W1, W2, W3, W4, W5, W6, W7]>>, - CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], - CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>, + CCIfType<[f16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>, + CCIfType<[bf16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], + CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, // If more than will fit in registers, pass them on the stack instead. CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>, - CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>, + CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16 || ValVT == MVT::bf16", + CCAssignToStack<2, 2>>, CCIfType<[i32, f32], CCAssignToStack<4, 4>>, - CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16], + + // Re-demote pointers to 32-bits so we don't end up storing 64-bit + // values and clobbering neighbouring stack locations. Not very pretty. + CCIfPtr>>, + CCIfPtr>>, + + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; +let Entry = 1 in def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ CCIfType<[iPTR], CCBitConvertToType>, CCIfType<[v2f32], CCBitConvertToType>, @@ -198,41 +246,62 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ // Handle all scalar types as either i64 or f64. CCIfType<[i8, i16, i32], CCPromoteToType>, - CCIfType<[f16, f32], CCPromoteToType>, + CCIfType<[f16, bf16, f32], CCPromoteToType>, // Everything is on the stack. // i128 is split to two i64s, and its stack alignment is 16 bytes. CCIfType<[i64], CCIfSplit>>, - CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16], + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16], + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], CCAssignToStack<16, 16>> ]>; +// In the ILP32 world, the minimum stack slot size is 4 bytes. Otherwise the +// same as the normal Darwin VarArgs handling. +let Entry = 1 in +def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[ + CCIfType<[v2f32], CCBitConvertToType>, + CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, + + // Handle all scalar types as either i32 or f32. + CCIfType<[i8, i16], CCPromoteToType>, + CCIfType<[f16, bf16], CCPromoteToType>, + + // Everything is on the stack. + // i128 is split to two i64s, and its stack alignment is 16 bytes. + CCIfPtr>>, + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[i64], CCIfSplit>>, + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16], + CCAssignToStack<8, 8>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16], + CCAssignToStack<16, 16>> +]>; + + // The WebKit_JS calling convention only passes the first argument (the callee) // in register and the remaining arguments on stack. We allow 32bit stack slots, // so that WebKit can write partial values in the stack and define the other // 32bit quantity as undef. +let Entry = 1 in def CC_AArch64_WebKit_JS : CallingConv<[ // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0). CCIfType<[i1, i8, i16], CCPromoteToType>, - CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>, - CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>, + CCIfType<[i32], CCAssignToReg<[W0]>>, + CCIfType<[i64], CCAssignToReg<[X0]>>, // Pass the remaining arguments on the stack instead. CCIfType<[i32, f32], CCAssignToStack<4, 4>>, CCIfType<[i64, f64], CCAssignToStack<8, 8>> ]>; +let Entry = 1 in def RetCC_AArch64_WebKit_JS : CallingConv<[ - CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], - [X0, X1, X2, X3, X4, X5, X6, X7]>>, - CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], - [W0, W1, W2, W3, W4, W5, W6, W7]>>, - CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> + CCIfType<[i32], CCAssignToReg<[W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>, + CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>, + CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>> ]>; //===----------------------------------------------------------------------===// @@ -257,6 +326,7 @@ def RetCC_AArch64_WebKit_JS : CallingConv<[ // The AArch64 register mapping is under the heading "The ARMv8/AArch64 ABI // register mapping". +let Entry = 1 in def CC_AArch64_GHC : CallingConv<[ CCIfType<[iPTR], CCBitConvertToType>, @@ -275,6 +345,12 @@ def CC_AArch64_GHC : CallingConv<[ CCIfType<[i64], CCAssignToReg<[X19, X20, X21, X22, X23, X24, X25, X26, X27, X28]>> ]>; +// The order of the callee-saves in this file is important, because the +// FrameLowering code will use this order to determine the layout the +// callee-save area in the stack frame. As can be observed below, Darwin +// requires the frame-record (LR, FP) to be at the top the callee-save area, +// whereas for other platforms they are at the bottom. + // FIXME: LR is only callee-saved in the sense that *we* preserve it and are // presumably a callee to someone. External functions may not do so, but this // is currently safe since BL has LR as an implicit-def and what happens after a @@ -283,11 +359,45 @@ def CC_AArch64_GHC : CallingConv<[ // It would be better to model its preservation semantics properly (create a // vreg on entry, use it in RET & tail call generation; make that vreg def if we // end up saving LR as part of a call frame). Watch this space... -def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, - X23, X24, X25, X26, X27, X28, +def CSR_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, LR, FP, D8, D9, D10, D11, D12, D13, D14, D15)>; +// A variant for treating X18 as callee saved, when interfacing with +// code that needs X18 to be preserved. +def CSR_AArch64_AAPCS_X18 : CalleeSavedRegs<(add X18, CSR_AArch64_AAPCS)>; + +// Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x. +// We put FP before LR, so that frame lowering logic generates (FP,LR) pairs, +// and not (LR,FP) pairs. +def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, FP, LR, + D8, D9, D10, D11, + D12, D13, D14, D15)>; + +// The Control Flow Guard check call uses a custom calling convention that also +// preserves X0-X8 and Q0-Q7. +def CSR_Win_AArch64_CFGuard_Check : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, + (sequence "X%u", 0, 8), + (sequence "Q%u", 0, 7))>; + +// AArch64 PCS for vector functions (VPCS) +// must (additionally) preserve full Q8-Q23 registers +def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, LR, FP, + (sequence "Q%u", 8, 23))>; + +// Functions taking SVE arguments or returning an SVE type +// must (additionally) preserve full Z8-Z23 and predicate registers P4-P15 +def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add (sequence "Z%u", 8, 23), + (sequence "P%u", 4, 15), + X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, LR, FP)>; + +def CSR_AArch64_AAPCS_SwiftTail + : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X20, X22)>; + // Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since // 'this' and the pointer return value are both passed in X0 in these cases, // this can be partially modelled by treating X0 as a callee-saved register; @@ -301,32 +411,6 @@ def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>; def CSR_AArch64_AAPCS_SwiftError : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X21)>; -// The function used by Darwin to obtain the address of a thread-local variable -// guarantees more than a normal AAPCS function. x16 and x17 are used on the -// fast path for calculation, but other registers except X0 (argument/return) -// and LR (it is a call, after all) are preserved. -def CSR_AArch64_TLS_Darwin - : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17), - FP, - (sequence "Q%u", 0, 31))>; - -// We can only handle a register pair with adjacent registers, the register pair -// should belong to the same class as well. Since the access function on the -// fast path calls a function that follows CSR_AArch64_TLS_Darwin, -// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin. -def CSR_AArch64_CXX_TLS_Darwin - : CalleeSavedRegs<(add CSR_AArch64_AAPCS, - (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), - (sequence "D%u", 0, 31))>; - -// CSRs that are handled by prologue, epilogue. -def CSR_AArch64_CXX_TLS_Darwin_PE - : CalleeSavedRegs<(add LR, FP)>; - -// CSRs that are handled explicitly via copies. -def CSR_AArch64_CXX_TLS_Darwin_ViaCopy - : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>; - // The ELF stub used for TLS-descriptor access saves every feasible // register. Only X0 and LR are clobbered. def CSR_AArch64_TLS_ELF @@ -350,17 +434,67 @@ def CSR_AArch64_StackProbe_Windows (sequence "X%u", 18, 28), FP, SP, (sequence "Q%u", 0, 31))>; +// Darwin variants of AAPCS. +// Darwin puts the frame-record at the top of the callee-save area. +def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, + X23, X24, X25, X26, X27, X28, + D8, D9, D10, D11, + D12, D13, D14, D15)>; + +def CSR_Darwin_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, + X22, X23, X24, X25, X26, X27, + X28, (sequence "Q%u", 8, 23))>; +def CSR_Darwin_AArch64_AAPCS_ThisReturn + : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, X0)>; + +def CSR_Darwin_AArch64_AAPCS_SwiftError + : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>; + +def CSR_Darwin_AArch64_AAPCS_SwiftTail + : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X20, X22)>; + +// The function used by Darwin to obtain the address of a thread-local variable +// guarantees more than a normal AAPCS function. x16 and x17 are used on the +// fast path for calculation, but other registers except X0 (argument/return) +// and LR (it is a call, after all) are preserved. +def CSR_Darwin_AArch64_TLS + : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17), + FP, + (sequence "Q%u", 0, 31))>; + +// We can only handle a register pair with adjacent registers, the register pair +// should belong to the same class as well. Since the access function on the +// fast path calls a function that follows CSR_Darwin_AArch64_TLS, +// CSR_Darwin_AArch64_CXX_TLS should be a subset of CSR_Darwin_AArch64_TLS. +def CSR_Darwin_AArch64_CXX_TLS + : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, + (sub (sequence "X%u", 1, 28), X9, X15, X16, X17, X18, X19), + (sequence "D%u", 0, 31))>; + +// CSRs that are handled by prologue, epilogue. +def CSR_Darwin_AArch64_CXX_TLS_PE + : CalleeSavedRegs<(add LR, FP)>; + +// CSRs that are handled explicitly via copies. +def CSR_Darwin_AArch64_CXX_TLS_ViaCopy + : CalleeSavedRegs<(sub CSR_Darwin_AArch64_CXX_TLS, LR, FP)>; + +def CSR_Darwin_AArch64_RT_MostRegs + : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, (sequence "X%u", 9, 15))>; + // Variants of the standard calling conventions for shadow call stack. // These all preserve x18 in addition to any other registers. def CSR_AArch64_NoRegs_SCS : CalleeSavedRegs<(add CSR_AArch64_NoRegs, X18)>; def CSR_AArch64_AllRegs_SCS : CalleeSavedRegs<(add CSR_AArch64_AllRegs, X18)>; -def CSR_AArch64_CXX_TLS_Darwin_SCS - : CalleeSavedRegs<(add CSR_AArch64_CXX_TLS_Darwin, X18)>; def CSR_AArch64_AAPCS_SwiftError_SCS : CalleeSavedRegs<(add CSR_AArch64_AAPCS_SwiftError, X18)>; def CSR_AArch64_RT_MostRegs_SCS : CalleeSavedRegs<(add CSR_AArch64_RT_MostRegs, X18)>; +def CSR_AArch64_AAVPCS_SCS + : CalleeSavedRegs<(add CSR_AArch64_AAVPCS, X18)>; +def CSR_AArch64_SVE_AAPCS_SCS + : CalleeSavedRegs<(add CSR_AArch64_SVE_AAPCS, X18)>; def CSR_AArch64_AAPCS_SCS : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X18)>; diff --git a/suite/synctools/tablegen/AArch64/AArch64Combine.td b/suite/synctools/tablegen/AArch64/AArch64Combine.td new file mode 100644 index 00000000..1994e0eb --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64Combine.td @@ -0,0 +1,233 @@ +//=- AArch64.td - Define AArch64 Combine Rules ---------------*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +include "llvm/Target/GlobalISel/Combine.td" + +def fconstant_to_constant : GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_FCONSTANT):$root, + [{ return matchFConstantToConstant(*${root}, MRI); }]), + (apply [{ applyFConstantToConstant(*${root}); }])>; + +def icmp_redundant_trunc_matchdata : GIDefMatchData<"Register">; +def icmp_redundant_trunc : GICombineRule< + (defs root:$root, icmp_redundant_trunc_matchdata:$matchinfo), + (match (wip_match_opcode G_ICMP):$root, + [{ return matchICmpRedundantTrunc(*${root}, MRI, Helper.getKnownBits(), ${matchinfo}); }]), + (apply [{ applyICmpRedundantTrunc(*${root}, MRI, B, Observer, ${matchinfo}); }])>; + +// AArch64-specific offset folding for G_GLOBAL_VALUE. +def fold_global_offset_matchdata : GIDefMatchData<"std::pair">; +def fold_global_offset : GICombineRule< + (defs root:$root, fold_global_offset_matchdata:$matchinfo), + (match (wip_match_opcode G_GLOBAL_VALUE):$root, + [{ return matchFoldGlobalOffset(*${root}, MRI, ${matchinfo}); }]), + (apply [{ return applyFoldGlobalOffset(*${root}, MRI, B, Observer, ${matchinfo});}]) +>; + +def AArch64PreLegalizerCombinerHelper: GICombinerHelper< + "AArch64GenPreLegalizerCombinerHelper", [all_combines, + fconstant_to_constant, + icmp_redundant_trunc, + fold_global_offset]> { + let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule"; + let StateClass = "AArch64PreLegalizerCombinerHelperState"; + let AdditionalArguments = []; +} + +def AArch64O0PreLegalizerCombinerHelper: GICombinerHelper< + "AArch64GenO0PreLegalizerCombinerHelper", [optnone_combines]> { + let DisableRuleOption = "aarch64O0prelegalizercombiner-disable-rule"; + let StateClass = "AArch64O0PreLegalizerCombinerHelperState"; + let AdditionalArguments = []; +} + +// Matchdata for combines which replace a G_SHUFFLE_VECTOR with a +// target-specific opcode. +def shuffle_matchdata : GIDefMatchData<"ShuffleVectorPseudo">; + +def rev : GICombineRule< + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchREV(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def zip : GICombineRule< + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchZip(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def uzp : GICombineRule< + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchUZP(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def dup: GICombineRule < + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchDup(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def trn : GICombineRule< + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchTRN(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +def ext: GICombineRule < + (defs root:$root, shuffle_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchEXT(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyEXT(*${root}, ${matchinfo}); }]) +>; + +def shuf_to_ins_matchdata : GIDefMatchData<"std::tuple">; +def shuf_to_ins: GICombineRule < + (defs root:$root, shuf_to_ins_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchINS(*${root}, MRI, ${matchinfo}); }]), + (apply [{ return applyINS(*${root}, MRI, B, ${matchinfo}); }]) +>; + +def vashr_vlshr_imm_matchdata : GIDefMatchData<"int64_t">; +def vashr_vlshr_imm : GICombineRule< + (defs root:$root, vashr_vlshr_imm_matchdata:$matchinfo), + (match (wip_match_opcode G_ASHR, G_LSHR):$root, + [{ return matchVAshrLshrImm(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyVAshrLshrImm(*${root}, MRI, ${matchinfo}); }]) +>; + +def form_duplane_matchdata : + GIDefMatchData<"std::pair">; +def form_duplane : GICombineRule < + (defs root:$root, form_duplane_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchDupLane(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }]) +>; + +def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn, + form_duplane, + shuf_to_ins]>; + +def adjust_icmp_imm_matchdata : + GIDefMatchData<"std::pair">; +def adjust_icmp_imm : GICombineRule < + (defs root:$root, adjust_icmp_imm_matchdata:$matchinfo), + (match (wip_match_opcode G_ICMP):$root, + [{ return matchAdjustICmpImmAndPred(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyAdjustICmpImmAndPred(*${root}, ${matchinfo}, B, Observer); }]) +>; + +def swap_icmp_operands : GICombineRule < + (defs root:$root), + (match (wip_match_opcode G_ICMP):$root, + [{ return trySwapICmpOperands(*${root}, MRI); }]), + (apply [{ applySwapICmpOperands(*${root}, Observer); }]) +>; + +def icmp_lowering : GICombineGroup<[adjust_icmp_imm, swap_icmp_operands]>; + +def extractvecelt_pairwise_add_matchdata : GIDefMatchData<"std::tuple">; +def extractvecelt_pairwise_add : GICombineRule< + (defs root:$root, extractvecelt_pairwise_add_matchdata:$matchinfo), + (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root, + [{ return matchExtractVecEltPairwiseAdd(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyExtractVecEltPairwiseAdd(*${root}, MRI, B, ${matchinfo}); }]) +>; + +def mul_const_matchdata : GIDefMatchData<"std::function">; +def mul_const : GICombineRule< + (defs root:$root, mul_const_matchdata:$matchinfo), + (match (wip_match_opcode G_MUL):$root, + [{ return matchAArch64MulConstCombine(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyAArch64MulConstCombine(*${root}, MRI, B, ${matchinfo}); }]) +>; + +def build_vector_to_dup : GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_BUILD_VECTOR):$root, + [{ return matchBuildVectorToDup(*${root}, MRI); }]), + (apply [{ return applyBuildVectorToDup(*${root}, MRI, B); }]) +>; + +def build_vector_lowering : GICombineGroup<[build_vector_to_dup]>; + +def lower_vector_fcmp : GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_FCMP):$root, + [{ return lowerVectorFCMP(*${root}, MRI, B); }]), + (apply [{}])>; + +def form_truncstore_matchdata : GIDefMatchData<"Register">; +def form_truncstore : GICombineRule< + (defs root:$root, form_truncstore_matchdata:$matchinfo), + (match (wip_match_opcode G_STORE):$root, + [{ return matchFormTruncstore(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyFormTruncstore(*${root}, MRI, B, Observer, ${matchinfo}); }]) +>; + +def fold_merge_to_zext : GICombineRule< + (defs root:$d), + (match (wip_match_opcode G_MERGE_VALUES):$d, + [{ return matchFoldMergeToZext(*${d}, MRI); }]), + (apply [{ applyFoldMergeToZext(*${d}, MRI, B, Observer); }]) +>; + +def mutate_anyext_to_zext : GICombineRule< + (defs root:$d), + (match (wip_match_opcode G_ANYEXT):$d, + [{ return matchMutateAnyExtToZExt(*${d}, MRI); }]), + (apply [{ applyMutateAnyExtToZExt(*${d}, MRI, B, Observer); }]) +>; + +def split_store_zero_128 : GICombineRule< + (defs root:$d), + (match (wip_match_opcode G_STORE):$d, + [{ return matchSplitStoreZero128(*${d}, MRI); }]), + (apply [{ applySplitStoreZero128(*${d}, MRI, B, Observer); }]) +>; + +// Post-legalization combines which should happen at all optimization levels. +// (E.g. ones that facilitate matching for the selector) For example, matching +// pseudos. +def AArch64PostLegalizerLoweringHelper + : GICombinerHelper<"AArch64GenPostLegalizerLoweringHelper", + [shuffle_vector_lowering, vashr_vlshr_imm, + icmp_lowering, build_vector_lowering, + lower_vector_fcmp, form_truncstore]> { + let DisableRuleOption = "aarch64postlegalizerlowering-disable-rule"; +} + +// Post-legalization combines which are primarily optimizations. +def AArch64PostLegalizerCombinerHelper + : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper", + [copy_prop, erase_undef_store, combines_for_extload, + sext_trunc_sextload, mutate_anyext_to_zext, + hoist_logic_op_with_same_opcode_hands, + redundant_and, xor_of_and_with_same_reg, + extractvecelt_pairwise_add, redundant_or, + mul_const, redundant_sext_inreg, + form_bitfield_extract, rotate_out_of_range, + icmp_to_true_false_known_bits, merge_unmerge, + select_combines, fold_merge_to_zext, + constant_fold, identity_combines, + ptr_add_immed_chain, overlapping_and, + split_store_zero_128]> { + let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule"; +} diff --git a/suite/synctools/tablegen/AArch64/AArch64GenRegisterBankInfo.def b/suite/synctools/tablegen/AArch64/AArch64GenRegisterBankInfo.def new file mode 100644 index 00000000..87aef1df --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64GenRegisterBankInfo.def @@ -0,0 +1,275 @@ +//===- AArch64GenRegisterBankInfo.def ----------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file defines all the static objects used by AArch64RegisterBankInfo. +/// \todo This should be generated by TableGen. +//===----------------------------------------------------------------------===// + +namespace llvm { +RegisterBankInfo::PartialMapping AArch64GenRegisterBankInfo::PartMappings[]{ + /* StartIdx, Length, RegBank */ + // 0: FPR 16-bit value. + {0, 16, AArch64::FPRRegBank}, + // 1: FPR 32-bit value. + {0, 32, AArch64::FPRRegBank}, + // 2: FPR 64-bit value. + {0, 64, AArch64::FPRRegBank}, + // 3: FPR 128-bit value. + {0, 128, AArch64::FPRRegBank}, + // 4: FPR 256-bit value. + {0, 256, AArch64::FPRRegBank}, + // 5: FPR 512-bit value. + {0, 512, AArch64::FPRRegBank}, + // 6: GPR 32-bit value. + {0, 32, AArch64::GPRRegBank}, + // 7: GPR 64-bit value. + {0, 64, AArch64::GPRRegBank}, + // 8: GPR 128-bit value. + {0, 128, AArch64::GPRRegBank}, +}; + +// ValueMappings. +RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{ + /* BreakDown, NumBreakDowns */ + // 0: invalid + {nullptr, 0}, + // 3-operands instructions (all binary operations should end up with one of + // those mapping). + // 1: FPR 16-bit value. <-- This must match First3OpsIdx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + // 4: FPR 32-bit value. <-- This must match First3OpsIdx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + // 7: FPR 64-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + // 10: FPR 128-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, + // 13: FPR 256-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1}, + // 16: FPR 512-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1}, + // 19: GPR 32-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + // 22: GPR 64-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, + // 25: GPR 128-bit value. <-- This must match Last3OpsIdx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR128 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR128 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR128 - PMI_Min], 1}, + // Cross register bank copies. + // 28: FPR 16-bit value to GPR 16-bit. <-- This must match + // FirstCrossRegCpyIdx. + // Note: This is the kind of copy we see with physical registers. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + // 30: FPR 32-bit value to GPR 32-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + // 32: FPR 64-bit value to GPR 64-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, + // 34: FPR 128-bit value to GPR 128-bit value (invalid) + {nullptr, 1}, + {nullptr, 1}, + // 36: FPR 256-bit value to GPR 256-bit value (invalid) + {nullptr, 1}, + {nullptr, 1}, + // 38: FPR 512-bit value to GPR 512-bit value (invalid) + {nullptr, 1}, + {nullptr, 1}, + // 40: GPR 32-bit value to FPR 32-bit value. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + // 42: GPR 64-bit value to FPR 64-bit value. <-- This must match + // LastCrossRegCpyIdx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + // 44: FPExt: 16 to 32. <-- This must match FPExt16To32Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + // 46: FPExt: 16 to 32. <-- This must match FPExt16To64Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR16 - PMI_Min], 1}, + // 48: FPExt: 32 to 64. <-- This must match FPExt32To64Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1}, + // 50: FPExt vector: 64 to 128. <-- This must match FPExt64To128Idx. + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1}, + // 52: Shift scalar with 64 bit shift imm + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1}, + {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1}, +}; + +bool AArch64GenRegisterBankInfo::checkPartialMap(unsigned Idx, + unsigned ValStartIdx, + unsigned ValLength, + const RegisterBank &RB) { + const PartialMapping &Map = PartMappings[Idx - PartialMappingIdx::PMI_Min]; + return Map.StartIdx == ValStartIdx && Map.Length == ValLength && + Map.RegBank == &RB; +} + +bool AArch64GenRegisterBankInfo::checkValueMapImpl(unsigned Idx, + unsigned FirstInBank, + unsigned Size, + unsigned Offset) { + unsigned PartialMapBaseIdx = Idx - PartialMappingIdx::PMI_Min; + const ValueMapping &Map = + AArch64GenRegisterBankInfo::getValueMapping((PartialMappingIdx)FirstInBank, Size)[Offset]; + return Map.BreakDown == &PartMappings[PartialMapBaseIdx] && + Map.NumBreakDowns == 1; +} + +bool AArch64GenRegisterBankInfo::checkPartialMappingIdx( + PartialMappingIdx FirstAlias, PartialMappingIdx LastAlias, + ArrayRef Order) { + if (Order.front() != FirstAlias) + return false; + if (Order.back() != LastAlias) + return false; + if (Order.front() > Order.back()) + return false; + + PartialMappingIdx Previous = Order.front(); + bool First = true; + for (const auto &Current : Order) { + if (First) { + First = false; + continue; + } + if (Previous + 1 != Current) + return false; + Previous = Current; + } + return true; +} + +unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx, + unsigned Size) { + if (RBIdx == PMI_FirstGPR) { + if (Size <= 32) + return 0; + if (Size <= 64) + return 1; + if (Size <= 128) + return 2; + return -1; + } + if (RBIdx == PMI_FirstFPR) { + if (Size <= 16) + return 0; + if (Size <= 32) + return 1; + if (Size <= 64) + return 2; + if (Size <= 128) + return 3; + if (Size <= 256) + return 4; + if (Size <= 512) + return 5; + return -1; + } + return -1; +} + +const RegisterBankInfo::ValueMapping * +AArch64GenRegisterBankInfo::getValueMapping(PartialMappingIdx RBIdx, + unsigned Size) { + assert(RBIdx != PartialMappingIdx::PMI_None && "No mapping needed for that"); + unsigned BaseIdxOffset = getRegBankBaseIdxOffset(RBIdx, Size); + if (BaseIdxOffset == -1u) + return &ValMappings[InvalidIdx]; + + unsigned ValMappingIdx = + First3OpsIdx + (RBIdx - PartialMappingIdx::PMI_Min + BaseIdxOffset) * + ValueMappingIdx::DistanceBetweenRegBanks; + assert(ValMappingIdx >= First3OpsIdx && ValMappingIdx <= Last3OpsIdx && + "Mapping out of bound"); + + return &ValMappings[ValMappingIdx]; +} + +AArch64GenRegisterBankInfo::PartialMappingIdx + AArch64GenRegisterBankInfo::BankIDToCopyMapIdx[]{ + PMI_None, // CCR + PMI_FirstFPR, // FPR + PMI_FirstGPR, // GPR + }; + +const RegisterBankInfo::ValueMapping * +AArch64GenRegisterBankInfo::getCopyMapping(unsigned DstBankID, + unsigned SrcBankID, unsigned Size) { + assert(DstBankID < AArch64::NumRegisterBanks && "Invalid bank ID"); + assert(SrcBankID < AArch64::NumRegisterBanks && "Invalid bank ID"); + PartialMappingIdx DstRBIdx = BankIDToCopyMapIdx[DstBankID]; + PartialMappingIdx SrcRBIdx = BankIDToCopyMapIdx[SrcBankID]; + assert(DstRBIdx != PMI_None && "No such mapping"); + assert(SrcRBIdx != PMI_None && "No such mapping"); + + if (DstRBIdx == SrcRBIdx) + return getValueMapping(DstRBIdx, Size); + + assert(Size <= 64 && "GPR cannot handle that size"); + unsigned ValMappingIdx = + FirstCrossRegCpyIdx + + (DstRBIdx - PMI_Min + getRegBankBaseIdxOffset(DstRBIdx, Size)) * + ValueMappingIdx::DistanceBetweenCrossRegCpy; + assert(ValMappingIdx >= FirstCrossRegCpyIdx && + ValMappingIdx <= LastCrossRegCpyIdx && "Mapping out of bound"); + return &ValMappings[ValMappingIdx]; +} + +const RegisterBankInfo::ValueMapping * +AArch64GenRegisterBankInfo::getFPExtMapping(unsigned DstSize, + unsigned SrcSize) { + // We support: + // - For Scalar: + // - 16 to 32. + // - 16 to 64. + // - 32 to 64. + // => FPR 16 to FPR 32|64 + // => FPR 32 to FPR 64 + // - For vectors: + // - v4f16 to v4f32 + // - v2f32 to v2f64 + // => FPR 64 to FPR 128 + + // Check that we have been asked sensible sizes. + if (SrcSize == 16) { + assert((DstSize == 32 || DstSize == 64) && "Unexpected half extension"); + if (DstSize == 32) + return &ValMappings[FPExt16To32Idx]; + return &ValMappings[FPExt16To64Idx]; + } + + if (SrcSize == 32) { + assert(DstSize == 64 && "Unexpected float extension"); + return &ValMappings[FPExt32To64Idx]; + } + assert((SrcSize == 64 || DstSize == 128) && "Unexpected vector extension"); + return &ValMappings[FPExt64To128Idx]; +} +} // End llvm namespace. diff --git a/suite/synctools/tablegen/AArch64/AArch64InstrAtomics.td b/suite/synctools/tablegen/AArch64/AArch64InstrAtomics.td index 35cd7735..b2209295 100644 --- a/suite/synctools/tablegen/AArch64/AArch64InstrAtomics.td +++ b/suite/synctools/tablegen/AArch64/AArch64InstrAtomics.td @@ -1,9 +1,8 @@ //=- AArch64InstrAtomics.td - AArch64 Atomic codegen support -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -16,9 +15,9 @@ //===---------------------------------- let AddedComplexity = 15, Size = 0 in def CompilerBarrier : Pseudo<(outs), (ins i32imm:$ordering), - [(atomic_fence imm:$ordering, 0)]>, Sched<[]>; -def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>; -def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>; + [(atomic_fence timm:$ordering, 0)]>, Sched<[]>; +def : Pat<(atomic_fence (i64 4), (timm)), (DMB (i32 0x9))>; +def : Pat<(atomic_fence (timm), (timm)), (DMB (i32 0xb))>; //===---------------------------------- // Atomic loads @@ -103,6 +102,34 @@ def : Pat<(relaxed_load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), (LDURXi GPR64sp:$Rn, simm9:$offset)>; +// FP 32-bit loads +def : Pat<(f32 (bitconvert (i32 (relaxed_load (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend32:$extend))))), + (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>; +def : Pat<(f32 (bitconvert (i32 (relaxed_load (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend32:$extend))))), + (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>; +def : Pat<(f32 (bitconvert (i32 (relaxed_load (am_indexed32 GPR64sp:$Rn, + uimm12s8:$offset))))), + (LDRSui GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat<(f32 (bitconvert (i32 (relaxed_load + (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))), + (LDURSi GPR64sp:$Rn, simm9:$offset)>; + +// FP 64-bit loads +def : Pat<(f64 (bitconvert (i64 (relaxed_load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend64:$extend))))), + (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; +def : Pat<(f64 (bitconvert (i64 (relaxed_load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend64:$extend))))), + (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; +def : Pat<(f64 (bitconvert (i64 (relaxed_load (am_indexed64 GPR64sp:$Rn, + uimm12s8:$offset))))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat<(f64 (bitconvert (i64 (relaxed_load + (am_unscaled64 GPR64sp:$Rn, simm9:$offset))))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; + //===---------------------------------- // Atomic stores //===---------------------------------- @@ -197,6 +224,38 @@ def : Pat<(relaxed_store (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val), (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>; +// FP 32-bit stores +def : Pat<(relaxed_store (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend32:$extend), + (i32 (bitconvert (f32 FPR32Op:$val)))), + (STRSroW FPR32Op:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>; +def : Pat<(relaxed_store (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend32:$extend), + (i32 (bitconvert (f32 FPR32Op:$val)))), + (STRSroX FPR32Op:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>; +def : Pat<(relaxed_store + (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), (i32 (bitconvert (f32 FPR32Op:$val)))), + (STRSui FPR32Op:$val, GPR64sp:$Rn, uimm12s4:$offset)>; +def : Pat<(relaxed_store + (am_unscaled32 GPR64sp:$Rn, simm9:$offset), (i32 (bitconvert (f32 FPR32Op:$val)))), + (STURSi FPR32Op:$val, GPR64sp:$Rn, simm9:$offset)>; + +// FP 64-bit stores +def : Pat<(relaxed_store (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend64:$extend), + (i64 (bitconvert (f64 FPR64Op:$val)))), + (STRDroW FPR64Op:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; +def : Pat<(relaxed_store (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend64:$extend), + (i64 (bitconvert (f64 FPR64Op:$val)))), + (STRDroX FPR64Op:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; +def : Pat<(relaxed_store + (am_indexed64 GPR64sp:$Rn, uimm12s4:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))), + (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s4:$offset)>; +def : Pat<(relaxed_store + (am_unscaled64 GPR64sp:$Rn, simm9:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))), + (STURDi FPR64Op:$val, GPR64sp:$Rn, simm9:$offset)>; + //===---------------------------------- // Low-level exclusive operations //===---------------------------------- @@ -205,19 +264,27 @@ def : Pat<(relaxed_store def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i8; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }]; +} def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i16; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }]; +} def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i32; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }]; +} def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i64; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }]; +} def : Pat<(ldxr_1 GPR64sp:$addr), (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>; @@ -238,19 +305,27 @@ def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff), def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i8; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }]; +} def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i16; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }]; +} def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i32; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }]; +} def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i64; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }]; +} def : Pat<(ldaxr_1 GPR64sp:$addr), (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>; @@ -272,22 +347,30 @@ def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff), def stxr_1 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stxr node:$val, node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i8; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }]; +} def stxr_2 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stxr node:$val, node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i16; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }]; +} def stxr_4 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stxr node:$val, node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i32; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }]; +} def stxr_8 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stxr node:$val, node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i64; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }]; +} def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr), @@ -318,22 +401,30 @@ def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr), def stlxr_1 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stlxr node:$val, node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i8; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 1); }]; +} def stlxr_2 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stlxr node:$val, node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i16; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 2); }]; +} def stlxr_4 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stlxr node:$val, node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i32; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 4); }]; +} def stlxr_8 : PatFrag<(ops node:$val, node:$ptr), (int_aarch64_stlxr node:$val, node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i64; -}]>; +}]> { + let GISelPredicateCode = [{ return isLoadStoreOfNumBytes(MI, 8); }]; +} def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr), @@ -398,11 +489,16 @@ def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$scratch), } let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $scratch", - mayLoad = 1, mayStore = 1 in -def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch), - (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi, - GPR64:$newLo, GPR64:$newHi), []>, - Sched<[WriteAtomic]>; + mayLoad = 1, mayStore = 1 in { +class cmp_swap_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32common:$scratch), + (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi, + GPR64:$newLo, GPR64:$newHi), []>, + Sched<[WriteAtomic]>; +def CMP_SWAP_128 : cmp_swap_128; +def CMP_SWAP_128_RELEASE : cmp_swap_128; +def CMP_SWAP_128_ACQUIRE : cmp_swap_128; +def CMP_SWAP_128_MONOTONIC : cmp_swap_128; +} // v8.1 Atomic instructions: let Predicates = [HasLSE] in { @@ -423,4 +519,3 @@ let Predicates = [HasLSE] in { defm : LDOPregister_patterns_mod<"LDADD", "atomic_load_sub", "SUB">; defm : LDOPregister_patterns_mod<"LDCLR", "atomic_load_and", "ORN">; } - diff --git a/suite/synctools/tablegen/AArch64/AArch64InstrFormats.td b/suite/synctools/tablegen/AArch64/AArch64InstrFormats.td index 7caf32db..4c1e41b7 100644 --- a/suite/synctools/tablegen/AArch64/AArch64InstrFormats.td +++ b/suite/synctools/tablegen/AArch64/AArch64InstrFormats.td @@ -1,9 +1,8 @@ //===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tblgen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -21,6 +20,31 @@ class Format val> { def PseudoFrm : Format<0>; def NormalFrm : Format<1>; // Do we need any others? +// Enum describing whether an instruction is +// destructive in its first source operand. +class DestructiveInstTypeEnum val> { + bits<4> Value = val; +} +def NotDestructive : DestructiveInstTypeEnum<0>; +// Destructive in its first operand and can be MOVPRFX'd, but has no other +// special properties. +def DestructiveOther : DestructiveInstTypeEnum<1>; +def DestructiveUnary : DestructiveInstTypeEnum<2>; +def DestructiveBinaryImm : DestructiveInstTypeEnum<3>; +def DestructiveBinaryShImmUnpred : DestructiveInstTypeEnum<4>; +def DestructiveBinary : DestructiveInstTypeEnum<5>; +def DestructiveBinaryComm : DestructiveInstTypeEnum<6>; +def DestructiveBinaryCommWithRev : DestructiveInstTypeEnum<7>; +def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>; +def DestructiveUnaryPassthru : DestructiveInstTypeEnum<9>; + +class FalseLanesEnum val> { + bits<2> Value = val; +} +def FalseLanesNone : FalseLanesEnum<0>; +def FalseLanesZero : FalseLanesEnum<1>; +def FalseLanesUndef : FalseLanesEnum<2>; + // AArch64 Instruction Format class AArch64Inst : Instruction { field bits<32> Inst; // Instruction encoding. @@ -35,6 +59,20 @@ class AArch64Inst : Instruction { let Namespace = "AArch64"; Format F = f; bits<2> Form = F.Value; + + // Defaults + bit isWhile = 0; + bit isPTestLike = 0; + FalseLanesEnum FalseLanes = FalseLanesNone; + DestructiveInstTypeEnum DestructiveInstType = NotDestructive; + ElementSizeEnum ElementSize = ElementSizeNone; + + let TSFlags{10} = isPTestLike; + let TSFlags{9} = isWhile; + let TSFlags{8-7} = FalseLanes.Value; + let TSFlags{6-3} = DestructiveInstType.Value; + let TSFlags{2-0} = ElementSize.Value; + let Pattern = []; let Constraints = cstr; } @@ -49,6 +87,7 @@ class Pseudo pattern, string cstr = ""> dag InOperandList = iops; let Pattern = pattern; let isCodeGenOnly = 1; + let isPseudo = 1; } // Real instructions (have encoding information) @@ -57,14 +96,6 @@ class EncodedI pattern> : AArch64Inst { let Size = 4; } -// Enum describing whether an instruction is -// destructive in its first source operand. -class DestructiveInstTypeEnum val> { - bits<1> Value = val; -} -def NotDestructive : DestructiveInstTypeEnum<0>; -def Destructive : DestructiveInstTypeEnum<1>; - // Normal instructions class I pattern> @@ -72,13 +103,6 @@ class I : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>; @@ -244,6 +268,7 @@ def adrplabel : Operand { let EncoderMethod = "getAdrLabelOpValue"; let PrintMethod = "printAdrpLabel"; let ParserMatchClass = AdrpOperand; + let OperandType = "OPERAND_PCREL"; } def AdrOperand : AsmOperandClass { @@ -263,6 +288,14 @@ class SImmOperand : AsmOperandClass { let PredicateMethod = "isSImm<" # width # ">"; } + +class AsmImmRange : AsmOperandClass { + let Name = "Imm" # Low # "_" # High; + let DiagnosticType = "InvalidImm" # Low # "_" # High; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isImmInRange<" # Low # "," # High # ">"; +} + // Authenticated loads for v8.3 can have scaled 10-bit immediate offsets. def SImm10s8Operand : SImmScaledMemoryIndexed<10, 8>; def simm10Scaled : Operand { @@ -271,6 +304,12 @@ def simm10Scaled : Operand { let PrintMethod = "printImmScale<8>"; } +def simm9s16 : Operand { + let ParserMatchClass = SImmScaledMemoryIndexed<9, 16>; + let DecoderMethod = "DecodeSImm<9>"; + let PrintMethod = "printImmScale<16>"; +} + // uimm6 predicate - True if the immediate is in the range [0, 63]. def UImm6Operand : AsmOperandClass { let Name = "UImm6"; @@ -281,6 +320,10 @@ def uimm6 : Operand, ImmLeaf= 0 && Imm < 64; }]> { let ParserMatchClass = UImm6Operand; } +def uimm16 : Operand, ImmLeaf= 0 && Imm < 65536;}]>{ + let ParserMatchClass = AsmImmRange<0, 65535>; +} + def SImm9Operand : SImmOperand<9>; def simm9 : Operand, ImmLeaf= -256 && Imm < 256; }]> { let ParserMatchClass = SImm9Operand; @@ -288,7 +331,7 @@ def simm9 : Operand, ImmLeaf= -256 && Imm < 256; }]> { } def SImm8Operand : SImmOperand<8>; -def simm8 : Operand, ImmLeaf= -128 && Imm < 127; }]> { +def simm8 : Operand, ImmLeaf= -128 && Imm < 128; }]> { let ParserMatchClass = SImm8Operand; let DecoderMethod = "DecodeSImm<8>"; } @@ -310,6 +353,18 @@ def simm5_32b : Operand, ImmLeaf= -16 && Imm < 16; }]> let DecoderMethod = "DecodeSImm<5>"; } +def simm5_8b : Operand, ImmLeaf= -16 && (int8_t)Imm < 16; }]> { + let ParserMatchClass = SImm5Operand; + let DecoderMethod = "DecodeSImm<5>"; + let PrintMethod = "printSImm<8>"; +} + +def simm5_16b : Operand, ImmLeaf= -16 && (int16_t)Imm < 16; }]> { + let ParserMatchClass = SImm5Operand; + let DecoderMethod = "DecodeSImm<5>"; + let PrintMethod = "printSImm<16>"; +} + // simm7sN predicate - True if the immediate is a multiple of N in the range // [-64 * N, 63 * N]. @@ -332,11 +387,29 @@ def simm7s16 : Operand { let PrintMethod = "printImmScale<16>"; } -def am_indexed7s8 : ComplexPattern; -def am_indexed7s16 : ComplexPattern; -def am_indexed7s32 : ComplexPattern; -def am_indexed7s64 : ComplexPattern; -def am_indexed7s128 : ComplexPattern; +def am_sve_fi : ComplexPattern; + +def am_indexed7s8 : ComplexPattern; +def am_indexed7s16 : ComplexPattern; +def am_indexed7s32 : ComplexPattern; +def am_indexed7s64 : ComplexPattern; +def am_indexed7s128 : ComplexPattern; + +def am_indexedu6s128 : ComplexPattern; +def am_indexeds9s128 : ComplexPattern; + +def UImmS1XForm : SDNodeXFormgetTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64); +}]>; +def UImmS2XForm : SDNodeXFormgetTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64); +}]>; +def UImmS4XForm : SDNodeXFormgetTargetConstant(N->getZExtValue() / 4, SDLoc(N), MVT::i64); +}]>; +def UImmS8XForm : SDNodeXFormgetTargetConstant(N->getZExtValue() / 8, SDLoc(N), MVT::i64); +}]>; // uimm5sN predicate - True if the immediate is a multiple of N in the range // [0 * N, 32 * N]. @@ -345,17 +418,41 @@ def UImm5s4Operand : UImmScaledMemoryIndexed<5, 4>; def UImm5s8Operand : UImmScaledMemoryIndexed<5, 8>; def uimm5s2 : Operand, ImmLeaf= 0 && Imm < (32*2) && ((Imm % 2) == 0); }]> { + [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }], + UImmS2XForm> { let ParserMatchClass = UImm5s2Operand; let PrintMethod = "printImmScale<2>"; } def uimm5s4 : Operand, ImmLeaf= 0 && Imm < (32*4) && ((Imm % 4) == 0); }]> { + [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }], + UImmS4XForm> { let ParserMatchClass = UImm5s4Operand; let PrintMethod = "printImmScale<4>"; } def uimm5s8 : Operand, ImmLeaf= 0 && Imm < (32*8) && ((Imm % 8) == 0); }]> { + [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }], + UImmS8XForm> { + let ParserMatchClass = UImm5s8Operand; + let PrintMethod = "printImmScale<8>"; +} + +// tuimm5sN predicate - similiar to uimm5sN, but use TImmLeaf (TargetConstant) +// instead of ImmLeaf (Constant) +def tuimm5s2 : Operand, TImmLeaf= 0 && Imm < (32*2) && ((Imm % 2) == 0); }], + UImmS2XForm> { + let ParserMatchClass = UImm5s2Operand; + let PrintMethod = "printImmScale<2>"; +} +def tuimm5s4 : Operand, TImmLeaf= 0 && Imm < (32*4) && ((Imm % 4) == 0); }], + UImmS4XForm> { + let ParserMatchClass = UImm5s4Operand; + let PrintMethod = "printImmScale<4>"; +} +def tuimm5s8 : Operand, TImmLeaf= 0 && Imm < (32*8) && ((Imm % 8) == 0); }], + UImmS8XForm> { let ParserMatchClass = UImm5s8Operand; let PrintMethod = "printImmScale<8>"; } @@ -366,6 +463,7 @@ def UImm6s1Operand : UImmScaledMemoryIndexed<6, 1>; def UImm6s2Operand : UImmScaledMemoryIndexed<6, 2>; def UImm6s4Operand : UImmScaledMemoryIndexed<6, 4>; def UImm6s8Operand : UImmScaledMemoryIndexed<6, 8>; +def UImm6s16Operand : UImmScaledMemoryIndexed<6, 16>; def uimm6s1 : Operand, ImmLeaf= 0 && Imm < 64; }]> { let ParserMatchClass = UImm6s1Operand; @@ -385,6 +483,27 @@ def uimm6s8 : Operand, ImmLeaf, ImmLeaf= 0 && Imm < (64*16) && ((Imm % 16) == 0); }]> { + let PrintMethod = "printImmScale<16>"; + let ParserMatchClass = UImm6s16Operand; +} + +def SImmS2XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 2, SDLoc(N), MVT::i64); +}]>; +def SImmS3XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 3, SDLoc(N), MVT::i64); +}]>; +def SImmS4XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 4, SDLoc(N), MVT::i64); +}]>; +def SImmS16XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64); +}]>; +def SImmS32XForm : SDNodeXFormgetTargetConstant(N->getSExtValue() / 32, SDLoc(N), MVT::i64); +}]>; // simm6sN predicate - True if the immediate is a multiple of N in the range // [-32 * N, 31 * N]. @@ -401,6 +520,7 @@ def SImm4s2Operand : SImmScaledMemoryIndexed<4, 2>; def SImm4s3Operand : SImmScaledMemoryIndexed<4, 3>; def SImm4s4Operand : SImmScaledMemoryIndexed<4, 4>; def SImm4s16Operand : SImmScaledMemoryIndexed<4, 16>; +def SImm4s32Operand : SImmScaledMemoryIndexed<4, 32>; def simm4s1 : Operand, ImmLeaf=-8 && Imm <= 7; }]> { @@ -409,37 +529,36 @@ def simm4s1 : Operand, ImmLeaf, ImmLeaf=-16 && Imm <= 14 && (Imm % 2) == 0x0; }]> { +[{ return Imm >=-16 && Imm <= 14 && (Imm % 2) == 0x0; }], SImmS2XForm> { let PrintMethod = "printImmScale<2>"; let ParserMatchClass = SImm4s2Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s3 : Operand, ImmLeaf=-24 && Imm <= 21 && (Imm % 3) == 0x0; }]> { +[{ return Imm >=-24 && Imm <= 21 && (Imm % 3) == 0x0; }], SImmS3XForm> { let PrintMethod = "printImmScale<3>"; let ParserMatchClass = SImm4s3Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s4 : Operand, ImmLeaf=-32 && Imm <= 28 && (Imm % 4) == 0x0; }]> { +[{ return Imm >=-32 && Imm <= 28 && (Imm % 4) == 0x0; }], SImmS4XForm> { let PrintMethod = "printImmScale<4>"; let ParserMatchClass = SImm4s4Operand; let DecoderMethod = "DecodeSImm<4>"; } def simm4s16 : Operand, ImmLeaf=-128 && Imm <= 112 && (Imm % 16) == 0x0; }]> { +[{ return Imm >=-128 && Imm <= 112 && (Imm % 16) == 0x0; }], SImmS16XForm> { let PrintMethod = "printImmScale<16>"; let ParserMatchClass = SImm4s16Operand; let DecoderMethod = "DecodeSImm<4>"; } - -class AsmImmRange : AsmOperandClass { - let Name = "Imm" # Low # "_" # High; - let DiagnosticType = "InvalidImm" # Low # "_" # High; - let RenderMethod = "addImmOperands"; - let PredicateMethod = "isImmInRange<" # Low # "," # High # ">"; +def simm4s32 : Operand, ImmLeaf=-256 && Imm <= 224 && (Imm % 32) == 0x0; }], SImmS32XForm> { + let PrintMethod = "printImmScale<32>"; + let ParserMatchClass = SImm4s32Operand; + let DecoderMethod = "DecodeSImm<4>"; } def Imm1_8Operand : AsmImmRange<1, 8>; @@ -461,76 +580,40 @@ def BranchTarget14Operand : BranchTarget<14>; def BranchTarget26Operand : BranchTarget<26>; def PCRelLabel19Operand : PCRelLabel<19>; -def MovZSymbolG3AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG3"; +def MovWSymbolG3AsmOperand : AsmOperandClass { + let Name = "MovWSymbolG3"; let RenderMethod = "addImmOperands"; } -def movz_symbol_g3 : Operand { - let ParserMatchClass = MovZSymbolG3AsmOperand; +def movw_symbol_g3 : Operand { + let ParserMatchClass = MovWSymbolG3AsmOperand; } -def MovZSymbolG2AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG2"; +def MovWSymbolG2AsmOperand : AsmOperandClass { + let Name = "MovWSymbolG2"; let RenderMethod = "addImmOperands"; } -def movz_symbol_g2 : Operand { - let ParserMatchClass = MovZSymbolG2AsmOperand; +def movw_symbol_g2 : Operand { + let ParserMatchClass = MovWSymbolG2AsmOperand; } -def MovZSymbolG1AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG1"; +def MovWSymbolG1AsmOperand : AsmOperandClass { + let Name = "MovWSymbolG1"; let RenderMethod = "addImmOperands"; } -def movz_symbol_g1 : Operand { - let ParserMatchClass = MovZSymbolG1AsmOperand; +def movw_symbol_g1 : Operand { + let ParserMatchClass = MovWSymbolG1AsmOperand; } -def MovZSymbolG0AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG0"; +def MovWSymbolG0AsmOperand : AsmOperandClass { + let Name = "MovWSymbolG0"; let RenderMethod = "addImmOperands"; } -def movz_symbol_g0 : Operand { - let ParserMatchClass = MovZSymbolG0AsmOperand; -} - -def MovKSymbolG3AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG3"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g3 : Operand { - let ParserMatchClass = MovKSymbolG3AsmOperand; -} - -def MovKSymbolG2AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG2"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g2 : Operand { - let ParserMatchClass = MovKSymbolG2AsmOperand; -} - -def MovKSymbolG1AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG1"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g1 : Operand { - let ParserMatchClass = MovKSymbolG1AsmOperand; -} - -def MovKSymbolG0AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG0"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g0 : Operand { - let ParserMatchClass = MovKSymbolG0AsmOperand; +def movw_symbol_g0 : Operand { + let ParserMatchClass = MovWSymbolG0AsmOperand; } class fixedpoint_i32 @@ -607,7 +690,40 @@ def vecshiftR64Narrow : Operand, ImmLeaf, TImmLeaf 0) && (((uint32_t)Imm) < 9); +}]> { + let EncoderMethod = "getVecShiftR8OpValue"; + let DecoderMethod = "DecodeVecShiftR8Imm"; + let ParserMatchClass = Imm1_8Operand; +} +def tvecshiftR16 : Operand, TImmLeaf 0) && (((uint32_t)Imm) < 17); +}]> { + let EncoderMethod = "getVecShiftR16OpValue"; + let DecoderMethod = "DecodeVecShiftR16Imm"; + let ParserMatchClass = Imm1_16Operand; +} +def tvecshiftR32 : Operand, TImmLeaf 0) && (((uint32_t)Imm) < 33); +}]> { + let EncoderMethod = "getVecShiftR32OpValue"; + let DecoderMethod = "DecodeVecShiftR32Imm"; + let ParserMatchClass = Imm1_32Operand; +} +def tvecshiftR64 : Operand, TImmLeaf 0) && (((uint32_t)Imm) < 65); +}]> { + let EncoderMethod = "getVecShiftR64OpValue"; + let DecoderMethod = "DecodeVecShiftR64Imm"; + let ParserMatchClass = Imm1_64Operand; +} + +def Imm0_0Operand : AsmImmRange<0, 0>; def Imm0_1Operand : AsmImmRange<0, 1>; +def Imm0_3Operand : AsmImmRange<0, 3>; def Imm0_7Operand : AsmImmRange<0, 7>; def Imm0_15Operand : AsmImmRange<0, 15>; def Imm0_31Operand : AsmImmRange<0, 31>; @@ -642,6 +758,36 @@ def vecshiftL64 : Operand, ImmLeaf, TImmLeaf { + let EncoderMethod = "getVecShiftL8OpValue"; + let DecoderMethod = "DecodeVecShiftL8Imm"; + let ParserMatchClass = Imm0_7Operand; +} +def tvecshiftL16 : Operand, TImmLeaf { + let EncoderMethod = "getVecShiftL16OpValue"; + let DecoderMethod = "DecodeVecShiftL16Imm"; + let ParserMatchClass = Imm0_15Operand; +} +def tvecshiftL32 : Operand, TImmLeaf { + let EncoderMethod = "getVecShiftL32OpValue"; + let DecoderMethod = "DecodeVecShiftL32Imm"; + let ParserMatchClass = Imm0_31Operand; +} +def tvecshiftL64 : Operand, TImmLeaf { + let EncoderMethod = "getVecShiftL64OpValue"; + let DecoderMethod = "DecodeVecShiftL64Imm"; + let ParserMatchClass = Imm0_63Operand; +} // Crazy immediate formats used by 32-bit and 64-bit logical immediate // instructions for splatting repeating bit patterns across the immediate. @@ -654,6 +800,11 @@ def logical_imm64_XFORM : SDNodeXFormgetTargetConstant(enc, SDLoc(N), MVT::i32); }]>; +def gi_logical_imm32_XFORM : GICustomOperandRenderer<"renderLogicalImm32">, + GISDNodeXFormEquiv; +def gi_logical_imm64_XFORM : GICustomOperandRenderer<"renderLogicalImm64">, + GISDNodeXFormEquiv; + let DiagnosticType = "LogicalSecondSource" in { def LogicalImm32Operand : AsmOperandClass { let Name = "LogicalImm32"; @@ -695,13 +846,15 @@ def logical_imm64_not : Operand { let ParserMatchClass = LogicalImm64NotOperand; } -// imm0_65535 predicate - True if the immediate is in the range [0,65535]. -def Imm0_65535Operand : AsmImmRange<0, 65535>; -def imm0_65535 : Operand, ImmLeaf, PrintMethod = "printImmHex" in { +def timm32_0_65535 : Operand, TImmLeaf { - let ParserMatchClass = Imm0_65535Operand; - let PrintMethod = "printImmHex"; +}]>; + +def timm64_0_65535 : Operand, TImmLeaf; } // imm0_255 predicate - True if the immediate is in the range [0,255]. @@ -723,6 +876,13 @@ def imm0_127 : Operand, ImmLeaf, ImmLeaf { + let ParserMatchClass = Imm0_127Operand; + let PrintMethod = "printImm"; +} + // NOTE: These imm0_N operands have to be of type i64 because i64 is the size // for all shift-amounts. @@ -733,6 +893,12 @@ def imm0_63 : Operand, ImmLeaf, TImmLeaf { + let ParserMatchClass = Imm0_63Operand; +} + // imm0_31 predicate - True if the immediate is in the range [0,31] def imm0_31 : Operand, ImmLeaf, ImmLeaf, TImmLeaf { + let ParserMatchClass = Imm0_31Operand; +} + // True if the 32-bit immediate is in the range [0,31] def imm32_0_31 : Operand, ImmLeaf, ImmLeaf, TImmLeaf { + let ParserMatchClass = Imm0_1Operand; +} + // imm0_15 predicate - True if the immediate is in the range [0,15] def imm0_15 : Operand, ImmLeaf, ImmLeaf, ImmLeaf { + let ParserMatchClass = Imm0_3Operand; +} + +// timm32_0_7 predicate - True if the 32-bit immediate is in the range [0,7] +def timm32_0_7 : Operand, TImmLeaf { + let ParserMatchClass = Imm0_7Operand; +} + // imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15] def imm32_0_15 : Operand, ImmLeaf def arith_shifted_reg32 : arith_shifted_reg; def arith_shifted_reg64 : arith_shifted_reg; +def gi_arith_shifted_reg32 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_arith_shifted_reg64 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + // An arithmetic shifter operand: // {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror // {5-0} - imm6 @@ -819,6 +1022,14 @@ class logical_shifted_reg def logical_shifted_reg32 : logical_shifted_reg; def logical_shifted_reg64 : logical_shifted_reg; +def gi_logical_shifted_reg32 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_logical_shifted_reg64 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + // A logical vector shifter operand: // {7-6} - shift type: 00 = lsl // {5-0} - imm6: #0, #8, #16, or #24 @@ -900,6 +1111,14 @@ class neg_addsub_shifted_imm def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm; def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm; +def gi_neg_addsub_shifted_imm32 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_neg_addsub_shifted_imm64 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + // An extend operand: // {5-3} - extend type // {2-0} - imm3 @@ -930,37 +1149,60 @@ class arith_extended_reg32to64 : Operand, let MIOperandInfo = (ops GPR32, arith_extend64); } +def arith_extended_reg32_i32 : arith_extended_reg32; +def gi_arith_extended_reg32_i32 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def arith_extended_reg32_i64 : arith_extended_reg32; +def gi_arith_extended_reg32_i64 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def arith_extended_reg32to64_i64 : arith_extended_reg32to64; +def gi_arith_extended_reg32to64_i64 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + // Floating-point immediate. -def fpimm16 : Operand, - FPImmLeafgetValueAPF(); uint32_t enc = AArch64_AM::getFP16Imm(InVal); return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); - }]>> { - let ParserMatchClass = FPImmOperand; - let PrintMethod = "printFPImmOperand"; -} -def fpimm32 : Operand, - FPImmLeaf; + +def fpimm32XForm : SDNodeXFormgetValueAPF(); uint32_t enc = AArch64_AM::getFP32Imm(InVal); return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); - }]>> { + }]>; + +def fpimm64XForm : SDNodeXFormgetValueAPF(); + uint32_t enc = AArch64_AM::getFP64Imm(InVal); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); + }]>; + +def fpimm16 : Operand, + FPImmLeaf { + let ParserMatchClass = FPImmOperand; + let PrintMethod = "printFPImmOperand"; +} + +def fpimm32 : Operand, + FPImmLeaf { let ParserMatchClass = FPImmOperand; let PrintMethod = "printFPImmOperand"; } def fpimm64 : Operand, FPImmLeafgetValueAPF(); - uint32_t enc = AArch64_AM::getFP64Imm(InVal); - return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); - }]>> { + }], fpimm64XForm> { let ParserMatchClass = FPImmOperand; let PrintMethod = "printFPImmOperand"; } @@ -974,6 +1216,25 @@ def fpimm0 : FPImmLeaf; +def fpimm_half : FPImmLeaf; + +def fpimm_one : FPImmLeaf; + +def fpimm_two : FPImmLeaf; + +def gi_fpimm16 : GICustomOperandRenderer<"renderFPImm16">, + GISDNodeXFormEquiv; +def gi_fpimm32 : GICustomOperandRenderer<"renderFPImm32">, + GISDNodeXFormEquiv; +def gi_fpimm64 : GICustomOperandRenderer<"renderFPImm64">, + GISDNodeXFormEquiv; + // Vector lane operands class AsmVectorIndex : AsmOperandClass { let Name = NamePrefix # "IndexRange" # Min # "_" # Max; @@ -982,23 +1243,47 @@ class AsmVectorIndex : AsmOperandClass { let RenderMethod = "addVectorIndexOperands"; } -class AsmVectorIndexOpnd - : Operand, ImmLeaf { +class AsmVectorIndexOpnd + : Operand { let ParserMatchClass = mc; let PrintMethod = "printVectorIndex"; } +multiclass VectorIndex { + def "" : AsmVectorIndexOpnd, ImmLeaf; + def _timm : AsmVectorIndexOpnd, TImmLeaf; +} + +def VectorIndex0Operand : AsmVectorIndex<0, 0>; def VectorIndex1Operand : AsmVectorIndex<1, 1>; def VectorIndexBOperand : AsmVectorIndex<0, 15>; def VectorIndexHOperand : AsmVectorIndex<0, 7>; def VectorIndexSOperand : AsmVectorIndex<0, 3>; def VectorIndexDOperand : AsmVectorIndex<0, 1>; -def VectorIndex1 : AsmVectorIndexOpnd; -def VectorIndexB : AsmVectorIndexOpnd; -def VectorIndexH : AsmVectorIndexOpnd; -def VectorIndexS : AsmVectorIndexOpnd; -def VectorIndexD : AsmVectorIndexOpnd; +defm VectorIndex0 : VectorIndex; +defm VectorIndex1 : VectorIndex; +defm VectorIndexB : VectorIndex; +defm VectorIndexH : VectorIndex; +defm VectorIndexS : VectorIndex; +defm VectorIndexD : VectorIndex; + +defm VectorIndex132b : VectorIndex; +defm VectorIndexB32b : VectorIndex; +defm VectorIndexH32b : VectorIndex; +defm VectorIndexS32b : VectorIndex; +defm VectorIndexD32b : VectorIndex; def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">; def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">; @@ -1006,16 +1291,52 @@ def SVEVectorIndexExtDupSOperand : AsmVectorIndex<0, 15, "SVE">; def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">; def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">; -def sve_elm_idx_extdup_b - : AsmVectorIndexOpnd; -def sve_elm_idx_extdup_h - : AsmVectorIndexOpnd; -def sve_elm_idx_extdup_s - : AsmVectorIndexOpnd; -def sve_elm_idx_extdup_d - : AsmVectorIndexOpnd; -def sve_elm_idx_extdup_q - : AsmVectorIndexOpnd; +defm sve_elm_idx_extdup_b + : VectorIndex; +defm sve_elm_idx_extdup_h + : VectorIndex; +defm sve_elm_idx_extdup_s + : VectorIndex; +defm sve_elm_idx_extdup_d + : VectorIndex; +defm sve_elm_idx_extdup_q + : VectorIndex; + +def sme_elm_idx0_0 : Operand, ImmLeaf { + let ParserMatchClass = Imm0_0Operand; + let PrintMethod = "printMatrixIndex"; +} +def sme_elm_idx0_1 : Operand, ImmLeaf { + let ParserMatchClass = Imm0_1Operand; + let PrintMethod = "printMatrixIndex"; +} +def sme_elm_idx0_3 : Operand, ImmLeaf { + let ParserMatchClass = Imm0_3Operand; + let PrintMethod = "printMatrixIndex"; +} +def sme_elm_idx0_7 : Operand, ImmLeaf { + let ParserMatchClass = Imm0_7Operand; + let PrintMethod = "printMatrixIndex"; +} +def sme_elm_idx0_15 : Operand, ImmLeaf { + let ParserMatchClass = Imm0_15Operand; + let PrintMethod = "printMatrixIndex"; +} // 8-bit immediate for AdvSIMD where 64-bit values of the form: // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh @@ -1057,13 +1378,63 @@ class SimpleSystemI - : BaseSystemI, +class RtSystemI pattern = []> + : BaseSystemI, Sched<[WriteSys]> { bits<5> Rt; let Inst{4-0} = Rt; } +// System instructions for transactional memory extension +class TMBaseSystemI CRm, bits<3> op2, dag oops, dag iops, + string asm, string operands, list pattern> + : BaseSystemI, + Sched<[WriteSys]> { + let Inst{20-12} = 0b000110011; + let Inst{11-8} = CRm; + let Inst{7-5} = op2; + let DecoderMethod = ""; + + let mayLoad = 1; + let mayStore = 1; +} + +// System instructions for transactional memory - single input operand +class TMSystemI CRm, string asm, list pattern> + : TMBaseSystemI<0b1, CRm, 0b011, + (outs GPR64:$Rt), (ins), asm, "\t$Rt", pattern> { + bits<5> Rt; + let Inst{4-0} = Rt; +} + +// System instructions that pass a register argument +// This class assumes the register is for input rather than output. +class RegInputSystemI CRm, bits<3> Op2, string asm, + list pattern = []> + : RtSystemI<0, (outs), (ins GPR64:$Rt), asm, "\t$Rt", pattern> { + let Inst{20-12} = 0b000110001; + let Inst{11-8} = CRm; + let Inst{7-5} = Op2; +} + +// System instructions for transactional memory - no operand +class TMSystemINoOperand CRm, string asm, list pattern> + : TMBaseSystemI<0b0, CRm, 0b011, (outs), (ins), asm, "", pattern> { + let Inst{4-0} = 0b11111; +} + +// System instructions for exit from transactions +class TMSystemException op1, string asm, list pattern> + : I<(outs), (ins timm64_0_65535:$imm), asm, "\t$imm", "", pattern>, + Sched<[WriteSys]> { + bits<16> imm; + let Inst{31-24} = 0b11010100; + let Inst{23-21} = op1; + let Inst{20-5} = imm; + let Inst{4-0} = 0b00000; +} + // Hint instructions that take both a CRm and a 3-bit immediate. // NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot // model patterns with sufficiently fine granularity @@ -1087,6 +1458,14 @@ def barrier_op : Operand { let PrintMethod = "printBarrierOption"; let ParserMatchClass = BarrierAsmOperand; } +def BarriernXSAsmOperand : AsmOperandClass { + let Name = "BarriernXS"; + let ParserMethod = "tryParseBarriernXSOperand"; +} +def barrier_nxs_op : Operand { + let PrintMethod = "printBarriernXSOption"; + let ParserMatchClass = BarriernXSAsmOperand; +} class CRmSystemI opc, string asm, list pattern = []> : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm", pattern>, @@ -1149,10 +1528,31 @@ def psbhint_op : Operand { }]; } +def BTIHintOperand : AsmOperandClass { + let Name = "BTIHint"; + let ParserMethod = "tryParseBTIHint"; +} +def btihint_op : Operand { + let ParserMatchClass = BTIHintOperand; + let PrintMethod = "printBTIHintOp"; + let MCOperandPredicate = [{ + // "bti" is an alias to "hint" only for certain values of CRm:Op2 fields. + if (!MCOp.isImm()) + return false; + return AArch64BTIHint::lookupBTIByEncoding(MCOp.getImm() ^ 32) != nullptr; + }]; +} + class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg), "mrs", "\t$Rt, $systemreg"> { bits<16> systemreg; let Inst{20-5} = systemreg; + let DecoderNamespace = "Fallback"; + // The MRS is set as a NZCV setting instruction. Not all MRS instructions + // require doing this. The alternative was to explicitly model each one, but + // it feels like it is unnecessary because it seems there are no negative + // consequences setting these flags for all. + let Defs = [NZCV]; } // FIXME: Some of these def NZCV, others don't. Best way to model that? @@ -1162,6 +1562,7 @@ class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt), "msr", "\t$systemreg, $Rt"> { bits<16> systemreg; let Inst{20-5} = systemreg; + let DecoderNamespace = "Fallback"; } def SystemPStateFieldWithImm0_15Operand : AsmOperandClass { @@ -1173,16 +1574,23 @@ def pstatefield4_op : Operand { let PrintMethod = "printSystemPStateField"; } +// Instructions to modify PSTATE, no input reg let Defs = [NZCV] in +class PstateWriteSimple + : SimpleSystemI<0, iops, asm, operands> { + + let Inst{20-19} = 0b00; + let Inst{15-12} = 0b0100; +} + class MSRpstateImm0_15 - : SimpleSystemI<0, (ins pstatefield4_op:$pstatefield, imm0_15:$imm), - "msr", "\t$pstatefield, $imm">, + : PstateWriteSimple<(ins pstatefield4_op:$pstatefield, imm0_15:$imm), "msr", + "\t$pstatefield, $imm">, Sched<[WriteSys]> { + bits<6> pstatefield; bits<4> imm; - let Inst{20-19} = 0b00; let Inst{18-16} = pstatefield{5-3}; - let Inst{15-12} = 0b0100; let Inst{11-8} = imm; let Inst{7-5} = pstatefield{2-0}; @@ -1201,16 +1609,15 @@ def pstatefield1_op : Operand { let PrintMethod = "printSystemPStateField"; } -let Defs = [NZCV] in class MSRpstateImm0_1 - : SimpleSystemI<0, (ins pstatefield1_op:$pstatefield, imm0_1:$imm), - "msr", "\t$pstatefield, $imm">, + : PstateWriteSimple<(ins pstatefield1_op:$pstatefield, imm0_1:$imm), "msr", + "\t$pstatefield, $imm">, Sched<[WriteSys]> { + bits<6> pstatefield; bit imm; - let Inst{20-19} = 0b00; let Inst{18-16} = pstatefield{5-3}; - let Inst{15-9} = 0b0100000; + let Inst{11-9} = 0b000; let Inst{8} = imm; let Inst{7-5} = pstatefield{2-0}; @@ -1308,6 +1715,7 @@ class RCPCLoad sz, string asm, RegisterClass RC> class AuthBase M, dag oops, dag iops, string asm, string operands, list pattern> : I, Sched<[]> { + let isAuthenticated = 1; let Inst{31-25} = 0b1101011; let Inst{20-11} = 0b1111100001; let Inst{10} = M; @@ -1332,6 +1740,7 @@ class AuthOneOperand opc, bits<1> M, string asm> let Inst{9-5} = Rn; } +let Uses = [LR,SP] in class AuthReturn op, bits<1> M, string asm> : AuthBase { let Inst{24} = 0; @@ -1341,11 +1750,12 @@ class AuthReturn op, bits<1> M, string asm> let mayLoad = 1 in class BaseAuthLoad + string operands, string cstr> : I, Sched<[]> { bits<10> offset; bits<5> Rn; bits<5> Rt; + let isAuthenticated = 1; let Inst{31-24} = 0b11111000; let Inst{23} = M; let Inst{22} = offset{9}; @@ -1355,19 +1765,24 @@ class BaseAuthLoad { def indexed : BaseAuthLoad; + asm, "\t$Rt, [$Rn, $offset]", "">; def writeback : BaseAuthLoad; + "$Rn = $wback,@earlyclobber $wback">; def : InstAlias(NAME # "indexed") GPR64:$Rt, GPR64sp:$Rn, 0)>; + + def : InstAlias(NAME # "writeback") GPR64sp:$wback, GPR64:$Rt, 0), 0>; } //--- @@ -1401,10 +1816,10 @@ def am_brcond : Operand { let OperandType = "OPERAND_PCREL"; } -class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target), - "b", ".$cond\t$target", "", - [(AArch64brcond bb:$target, imm:$cond, NZCV)]>, - Sched<[WriteBr]> { +class BranchCond + : I<(outs), (ins ccode:$cond, am_brcond:$target), + mnemonic, ".$cond\t$target", "", + [(AArch64brcond bb:$target, imm:$cond, NZCV)]>, Sched<[WriteBr]> { let isBranch = 1; let isTerminator = 1; let Uses = [NZCV]; @@ -1413,7 +1828,7 @@ class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target), bits<19> target; let Inst{31-24} = 0b01010100; let Inst{23-5} = target; - let Inst{4} = 0; + let Inst{4} = bit4; let Inst{3-0} = cond; } @@ -1597,9 +2012,10 @@ class OneXRegData opc, string asm, SDPatternOperator node> let Inst{31} = 1; } -class SignAuthOneData opcode_prefix, bits<2> opcode, string asm> - : I<(outs GPR64:$Rd), (ins GPR64sp:$Rn), asm, "\t$Rd, $Rn", "", - []>, +class SignAuthOneData opcode_prefix, bits<2> opcode, string asm, + SDPatternOperator op> + : I<(outs GPR64:$dst), (ins GPR64:$Rd, GPR64sp:$Rn), asm, "\t$Rd, $Rn", + "$dst = $Rd", [(set GPR64:$dst, (op GPR64:$Rd, opcode, GPR64sp:$Rn))]>, Sched<[WriteI, ReadI]> { bits<5> Rd; bits<5> Rn; @@ -1610,8 +2026,11 @@ class SignAuthOneData opcode_prefix, bits<2> opcode, string asm> let Inst{4-0} = Rd; } -class SignAuthZero opcode_prefix, bits<2> opcode, string asm> - : I<(outs GPR64:$Rd), (ins), asm, "\t$Rd", "", []>, Sched<[]> { +class SignAuthZero opcode_prefix, bits<2> opcode, string asm, + SDPatternOperator op> + : I<(outs GPR64:$dst), (ins GPR64:$Rd), asm, "\t$Rd", "$dst = $Rd", + [(set GPR64:$dst, (op GPR64:$Rd, opcode, (i64 0)))]>, + Sched<[]> { bits<5> Rd; let Inst{31-15} = 0b11011010110000010; let Inst{14-12} = opcode_prefix; @@ -1637,11 +2056,21 @@ class SignAuthTwoOperand opc, string asm, let Inst{4-0} = Rd; } +class ClearAuth data, string asm> + : I<(outs GPR64:$Rd), (ins GPR64:$Rn), asm, "\t$Rd", "$Rd = $Rn", []>, Sched<[]> { + bits<5> Rd; + let Inst{31-11} = 0b110110101100000101000; + let Inst{10} = data; + let Inst{9-5} = 0b11111; + let Inst{4-0} = Rd; +} + // Base class for the Armv8.4-A 8 and 16-bit flag manipulation instructions class BaseFlagManipulation : I<(outs), iops, asm, ops, "", []>, Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; + let Defs = [NZCV]; bits<5> Rn; let Inst{31} = sf; let Inst{30-15} = 0b0111010000000000; @@ -1719,10 +2148,12 @@ multiclass AddSubCarry opc, RegisterClass regtype, string asm, - SDPatternOperator OpNode> - : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), + SDPatternOperator OpNode, + RegisterClass in1regtype = regtype, + RegisterClass in2regtype = regtype> + : I<(outs regtype:$Rd), (ins in1regtype:$Rn, in2regtype:$Rm), asm, "\t$Rd, $Rn, $Rm", "", - [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> { + [(set regtype:$Rd, (OpNode in1regtype:$Rn, in2regtype:$Rm))]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -1779,6 +2210,14 @@ multiclass Shift shift_type, string asm, SDNode OpNode> { def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))), (!cast(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>; + + def : Pat<(i64 (OpNode GPR64:$Rn, (i64 (sext GPR32:$Rm)))), + (!cast(NAME # "Xr") GPR64:$Rn, + (SUBREG_TO_REG (i32 0), GPR32:$Rm, sub_32))>; + + def : Pat<(i64 (OpNode GPR64:$Rn, (i64 (zext GPR32:$Rm)))), + (!cast(NAME # "Xr") GPR64:$Rn, + (SUBREG_TO_REG (i32 0), GPR32:$Rm, sub_32))>; } class ShiftAlias @@ -1803,16 +2242,14 @@ class BaseMulAccum opc, RegisterClass multype, let Inst{4-0} = Rd; } -multiclass MulAccum { +multiclass MulAccum { // MADD/MSUB generation is decided by MachineCombiner.cpp - def Wrrr : BaseMulAccum, + def Wrrr : BaseMulAccum, Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> { let Inst{31} = 0; } - def Xrrr : BaseMulAccum, + def Xrrr : BaseMulAccum, Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> { let Inst{31} = 1; } @@ -1902,7 +2339,7 @@ class ADRI pattern> //--- def movimm32_imm : Operand { - let ParserMatchClass = Imm0_65535Operand; + let ParserMatchClass = AsmImmRange<0, 65535>; let EncoderMethod = "getMoveWideImmOpValue"; let PrintMethod = "printImm"; } @@ -1977,23 +2414,29 @@ multiclass InsertImmediate opc, string asm> { //--- class BaseAddSubImm - : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm), - asm, "\t$Rd, $Rn, $imm", "", - [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>, - Sched<[WriteI, ReadI]> { + string asm_inst, string asm_ops, + dag inputs, dag pattern> + : I<(outs dstRegtype:$Rd), inputs, asm_inst, asm_ops, "", [pattern]>, + Sched<[WriteI, ReadI]> { bits<5> Rd; bits<5> Rn; - bits<14> imm; let Inst{30} = isSub; let Inst{29} = setFlags; let Inst{28-24} = 0b10001; - let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12 - let Inst{21-10} = imm{11-0}; let Inst{9-5} = Rn; let Inst{4-0} = Rd; - let DecoderMethod = "DecodeBaseAddSubImm"; +} + +class AddSubImmShift + : BaseAddSubImm { + bits<14> imm; + let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12 + let Inst{21-10} = imm{11-0}; + let DecoderMethod = "DecodeAddSubImmShift"; } class BaseAddSubRegPseudo { let Inst{31} = 0; } let AddedComplexity = 6 in - def Xri : BaseAddSubImm { let Inst{31} = 1; } @@ -2125,11 +2568,11 @@ multiclass AddSub, mnemonic, OpNode> { + arith_extended_reg32_i32, mnemonic, OpNode> { let Inst{31} = 0; } def Xrx : BaseAddSubEReg, mnemonic, OpNode> { + arith_extended_reg32to64_i64, mnemonic, OpNode> { let Inst{31} = 1; } } @@ -2173,11 +2616,11 @@ multiclass AddSubS { let isCompare = 1, Defs = [NZCV] in { // Add/Subtract immediate - def Wri : BaseAddSubImm { let Inst{31} = 0; } - def Xri : BaseAddSubImm { let Inst{31} = 1; } @@ -2199,11 +2642,11 @@ multiclass AddSubS, mnemonic, OpNode> { + arith_extended_reg32_i32, mnemonic, OpNode> { let Inst{31} = 0; } def Xrx : BaseAddSubEReg, mnemonic, OpNode> { + arith_extended_reg32_i64, mnemonic, OpNode> { let Inst{31} = 1; } } @@ -2271,6 +2714,27 @@ multiclass AddSubS; // UXTX #0 } +class AddSubG + : BaseAddSubImm< + isSub, 0, GPR64sp, asm_inst, "\t$Rd, $Rn, $imm6, $imm4", + (ins GPR64sp:$Rn, uimm6s16:$imm6, imm0_15:$imm4), + (set GPR64sp:$Rd, (OpNode GPR64sp:$Rn, imm0_63:$imm6, imm0_15:$imm4))> { + bits<6> imm6; + bits<4> imm4; + let Inst{31} = 1; + let Inst{23-22} = 0b10; + let Inst{21-16} = imm6; + let Inst{15-14} = 0b00; + let Inst{13-10} = imm4; + let Unpredictable{15-14} = 0b11; +} + +class SUBP + : BaseTwoOperand<0b0000, GPR64, asm_instr, OpNode, GPR64sp, GPR64sp> { + let Inst{31} = 1; + let Inst{29} = setsFlags; +} + //--- // Extract //--- @@ -2713,11 +3177,18 @@ def maski16_or_more : Operand, // (unsigned immediate) // Indexed for 8-bit registers. offset is in range [0,4095]. -def am_indexed8 : ComplexPattern; -def am_indexed16 : ComplexPattern; -def am_indexed32 : ComplexPattern; -def am_indexed64 : ComplexPattern; -def am_indexed128 : ComplexPattern; +def am_indexed8 : ComplexPattern; +def am_indexed16 : ComplexPattern; +def am_indexed32 : ComplexPattern; +def am_indexed64 : ComplexPattern; +def am_indexed128 : ComplexPattern; + +// (unsigned immediate) +// Indexed for 8-bit registers. offset is in range [0,63]. +def am_indexed8_6b : ComplexPattern", []>; +def am_indexed16_6b : ComplexPattern", []>; +def am_indexed32_6b : ComplexPattern", []>; +def am_indexed64_6b : ComplexPattern", []>; def gi_am_indexed8 : GIComplexOperandMatcher">, @@ -2782,7 +3253,7 @@ class BaseLoadStoreUI sz, bit V, bits<2> opc, dag oops, dag iops, let DecoderMethod = "DecodeUnsignedLdStInstruction"; } -multiclass LoadUI sz, bit V, bits<2> opc, RegisterOperand regtype, +multiclass LoadUI sz, bit V, bits<2> opc, DAGOperand regtype, Operand indextype, string asm, list pattern> { let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in def ui : BaseLoadStoreUI sz, bit V, bits<2> opc, RegisterOperand regtype, (!cast(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; } -multiclass StoreUI sz, bit V, bits<2> opc, RegisterOperand regtype, +multiclass StoreUI sz, bit V, bits<2> opc, DAGOperand regtype, Operand indextype, string asm, list pattern> { let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in def ui : BaseLoadStoreUI { let OperandType = "OPERAND_PCREL"; } -let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in -class LoadLiteral opc, bit V, RegisterOperand regtype, string asm> +let mayLoad = 1, mayStore = 0, hasSideEffects = 0, AddedComplexity = 20 in +class LoadLiteral opc, bit V, RegisterOperand regtype, string asm, list pat> : I<(outs regtype:$Rt), (ins am_ldrlit:$label), - asm, "\t$Rt, $label", "", []>, + asm, "\t$Rt, $label", "", pat>, Sched<[WriteLD]> { bits<5> Rt; bits<19> label; @@ -2887,17 +3358,49 @@ class PrefetchLiteral opc, bit V, string asm, list pat> // Load/store register offset //--- -def ro_Xindexed8 : ComplexPattern", []>; -def ro_Xindexed16 : ComplexPattern", []>; -def ro_Xindexed32 : ComplexPattern", []>; -def ro_Xindexed64 : ComplexPattern", []>; -def ro_Xindexed128 : ComplexPattern", []>; +def ro_Xindexed8 : ComplexPattern", []>; +def ro_Xindexed16 : ComplexPattern", []>; +def ro_Xindexed32 : ComplexPattern", []>; +def ro_Xindexed64 : ComplexPattern", []>; +def ro_Xindexed128 : ComplexPattern", []>; -def ro_Windexed8 : ComplexPattern", []>; -def ro_Windexed16 : ComplexPattern", []>; -def ro_Windexed32 : ComplexPattern", []>; -def ro_Windexed64 : ComplexPattern", []>; -def ro_Windexed128 : ComplexPattern", []>; +def gi_ro_Xindexed8 : + GIComplexOperandMatcher">, + GIComplexPatternEquiv; +def gi_ro_Xindexed16 : + GIComplexOperandMatcher">, + GIComplexPatternEquiv; +def gi_ro_Xindexed32 : + GIComplexOperandMatcher">, + GIComplexPatternEquiv; +def gi_ro_Xindexed64 : + GIComplexOperandMatcher">, + GIComplexPatternEquiv; +def gi_ro_Xindexed128 : + GIComplexOperandMatcher">, + GIComplexPatternEquiv; + +def ro_Windexed8 : ComplexPattern", []>; +def ro_Windexed16 : ComplexPattern", []>; +def ro_Windexed32 : ComplexPattern", []>; +def ro_Windexed64 : ComplexPattern", []>; +def ro_Windexed128 : ComplexPattern", []>; + +def gi_ro_Windexed8 : + GIComplexOperandMatcher">, + GIComplexPatternEquiv; +def gi_ro_Windexed16 : + GIComplexOperandMatcher">, + GIComplexPatternEquiv; +def gi_ro_Windexed32 : + GIComplexOperandMatcher">, + GIComplexPatternEquiv; +def gi_ro_Windexed64 : + GIComplexOperandMatcher">, + GIComplexPatternEquiv; +def gi_ro_Windexed128 : + GIComplexOperandMatcher">, + GIComplexPatternEquiv; class MemExtendOperand : AsmOperandClass { let Name = "Mem" # Reg # "Extend" # Width; @@ -2965,8 +3468,8 @@ def ro64 : ROAddrMode; def ro128 : ROAddrMode; -class LoadStore8RO sz, bit V, bits<2> opc, RegisterOperand regtype, - string asm, dag ins, dag outs, list pat> +class LoadStore8RO sz, bit V, bits<2> opc, string asm, dag ins, + dag outs, list pat> : I { bits<5> Rt; bits<5> Rn; @@ -2987,14 +3490,14 @@ class LoadStore8RO sz, bit V, bits<2> opc, RegisterOperand regtype, let Inst{4-0} = Rt; } -class ROInstAlias +class ROInstAlias : InstAlias; -multiclass Load8RO sz, bit V, bits<2> opc, RegisterOperand regtype, +multiclass Load8RO sz, bit V, bits<2> opc, DAGOperand regtype, string asm, ValueType Ty, SDPatternOperator loadop> { let AddedComplexity = 10 in - def roW : LoadStore8RO sz, bit V, bits<2> opc, RegisterOperand regtype, } let AddedComplexity = 10 in - def roX : LoadStore8RO sz, bit V, bits<2> opc, RegisterOperand regtype, def : ROInstAlias(NAME # "roX")>; } -multiclass Store8RO sz, bit V, bits<2> opc, RegisterOperand regtype, +multiclass Store8RO sz, bit V, bits<2> opc, DAGOperand regtype, string asm, ValueType Ty, SDPatternOperator storeop> { let AddedComplexity = 10 in - def roW : LoadStore8RO, - Sched<[WriteSTIdx, ReadAdrBase]> { + Sched<[WriteSTIdx, ReadST, ReadAdrBase]> { let Inst{13} = 0b0; } let AddedComplexity = 10 in - def roX : LoadStore8RO, - Sched<[WriteSTIdx, ReadAdrBase]> { + Sched<[WriteSTIdx, ReadST, ReadAdrBase]> { let Inst{13} = 0b1; } def : ROInstAlias(NAME # "roX")>; } -class LoadStore16RO sz, bit V, bits<2> opc, RegisterOperand regtype, - string asm, dag ins, dag outs, list pat> +class LoadStore16RO sz, bit V, bits<2> opc, string asm, dag ins, + dag outs, list pat> : I { bits<5> Rt; bits<5> Rn; @@ -3065,10 +3568,10 @@ class LoadStore16RO sz, bit V, bits<2> opc, RegisterOperand regtype, let Inst{4-0} = Rt; } -multiclass Load16RO sz, bit V, bits<2> opc, RegisterOperand regtype, +multiclass Load16RO sz, bit V, bits<2> opc, DAGOperand regtype, string asm, ValueType Ty, SDPatternOperator loadop> { let AddedComplexity = 10 in - def roW : LoadStore16RO sz, bit V, bits<2> opc, RegisterOperand regtype, } let AddedComplexity = 10 in - def roX : LoadStore16RO sz, bit V, bits<2> opc, RegisterOperand regtype, def : ROInstAlias(NAME # "roX")>; } -multiclass Store16RO sz, bit V, bits<2> opc, RegisterOperand regtype, +multiclass Store16RO sz, bit V, bits<2> opc, DAGOperand regtype, string asm, ValueType Ty, SDPatternOperator storeop> { let AddedComplexity = 10 in - def roW : LoadStore16RO, - Sched<[WriteSTIdx, ReadAdrBase]> { + Sched<[WriteSTIdx, ReadST, ReadAdrBase]> { let Inst{13} = 0b0; } let AddedComplexity = 10 in - def roX : LoadStore16RO, - Sched<[WriteSTIdx, ReadAdrBase]> { + Sched<[WriteSTIdx, ReadST, ReadAdrBase]> { let Inst{13} = 0b1; } def : ROInstAlias(NAME # "roX")>; } -class LoadStore32RO sz, bit V, bits<2> opc, RegisterOperand regtype, - string asm, dag ins, dag outs, list pat> +class LoadStore32RO sz, bit V, bits<2> opc, string asm, dag ins, + dag outs, list pat> : I { bits<5> Rt; bits<5> Rn; @@ -3137,10 +3640,10 @@ class LoadStore32RO sz, bit V, bits<2> opc, RegisterOperand regtype, let Inst{4-0} = Rt; } -multiclass Load32RO sz, bit V, bits<2> opc, RegisterOperand regtype, +multiclass Load32RO sz, bit V, bits<2> opc, DAGOperand regtype, string asm, ValueType Ty, SDPatternOperator loadop> { let AddedComplexity = 10 in - def roW : LoadStore32RO sz, bit V, bits<2> opc, RegisterOperand regtype, } let AddedComplexity = 10 in - def roX : LoadStore32RO sz, bit V, bits<2> opc, RegisterOperand regtype, def : ROInstAlias(NAME # "roX")>; } -multiclass Store32RO sz, bit V, bits<2> opc, RegisterOperand regtype, +multiclass Store32RO sz, bit V, bits<2> opc, DAGOperand regtype, string asm, ValueType Ty, SDPatternOperator storeop> { let AddedComplexity = 10 in - def roW : LoadStore32RO, - Sched<[WriteSTIdx, ReadAdrBase]> { + Sched<[WriteSTIdx, ReadST, ReadAdrBase]> { let Inst{13} = 0b0; } let AddedComplexity = 10 in - def roX : LoadStore32RO, - Sched<[WriteSTIdx, ReadAdrBase]> { + Sched<[WriteSTIdx, ReadST, ReadAdrBase]> { let Inst{13} = 0b1; } def : ROInstAlias(NAME # "roX")>; } -class LoadStore64RO sz, bit V, bits<2> opc, RegisterOperand regtype, - string asm, dag ins, dag outs, list pat> +class LoadStore64RO sz, bit V, bits<2> opc, string asm, dag ins, + dag outs, list pat> : I { bits<5> Rt; bits<5> Rn; @@ -3209,10 +3712,10 @@ class LoadStore64RO sz, bit V, bits<2> opc, RegisterOperand regtype, let Inst{4-0} = Rt; } -multiclass Load64RO sz, bit V, bits<2> opc, RegisterOperand regtype, +multiclass Load64RO sz, bit V, bits<2> opc, DAGOperand regtype, string asm, ValueType Ty, SDPatternOperator loadop> { let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in - def roW : LoadStore64RO sz, bit V, bits<2> opc, RegisterOperand regtype, } let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in - def roX : LoadStore64RO sz, bit V, bits<2> opc, RegisterOperand regtype, def : ROInstAlias(NAME # "roX")>; } -multiclass Store64RO sz, bit V, bits<2> opc, RegisterOperand regtype, +multiclass Store64RO sz, bit V, bits<2> opc, DAGOperand regtype, string asm, ValueType Ty, SDPatternOperator storeop> { let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in - def roW : LoadStore64RO, - Sched<[WriteSTIdx, ReadAdrBase]> { + Sched<[WriteSTIdx, ReadST, ReadAdrBase]> { let Inst{13} = 0b0; } let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in - def roX : LoadStore64RO, - Sched<[WriteSTIdx, ReadAdrBase]> { + Sched<[WriteSTIdx, ReadST, ReadAdrBase]> { let Inst{13} = 0b1; } def : ROInstAlias(NAME # "roX")>; } -class LoadStore128RO sz, bit V, bits<2> opc, RegisterOperand regtype, - string asm, dag ins, dag outs, list pat> +class LoadStore128RO sz, bit V, bits<2> opc, string asm, dag ins, + dag outs, list pat> : I { bits<5> Rt; bits<5> Rn; @@ -3281,10 +3784,10 @@ class LoadStore128RO sz, bit V, bits<2> opc, RegisterOperand regtype, let Inst{4-0} = Rt; } -multiclass Load128RO sz, bit V, bits<2> opc, RegisterOperand regtype, +multiclass Load128RO sz, bit V, bits<2> opc, DAGOperand regtype, string asm, ValueType Ty, SDPatternOperator loadop> { let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in - def roW : LoadStore128RO sz, bit V, bits<2> opc, RegisterOperand regtype, } let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in - def roX : LoadStore128RO sz, bit V, bits<2> opc, RegisterOperand regtype, def : ROInstAlias(NAME # "roX")>; } -multiclass Store128RO sz, bit V, bits<2> opc, RegisterOperand regtype, - string asm, ValueType Ty, SDPatternOperator storeop> { +multiclass Store128RO sz, bit V, bits<2> opc, DAGOperand regtype, + string asm> { let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in - def roW : LoadStore128RO, - Sched<[WriteSTIdx, ReadAdrBase]> { + Sched<[WriteSTIdx, ReadST, ReadAdrBase]> { let Inst{13} = 0b0; } let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in - def roX : LoadStore128RO, - Sched<[WriteSTIdx, ReadAdrBase]> { + Sched<[WriteSTIdx, ReadST, ReadAdrBase]> { let Inst{13} = 0b1; } @@ -3377,11 +3880,11 @@ multiclass PrefetchRO sz, bit V, bits<2> opc, string asm> { // Load/store unscaled immediate //--- -def am_unscaled8 : ComplexPattern; -def am_unscaled16 : ComplexPattern; -def am_unscaled32 : ComplexPattern; -def am_unscaled64 : ComplexPattern; -def am_unscaled128 :ComplexPattern; +def am_unscaled8 : ComplexPattern; +def am_unscaled16 : ComplexPattern; +def am_unscaled32 : ComplexPattern; +def am_unscaled64 : ComplexPattern; +def am_unscaled128 :ComplexPattern; def gi_am_unscaled8 : GIComplexOperandMatcher, @@ -3422,7 +3925,7 @@ class BaseLoadStoreUnscale sz, bit V, bits<2> opc, dag oops, dag iops, // Armv8.4 LDAPR & STLR with Immediate Offset instruction multiclass BaseLoadUnscaleV84 sz, bits<2> opc, - RegisterOperand regtype > { + DAGOperand regtype > { def i : BaseLoadStoreUnscale, Sched<[WriteST]> { @@ -3434,7 +3937,7 @@ multiclass BaseLoadUnscaleV84 sz, bits<2> opc, } multiclass BaseStoreUnscaleV84 sz, bits<2> opc, - RegisterOperand regtype > { + DAGOperand regtype > { def i : BaseLoadStoreUnscale, @@ -3446,7 +3949,7 @@ multiclass BaseStoreUnscaleV84 sz, bits<2> opc, (!cast(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } -multiclass LoadUnscaled sz, bit V, bits<2> opc, RegisterOperand regtype, +multiclass LoadUnscaled sz, bit V, bits<2> opc, DAGOperand regtype, string asm, list pattern> { let AddedComplexity = 1 in // try this before LoadUI def i : BaseLoadStoreUnscale sz, bit V, bits<2> opc, RegisterOperand regtype, (!cast(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } -multiclass StoreUnscaled sz, bit V, bits<2> opc, RegisterOperand regtype, +multiclass StoreUnscaled sz, bit V, bits<2> opc, DAGOperand regtype, string asm, list pattern> { let AddedComplexity = 1 in // try this before StoreUI def i : BaseLoadStoreUnscale sz, bit V, bits<2> opc, RegisterOperand regtype, (outs GPR64sp:$wback, regtype:$Rt), (ins GPR64sp:$Rn, simm9:$offset), asm, "$Rn = $wback,@earlyclobber $wback", []>, - Sched<[WriteLD, WriteAdr]>; + Sched<[WriteAdr, WriteLD]>; let mayStore = 1, mayLoad = 0 in class StorePreIdx sz, bit V, bits<2> opc, RegisterOperand regtype, @@ -3606,7 +4109,7 @@ class LoadPostIdx sz, bit V, bits<2> opc, RegisterOperand regtype, (outs GPR64sp:$wback, regtype:$Rt), (ins GPR64sp:$Rn, simm9:$offset), asm, "$Rn = $wback,@earlyclobber $wback", []>, - Sched<[WriteLD, WriteAdr]>; + Sched<[WriteAdr, WriteLD]>; let mayStore = 1, mayLoad = 0 in class StorePostIdx sz, bit V, bits<2> opc, RegisterOperand regtype, @@ -3703,7 +4206,7 @@ class LoadPairPreIdx opc, bit V, RegisterOperand regtype, : BaseLoadStorePairPreIdx, - Sched<[WriteLD, WriteLDHi, WriteAdr]>; + Sched<[WriteAdr, WriteLD, WriteLDHi]>; let mayStore = 1, mayLoad = 0 in class StorePairPreIdx opc, bit V, RegisterOperand regtype, @@ -3744,7 +4247,7 @@ class LoadPairPostIdx opc, bit V, RegisterOperand regtype, : BaseLoadStorePairPostIdx, - Sched<[WriteLD, WriteLDHi, WriteAdr]>; + Sched<[WriteAdr, WriteLD, WriteLDHi]>; let mayStore = 1, mayLoad = 0 in class StorePairPostIdx opc, bit V, RegisterOperand regtype, @@ -3778,7 +4281,7 @@ class BaseLoadStorePairNoAlloc opc, bit V, bit L, dag oops, dag iops, let DecoderMethod = "DecodePairLdStInstruction"; } -multiclass LoadPairNoAlloc opc, bit V, RegisterClass regtype, +multiclass LoadPairNoAlloc opc, bit V, DAGOperand regtype, Operand indextype, string asm> { let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in def i : BaseLoadStorePairNoAlloc opc, bit V, RegisterClass regtype, GPR64sp:$Rn, 0)>; } -multiclass StorePairNoAlloc opc, bit V, RegisterClass regtype, +multiclass StorePairNoAlloc opc, bit V, DAGOperand regtype, Operand indextype, string asm> { let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in def i : BaseLoadStorePairNoAlloc sz, bit o2, bit L, bit o1, bit o0, let Constraints = "@earlyclobber $Ws"; } +// Armv8.5-A Memory Tagging Extension +class BaseMemTag opc1, bits<2> opc2, string asm_insn, + string asm_opnds, string cstr, dag oops, dag iops> + : I, + Sched<[]> { + bits<5> Rn; + + let Inst{31-24} = 0b11011001; + let Inst{23-22} = opc1; + let Inst{21} = 1; + // Inst{20-12} defined by subclass + let Inst{11-10} = opc2; + let Inst{9-5} = Rn; + // Inst{4-0} defined by subclass +} + +class MemTagVector + : BaseMemTag<{0b1, Load}, 0b00, asm_insn, asm_opnds, + "", oops, iops> { + bits<5> Rt; + + let Inst{20-12} = 0b000000000; + let Inst{4-0} = Rt; + + let mayLoad = Load; +} + +class MemTagLoad + : BaseMemTag<0b01, 0b00, asm_insn, asm_opnds, "$Rt = $wback", + (outs GPR64:$wback), + (ins GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)> { + bits<5> Rt; + bits<9> offset; + + let Inst{20-12} = offset; + let Inst{4-0} = Rt; + + let mayLoad = 1; +} + +class BaseMemTagStore opc1, bits<2> opc2, string asm_insn, + string asm_opnds, string cstr, dag oops, dag iops> + : BaseMemTag { + bits<5> Rt; + bits<9> offset; + + let Inst{20-12} = offset; + let Inst{4-0} = Rt; + + let mayStore = 1; +} + +multiclass MemTagStore opc1, string insn> { + def Offset : + BaseMemTagStore; + def PreIndex : + BaseMemTagStore; + def PostIndex : + BaseMemTagStore; + + def : InstAlias(NAME # "Offset") GPR64sp:$Rt, GPR64sp:$Rn, 0)>; +} + //--- // Exception generation //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in class ExceptionGeneration op1, bits<2> ll, string asm> - : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>, + : I<(outs), (ins timm32_0_65535:$imm), asm, "\t$imm", "", []>, Sched<[WriteSys]> { bits<16> imm; let Inst{31-24} = 0b11010100; @@ -3948,6 +4523,19 @@ class ExceptionGeneration op1, bits<2> ll, string asm> let Inst{1-0} = ll; } +//--- +// UDF : Permanently UNDEFINED instructions. Format: Opc = 0x0000, 16 bit imm. +//-- +let hasSideEffects = 1, isTrap = 1, mayLoad = 0, mayStore = 0 in { +class UDFType opc, string asm> + : I<(outs), (ins uimm16:$imm), + asm, "\t$imm", "", []>, + Sched<[]> { + bits<16> imm; + let Inst{31-16} = opc; + let Inst{15-0} = imm; +} +} let Predicates = [HasFPARMv8] in { //--- @@ -3998,14 +4586,14 @@ multiclass FPToIntegerUnscaled rmode, bits<3> opcode, string asm, SDPatternOperator OpN> { // Unscaled half-precision to 32-bit def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm, - [(set GPR32:$Rd, (OpN FPR16:$Rn))]> { + [(set GPR32:$Rd, (OpN (f16 FPR16:$Rn)))]> { let Inst{31} = 0; // 32-bit GPR flag let Predicates = [HasFullFP16]; } // Unscaled half-precision to 64-bit def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm, - [(set GPR64:$Rd, (OpN FPR16:$Rn))]> { + [(set GPR64:$Rd, (OpN (f16 FPR16:$Rn)))]> { let Inst{31} = 1; // 64-bit GPR flag let Predicates = [HasFullFP16]; } @@ -4040,7 +4628,7 @@ multiclass FPToIntegerScaled rmode, bits<3> opcode, string asm, // Scaled half-precision to 32-bit def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32, fixedpoint_f16_i32, asm, - [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn, + [(set GPR32:$Rd, (OpN (fmul (f16 FPR16:$Rn), fixedpoint_f16_i32:$scale)))]> { let Inst{31} = 0; // 32-bit GPR flag let scale{5} = 1; @@ -4050,7 +4638,7 @@ multiclass FPToIntegerScaled rmode, bits<3> opcode, string asm, // Scaled half-precision to 64-bit def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64, fixedpoint_f16_i64, asm, - [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn, + [(set GPR64:$Rd, (OpN (fmul (f16 FPR16:$Rn), fixedpoint_f16_i64:$scale)))]> { let Inst{31} = 1; // 64-bit GPR flag let Predicates = [HasFullFP16]; @@ -4115,7 +4703,7 @@ class BaseIntegerToFP + ValueType dvt, string asm, SDPatternOperator node> : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>, Sched<[WriteFCvt]> { @@ -4130,7 +4718,7 @@ class BaseIntegerToFPUnscaled { +multiclass IntegerToFP { // Unscaled def UWHri: BaseIntegerToFPUnscaled { let Inst{31} = 0; // 32-bit GPR flag @@ -4166,7 +4754,7 @@ multiclass IntegerToFP { // Scaled def SWHri: BaseIntegerToFP { let Inst{31} = 0; // 32-bit GPR flag @@ -4194,7 +4782,7 @@ multiclass IntegerToFP { } def SXHri: BaseIntegerToFP { let Inst{31} = 1; // 64-bit GPR flag @@ -4367,19 +4955,19 @@ class BaseFPConversion type, bits<2> opcode, RegisterClass dstType, multiclass FPConversion { // Double-precision to Half-precision def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm, - [(set FPR16:$Rd, (fpround FPR64:$Rn))]>; + [(set (f16 FPR16:$Rd), (any_fpround FPR64:$Rn))]>; // Double-precision to Single-precision def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm, - [(set FPR32:$Rd, (fpround FPR64:$Rn))]>; + [(set FPR32:$Rd, (any_fpround FPR64:$Rn))]>; // Half-precision to Double-precision def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm, - [(set FPR64:$Rd, (fpextend FPR16:$Rn))]>; + [(set FPR64:$Rd, (fpextend (f16 FPR16:$Rn)))]>; // Half-precision to Single-precision def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm, - [(set FPR32:$Rd, (fpextend FPR16:$Rn))]>; + [(set FPR32:$Rd, (fpextend (f16 FPR16:$Rn)))]>; // Single-precision to Double-precision def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm, @@ -4387,7 +4975,7 @@ multiclass FPConversion { // Single-precision to Half-precision def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm, - [(set FPR16:$Rd, (fpround FPR32:$Rn))]>; + [(set (f16 FPR16:$Rd), (any_fpround FPR32:$Rn))]>; } //--- @@ -4395,7 +4983,7 @@ multiclass FPConversion { //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSingleOperandFPData opcode, RegisterClass regtype, +class BaseSingleOperandFPData opcode, RegisterClass regtype, ValueType vt, string asm, SDPatternOperator node> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "", [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>, @@ -4403,8 +4991,8 @@ class BaseSingleOperandFPData opcode, RegisterClass regtype, bits<5> Rd; bits<5> Rn; let Inst{31-24} = 0b00011110; - let Inst{21-19} = 0b100; - let Inst{18-15} = opcode; + let Inst{21} = 0b1; + let Inst{20-15} = opcode; let Inst{14-10} = 0b10000; let Inst{9-5} = Rn; let Inst{4-0} = Rd; @@ -4412,20 +5000,37 @@ class BaseSingleOperandFPData opcode, RegisterClass regtype, multiclass SingleOperandFPData opcode, string asm, SDPatternOperator node = null_frag> { - def Hr : BaseSingleOperandFPData { + + def Hr : BaseSingleOperandFPData<{0b00,opcode}, FPR16, f16, asm, node> { let Inst{23-22} = 0b11; // 16-bit size flag let Predicates = [HasFullFP16]; } - def Sr : BaseSingleOperandFPData { + def Sr : BaseSingleOperandFPData<{0b00,opcode}, FPR32, f32, asm, node> { let Inst{23-22} = 0b00; // 32-bit size flag } - def Dr : BaseSingleOperandFPData { + def Dr : BaseSingleOperandFPData<{0b00,opcode}, FPR64, f64, asm, node> { let Inst{23-22} = 0b01; // 64-bit size flag } } +multiclass SingleOperandFPNo16 opcode, string asm, + SDPatternOperator node = null_frag>{ + + def Sr : BaseSingleOperandFPData { + let Inst{23-22} = 0b00; // 32-bit registers + } + + def Dr : BaseSingleOperandFPData { + let Inst{23-22} = 0b01; // 64-bit registers + } +} + +// FRInt[32|64][Z|N] instructions +multiclass FRIntNNT opcode, string asm, SDPatternOperator node = null_frag> : + SingleOperandFPNo16<{0b0100,opcode}, asm, node>; + //--- // Two operand floating point data processing //--- @@ -4472,7 +5077,7 @@ multiclass TwoOperandFPData opcode, string asm, multiclass TwoOperandFPDataNeg opcode, string asm, SDNode node> { def Hrr : BaseTwoOperandFPData { + [(set (f16 FPR16:$Rd), (fneg (node (f16 FPR16:$Rn), (f16 FPR16:$Rm))))]> { let Inst{23-22} = 0b11; // 16-bit size flag let Predicates = [HasFullFP16]; } @@ -4514,7 +5119,7 @@ class BaseThreeOperandFPData { def Hrrr : BaseThreeOperandFPData { let Inst{23-22} = 0b11; // 16-bit size flag let Predicates = [HasFullFP16]; @@ -4576,7 +5181,7 @@ multiclass FPComparison { let Defs = [NZCV] in { def Hrr : BaseTwoOperandFPComparison { + [(OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)), (implicit NZCV)]> { let Inst{23-22} = 0b11; let Predicates = [HasFullFP16]; } @@ -4751,7 +5356,7 @@ class BaseSIMDThreeSameVector size, bits<5> opcode, : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -4774,7 +5379,7 @@ class BaseSIMDThreeSameVectorTied size, bits<5> opcode, : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -4790,23 +5395,45 @@ class BaseSIMDThreeSameVectorTied size, bits<5> opcode, let Inst{4-0} = Rd; } -class BaseSIMDThreeSameVectorDot : - BaseSIMDThreeSameVectorTied { - let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}"); -} +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDThreeSameVectorPseudo pattern> + : Pseudo<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), pattern>, + Sched<[!if(!eq(regtype, V128), WriteVq, WriteVd)]>; -multiclass SIMDThreeSameVectorDot { - def v8i8 : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64, - v2i32, v8i8, OpNode>; - def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128, - v4i32, v16i8, OpNode>; +multiclass SIMDLogicalThreeVectorPseudo { + def v8i8 : BaseSIMDThreeSameVectorPseudo; + def v16i8 : BaseSIMDThreeSameVectorPseudo; + + def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS), + (v4i16 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS), + (v2i32 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS), + (v1i64 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + + def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS), + (v8i16 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS), + (v4i32 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS), + (v2i64 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; } // All operand sizes distinguished in the encoding. @@ -4835,6 +5462,24 @@ multiclass SIMDThreeSameVector opc, string asm, [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>; } +multiclass SIMDThreeSameVectorExtraPatterns { + def : Pat<(v8i8 (OpNode V64:$LHS, V64:$RHS)), + (!cast(inst#"v8i8") V64:$LHS, V64:$RHS)>; + def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)), + (!cast(inst#"v4i16") V64:$LHS, V64:$RHS)>; + def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)), + (!cast(inst#"v2i32") V64:$LHS, V64:$RHS)>; + + def : Pat<(v16i8 (OpNode V128:$LHS, V128:$RHS)), + (!cast(inst#"v16i8") V128:$LHS, V128:$RHS)>; + def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)), + (!cast(inst#"v8i16") V128:$LHS, V128:$RHS)>; + def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)), + (!cast(inst#"v4i32") V128:$LHS, V128:$RHS)>; + def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)), + (!cast(inst#"v2i64") V128:$LHS, V128:$RHS)>; +} + // As above, but D sized elements unsupported. multiclass SIMDThreeSameVectorBHS opc, string asm, SDPatternOperator OpNode> { @@ -5011,7 +5656,7 @@ multiclass SIMDLogicalThreeVector size, string asm, } multiclass SIMDLogicalThreeVectorTied size, - string asm, SDPatternOperator OpNode> { + string asm, SDPatternOperator OpNode = null_frag> { def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64, asm, ".8b", [(set (v8i8 V64:$dst), @@ -5049,6 +5694,51 @@ multiclass SIMDLogicalThreeVectorTied size, V128:$LHS, V128:$MHS, V128:$RHS)>; } +// ARMv8.2-A Dot Product Instructions (Vector): These instructions extract +// bytes from S-sized elements. +class BaseSIMDThreeSameVectorDot : + BaseSIMDThreeSameVectorTied { + let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}"); +} + +multiclass SIMDThreeSameVectorDot { + def v8i8 : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64, + v2i32, v8i8, OpNode>; + def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128, + v4i32, v16i8, OpNode>; +} + +// ARMv8.2-A Fused Multiply Add-Long Instructions (Vector): These instructions +// select inputs from 4H vectors and accumulate outputs to a 2S vector (or from +// 8H to 4S, when Q=1). +class BaseSIMDThreeSameVectorFML size, string asm, string kind1, + string kind2, RegisterOperand RegType, + ValueType AccumType, ValueType InputType, + SDPatternOperator OpNode> : + BaseSIMDThreeSameVectorTied { + let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}"); + let Inst{13} = b13; +} + +multiclass SIMDThreeSameVectorFML size, string asm, + SDPatternOperator OpNode> { + def v4f16 : BaseSIMDThreeSameVectorFML<0, U, b13, size, asm, ".2s", ".2h", V64, + v2f32, v4f16, OpNode>; + def v8f16 : BaseSIMDThreeSameVectorFML<1, U, b13, size, asm, ".4s", ".4h", V128, + v4f32, v8f16, OpNode>; +} + //---------------------------------------------------------------------------- // AdvSIMD two register vector instructions. @@ -5061,7 +5751,7 @@ class BaseSIMDTwoSameVector size, bits<5> opcode, : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "{\t$Rd" # dstkind # ", $Rn" # srckind # "|" # dstkind # "\t$Rd, $Rn}", "", pattern>, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; @@ -5086,7 +5776,7 @@ class BaseSIMDTwoSameVectorTied size, bits<5> opcode, : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm, "{\t$Rd" # dstkind # ", $Rn" # srckind # "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; @@ -5132,7 +5822,7 @@ class BaseSIMDVectorLShiftLongBySize size, : I<(outs V128:$Rd), (ins regtype:$Rn), asm, "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount # "|" # dstkind # "\t$Rd, $Rn, #" # amount # "}", "", []>, - Sched<[WriteV]> { + Sched<[WriteVq]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; @@ -5293,7 +5983,7 @@ multiclass SIMDTwoVectorBH opc, string asm, [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>; } -// Supports only S and D element sizes, uses high bit of the size field +// Supports H, S and D element sizes, uses high bit of the size field // as an extra opcode bit. multiclass SIMDTwoVectorFP opc, string asm, SDPatternOperator OpNode> { @@ -5316,6 +6006,25 @@ multiclass SIMDTwoVectorFP opc, string asm, [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; } +// Supports only S and D element sizes +multiclass SIMDTwoVectorSD opc, string asm, + SDPatternOperator OpNode = null_frag> { + + def v2f32 : BaseSIMDTwoSameVector<0, U, 00, opc, 0b00, V64, + asm, ".2s", ".2s", + [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; + def v4f32 : BaseSIMDTwoSameVector<1, U, 00, opc, 0b00, V128, + asm, ".4s", ".4s", + [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; + def v2f64 : BaseSIMDTwoSameVector<1, U, 01, opc, 0b00, V128, + asm, ".2d", ".2d", + [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; +} + +multiclass FRIntNNTVector : + SIMDTwoVectorSD; + // Supports only S element size. multiclass SIMDTwoVectorS opc, string asm, SDPatternOperator OpNode> { @@ -5370,7 +6079,7 @@ multiclass SIMDTwoVectorIntToFP opc, string asm, [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; } - +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDMixedTwoVector size, bits<5> opcode, RegisterOperand inreg, RegisterOperand outreg, string asm, string outkind, string inkind, @@ -5378,7 +6087,7 @@ class BaseSIMDMixedTwoVector size, bits<5> opcode, : I<(outs outreg:$Rd), (ins inreg:$Rn), asm, "{\t$Rd" # outkind # ", $Rn" # inkind # "|" # outkind # "\t$Rd, $Rn}", "", pattern>, - Sched<[WriteV]> { + Sched<[WriteVq]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; @@ -5393,6 +6102,7 @@ class BaseSIMDMixedTwoVector size, bits<5> opcode, let Inst{4-0} = Rd; } +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDMixedTwoVectorTied size, bits<5> opcode, RegisterOperand inreg, RegisterOperand outreg, string asm, string outkind, string inkind, @@ -5400,7 +6110,7 @@ class BaseSIMDMixedTwoVectorTied size, bits<5> opcode, : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm, "{\t$Rd" # outkind # ", $Rn" # inkind # "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>, - Sched<[WriteV]> { + Sched<[WriteVq]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; @@ -5452,7 +6162,7 @@ class BaseSIMDCmpTwoVector size, bits<2> size2, "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero # "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "", [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; @@ -5550,7 +6260,7 @@ class BaseSIMDFPCvtTwoVector size, bits<5> opcode, list pattern> : I<(outs outtype:$Rd), (ins intype:$Rn), asm, !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>, - Sched<[WriteV]> { + Sched<[WriteVq]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; @@ -5565,13 +6275,14 @@ class BaseSIMDFPCvtTwoVector size, bits<5> opcode, let Inst{4-0} = Rd; } +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDFPCvtTwoVectorTied size, bits<5> opcode, RegisterOperand outtype, RegisterOperand intype, string asm, string VdTy, string VnTy, list pattern> : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm, !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>, - Sched<[WriteV]> { + Sched<[WriteVq]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; @@ -5634,7 +6345,7 @@ class BaseSIMDDifferentThreeVector size, bits<4> opcode, : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm, "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 # "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>, - Sched<[WriteV]> { + Sched<[WriteVq]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -5660,7 +6371,7 @@ class BaseSIMDDifferentThreeVectorTied size, bits<4> opcode, : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm, "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 # "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>, - Sched<[WriteV]> { + Sched<[WriteVq]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -6000,7 +6711,7 @@ class BaseSIMDBitwiseExtract, - Sched<[WriteV]> { + Sched<[!if(size, WriteVq, WriteVd)]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -6034,7 +6745,7 @@ class BaseSIMDZipVector size, bits<3> opc, RegisterOperand regtype, "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # "|" # kind # "\t$Rd, $Rn, $Rm}", "", [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>, - Sched<[WriteV]> { + Sched<[!if(!eq(regtype, V128), WriteVq, WriteVd)]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -6090,7 +6801,7 @@ class BaseSIMDThreeScalar size, bits<5> opcode, list pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rd, $Rn, $Rm", "", pattern>, - Sched<[WriteV]> { + Sched<[WriteVd]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -6110,7 +6821,7 @@ class BaseSIMDThreeScalarTied size, bit R, bits<5> opcode, dag oops, dag iops, string asm, list pattern> : I, - Sched<[WriteV]> { + Sched<[WriteVd]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -6153,8 +6864,7 @@ multiclass SIMDThreeScalarHS opc, string asm, def v1i16 : BaseSIMDThreeScalar; } -multiclass SIMDThreeScalarHSTied opc, string asm, - SDPatternOperator OpNode = null_frag> { +multiclass SIMDThreeScalarHSTied opc, string asm> { def v1i32: BaseSIMDThreeScalarTied; @@ -6164,16 +6874,19 @@ multiclass SIMDThreeScalarHSTied opc, string asm, } multiclass SIMDFPThreeScalar opc, string asm, - SDPatternOperator OpNode = null_frag> { + SDPatternOperator OpNode = null_frag, + Predicate pred = HasNEON> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def #NAME#64 : BaseSIMDThreeScalar; - def #NAME#32 : BaseSIMDThreeScalar; - let Predicates = [HasNEON, HasFullFP16] in { - def #NAME#16 : BaseSIMDThreeScalar; - } // Predicates = [HasNEON, HasFullFP16] + } + let Predicates = [pred, HasFullFP16] in { + def NAME#16 : BaseSIMDThreeScalar; + } } def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), @@ -6183,12 +6896,12 @@ multiclass SIMDFPThreeScalar opc, string asm, multiclass SIMDThreeScalarFPCmp opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def #NAME#64 : BaseSIMDThreeScalar; - def #NAME#32 : BaseSIMDThreeScalar; let Predicates = [HasNEON, HasFullFP16] in { - def #NAME#16 : BaseSIMDThreeScalar; } // Predicates = [HasNEON, HasFullFP16] } @@ -6201,7 +6914,7 @@ class BaseSIMDThreeScalarMixed size, bits<5> opcode, dag oops, dag iops, string asm, string cstr, list pat> : I, - Sched<[WriteV]> { + Sched<[WriteVd]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -6254,7 +6967,7 @@ class BaseSIMDTwoScalar size, bits<2> size2, bits<5> opcode, string asm, list pat> : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm, "\t$Rd, $Rn", "", pat>, - Sched<[WriteV]> { + Sched<[WriteVd]> { bits<5> Rd; bits<5> Rn; let Inst{31-30} = 0b01; @@ -6276,7 +6989,7 @@ class BaseSIMDTwoScalarTied size, bits<5> opcode, string asm, list pat> : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm, "\t$Rd, $Rn", "$Rd = $dst", pat>, - Sched<[WriteV]> { + Sched<[WriteVd]> { bits<5> Rd; bits<5> Rn; let Inst{31-30} = 0b01; @@ -6296,7 +7009,7 @@ class BaseSIMDCmpTwoScalar size, bits<2> size2, bits<5> opcode, RegisterClass regtype, string asm, string zero> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn, #" # zero, "", []>, - Sched<[WriteV]> { + Sched<[WriteVd]> { bits<5> Rd; bits<5> Rn; let Inst{31-30} = 0b01; @@ -6315,7 +7028,7 @@ class BaseSIMDCmpTwoScalar size, bits<2> size2, bits<5> opcode, class SIMDInexactCvtTwoScalar opcode, string asm> : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "", [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>, - Sched<[WriteV]> { + Sched<[WriteVd]> { bits<5> Rd; bits<5> Rn; let Inst{31-17} = 0b011111100110000; @@ -6363,10 +7076,13 @@ multiclass SIMDTwoScalarD opc, string asm, (!cast(NAME # "v1i64") FPR64:$Rn)>; } -multiclass SIMDFPTwoScalar opc, string asm> { +multiclass SIMDFPTwoScalar opc, string asm, + Predicate pred = HasNEON> { + let Predicates = [pred] in { def v1i64 : BaseSIMDTwoScalar; def v1i32 : BaseSIMDTwoScalar; - let Predicates = [HasNEON, HasFullFP16] in { + } + let Predicates = [pred, HasFullFP16] in { def v1f16 : BaseSIMDTwoScalar; } } @@ -6379,7 +7095,7 @@ multiclass SIMDFPTwoScalarCVT opc, string asm, [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>; let Predicates = [HasNEON, HasFullFP16] in { def v1i16 : BaseSIMDTwoScalar; + [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn)))]>; } } @@ -6434,7 +7150,7 @@ class BaseSIMDPairwiseScalar size, bits<5> opcode, string asm, string kind> : I<(outs regtype:$Rd), (ins vectype:$Rn), asm, "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>, - Sched<[WriteV]> { + Sched<[WriteVd]> { bits<5> Rd; bits<5> Rn; let Inst{31-30} = 0b01; @@ -6474,7 +7190,7 @@ class BaseSIMDAcrossLanes size, bits<5> opcode, string asm, string kind, list pattern> : I<(outs regtype:$Rd), (ins vectype:$Rn), asm, "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; @@ -6521,10 +7237,10 @@ multiclass SIMDFPAcrossLanes opcode, bit sz1, string asm, let Predicates = [HasNEON, HasFullFP16] in { def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64, asm, ".4h", - [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>; + [(set (f16 FPR16:$Rd), (intOp (v4f16 V64:$Rn)))]>; def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128, asm, ".8h", - [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>; + [(set (f16 FPR16:$Rd), (intOp (v8f16 V128:$Rn)))]>; } // Predicates = [HasNEON, HasFullFP16] def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128, asm, ".4s", @@ -6540,7 +7256,7 @@ multiclass SIMDFPAcrossLanes opcode, bit sz1, string asm, class BaseSIMDInsDup pattern> : I, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; @@ -6566,7 +7282,7 @@ class SIMDDupFromMain imm5, string size, ValueType vectype, class SIMDDupFromElement + SDNode OpNode> : BaseSIMDInsDup { + VectorIndexD, AArch64duplane64> { bits<1> idx; let Inst{20} = idx; let Inst{19-16} = 0b1000; @@ -6586,7 +7302,7 @@ class SIMDDup64FromElement class SIMDDup32FromElement : SIMDDupFromElement { + VectorIndexS, AArch64duplane32> { bits<2> idx; let Inst{20-19} = idx; let Inst{18-16} = 0b100; @@ -6595,7 +7311,7 @@ class SIMDDup32FromElement : SIMDDupFromElement { + VectorIndexH, AArch64duplane16> { bits<3> idx; let Inst{20-18} = idx; let Inst{17-16} = 0b10; @@ -6604,7 +7320,7 @@ class SIMDDup16FromElement : SIMDDupFromElement { + VectorIndexB, AArch64duplane8> { bits<4> idx; let Inst{20-17} = idx; let Inst{16} = 1; @@ -6633,6 +7349,25 @@ class SIMDMovAlias; multiclass SMov { + // SMOV with vector index of 0 are legal in Scalable Matrix Extension (SME) + // streaming mode. + let Predicates = [HasNEONorStreamingSVE] in { + def vi8to32_idx0 : SIMDSMov<0, ".b", GPR32, VectorIndex0> { + let Inst{20-16} = 0b00001; + } + def vi8to64_idx0 : SIMDSMov<1, ".b", GPR64, VectorIndex0> { + let Inst{20-16} = 0b00001; + } + def vi16to32_idx0 : SIMDSMov<0, ".h", GPR32, VectorIndex0> { + let Inst{20-16} = 0b00010; + } + def vi16to64_idx0 : SIMDSMov<1, ".h", GPR64, VectorIndex0> { + let Inst{20-16} = 0b00010; + } + def vi32to64_idx0 : SIMDSMov<1, ".s", GPR64, VectorIndex0> { + let Inst{20-16} = 0b00100; + } + } def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> { bits<4> idx; let Inst{20-17} = idx; @@ -6661,6 +7396,28 @@ multiclass SMov { } multiclass UMov { + // UMOV with vector index of 0 are legal in Scalable Matrix Extension (SME) + // streaming mode. + let Predicates = [HasNEONorStreamingSVE] in { + def vi8_idx0 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndex0> { + let Inst{20-16} = 0b00001; + } + def vi16_idx0 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndex0> { + let Inst{20-16} = 0b00010; + } + def vi32_idx0 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndex0> { + let Inst{20-16} = 0b00100; + } + def vi64_idx0 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndex0> { + let Inst{20-16} = 0b01000; + } + def : SIMDMovAlias<"mov", ".s", + !cast(NAME # vi32_idx0), + GPR32, VectorIndex0>; + def : SIMDMovAlias<"mov", ".d", + !cast(NAME # vi64_idx0), + GPR64, VectorIndex0>; + } def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> { bits<4> idx; let Inst{20-17} = idx; @@ -6721,7 +7478,7 @@ class SIMDInsMainMovAlias; class SIMDInsElementMovAlias - : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" # + : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" # "|" # size #"\t$dst$idx, $src$idx2}", (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>; @@ -6811,7 +7568,7 @@ class BaseSIMDTableLookup len, bit op, RegisterOperand vectype, RegisterOperand listtype, string asm, string kind> : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm, "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Vd; bits<5> Vn; bits<5> Vm; @@ -6832,7 +7589,7 @@ class BaseSIMDTableLookupTied len, bit op, RegisterOperand vectyp RegisterOperand listtype, string asm, string kind> : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm, "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Vd; bits<5> Vn; bits<5> Vm; @@ -6943,15 +7700,15 @@ multiclass SIMDTableLookupTied { //---------------------------------------------------------------------------- -// AdvSIMD scalar CPY +// AdvSIMD scalar DUP //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDScalarCPY - : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov", +class BaseSIMDScalarDUP + : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), asm, "{\t$dst, $src" # kind # "$idx" # "|\t$dst, $src$idx}", "", []>, - Sched<[WriteV]> { + Sched<[WriteVd]> { bits<5> dst; bits<5> src; let Inst{31-21} = 0b01011110000; @@ -6960,30 +7717,30 @@ class BaseSIMDScalarCPY - : InstAlias; -multiclass SIMDScalarCPY { - def i8 : BaseSIMDScalarCPY { +multiclass SIMDScalarDUP { + def i8 : BaseSIMDScalarDUP { bits<4> idx; let Inst{20-17} = idx; let Inst{16} = 1; } - def i16 : BaseSIMDScalarCPY { + def i16 : BaseSIMDScalarDUP { bits<3> idx; let Inst{20-18} = idx; let Inst{17-16} = 0b10; } - def i32 : BaseSIMDScalarCPY { + def i32 : BaseSIMDScalarDUP { bits<2> idx; let Inst{20-19} = idx; let Inst{18-16} = 0b100; } - def i64 : BaseSIMDScalarCPY { + def i64 : BaseSIMDScalarDUP { bits<1> idx; let Inst{20} = idx; let Inst{19-16} = 0b1000; @@ -6994,16 +7751,16 @@ multiclass SIMDScalarCPY { (!cast(NAME # i64) V128:$src, VectorIndexD:$idx)>; // 'DUP' mnemonic aliases. - def : SIMDScalarCPYAlias<"dup", ".b", + def : SIMDScalarDUPAlias<"dup", ".b", !cast(NAME#"i8"), FPR8, V128, VectorIndexB>; - def : SIMDScalarCPYAlias<"dup", ".h", + def : SIMDScalarDUPAlias<"dup", ".h", !cast(NAME#"i16"), FPR16, V128, VectorIndexH>; - def : SIMDScalarCPYAlias<"dup", ".s", + def : SIMDScalarDUPAlias<"dup", ".s", !cast(NAME#"i32"), FPR32, V128, VectorIndexS>; - def : SIMDScalarCPYAlias<"dup", ".d", + def : SIMDScalarDUPAlias<"dup", ".d", !cast(NAME#"i64"), FPR64, V128, VectorIndexD>; } @@ -7016,7 +7773,7 @@ class BaseSIMDModifiedImm pattern> : I, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Rd; bits<8> imm8; let Inst{31} = 0; @@ -7186,7 +7943,7 @@ class BaseSIMDIndexed size, bits<4> opc, asm, "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" # "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>, - Sched<[WriteV]> { + Sched<[WriteVd]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -7216,7 +7973,7 @@ class BaseSIMDIndexedTied size, bits<4> opc, (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm, "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" # "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>, - Sched<[WriteV]> { + Sched<[WriteVd]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -7236,13 +7993,152 @@ class BaseSIMDIndexedTied size, bits<4> opc, let Inst{4-0} = Rd; } -// ARMv8.2 Index Dot product instructions -class BaseSIMDThreeSameVectorDotIndex + : BaseSIMDThreeSameVectorTied { + let AsmString = !strconcat(asm, + "{\t$Rd" # kind1 # ", $Rn" # kind2 # + ", $Rm" # kind2 # "}"); +} + +multiclass SIMDThreeSameVectorBFDot { + def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64, + v2f32, v4bf16>; + def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128, + v4f32, v8bf16>; +} + +class BaseSIMDThreeSameVectorBF16DotI + : BaseSIMDIndexedTied { + + bits<2> idx; + let Inst{21} = idx{0}; // L + let Inst{11} = idx{1}; // H +} + +multiclass SIMDThreeSameVectorBF16DotI { + + def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h", + ".2h", V64, v2f32, v4bf16>; + def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h", + ".2h", V128, v4f32, v8bf16>; +} + +class SIMDBF16MLAL + : BaseSIMDThreeSameVectorTied { + let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}"); +} + +class SIMDBF16MLALIndex + : I<(outs V128:$dst), + (ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm, + "{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst", + [(set (v4f32 V128:$dst), + (v4f32 (OpNode (v4f32 V128:$Rd), + (v8bf16 V128:$Rn), + (v8bf16 + (AArch64duplane16 (v8bf16 V128_lo:$Rm), + VectorIndexH:$idx)))))]>, + Sched<[WriteVq]> { + bits<5> Rd; + bits<5> Rn; + bits<4> Rm; + bits<3> idx; + + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29-22} = 0b00111111; + let Inst{21-20} = idx{1-0}; + let Inst{19-16} = Rm; + let Inst{15-12} = 0b1111; + let Inst{11} = idx{2}; // H + let Inst{10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class SIMDThreeSameVectorBF16MatrixMul + : BaseSIMDThreeSameVectorTied<1, 1, 0b010, 0b11101, + V128, asm, ".4s", + [(set (v4f32 V128:$dst), + (int_aarch64_neon_bfmmla (v4f32 V128:$Rd), + (v8bf16 V128:$Rn), + (v8bf16 V128:$Rm)))]> { + let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h", + ", $Rm", ".8h", "}"); +} + +class SIMD_BFCVTN + : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128, + "bfcvtn", ".4h", ".4s", + [(set (v8bf16 V128:$Rd), + (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>; + +class SIMD_BFCVTN2 + : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128, + "bfcvtn2", ".8h", ".4s", + [(set (v8bf16 V128:$dst), + (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>; + +class BF16ToSinglePrecision + : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "", + [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>, + Sched<[WriteFCvt]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-10} = 0b0001111001100011010000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} +} // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0 + +//---------------------------------------------------------------------------- +// Armv8.6 Matrix Multiply Extension +//---------------------------------------------------------------------------- + +class SIMDThreeSameVectorMatMul + : BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1010, B}, V128, asm, ".4s", + [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { + let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}"; +} + +//---------------------------------------------------------------------------- +// ARMv8.2-A Dot Product Instructions (Indexed) +class BaseSIMDThreeSameVectorDotIndex size, string asm, + string dst_kind, string lhs_kind, string rhs_kind, RegisterOperand RegType, ValueType AccumType, ValueType InputType, SDPatternOperator OpNode> : - BaseSIMDIndexedTied size, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b", V64, - v2i32, v8i8, OpNode>; - def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b", V128, - v4i32, v16i8, OpNode>; + def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b", + V64, v2i32, v8i8, OpNode>; + def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b", + V128, v4i32, v16i8, OpNode>; +} + +// ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed) +class BaseSIMDThreeSameVectorFMLIndex opc, string asm, + string dst_kind, string lhs_kind, + string rhs_kind, RegisterOperand RegType, + ValueType AccumType, ValueType InputType, + SDPatternOperator OpNode> : + BaseSIMDIndexedTied { + // idx = H:L:M + bits<3> idx; + let Inst{11} = idx{2}; // H + let Inst{21} = idx{1}; // L + let Inst{20} = idx{0}; // M +} + +multiclass SIMDThreeSameVectorFMLIndex opc, string asm, + SDPatternOperator OpNode> { + def v4f16 : BaseSIMDThreeSameVectorFMLIndex<0, U, opc, asm, ".2s", ".2h", ".h", + V64, v2f32, v4f16, OpNode>; + def v8f16 : BaseSIMDThreeSameVectorFMLIndex<1, U, opc, asm, ".4s", ".4h", ".h", + V128, v4f32, v8f16, OpNode>; } multiclass SIMDFPIndexed opc, string asm, @@ -7370,6 +8294,34 @@ multiclass SIMDFPIndexed opc, string asm, } multiclass SIMDFPIndexedTiedPatterns { + let Predicates = [HasNEON, HasFullFP16] in { + // Patterns for f16: DUPLANE, DUP scalar and vector_extract. + def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), + (AArch64duplane16 (v8f16 V128_lo:$Rm), + VectorIndexH:$idx))), + (!cast(INST # "v8i16_indexed") + V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx)>; + def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), + (AArch64dup (f16 FPR16Op_lo:$Rm)))), + (!cast(INST # "v8i16_indexed") V128:$Rd, V128:$Rn, + (SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>; + + def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), + (AArch64duplane16 (v8f16 V128_lo:$Rm), + VectorIndexH:$idx))), + (!cast(INST # "v4i16_indexed") + V64:$Rd, V64:$Rn, V128_lo:$Rm, VectorIndexH:$idx)>; + def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), + (AArch64dup (f16 FPR16Op_lo:$Rm)))), + (!cast(INST # "v4i16_indexed") V64:$Rd, V64:$Rn, + (SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>; + + def : Pat<(f16 (OpNode (f16 FPR16:$Rd), (f16 FPR16:$Rn), + (vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))), + (!cast(INST # "v1i16_indexed") FPR16:$Rd, FPR16:$Rn, + V128_lo:$Rm, VectorIndexH:$idx)>; + } // Predicates = [HasNEON, HasFullFP16] + // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar. def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (AArch64duplane32 (v4f32 V128:$Rm), @@ -7404,15 +8356,11 @@ multiclass SIMDFPIndexedTiedPatterns { (!cast(INST # "v2i64_indexed") V128:$Rd, V128:$Rn, (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>; - // 2 variants for 32-bit scalar version: extract from .2s or from .4s + // Covers 2 variants for 32-bit scalar version: extract from .2s or from .4s def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))), (!cast(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, V128:$Rm, VectorIndexS:$idx)>; - def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), - (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))), - (!cast(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, - (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; // 1 variant for 64-bit scalar version: extract from .1d or from .2d def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn), @@ -7497,6 +8445,64 @@ multiclass SIMDFPIndexedTied opc, string asm> { } } +multiclass SIMDIndexedHSPatterns { + + def : Pat<(v4i16 (OpNodeLane + (v4i16 V64:$Rn), (v4i16 V64_lo:$Rm), + VectorIndexS32b:$idx)), + (!cast(NAME # v4i16_indexed) $Rn, + (SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v4i16 (OpNodeLaneQ + (v4i16 V64:$Rn), (v8i16 V128_lo:$Rm), + VectorIndexH32b:$idx)), + (!cast(NAME # v4i16_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + + def : Pat<(v8i16 (OpNodeLane + (v8i16 V128:$Rn), (v4i16 V64_lo:$Rm), + VectorIndexS32b:$idx)), + (!cast(NAME # v8i16_indexed) $Rn, + (SUBREG_TO_REG (i32 0), $Rm, dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v8i16 (OpNodeLaneQ + (v8i16 V128:$Rn), (v8i16 V128_lo:$Rm), + VectorIndexH32b:$idx)), + (!cast(NAME # v8i16_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + + def : Pat<(v2i32 (OpNodeLane + (v2i32 V64:$Rn), (v2i32 V64:$Rm), + VectorIndexD32b:$idx)), + (!cast(NAME # v2i32_indexed) $Rn, + (SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v2i32 (OpNodeLaneQ + (v2i32 V64:$Rn), (v4i32 V128:$Rm), + VectorIndexS32b:$idx)), + (!cast(NAME # v2i32_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + + def : Pat<(v4i32 (OpNodeLane + (v4i32 V128:$Rn), (v2i32 V64:$Rm), + VectorIndexD32b:$idx)), + (!cast(NAME # v4i32_indexed) $Rn, + (SUBREG_TO_REG (i32 0), $Rm, dsub), + (UImmS1XForm $idx))>; + + def : Pat<(v4i32 (OpNodeLaneQ + (v4i32 V128:$Rn), + (v4i32 V128:$Rm), + VectorIndexS32b:$idx)), + (!cast(NAME # v4i32_indexed) $Rn, $Rm, + (UImmS1XForm $idx))>; + +} + multiclass SIMDIndexedHS opc, string asm, SDPatternOperator OpNode> { def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64, @@ -7981,7 +8987,7 @@ class BaseSIMDScalarShift opc, bits<7> fixed_imm, Operand immtype, string asm, list pattern> : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm), asm, "\t$Rd, $Rn, $imm", "", pattern>, - Sched<[WriteV]> { + Sched<[WriteVd]> { bits<5> Rd; bits<5> Rn; bits<7> imm; @@ -8001,7 +9007,7 @@ class BaseSIMDScalarShiftTied opc, bits<7> fixed_imm, Operand immtype, string asm, list pattern> : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm), asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>, - Sched<[WriteV]> { + Sched<[WriteVd]> { bits<5> Rd; bits<5> Rn; bits<7> imm; @@ -8065,10 +9071,13 @@ multiclass SIMDScalarLShiftD opc, string asm, SDPatternOperator OpNode> { def d : BaseSIMDScalarShift { + [(set (i64 FPR64:$Rd), + (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> { let Inst{21-16} = imm{5-0}; } + + def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))), + (!cast(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>; } let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in @@ -8162,7 +9171,7 @@ class BaseSIMDVectorShift opc, bits<7> fixed_imm, : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm), asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" # "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; @@ -8185,7 +9194,7 @@ class BaseSIMDVectorShiftTied opc, bits<7> fixed_imm, : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm), asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" # "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Rd; bits<5> Rn; let Inst{31} = 0; @@ -9547,40 +10556,30 @@ class BaseSIMDThreeSameVectorTiedR0 size, bits<5> opcode, pattern> { } multiclass SIMDThreeSameVectorSQRDMLxHTiedHS opc, string asm, - SDPatternOperator Accum> { + SDPatternOperator op> { def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h", [(set (v4i16 V64:$dst), - (Accum (v4i16 V64:$Rd), - (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn), - (v4i16 V64:$Rm)))))]>; + (v4i16 (op (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>; def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h", [(set (v8i16 V128:$dst), - (Accum (v8i16 V128:$Rd), - (v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn), - (v8i16 V128:$Rm)))))]>; + (v8i16 (op (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>; def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s", [(set (v2i32 V64:$dst), - (Accum (v2i32 V64:$Rd), - (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn), - (v2i32 V64:$Rm)))))]>; + (v2i32 (op (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>; def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s", [(set (v4i32 V128:$dst), - (Accum (v4i32 V128:$Rd), - (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn), - (v4i32 V128:$Rm)))))]>; + (v4i32 (op (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>; } multiclass SIMDIndexedSQRDMLxHSDTied opc, string asm, - SDPatternOperator Accum> { + SDPatternOperator op> { def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64, V128_lo, VectorIndexH, asm, ".4h", ".4h", ".4h", ".h", [(set (v4i16 V64:$dst), - (Accum (v4i16 V64:$Rd), - (v4i16 (int_aarch64_neon_sqrdmulh - (v4i16 V64:$Rn), - (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))))]> { + (v4i16 (op (v4i16 V64:$Rd), (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx)))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -9591,11 +10590,9 @@ multiclass SIMDIndexedSQRDMLxHSDTied opc, string asm, V128, V128, V128_lo, VectorIndexH, asm, ".8h", ".8h", ".8h", ".h", [(set (v8i16 V128:$dst), - (Accum (v8i16 V128:$Rd), - (v8i16 (int_aarch64_neon_sqrdmulh - (v8i16 V128:$Rn), - (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))))]> { + (v8i16 (op (v8i16 V128:$Rd), (v8i16 V128:$Rn), + (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx)))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -9606,75 +10603,26 @@ multiclass SIMDIndexedSQRDMLxHSDTied opc, string asm, V64, V64, V128, VectorIndexS, asm, ".2s", ".2s", ".2s", ".s", [(set (v2i32 V64:$dst), - (Accum (v2i32 V64:$Rd), - (v2i32 (int_aarch64_neon_sqrdmulh - (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { + (v2i32 (op (v2i32 V64:$Rd), (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } - // FIXME: it would be nice to use the scalar (v1i32) instruction here, but - // an intermediate EXTRACT_SUBREG would be untyped. - // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we - // got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..))) - def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), - (i32 (vector_extract - (v4i32 (insert_subvector - (undef), - (v2i32 (int_aarch64_neon_sqrdmulh - (v2i32 V64:$Rn), - (v2i32 (AArch64duplane32 - (v4i32 V128:$Rm), - VectorIndexS:$idx)))), - (i32 0))), - (i64 0))))), - (EXTRACT_SUBREG - (v2i32 (!cast(NAME # v2i32_indexed) - (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), - FPR32Op:$Rd, - ssub)), - V64:$Rn, - V128:$Rm, - VectorIndexS:$idx)), - ssub)>; - def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, V128, V128, V128, VectorIndexS, asm, ".4s", ".4s", ".4s", ".s", [(set (v4i32 V128:$dst), - (Accum (v4i32 V128:$Rd), - (v4i32 (int_aarch64_neon_sqrdmulh - (v4i32 V128:$Rn), - (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { + (v4i32 (op (v4i32 V128:$Rd), (v4i32 V128:$Rn), + (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; } - // FIXME: it would be nice to use the scalar (v1i32) instruction here, but - // an intermediate EXTRACT_SUBREG would be untyped. - def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), - (i32 (vector_extract - (v4i32 (int_aarch64_neon_sqrdmulh - (v4i32 V128:$Rn), - (v4i32 (AArch64duplane32 - (v4i32 V128:$Rm), - VectorIndexS:$idx)))), - (i64 0))))), - (EXTRACT_SUBREG - (v4i32 (!cast(NAME # v4i32_indexed) - (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), - FPR32Op:$Rd, - ssub)), - V128:$Rn, - V128:$Rm, - VectorIndexS:$idx)), - ssub)>; - def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc, FPR16Op, FPR16Op, V128_lo, VectorIndexH, asm, ".h", "", "", ".h", @@ -9689,11 +10637,9 @@ multiclass SIMDIndexedSQRDMLxHSDTied opc, string asm, FPR32Op, FPR32Op, V128, VectorIndexS, asm, ".s", "", "", ".s", [(set (i32 FPR32Op:$dst), - (Accum (i32 FPR32Op:$Rd), - (i32 (int_aarch64_neon_sqrdmulh - (i32 FPR32Op:$Rn), - (i32 (vector_extract (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { + (i32 (op (i32 FPR32Op:$Rd), (i32 FPR32Op:$Rn), + (i32 (vector_extract (v4i32 V128:$Rm), + VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -9711,15 +10657,20 @@ class ComplexRotationOperand let DiagnosticType = "InvalidComplexRotation" # Type; let Name = "ComplexRotation" # Type; } -def complexrotateop : Operand { +def complexrotateop : Operand, TImmLeaf= 0 && Imm <= 270; }], + SDNodeXFormgetTargetConstant((N->getSExtValue() / 90), SDLoc(N), MVT::i32); +}]>> { let ParserMatchClass = ComplexRotationOperand<90, 0, "Even">; let PrintMethod = "printComplexRotationOp<90, 0>"; } -def complexrotateopodd : Operand { +def complexrotateopodd : Operand, TImmLeaf= 0 && Imm <= 270; }], + SDNodeXFormgetTargetConstant(((N->getSExtValue() - 90) / 180), SDLoc(N), MVT::i32); +}]>> { let ParserMatchClass = ComplexRotationOperand<180, 90, "Odd">; let PrintMethod = "printComplexRotationOp<180, 90>"; } - let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDThreeSameVectorComplex size, bits<3> opcode, RegisterOperand regtype, Operand rottype, @@ -9727,7 +10678,7 @@ class BaseSIMDThreeSameVectorComplex size, bits<3> opcode, : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, rottype:$rot), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $rot" "|" # kind # "\t$Rd, $Rn, $Rm, $rot}", "", pattern>, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -9748,45 +10699,46 @@ class BaseSIMDThreeSameVectorComplex size, bits<3> opcode, let Inst{4-0} = Rd; } +//8.3 CompNum - Floating-point complex number support multiclass SIMDThreeSameVectorComplexHSD opcode, Operand rottype, string asm, SDPatternOperator OpNode>{ - let Predicates = [HasV8_3a, HasNEON, HasFullFP16] in { + let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in { def v4f16 : BaseSIMDThreeSameVectorComplex<0, U, 0b01, opcode, V64, rottype, asm, ".4h", [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm), - (rottype i32:$rot)))]>; + (i32 rottype:$rot)))]>; def v8f16 : BaseSIMDThreeSameVectorComplex<1, U, 0b01, opcode, V128, rottype, asm, ".8h", [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm), - (rottype i32:$rot)))]>; + (i32 rottype:$rot)))]>; } - let Predicates = [HasV8_3a, HasNEON] in { + let Predicates = [HasComplxNum, HasNEON] in { def v2f32 : BaseSIMDThreeSameVectorComplex<0, U, 0b10, opcode, V64, rottype, asm, ".2s", [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm), - (rottype i32:$rot)))]>; + (i32 rottype:$rot)))]>; def v4f32 : BaseSIMDThreeSameVectorComplex<1, U, 0b10, opcode, V128, rottype, asm, ".4s", [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm), - (rottype i32:$rot)))]>; + (i32 rottype:$rot)))]>; def v2f64 : BaseSIMDThreeSameVectorComplex<1, U, 0b11, opcode, V128, rottype, asm, ".2d", [(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm), - (rottype i32:$rot)))]>; + (i32 rottype:$rot)))]>; } } @@ -9800,7 +10752,7 @@ class BaseSIMDThreeSameVectorTiedComplex size, (ins regtype:$Rd, regtype:$Rn, regtype:$Rm, rottype:$rot), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $rot" "|" # kind # "\t$Rd, $Rn, $Rm, $rot}", "$Rd = $dst", pattern>, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -9822,43 +10774,43 @@ class BaseSIMDThreeSameVectorTiedComplex size, multiclass SIMDThreeSameVectorTiedComplexHSD opcode, Operand rottype, string asm, SDPatternOperator OpNode> { - let Predicates = [HasV8_3a, HasNEON, HasFullFP16] in { + let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in { def v4f16 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b01, opcode, V64, rottype, asm, ".4h", [(set (v4f16 V64:$dst), (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm), - (rottype i32:$rot)))]>; + (i32 rottype:$rot)))]>; def v8f16 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b01, opcode, V128, rottype, asm, ".8h", [(set (v8f16 V128:$dst), (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm), - (rottype i32:$rot)))]>; + (i32 rottype:$rot)))]>; } - let Predicates = [HasV8_3a, HasNEON] in { + let Predicates = [HasComplxNum, HasNEON] in { def v2f32 : BaseSIMDThreeSameVectorTiedComplex<0, U, 0b10, opcode, V64, rottype, asm, ".2s", [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm), - (rottype i32:$rot)))]>; + (i32 rottype:$rot)))]>; def v4f32 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b10, opcode, V128, rottype, asm, ".4s", [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm), - (rottype i32:$rot)))]>; + (i32 rottype:$rot)))]>; def v2f64 : BaseSIMDThreeSameVectorTiedComplex<1, U, 0b11, opcode, V128, rottype, asm, ".2d", [(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm), - (rottype i32:$rot)))]>; + (i32 rottype:$rot)))]>; } } @@ -9876,7 +10828,7 @@ class BaseSIMDIndexedTiedComplex size, "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx, $rot" # "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx, $rot}", "$Rd = $dst", pattern>, - Sched<[WriteV]> { + Sched<[!if(Q, WriteVq, WriteVd)]> { bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -9902,9 +10854,9 @@ class BaseSIMDIndexedTiedComplex size, // The complex instructions index by pairs of elements, so the VectorIndexes // don't match the lane types, and the index bits are different to the other // classes. -multiclass SIMDIndexedTiedComplexHSD { - let Predicates = [HasV8_3a,HasNEON,HasFullFP16] in { +multiclass SIMDIndexedTiedComplexHSD { + let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in { def v4f16_indexed : BaseSIMDIndexedTiedComplex<0, 1, 0, 0b01, opc1, opc2, V64, V64, V128, VectorIndexD, rottype, asm, ".4h", ".4h", ".4h", ".h", []> { @@ -9920,9 +10872,9 @@ multiclass SIMDIndexedTiedComplexHSD { @@ -9930,7 +10882,7 @@ multiclass SIMDIndexedTiedComplexHSD opc, string asm, dag outs, dag ins, string cstr, list pat> : I, - Sched<[WriteV]>{ + Sched<[WriteVq]>{ bits<5> Rd; bits<5> Rn; let Inst{31-16} = 0b0100111000101000; @@ -9967,7 +10919,7 @@ class SHA3OpTiedInst opc, string asm, string dst_lhs_kind, : I, - Sched<[WriteV]>{ + Sched<[WriteVq]>{ bits<5> Rd; bits<5> Rn; bits<5> Rm; @@ -10007,7 +10959,7 @@ class SHA2OpInst opc, string asm, string kind, list pat> : I, - Sched<[WriteV]>{ + Sched<[WriteVq]>{ bits<5> Rd; bits<5> Rn; let Inst{31-16} = 0b0101111000101000; @@ -10030,7 +10982,7 @@ class SHAInstSS opc, string asm, Intrinsic OpNode> // Armv8.2-A Crypto extensions class BaseCryptoV82 pattern> - : I , Sched<[WriteV]> { + : I , Sched<[WriteVq]> { bits<5> Vd; bits<5> Vn; let Inst{31-25} = 0b1100111; @@ -10039,8 +10991,8 @@ class BaseCryptoV82op0, bits<2>op1, string asm, string asmops> - : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm, asmops, - "$Vm = $Vd", []> { + : BaseCryptoV82<(outs V128:$Vdst), (ins V128:$Vd, V128:$Vn), asm, asmops, + "$Vd = $Vdst", []> { let Inst{31-25} = 0b1100111; let Inst{24-21} = 0b0110; let Inst{20-15} = 0b000001; @@ -10049,9 +11001,9 @@ class CryptoRRTiedop0, bits<2>op1, string asm, string asmops> let Inst{11-10} = op1; } class CryptoRRTied_2Dop0, bits<2>op1, string asm> - : CryptoRRTied; + : CryptoRRTied; class CryptoRRTied_4Sop0, bits<2>op1, string asm> - : CryptoRRTied; + : CryptoRRTied; class CryptoRRR op0, bits<2>op1, dag oops, dag iops, string asm, string asmops, string cst> @@ -10066,19 +11018,19 @@ class CryptoRRR op0, bits<2>op1, dag oops, dag iops, string asm, } class CryptoRRR_2D op0, bits<2>op1, string asm> : CryptoRRR; + "{\t$Vd.2d, $Vn.2d, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "">; class CryptoRRRTied_2D op0, bits<2>op1, string asm> : CryptoRRR; + "{\t$Vd.2d, $Vn.2d, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">; class CryptoRRR_4S op0, bits<2>op1, string asm> : CryptoRRR; + "{\t$Vd.4s, $Vn.4s, $Vm.4s|.4s\t$Vd, $Vn, $Vm}", "">; class CryptoRRRTied_4S op0, bits<2>op1, string asm> : CryptoRRR; + "{\t$Vd.4s, $Vn.4s, $Vm.4s|.4s\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">; class CryptoRRRTied op0, bits<2>op1, string asm> : CryptoRRR; + asm, "{\t$Vd, $Vn, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">; class CryptoRRRRop0, string asm, string asmops> : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, V128:$Va), asm, @@ -10092,15 +11044,18 @@ class CryptoRRRRop0, string asm, string asmops> let Inst{14-10} = Va; } class CryptoRRRR_16Bop0, string asm> - : CryptoRRRR { + : CryptoRRRR { } class CryptoRRRR_4Sop0, string asm> - : CryptoRRRR { + : CryptoRRRR { } class CryptoRRRi6 : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, uimm6:$imm), asm, - "{\t$Vd.2d, $Vn.2d, $Vm.2d, $imm}", "", []> { + "{\t$Vd.2d, $Vn.2d, $Vm.2d, $imm" # + "|.2d\t$Vd, $Vn, $Vm, $imm}", "", []> { bits<6> imm; bits<5> Vm; let Inst{24-21} = 0b0100; @@ -10113,7 +11068,8 @@ class CryptoRRRi6 class CryptoRRRi2Tiedop0, bits<2>op1, string asm> : BaseCryptoV82<(outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm, VectorIndexS:$imm), - asm, "{\t$Vd.4s, $Vn.4s, $Vm.s$imm}", "$Vd = $Vdst", []> { + asm, "{\t$Vd.4s, $Vn.4s, $Vm.s$imm" # + "|.4s\t$Vd, $Vn, $Vm$imm}", "$Vd = $Vdst", []> { bits<2> imm; bits<5> Vm; let Inst{24-21} = 0b0010; @@ -10382,6 +11338,152 @@ multiclass STOPregister { !cast(instr # "X")>; } +class LoadStore64B_base opc, string asm_inst, string asm_ops, + dag iops, dag oops, list pat> + : I, + Sched<[]> /* FIXME: fill in scheduling details once known */ { + bits<5> Rt; + bits<5> Rn; + let Inst{31-21} = 0b11111000001; + let Inst{15} = 1; + let Inst{14-12} = opc; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let Predicates = [HasV8_7a]; +} + +class LoadStore64B opc, string asm_inst, dag iops, dag oops, + list pat = []> + : LoadStore64B_base { + let Inst{20-16} = 0b11111; +} + +class Store64BV opc, string asm_inst, list pat = []> + : LoadStore64B_base { + bits<5> Rs; + let Inst{20-16} = Rs; +} + +class MOPSMemoryCopyMoveBase opcode, bits<2> op1, + bits<2> op2, string asm> + : I<(outs GPR64common:$Rd_wb, GPR64common:$Rs_wb, GPR64:$Rn_wb), + (ins GPR64common:$Rd, GPR64common:$Rs, GPR64:$Rn), + asm, "\t[$Rd]!, [$Rs]!, $Rn!", + "$Rd = $Rd_wb,$Rs = $Rs_wb,$Rn = $Rn_wb", []>, + Sched<[]> { + bits<5> Rd; + bits<5> Rs; + bits<5> Rn; + let Inst{31-27} = 0b00011; + let Inst{26} = isMove; + let Inst{25-24} = 0b01; + let Inst{23-22} = opcode; + let Inst{21} = 0b0; + let Inst{20-16} = Rs; + let Inst{15-14} = op2; + let Inst{13-12} = op1; + let Inst{11-10} = 0b01; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeCPYMemOpInstruction"; + let mayLoad = 1; + let mayStore = 1; +} + +class MOPSMemoryCopy opcode, bits<2> op1, bits<2> op2, string asm> + : MOPSMemoryCopyMoveBase<0, opcode, op1, op2, asm>; + +class MOPSMemoryMove opcode, bits<2> op1, bits<2> op2, string asm> + : MOPSMemoryCopyMoveBase<1, opcode, op1, op2, asm>; + +class MOPSMemorySetBase opcode, bit op1, bit op2, + string asm> + : I<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb), + (ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm), + asm, "\t[$Rd]!, $Rn!, $Rm", + "$Rd = $Rd_wb,$Rn = $Rn_wb", []>, + Sched<[]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-27} = 0b00011; + let Inst{26} = isTagging; + let Inst{25-21} = 0b01110; + let Inst{20-16} = Rm; + let Inst{15-14} = opcode; + let Inst{13} = op2; + let Inst{12} = op1; + let Inst{11-10} = 0b01; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeSETMemOpInstruction"; + let mayLoad = 0; + let mayStore = 1; +} + +class MOPSMemorySet opcode, bit op1, bit op2, string asm> + : MOPSMemorySetBase<0, opcode, op1, op2, asm>; + +class MOPSMemorySetTagging opcode, bit op1, bit op2, string asm> + : MOPSMemorySetBase<1, opcode, op1, op2, asm>; + +multiclass MOPSMemoryCopyInsns opcode, string asm> { + def "" : MOPSMemoryCopy; + def WN : MOPSMemoryCopy; + def RN : MOPSMemoryCopy; + def N : MOPSMemoryCopy; + def WT : MOPSMemoryCopy; + def WTWN : MOPSMemoryCopy; + def WTRN : MOPSMemoryCopy; + def WTN : MOPSMemoryCopy; + def RT : MOPSMemoryCopy; + def RTWN : MOPSMemoryCopy; + def RTRN : MOPSMemoryCopy; + def RTN : MOPSMemoryCopy; + def T : MOPSMemoryCopy; + def TWN : MOPSMemoryCopy; + def TRN : MOPSMemoryCopy; + def TN : MOPSMemoryCopy; +} + +multiclass MOPSMemoryMoveInsns opcode, string asm> { + def "" : MOPSMemoryMove; + def WN : MOPSMemoryMove; + def RN : MOPSMemoryMove; + def N : MOPSMemoryMove; + def WT : MOPSMemoryMove; + def WTWN : MOPSMemoryMove; + def WTRN : MOPSMemoryMove; + def WTN : MOPSMemoryMove; + def RT : MOPSMemoryMove; + def RTWN : MOPSMemoryMove; + def RTRN : MOPSMemoryMove; + def RTN : MOPSMemoryMove; + def T : MOPSMemoryMove; + def TWN : MOPSMemoryMove; + def TRN : MOPSMemoryMove; + def TN : MOPSMemoryMove; +} + +multiclass MOPSMemorySetInsns opcode, string asm> { + def "" : MOPSMemorySet; + def T : MOPSMemorySet; + def N : MOPSMemorySet; + def TN : MOPSMemorySet; +} + +multiclass MOPSMemorySetTaggingInsns opcode, string asm> { + def "" : MOPSMemorySetTagging; + def T : MOPSMemorySetTagging; + def N : MOPSMemorySetTagging; + def TN : MOPSMemorySetTagging; +} + //---------------------------------------------------------------------------- // Allow the size specifier tokens to be upper case, not just lower. def : TokenAlias<".4B", ".4b">; // Add dot product diff --git a/suite/synctools/tablegen/AArch64/AArch64InstrGISel.td b/suite/synctools/tablegen/AArch64/AArch64InstrGISel.td new file mode 100644 index 00000000..58b6dcad --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64InstrGISel.td @@ -0,0 +1,287 @@ +//=----- AArch64InstrGISel.td - AArch64 GISel target pseudos -*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// AArch64 GlobalISel target pseudo instruction definitions. This is kept +// separately from the other tablegen files for organizational purposes, but +// share the same infrastructure. +// +//===----------------------------------------------------------------------===// + + +class AArch64GenericInstruction : GenericInstruction { + let Namespace = "AArch64"; +} + +// A pseudo to represent a relocatable add instruction as part of address +// computation. +def G_ADD_LOW : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src, type2:$imm); + let hasSideEffects = 0; +} + +// Pseudo for a rev16 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_REV16 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +// Pseudo for a rev32 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_REV32 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +// Pseudo for a rev64 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_REV64 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +// Represents an uzp1 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_UZP1 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents an uzp2 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_UZP2 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents a zip1 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_ZIP1 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents a zip2 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_ZIP2 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents a dup instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_DUP: AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$lane); + let hasSideEffects = 0; +} + +// Represents a lane duplicate operation. +def G_DUPLANE8 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src, type1:$lane); + let hasSideEffects = 0; +} +def G_DUPLANE16 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src, type1:$lane); + let hasSideEffects = 0; +} +def G_DUPLANE32 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src, type1:$lane); + let hasSideEffects = 0; +} +def G_DUPLANE64 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src, type1:$lane); + let hasSideEffects = 0; +} + +// Represents a trn1 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_TRN1 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents a trn2 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_TRN2 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); + let hasSideEffects = 0; +} + +// Represents an ext instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_EXT: AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2, untyped_imm_0:$imm); + let hasSideEffects = 0; +} + +// Represents a vector G_ASHR with an immediate. +def G_VASHR : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, untyped_imm_0:$imm); + let hasSideEffects = 0; +} + +// Represents a vector G_LSHR with an immediate. +def G_VLSHR : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, untyped_imm_0:$imm); + let hasSideEffects = 0; +} + +// Represents an integer to FP conversion on the FPR bank. +def G_SITOF : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} +def G_UITOF : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +def G_FCMEQ : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type1:$src2); + let hasSideEffects = 0; +} + +def G_FCMGE : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type1:$src2); + let hasSideEffects = 0; +} + +def G_FCMGT : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type1:$src2); + let hasSideEffects = 0; +} + +def G_FCMEQZ : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +def G_FCMGEZ : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +def G_FCMGTZ : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +def G_FCMLEZ : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +def G_FCMLTZ : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; + +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; + +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; + +def : GINodeEquiv; + +// These are patterns that we only use for GlobalISel via the importer. +def : Pat<(f32 (fadd (vector_extract (v2f32 FPR64:$Rn), (i64 0)), + (vector_extract (v2f32 FPR64:$Rn), (i64 1)))), + (f32 (FADDPv2i32p (v2f32 FPR64:$Rn)))>; + +let Predicates = [HasNEON] in { + def : Pat<(v2f64 (sint_to_fp v2i32:$src)), + (SCVTFv2f64 (SSHLLv2i32_shift V64:$src, 0))>; + def : Pat<(v2f64 (uint_to_fp v2i32:$src)), + (UCVTFv2f64 (USHLLv2i32_shift V64:$src, 0))>; + def : Pat<(v2f32 (sint_to_fp v2i64:$src)), + (FCVTNv2i32 (SCVTFv2f64 V128:$src))>; + def : Pat<(v2f32 (uint_to_fp v2i64:$src)), + (FCVTNv2i32 (UCVTFv2f64 V128:$src))>; + + def : Pat<(v2i64 (fp_to_sint v2f32:$src)), + (FCVTZSv2f64 (FCVTLv2i32 V64:$src))>; + def : Pat<(v2i64 (fp_to_uint v2f32:$src)), + (FCVTZUv2f64 (FCVTLv2i32 V64:$src))>; + def : Pat<(v2i32 (fp_to_sint v2f64:$src)), + (XTNv2i32 (FCVTZSv2f64 V128:$src))>; + def : Pat<(v2i32 (fp_to_uint v2f64:$src)), + (XTNv2i32 (FCVTZUv2f64 V128:$src))>; + +} + +let Predicates = [HasNoLSE] in { +def : Pat<(atomic_cmp_swap_8 GPR64:$addr, GPR32:$desired, GPR32:$new), + (CMP_SWAP_8 GPR64:$addr, GPR32:$desired, GPR32:$new)>; + +def : Pat<(atomic_cmp_swap_16 GPR64:$addr, GPR32:$desired, GPR32:$new), + (CMP_SWAP_16 GPR64:$addr, GPR32:$desired, GPR32:$new)>; + +def : Pat<(atomic_cmp_swap_32 GPR64:$addr, GPR32:$desired, GPR32:$new), + (CMP_SWAP_32 GPR64:$addr, GPR32:$desired, GPR32:$new)>; + +def : Pat<(atomic_cmp_swap_64 GPR64:$addr, GPR64:$desired, GPR64:$new), + (CMP_SWAP_64 GPR64:$addr, GPR64:$desired, GPR64:$new)>; +} + +def : Pat<(int_aarch64_stlxp GPR64:$lo, GPR64:$hi, GPR64:$addr), + (STLXPX GPR64:$lo, GPR64:$hi, GPR64:$addr)>; +def : Pat<(int_aarch64_stxp GPR64:$lo, GPR64:$hi, GPR64:$addr), + (STXPX GPR64:$lo, GPR64:$hi, GPR64:$addr)>; diff --git a/suite/synctools/tablegen/AArch64/AArch64InstrInfo.td b/suite/synctools/tablegen/AArch64/AArch64InstrInfo.td index d6b8bb5d..2680b5ac 100644 --- a/suite/synctools/tablegen/AArch64/AArch64InstrInfo.td +++ b/suite/synctools/tablegen/AArch64/AArch64InstrInfo.td @@ -1,9 +1,8 @@ //=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -15,59 +14,214 @@ // ARM Instruction Predicate Definitions. // def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, - AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; + AssemblerPredicate<(all_of HasV8_1aOps), "armv8.1a">; def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, - AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; + AssemblerPredicate<(all_of HasV8_2aOps), "armv8.2a">; def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">, - AssemblerPredicate<"HasV8_3aOps", "armv8.3a">; + AssemblerPredicate<(all_of HasV8_3aOps), "armv8.3a">; def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">, - AssemblerPredicate<"HasV8_4aOps", "armv8.4a">; + AssemblerPredicate<(all_of HasV8_4aOps), "armv8.4a">; +def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">, + AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">; +def HasV8_6a : Predicate<"Subtarget->hasV8_6aOps()">, + AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">; +def HasV8_7a : Predicate<"Subtarget->hasV8_7aOps()">, + AssemblerPredicate<(all_of HasV8_7aOps), "armv8.7a">; +def HasV9_0a : Predicate<"Subtarget->hasV9_0aOps()">, + AssemblerPredicate<(all_of HasV9_0aOps), "armv9-a">; +def HasV9_1a : Predicate<"Subtarget->hasV9_1aOps()">, + AssemblerPredicate<(all_of HasV9_1aOps), "armv9.1a">; +def HasV9_2a : Predicate<"Subtarget->hasV9_2aOps()">, + AssemblerPredicate<(all_of HasV9_2aOps), "armv9.2a">; +def HasV9_3a : Predicate<"Subtarget->hasV9_3aOps()">, + AssemblerPredicate<(all_of HasV9_3aOps), "armv9.3a">; +def HasV8_0r : Predicate<"Subtarget->hasV8_0rOps()">, + AssemblerPredicate<(all_of HasV8_0rOps), "armv8-r">; + +def HasEL2VMSA : Predicate<"Subtarget->hasEL2VMSA()">, + AssemblerPredicate<(all_of FeatureEL2VMSA), "el2vmsa">; + +def HasEL3 : Predicate<"Subtarget->hasEL3()">, + AssemblerPredicate<(all_of FeatureEL3), "el3">; + +def HasVH : Predicate<"Subtarget->hasVH()">, + AssemblerPredicate<(all_of FeatureVH), "vh">; + +def HasLOR : Predicate<"Subtarget->hasLOR()">, + AssemblerPredicate<(all_of FeatureLOR), "lor">; + +def HasPAuth : Predicate<"Subtarget->hasPAuth()">, + AssemblerPredicate<(all_of FeaturePAuth), "pauth">; + +def HasJS : Predicate<"Subtarget->hasJS()">, + AssemblerPredicate<(all_of FeatureJS), "jsconv">; + +def HasCCIDX : Predicate<"Subtarget->hasCCIDX()">, + AssemblerPredicate<(all_of FeatureCCIDX), "ccidx">; + +def HasComplxNum : Predicate<"Subtarget->hasComplxNum()">, + AssemblerPredicate<(all_of FeatureComplxNum), "complxnum">; + +def HasNV : Predicate<"Subtarget->hasNV()">, + AssemblerPredicate<(all_of FeatureNV), "nv">; + +def HasMPAM : Predicate<"Subtarget->hasMPAM()">, + AssemblerPredicate<(all_of FeatureMPAM), "mpam">; + +def HasDIT : Predicate<"Subtarget->hasDIT()">, + AssemblerPredicate<(all_of FeatureDIT), "dit">; + +def HasTRACEV8_4 : Predicate<"Subtarget->hasTRACEV8_4()">, + AssemblerPredicate<(all_of FeatureTRACEV8_4), "tracev8.4">; + +def HasAM : Predicate<"Subtarget->hasAM()">, + AssemblerPredicate<(all_of FeatureAM), "am">; + +def HasSEL2 : Predicate<"Subtarget->hasSEL2()">, + AssemblerPredicate<(all_of FeatureSEL2), "sel2">; + +def HasTLB_RMI : Predicate<"Subtarget->hasTLB_RMI()">, + AssemblerPredicate<(all_of FeatureTLB_RMI), "tlb-rmi">; + +def HasFlagM : Predicate<"Subtarget->hasFlagM()">, + AssemblerPredicate<(all_of FeatureFlagM), "flagm">; + +def HasRCPC_IMMO : Predicate<"Subtarget->hasRCPCImm()">, + AssemblerPredicate<(all_of FeatureRCPC_IMMO), "rcpc-immo">; + def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, - AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">; + AssemblerPredicate<(all_of FeatureFPARMv8), "fp-armv8">; def HasNEON : Predicate<"Subtarget->hasNEON()">, - AssemblerPredicate<"FeatureNEON", "neon">; + AssemblerPredicate<(all_of FeatureNEON), "neon">; def HasCrypto : Predicate<"Subtarget->hasCrypto()">, - AssemblerPredicate<"FeatureCrypto", "crypto">; + AssemblerPredicate<(all_of FeatureCrypto), "crypto">; def HasSM4 : Predicate<"Subtarget->hasSM4()">, - AssemblerPredicate<"FeatureSM4", "sm4">; + AssemblerPredicate<(all_of FeatureSM4), "sm4">; def HasSHA3 : Predicate<"Subtarget->hasSHA3()">, - AssemblerPredicate<"FeatureSHA3", "sha3">; + AssemblerPredicate<(all_of FeatureSHA3), "sha3">; def HasSHA2 : Predicate<"Subtarget->hasSHA2()">, - AssemblerPredicate<"FeatureSHA2", "sha2">; + AssemblerPredicate<(all_of FeatureSHA2), "sha2">; def HasAES : Predicate<"Subtarget->hasAES()">, - AssemblerPredicate<"FeatureAES", "aes">; + AssemblerPredicate<(all_of FeatureAES), "aes">; def HasDotProd : Predicate<"Subtarget->hasDotProd()">, - AssemblerPredicate<"FeatureDotProd", "dotprod">; + AssemblerPredicate<(all_of FeatureDotProd), "dotprod">; def HasCRC : Predicate<"Subtarget->hasCRC()">, - AssemblerPredicate<"FeatureCRC", "crc">; + AssemblerPredicate<(all_of FeatureCRC), "crc">; def HasLSE : Predicate<"Subtarget->hasLSE()">, - AssemblerPredicate<"FeatureLSE", "lse">; + AssemblerPredicate<(all_of FeatureLSE), "lse">; +def HasNoLSE : Predicate<"!Subtarget->hasLSE()">; def HasRAS : Predicate<"Subtarget->hasRAS()">, - AssemblerPredicate<"FeatureRAS", "ras">; + AssemblerPredicate<(all_of FeatureRAS), "ras">; def HasRDM : Predicate<"Subtarget->hasRDM()">, - AssemblerPredicate<"FeatureRDM", "rdm">; + AssemblerPredicate<(all_of FeatureRDM), "rdm">; def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, - AssemblerPredicate<"FeatureFullFP16", "fullfp16">; + AssemblerPredicate<(all_of FeatureFullFP16), "fullfp16">; +def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">, + AssemblerPredicate<(all_of FeatureFP16FML), "fp16fml">; def HasSPE : Predicate<"Subtarget->hasSPE()">, - AssemblerPredicate<"FeatureSPE", "spe">; + AssemblerPredicate<(all_of FeatureSPE), "spe">; def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">, - AssemblerPredicate<"FeatureFuseAES", + AssemblerPredicate<(all_of FeatureFuseAES), "fuse-aes">; def HasSVE : Predicate<"Subtarget->hasSVE()">, - AssemblerPredicate<"FeatureSVE", "sve">; + AssemblerPredicate<(all_of FeatureSVE), "sve">; +def HasSVE2 : Predicate<"Subtarget->hasSVE2()">, + AssemblerPredicate<(all_of FeatureSVE2), "sve2">; +def HasSVE2AES : Predicate<"Subtarget->hasSVE2AES()">, + AssemblerPredicate<(all_of FeatureSVE2AES), "sve2-aes">; +def HasSVE2SM4 : Predicate<"Subtarget->hasSVE2SM4()">, + AssemblerPredicate<(all_of FeatureSVE2SM4), "sve2-sm4">; +def HasSVE2SHA3 : Predicate<"Subtarget->hasSVE2SHA3()">, + AssemblerPredicate<(all_of FeatureSVE2SHA3), "sve2-sha3">; +def HasSVE2BitPerm : Predicate<"Subtarget->hasSVE2BitPerm()">, + AssemblerPredicate<(all_of FeatureSVE2BitPerm), "sve2-bitperm">; +def HasSME : Predicate<"Subtarget->hasSME()">, + AssemblerPredicate<(all_of FeatureSME), "sme">; +def HasSMEF64 : Predicate<"Subtarget->hasSMEF64()">, + AssemblerPredicate<(all_of FeatureSMEF64), "sme-f64">; +def HasSMEI64 : Predicate<"Subtarget->hasSMEI64()">, + AssemblerPredicate<(all_of FeatureSMEI64), "sme-i64">; +def HasStreamingSVE : Predicate<"Subtarget->hasStreamingSVE()">, + AssemblerPredicate<(all_of FeatureStreamingSVE), "sme">; +// A subset of SVE(2) instructions are legal in Streaming SVE execution mode, +// they should be enabled if either has been specified. +def HasSVEorStreamingSVE + : Predicate<"Subtarget->hasSVE() || Subtarget->hasStreamingSVE()">, + AssemblerPredicate<(any_of FeatureSVE, FeatureStreamingSVE), + "sve or sme">; +def HasSVE2orStreamingSVE + : Predicate<"Subtarget->hasSVE2() || Subtarget->hasStreamingSVE()">, + AssemblerPredicate<(any_of FeatureSVE2, FeatureStreamingSVE), + "sve2 or sme">; +// A subset of NEON instructions are legal in Streaming SVE execution mode, +// they should be enabled if either has been specified. +def HasNEONorStreamingSVE + : Predicate<"Subtarget->hasNEON() || Subtarget->hasStreamingSVE()">, + AssemblerPredicate<(any_of FeatureNEON, FeatureStreamingSVE), + "neon or sme">; def HasRCPC : Predicate<"Subtarget->hasRCPC()">, - AssemblerPredicate<"FeatureRCPC", "rcpc">; - + AssemblerPredicate<(all_of FeatureRCPC), "rcpc">; +def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">, + AssemblerPredicate<(all_of FeatureAltFPCmp), "altnzcv">; +def HasFRInt3264 : Predicate<"Subtarget->hasFRInt3264()">, + AssemblerPredicate<(all_of FeatureFRInt3264), "frint3264">; +def HasSB : Predicate<"Subtarget->hasSB()">, + AssemblerPredicate<(all_of FeatureSB), "sb">; +def HasPredRes : Predicate<"Subtarget->hasPredRes()">, + AssemblerPredicate<(all_of FeaturePredRes), "predres">; +def HasCCDP : Predicate<"Subtarget->hasCCDP()">, + AssemblerPredicate<(all_of FeatureCacheDeepPersist), "ccdp">; +def HasBTI : Predicate<"Subtarget->hasBTI()">, + AssemblerPredicate<(all_of FeatureBranchTargetId), "bti">; +def HasMTE : Predicate<"Subtarget->hasMTE()">, + AssemblerPredicate<(all_of FeatureMTE), "mte">; +def HasTME : Predicate<"Subtarget->hasTME()">, + AssemblerPredicate<(all_of FeatureTME), "tme">; +def HasETE : Predicate<"Subtarget->hasETE()">, + AssemblerPredicate<(all_of FeatureETE), "ete">; +def HasTRBE : Predicate<"Subtarget->hasTRBE()">, + AssemblerPredicate<(all_of FeatureTRBE), "trbe">; +def HasBF16 : Predicate<"Subtarget->hasBF16()">, + AssemblerPredicate<(all_of FeatureBF16), "bf16">; +def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">, + AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">; +def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">, + AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">; +def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">, + AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">; +def HasXS : Predicate<"Subtarget->hasXS()">, + AssemblerPredicate<(all_of FeatureXS), "xs">; +def HasWFxT : Predicate<"Subtarget->hasWFxT()">, + AssemblerPredicate<(all_of FeatureWFxT), "wfxt">; +def HasLS64 : Predicate<"Subtarget->hasLS64()">, + AssemblerPredicate<(all_of FeatureLS64), "ls64">; +def HasBRBE : Predicate<"Subtarget->hasBRBE()">, + AssemblerPredicate<(all_of FeatureBRBE), "brbe">; +def HasSPE_EEF : Predicate<"Subtarget->hasSPE_EEF()">, + AssemblerPredicate<(all_of FeatureSPE_EEF), "spe-eef">; +def HasHBC : Predicate<"Subtarget->hasHBC()">, + AssemblerPredicate<(all_of FeatureHBC), "hbc">; +def HasMOPS : Predicate<"Subtarget->hasMOPS()">, + AssemblerPredicate<(all_of FeatureMOPS), "mops">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; +def IsWindows : Predicate<"Subtarget->isTargetWindows()">; +def UseExperimentalZeroingPseudos + : Predicate<"Subtarget->useExperimentalZeroingPseudos()">; def UseAlternateSExtLoadCVTF32 : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; def UseNegativeImmediates - : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates", + : Predicate<"false">, AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)), "NegativeImmediates">; +def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">; + +def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", + SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, + SDTCisInt<1>]>>; + //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. @@ -126,6 +280,7 @@ def SDT_AArch64FCmp : SDTypeProfile<0, 2, SDTCisSameAs<0, 1>]>; def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>; def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>; +def SDT_AArch64Insr : SDTypeProfile<1, 2, [SDTCisVec<0>]>; def SDT_AArch64Zip : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>]>; @@ -137,6 +292,12 @@ def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>]>; def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>; +def SDT_AArch64Dot: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>, SDTCisSameAs<2,3>]>; + +def SDT_AArch64vshiftinsert : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<3>, + SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>; def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>; @@ -154,6 +315,12 @@ def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>; def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; +def SDT_AArch64uaddlp : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; + +def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; + // Generates the general dynamic sequences, i.e. // adrp x0, :tlsdesc:var // ldr x1, [x0, #:tlsdesc_lo12:var] @@ -171,9 +338,126 @@ def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4, SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>, SDTCisSameAs<1, 4>]>; +def SDT_AArch64TBL : SDTypeProfile<1, 2, [ + SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisInt<2> +]>; + +// non-extending masked load fragment. +def nonext_masked_load : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ + return cast(N)->getExtensionType() == ISD::NON_EXTLOAD && + cast(N)->isUnindexed() && + !cast(N)->isNonTemporal(); +}]>; +// sign extending masked load fragments. +def asext_masked_load : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (masked_ld node:$ptr, undef, node:$pred, node:$def),[{ + return (cast(N)->getExtensionType() == ISD::EXTLOAD || + cast(N)->getExtensionType() == ISD::SEXTLOAD) && + cast(N)->isUnindexed(); +}]>; +def asext_masked_load_i8 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (asext_masked_load node:$ptr, node:$pred, node:$def), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def asext_masked_load_i16 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (asext_masked_load node:$ptr, node:$pred, node:$def), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; +def asext_masked_load_i32 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (asext_masked_load node:$ptr, node:$pred, node:$def), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; +// zero extending masked load fragments. +def zext_masked_load : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ + return cast(N)->getExtensionType() == ISD::ZEXTLOAD && + cast(N)->isUnindexed(); +}]>; +def zext_masked_load_i8 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (zext_masked_load node:$ptr, node:$pred, node:$def), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def zext_masked_load_i16 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (zext_masked_load node:$ptr, node:$pred, node:$def), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; +def zext_masked_load_i32 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (zext_masked_load node:$ptr, node:$pred, node:$def), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + +def non_temporal_load : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ + return cast(N)->getExtensionType() == ISD::NON_EXTLOAD && + cast(N)->isUnindexed() && + cast(N)->isNonTemporal(); +}]>; + +// non-truncating masked store fragment. +def nontrunc_masked_store : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ + return !cast(N)->isTruncatingStore() && + cast(N)->isUnindexed() && + !cast(N)->isNonTemporal(); +}]>; +// truncating masked store fragments. +def trunc_masked_store : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ + return cast(N)->isTruncatingStore() && + cast(N)->isUnindexed(); +}]>; +def trunc_masked_store_i8 : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (trunc_masked_store node:$val, node:$ptr, node:$pred), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def trunc_masked_store_i16 : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (trunc_masked_store node:$val, node:$ptr, node:$pred), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; +def trunc_masked_store_i32 : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (trunc_masked_store node:$val, node:$ptr, node:$pred), [{ + return cast(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + +def non_temporal_store : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ + return !cast(N)->isTruncatingStore() && + cast(N)->isUnindexed() && + cast(N)->isNonTemporal(); +}]>; + +// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise +def top16Zero: PatLeaf<(i32 GPR32:$src), [{ + return SDValue(N,0)->getValueType(0) == MVT::i32 && + CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16)); + }]>; + +// top32Zero - answer true if the upper 32 bits of $src are 0, false otherwise +def top32Zero: PatLeaf<(i64 GPR64:$src), [{ + return SDValue(N,0)->getValueType(0) == MVT::i64 && + CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(64, 32)); + }]>; // Node definitions. def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>; +def AArch64adr : SDNode<"AArch64ISD::ADR", SDTIntUnaryOp, []>; def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>; def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>; def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START", @@ -188,6 +472,17 @@ def AArch64call : SDNode<"AArch64ISD::CALL", SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; + +def AArch64call_bti : SDNode<"AArch64ISD::CALL_BTI", + SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; + +def AArch64call_rvmarker: SDNode<"AArch64ISD::CALL_RVMARKER", + SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; + def AArch64brcond : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond, [SDNPHasChain]>; def AArch64cbz : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz, @@ -222,7 +517,14 @@ def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>; def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>; -def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>; +def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>; +def AArch64strict_fcmp : SDNode<"AArch64ISD::STRICT_FCMP", SDT_AArch64FCmp, + [SDNPHasChain]>; +def AArch64strict_fcmpe : SDNode<"AArch64ISD::STRICT_FCMPE", SDT_AArch64FCmp, + [SDNPHasChain]>; +def AArch64any_fcmp : PatFrags<(ops node:$lhs, node:$rhs), + [(AArch64strict_fcmp node:$lhs, node:$rhs), + (AArch64fcmp node:$lhs, node:$rhs)]>; def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>; def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>; @@ -230,6 +532,8 @@ def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>; def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>; def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>; +def AArch64insr : SDNode<"AArch64ISD::INSR", SDT_AArch64Insr>; + def AArch64zip1 : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>; def AArch64zip2 : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>; def AArch64uzp1 : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>; @@ -258,10 +562,11 @@ def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>; def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>; def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>; def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>; +def AArch64vsli : SDNode<"AArch64ISD::VSLI", SDT_AArch64vshiftinsert>; +def AArch64vsri : SDNode<"AArch64ISD::VSRI", SDT_AArch64vshiftinsert>; -def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>; def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>; -def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>; +def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>; def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>; def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>; @@ -279,7 +584,7 @@ def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>; def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>; def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>; def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS), - (AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>; + (vnot (AArch64cmeqz (and node:$LHS, node:$RHS)))>; def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>; def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>; @@ -290,8 +595,6 @@ def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>; def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>; def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>; -def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>; - def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; @@ -322,6 +625,9 @@ def AArch64frecps : SDNode<"AArch64ISD::FRECPS", SDTFPBinOp>; def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>; def AArch64frsqrts : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>; +def AArch64sdot : SDNode<"AArch64ISD::SDOT", SDT_AArch64Dot>; +def AArch64udot : SDNode<"AArch64ISD::UDOT", SDT_AArch64Dot>; + def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>; def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>; def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>; @@ -329,6 +635,45 @@ def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>; def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>; def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; +def AArch64srhadd : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>; +def AArch64urhadd : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>; +def AArch64shadd : SDNode<"AArch64ISD::SHADD", SDT_AArch64binvec>; +def AArch64uhadd : SDNode<"AArch64ISD::UHADD", SDT_AArch64binvec>; + +def AArch64uabd : PatFrags<(ops node:$lhs, node:$rhs), + [(abdu node:$lhs, node:$rhs), + (int_aarch64_neon_uabd node:$lhs, node:$rhs)]>; +def AArch64sabd : PatFrags<(ops node:$lhs, node:$rhs), + [(abds node:$lhs, node:$rhs), + (int_aarch64_neon_sabd node:$lhs, node:$rhs)]>; + +def AArch64uaddlp_n : SDNode<"AArch64ISD::UADDLP", SDT_AArch64uaddlp>; +def AArch64uaddlp : PatFrags<(ops node:$src), + [(AArch64uaddlp_n node:$src), + (int_aarch64_neon_uaddlp node:$src)]>; + +def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; +def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AArch64st2g : SDNode<"AArch64ISD::ST2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AArch64stz2g : SDNode<"AArch64ISD::STZ2G", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def SDT_AArch64unpk : SDTypeProfile<1, 1, [ + SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0> +]>; +def AArch64sunpkhi : SDNode<"AArch64ISD::SUNPKHI", SDT_AArch64unpk>; +def AArch64sunpklo : SDNode<"AArch64ISD::SUNPKLO", SDT_AArch64unpk>; +def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>; +def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>; + +def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>; +def AArch64mrs : SDNode<"AArch64ISD::MRS", + SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>, + [SDNPHasChain, SDNPOutGlue]>; //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -338,14 +683,27 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; // the Function object through the Subtarget and objections were raised // to that (see post-commit review comments for r301750). let RecomputePerFunction = 1 in { - def ForCodeSize : Predicate<"MF->getFunction().optForSize()">; - def NotForCodeSize : Predicate<"!MF->getFunction().optForSize()">; + def ForCodeSize : Predicate<"shouldOptForSize(MF)">; + def NotForCodeSize : Predicate<"!shouldOptForSize(MF)">; // Avoid generating STRQro if it is slow, unless we're optimizing for code size. - def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || MF->getFunction().optForSize()">; + def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || shouldOptForSize(MF)">; + + def UseBTI : Predicate<[{ MF->getInfo()->branchTargetEnforcement() }]>; + def NotUseBTI : Predicate<[{ !MF->getInfo()->branchTargetEnforcement() }]>; + + def SLSBLRMitigation : Predicate<[{ MF->getSubtarget().hardenSlsBlr() }]>; + def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget().hardenSlsBlr() }]>; + // Toggles patterns which aren't beneficial in GlobalISel when we aren't + // optimizing. This allows us to selectively use patterns without impacting + // SelectionDAG's behaviour. + // FIXME: One day there will probably be a nicer way to check for this, but + // today is not that day. + def OptimizedGISelOrOtherSelector : Predicate<"!MF->getFunction().hasOptNone() || MF->getProperties().hasProperty(MachineFunctionProperties::Property::FailedISel) || !MF->getProperties().hasProperty(MachineFunctionProperties::Property::Legalized)">; } include "AArch64InstrFormats.td" include "SVEInstrFormats.td" +include "SMEInstrFormats.td" //===----------------------------------------------------------------------===// @@ -370,40 +728,40 @@ let isReMaterializable = 1, isCodeGenOnly = 1 in { // removed, along with the AArch64Wrapper node. let AddedComplexity = 10 in -def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr), - [(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>, +def LOADgot : Pseudo<(outs GPR64common:$dst), (ins i64imm:$addr), + [(set GPR64common:$dst, (AArch64LOADgot tglobaladdr:$addr))]>, Sched<[WriteLDAdr]>; // The MOVaddr instruction should match only when the add is not folded // into a load or store address. def MOVaddr - : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), - [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi), + : Pseudo<(outs GPR64common:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64common:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi), tglobaladdr:$low))]>, Sched<[WriteAdrAdr]>; def MOVaddrJT - : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), - [(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi), + : Pseudo<(outs GPR64common:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64common:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi), tjumptable:$low))]>, Sched<[WriteAdrAdr]>; def MOVaddrCP - : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), - [(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi), + : Pseudo<(outs GPR64common:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64common:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi), tconstpool:$low))]>, Sched<[WriteAdrAdr]>; def MOVaddrBA - : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), - [(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi), + : Pseudo<(outs GPR64common:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64common:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi), tblockaddress:$low))]>, Sched<[WriteAdrAdr]>; def MOVaddrTLS - : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), - [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi), + : Pseudo<(outs GPR64common:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64common:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi), tglobaltlsaddr:$low))]>, Sched<[WriteAdrAdr]>; def MOVaddrEXT - : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), - [(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi), + : Pseudo<(outs GPR64common:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64common:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi), texternalsym:$low))]>, Sched<[WriteAdrAdr]>; // Normally AArch64addlow either gets folded into a following ldr/str, @@ -411,8 +769,8 @@ def MOVaddrEXT // might appear without either of them, so allow lowering it into a plain // add. def ADDlowTLS - : Pseudo<(outs GPR64:$dst), (ins GPR64:$src, i64imm:$low), - [(set GPR64:$dst, (AArch64addlow GPR64:$src, + : Pseudo<(outs GPR64sp:$dst), (ins GPR64sp:$src, i64imm:$low), + [(set GPR64sp:$dst, (AArch64addlow GPR64sp:$src, tglobaltlsaddr:$low))]>, Sched<[WriteAdr]>; @@ -427,6 +785,52 @@ def : Pat<(AArch64LOADgot texternalsym:$addr), def : Pat<(AArch64LOADgot tconstpool:$addr), (LOADgot tconstpool:$addr)>; +// In general these get lowered into a sequence of three 4-byte instructions. +// 32-bit jump table destination is actually only 2 instructions since we can +// use the table itself as a PC-relative base. But optimization occurs after +// branch relaxation so be pessimistic. +let Size = 12, Constraints = "@earlyclobber $dst,@earlyclobber $scratch", + isNotDuplicable = 1 in { +def JumpTableDest32 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch), + (ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>, + Sched<[]>; +def JumpTableDest16 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch), + (ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>, + Sched<[]>; +def JumpTableDest8 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch), + (ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>, + Sched<[]>; +} + +// Space-consuming pseudo to aid testing of placement and reachability +// algorithms. Immediate operand is the number of bytes this "instruction" +// occupies; register operands can be used to enforce dependency and constrain +// the scheduler. +let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in +def SPACE : Pseudo<(outs GPR64:$Rd), (ins i32imm:$size, GPR64:$Rn), + [(set GPR64:$Rd, (int_aarch64_space imm:$size, GPR64:$Rn))]>, + Sched<[]>; + +let hasSideEffects = 1, isCodeGenOnly = 1 in { + def SpeculationSafeValueX + : Pseudo<(outs GPR64:$dst), (ins GPR64:$src), []>, Sched<[]>; + def SpeculationSafeValueW + : Pseudo<(outs GPR32:$dst), (ins GPR32:$src), []>, Sched<[]>; +} + +// SpeculationBarrierEndBB must only be used after an unconditional control +// flow, i.e. after a terminator for which isBarrier is True. +let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in { + // This gets lowered to a pair of 4-byte instructions. + let Size = 8 in + def SpeculationBarrierISBDSBEndBB + : Pseudo<(outs), (ins), []>, Sched<[]>; + // This gets lowered to a 4-byte instruction. + let Size = 4 in + def SpeculationBarrierSBEndBB + : Pseudo<(outs), (ins), []>, Sched<[]>; +} + //===----------------------------------------------------------------------===// // System instructions. //===----------------------------------------------------------------------===// @@ -438,8 +842,17 @@ def : InstAlias<"wfe", (HINT 0b010)>; def : InstAlias<"wfi", (HINT 0b011)>; def : InstAlias<"sev", (HINT 0b100)>; def : InstAlias<"sevl", (HINT 0b101)>; +def : InstAlias<"dgh", (HINT 0b110)>; def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>; def : InstAlias<"csdb", (HINT 20)>; +// In order to be able to write readable assembly, LLVM should accept assembly +// inputs that use Branch Target Indentification mnemonics, even with BTI disabled. +// However, in order to be compatible with other assemblers (e.g. GAS), LLVM +// should not emit these mnemonics unless BTI is enabled. +def : InstAlias<"bti", (HINT 32), 0>; +def : InstAlias<"bti $op", (HINT btihint_op:$op), 0>; +def : InstAlias<"bti", (HINT 32)>, Requires<[HasBTI]>; +def : InstAlias<"bti $op", (HINT btihint_op:$op)>, Requires<[HasBTI]>; // v8.2a Statistical Profiling extension def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>; @@ -463,16 +876,122 @@ def ISB : CRmSystemI { let CRm = 0b0010; let Inst{12} = 0; - let Predicates = [HasV8_4a]; -} + let Predicates = [HasTRACEV8_4]; } -// ARMv8.2 Dot Product +def DSBnXS : CRmSystemI { + let CRm{1-0} = 0b11; + let Inst{9-8} = 0b10; + let Predicates = [HasXS]; +} + +let Predicates = [HasWFxT] in { +def WFET : RegInputSystemI<0b0000, 0b000, "wfet">; +def WFIT : RegInputSystemI<0b0000, 0b001, "wfit">; +} + +// Branch Record Buffer two-word mnemonic instructions +class BRBEI op2, string keyword> + : SimpleSystemI<0, (ins), "brb", keyword>, Sched<[WriteSys]> { + let Inst{31-8} = 0b110101010000100101110010; + let Inst{7-5} = op2; + let Predicates = [HasBRBE]; +} +def BRB_IALL: BRBEI<0b100, "\tiall">; +def BRB_INJ: BRBEI<0b101, "\tinj">; + +} + +// Allow uppercase and lowercase keyword arguments for BRB IALL and BRB INJ +def : TokenAlias<"INJ", "inj">; +def : TokenAlias<"IALL", "iall">; + +// ARMv8.2-A Dot Product let Predicates = [HasDotProd] in { -defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>; -defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>; -defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>; -defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>; +defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", AArch64sdot>; +defm UDOT : SIMDThreeSameVectorDot<1, 0, "udot", AArch64udot>; +defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", AArch64sdot>; +defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", AArch64udot>; +} + +// ARMv8.6-A BFloat +let Predicates = [HasNEON, HasBF16] in { +defm BFDOT : SIMDThreeSameVectorBFDot<1, "bfdot">; +defm BF16DOTlane : SIMDThreeSameVectorBF16DotI<0, "bfdot">; +def BFMMLA : SIMDThreeSameVectorBF16MatrixMul<"bfmmla">; +def BFMLALB : SIMDBF16MLAL<0, "bfmlalb", int_aarch64_neon_bfmlalb>; +def BFMLALT : SIMDBF16MLAL<1, "bfmlalt", int_aarch64_neon_bfmlalt>; +def BFMLALBIdx : SIMDBF16MLALIndex<0, "bfmlalb", int_aarch64_neon_bfmlalb>; +def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>; +def BFCVTN : SIMD_BFCVTN; +def BFCVTN2 : SIMD_BFCVTN2; + +// Vector-scalar BFDOT: +// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit +// register (the instruction uses a single 32-bit lane from it), so the pattern +// is a bit tricky. +def : Pat<(v2f32 (int_aarch64_neon_bfdot + (v2f32 V64:$Rd), (v4bf16 V64:$Rn), + (v4bf16 (bitconvert + (v2i32 (AArch64duplane32 + (v4i32 (bitconvert + (v8bf16 (insert_subvector undef, + (v4bf16 V64:$Rm), + (i64 0))))), + VectorIndexS:$idx)))))), + (BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn), + (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), + VectorIndexS:$idx)>; +} + +let Predicates = [HasNEONorStreamingSVE, HasBF16] in { +def BFCVT : BF16ToSinglePrecision<"bfcvt">; +} + +// ARMv8.6A AArch64 matrix multiplication +let Predicates = [HasMatMulInt8] in { +def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>; +def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>; +def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>; +defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>; +defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>; + +// sudot lane has a pattern where usdot is expected (there is no sudot). +// The second operand is used in the dup operation to repeat the indexed +// element. +class BaseSIMDSUDOTIndex + : BaseSIMDThreeSameVectorDotIndex { + let Pattern = [(set (AccumType RegType:$dst), + (AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd), + (InputType (bitconvert (AccumType + (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))), + (InputType RegType:$Rn))))]; +} + +multiclass SIMDSUDOTIndex { + def v8i8 : BaseSIMDSUDOTIndex<0, ".2s", ".8b", ".4b", V64, v2i32, v8i8>; + def v16i8 : BaseSIMDSUDOTIndex<1, ".4s", ".16b", ".4b", V128, v4i32, v16i8>; +} + +defm SUDOTlane : SIMDSUDOTIndex; + +} + +// ARMv8.2-A FP16 Fused Multiply-Add Long +let Predicates = [HasNEON, HasFP16FML] in { +defm FMLAL : SIMDThreeSameVectorFML<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>; +defm FMLSL : SIMDThreeSameVectorFML<0, 1, 0b101, "fmlsl", int_aarch64_neon_fmlsl>; +defm FMLAL2 : SIMDThreeSameVectorFML<1, 0, 0b001, "fmlal2", int_aarch64_neon_fmlal2>; +defm FMLSL2 : SIMDThreeSameVectorFML<1, 0, 0b101, "fmlsl2", int_aarch64_neon_fmlsl2>; +defm FMLALlane : SIMDThreeSameVectorFMLIndex<0, 0b0000, "fmlal", int_aarch64_neon_fmlal>; +defm FMLSLlane : SIMDThreeSameVectorFMLIndex<0, 0b0100, "fmlsl", int_aarch64_neon_fmlsl>; +defm FMLAL2lane : SIMDThreeSameVectorFMLIndex<1, 0b1000, "fmlal2", int_aarch64_neon_fmlal2>; +defm FMLSL2lane : SIMDThreeSameVectorFMLIndex<1, 0b1100, "fmlsl2", int_aarch64_neon_fmlsl2>; } // Armv8.2-A Crypto extensions @@ -485,6 +1004,54 @@ def RAX1 : CryptoRRR_2D<0b0,0b11, "rax1">; def EOR3 : CryptoRRRR_16B<0b00, "eor3">; def BCAX : CryptoRRRR_16B<0b01, "bcax">; def XAR : CryptoRRRi6<"xar">; + +class SHA3_pattern + : Pat<(VecTy (OpNode (VecTy V128:$Vd), (VecTy V128:$Vn), (VecTy V128:$Vm))), + (INST (VecTy V128:$Vd), (VecTy V128:$Vn), (VecTy V128:$Vm))>; + +def : Pat<(v2i64 (int_aarch64_crypto_sha512su0 (v2i64 V128:$Vn), (v2i64 V128:$Vm))), + (SHA512SU0 (v2i64 V128:$Vn), (v2i64 V128:$Vm))>; + +def : SHA3_pattern; +def : SHA3_pattern; +def : SHA3_pattern; + +def : SHA3_pattern; +def : SHA3_pattern; +def : SHA3_pattern; +def : SHA3_pattern; + +class EOR3_pattern + : Pat<(xor (xor (VecTy V128:$Vn), (VecTy V128:$Vm)), (VecTy V128:$Va)), + (EOR3 (VecTy V128:$Vn), (VecTy V128:$Vm), (VecTy V128:$Va))>; + +def : EOR3_pattern; +def : EOR3_pattern; +def : EOR3_pattern; +def : EOR3_pattern; + +def : SHA3_pattern; +def : SHA3_pattern; +def : SHA3_pattern; +def : SHA3_pattern; + +def : SHA3_pattern; +def : SHA3_pattern; +def : SHA3_pattern; +def : SHA3_pattern; + +def : SHA3_pattern; +def : SHA3_pattern; +def : SHA3_pattern; +def : SHA3_pattern; + +def : Pat<(v2i64 (int_aarch64_crypto_rax1 (v2i64 V128:$Vn), (v2i64 V128:$Vm))), + (RAX1 (v2i64 V128:$Vn), (v2i64 V128:$Vm))>; + +def : Pat<(v2i64 (int_aarch64_crypto_xar (v2i64 V128:$Vn), (v2i64 V128:$Vm), (i64 timm0_63:$imm))), + (XAR (v2i64 V128:$Vn), (v2i64 V128:$Vm), (timm0_63:$imm))>; + + } // HasSHA3 let Predicates = [HasSM4] in { @@ -497,6 +1064,32 @@ def SM3PARTW1 : CryptoRRRTied_4S<0b1, 0b00, "sm3partw1">; def SM3PARTW2 : CryptoRRRTied_4S<0b1, 0b01, "sm3partw2">; def SM4ENCKEY : CryptoRRR_4S<0b1, 0b10, "sm4ekey">; def SM4E : CryptoRRTied_4S<0b0, 0b01, "sm4e">; + +def : Pat<(v4i32 (int_aarch64_crypto_sm3ss1 (v4i32 V128:$Vn), (v4i32 V128:$Vm), (v4i32 V128:$Va))), + (SM3SS1 (v4i32 V128:$Vn), (v4i32 V128:$Vm), (v4i32 V128:$Va))>; + +class SM3PARTW_pattern + : Pat<(v4i32 (OpNode (v4i32 V128:$Vd), (v4i32 V128:$Vn), (v4i32 V128:$Vm))), + (INST (v4i32 V128:$Vd), (v4i32 V128:$Vn), (v4i32 V128:$Vm))>; + +class SM3TT_pattern + : Pat<(v4i32 (OpNode (v4i32 V128:$Vd), (v4i32 V128:$Vn), (v4i32 V128:$Vm), (i64 VectorIndexS_timm:$imm) )), + (INST (v4i32 V128:$Vd), (v4i32 V128:$Vn), (v4i32 V128:$Vm), (VectorIndexS_timm:$imm))>; + +class SM4_pattern + : Pat<(v4i32 (OpNode (v4i32 V128:$Vn), (v4i32 V128:$Vm))), + (INST (v4i32 V128:$Vn), (v4i32 V128:$Vm))>; + +def : SM3PARTW_pattern; +def : SM3PARTW_pattern; + +def : SM3TT_pattern; +def : SM3TT_pattern; +def : SM3TT_pattern; +def : SM3TT_pattern; + +def : SM4_pattern; +def : SM4_pattern; } // HasSM4 let Predicates = [HasRCPC] in { @@ -513,65 +1106,181 @@ defm FCMLA : SIMDThreeSameVectorTiedComplexHSD<1, 0b110, complexrotateop, "fcmla", null_frag>; defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd, "fcadd", null_frag>; -defm FCMLA : SIMDIndexedTiedComplexHSD<1, 0, 1, complexrotateop, "fcmla", - null_frag>; +defm FCMLA : SIMDIndexedTiedComplexHSD<0, 1, complexrotateop, "fcmla">; + +let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in { + def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot90 (v4f16 V64:$Rn), (v4f16 V64:$Rm))), + (FCADDv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm), (i32 0))>; + def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot270 (v4f16 V64:$Rn), (v4f16 V64:$Rm))), + (FCADDv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm), (i32 1))>; + def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot90 (v8f16 V128:$Rn), (v8f16 V128:$Rm))), + (FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 0))>; + def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot270 (v8f16 V128:$Rn), (v8f16 V128:$Rm))), + (FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 1))>; +} + +let Predicates = [HasComplxNum, HasNEON] in { + def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot90 (v2f32 V64:$Rn), (v2f32 V64:$Rm))), + (FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 0))>; + def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot270 (v2f32 V64:$Rn), (v2f32 V64:$Rm))), + (FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 1))>; + foreach Ty = [v4f32, v2f64] in { + def : Pat<(Ty (int_aarch64_neon_vcadd_rot90 (Ty V128:$Rn), (Ty V128:$Rm))), + (!cast("FCADD"#Ty) (Ty V128:$Rn), (Ty V128:$Rm), (i32 0))>; + def : Pat<(Ty (int_aarch64_neon_vcadd_rot270 (Ty V128:$Rn), (Ty V128:$Rm))), + (!cast("FCADD"#Ty) (Ty V128:$Rn), (Ty V128:$Rm), (i32 1))>; + } +} + +multiclass FCMLA_PATS { + def : Pat<(ty (int_aarch64_neon_vcmla_rot0 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))), + (!cast("FCMLA" # ty) $Rd, $Rn, $Rm, 0)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot90 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))), + (!cast("FCMLA" # ty) $Rd, $Rn, $Rm, 1)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot180 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))), + (!cast("FCMLA" # ty) $Rd, $Rn, $Rm, 2)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot270 (ty Reg:$Rd), (ty Reg:$Rn), (ty Reg:$Rm))), + (!cast("FCMLA" # ty) $Rd, $Rn, $Rm, 3)>; +} + +multiclass FCMLA_LANE_PATS { + def : Pat<(ty (int_aarch64_neon_vcmla_rot0 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), + (!cast("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 0)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot90 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), + (!cast("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 1)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot180 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), + (!cast("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 2)>; + def : Pat<(ty (int_aarch64_neon_vcmla_rot270 (ty Reg:$Rd), (ty Reg:$Rn), RHSDup)), + (!cast("FCMLA" # ty # "_indexed") $Rd, $Rn, $Rm, VectorIndexS:$idx, 3)>; +} + + +let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in { + defm : FCMLA_PATS; + defm : FCMLA_PATS; + + defm : FCMLA_LANE_PATS; + defm : FCMLA_LANE_PATS; +} +let Predicates = [HasComplxNum, HasNEON] in { + defm : FCMLA_PATS; + defm : FCMLA_PATS; + defm : FCMLA_PATS; + + defm : FCMLA_LANE_PATS; +} // v8.3a Pointer Authentication // These instructions inhabit part of the hint space and so can be used for -// armv8 targets +// armv8 targets. Keeping the old HINT mnemonic when compiling without PA is +// important for compatibility with other assemblers (e.g. GAS) when building +// software compatible with both CPUs that do or don't implement PA. let Uses = [LR], Defs = [LR] in { - def PACIAZ : SystemNoOperands<0b000, "paciaz">; - def PACIBZ : SystemNoOperands<0b010, "pacibz">; - def AUTIAZ : SystemNoOperands<0b100, "autiaz">; - def AUTIBZ : SystemNoOperands<0b110, "autibz">; + def PACIAZ : SystemNoOperands<0b000, "hint\t#24">; + def PACIBZ : SystemNoOperands<0b010, "hint\t#26">; + let isAuthenticated = 1 in { + def AUTIAZ : SystemNoOperands<0b100, "hint\t#28">; + def AUTIBZ : SystemNoOperands<0b110, "hint\t#30">; + } } let Uses = [LR, SP], Defs = [LR] in { - def PACIASP : SystemNoOperands<0b001, "paciasp">; - def PACIBSP : SystemNoOperands<0b011, "pacibsp">; - def AUTIASP : SystemNoOperands<0b101, "autiasp">; - def AUTIBSP : SystemNoOperands<0b111, "autibsp">; + def PACIASP : SystemNoOperands<0b001, "hint\t#25">; + def PACIBSP : SystemNoOperands<0b011, "hint\t#27">; + let isAuthenticated = 1 in { + def AUTIASP : SystemNoOperands<0b101, "hint\t#29">; + def AUTIBSP : SystemNoOperands<0b111, "hint\t#31">; + } } let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in { - def PACIA1716 : SystemNoOperands<0b000, "pacia1716">; - def PACIB1716 : SystemNoOperands<0b010, "pacib1716">; - def AUTIA1716 : SystemNoOperands<0b100, "autia1716">; - def AUTIB1716 : SystemNoOperands<0b110, "autib1716">; + def PACIA1716 : SystemNoOperands<0b000, "hint\t#8">; + def PACIB1716 : SystemNoOperands<0b010, "hint\t#10">; + let isAuthenticated = 1 in { + def AUTIA1716 : SystemNoOperands<0b100, "hint\t#12">; + def AUTIB1716 : SystemNoOperands<0b110, "hint\t#14">; + } } let Uses = [LR], Defs = [LR], CRm = 0b0000 in { - def XPACLRI : SystemNoOperands<0b111, "xpaclri">; + def XPACLRI : SystemNoOperands<0b111, "hint\t#7">; } -// These pointer authentication isntructions require armv8.3a -let Predicates = [HasV8_3a] in { - multiclass SignAuth prefix, bits<3> prefix_z, string asm> { - def IA : SignAuthOneData; - def IB : SignAuthOneData; - def DA : SignAuthOneData; - def DB : SignAuthOneData; - def IZA : SignAuthZero; - def DZA : SignAuthZero; - def IZB : SignAuthZero; - def DZB : SignAuthZero; +// In order to be able to write readable assembly, LLVM should accept assembly +// inputs that use pointer authentication mnemonics, even with PA disabled. +// However, in order to be compatible with other assemblers (e.g. GAS), LLVM +// should not emit these mnemonics unless PA is enabled. +def : InstAlias<"paciaz", (PACIAZ), 0>; +def : InstAlias<"pacibz", (PACIBZ), 0>; +def : InstAlias<"autiaz", (AUTIAZ), 0>; +def : InstAlias<"autibz", (AUTIBZ), 0>; +def : InstAlias<"paciasp", (PACIASP), 0>; +def : InstAlias<"pacibsp", (PACIBSP), 0>; +def : InstAlias<"autiasp", (AUTIASP), 0>; +def : InstAlias<"autibsp", (AUTIBSP), 0>; +def : InstAlias<"pacia1716", (PACIA1716), 0>; +def : InstAlias<"pacib1716", (PACIB1716), 0>; +def : InstAlias<"autia1716", (AUTIA1716), 0>; +def : InstAlias<"autib1716", (AUTIB1716), 0>; +def : InstAlias<"xpaclri", (XPACLRI), 0>; + +// These pointer authentication instructions require armv8.3a +let Predicates = [HasPAuth] in { + + // When PA is enabled, a better mnemonic should be emitted. + def : InstAlias<"paciaz", (PACIAZ), 1>; + def : InstAlias<"pacibz", (PACIBZ), 1>; + def : InstAlias<"autiaz", (AUTIAZ), 1>; + def : InstAlias<"autibz", (AUTIBZ), 1>; + def : InstAlias<"paciasp", (PACIASP), 1>; + def : InstAlias<"pacibsp", (PACIBSP), 1>; + def : InstAlias<"autiasp", (AUTIASP), 1>; + def : InstAlias<"autibsp", (AUTIBSP), 1>; + def : InstAlias<"pacia1716", (PACIA1716), 1>; + def : InstAlias<"pacib1716", (PACIB1716), 1>; + def : InstAlias<"autia1716", (AUTIA1716), 1>; + def : InstAlias<"autib1716", (AUTIB1716), 1>; + def : InstAlias<"xpaclri", (XPACLRI), 1>; + + multiclass SignAuth prefix, bits<3> prefix_z, string asm, + SDPatternOperator op> { + def IA : SignAuthOneData; + def IB : SignAuthOneData; + def DA : SignAuthOneData; + def DB : SignAuthOneData; + def IZA : SignAuthZero; + def DZA : SignAuthZero; + def IZB : SignAuthZero; + def DZB : SignAuthZero; } - defm PAC : SignAuth<0b000, 0b010, "pac">; - defm AUT : SignAuth<0b001, 0b011, "aut">; + defm PAC : SignAuth<0b000, 0b010, "pac", int_ptrauth_sign>; + defm AUT : SignAuth<0b001, 0b011, "aut", null_frag>; - def XPACI : SignAuthZero<0b100, 0b00, "xpaci">; - def XPACD : SignAuthZero<0b100, 0b01, "xpacd">; - def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>; + def XPACI : ClearAuth<0, "xpaci">; + def XPACD : ClearAuth<1, "xpacd">; + + def PACGA : SignAuthTwoOperand<0b1100, "pacga", int_ptrauth_sign_generic>; // Combined Instructions - def BRAA : AuthBranchTwoOperands<0, 0, "braa">; - def BRAB : AuthBranchTwoOperands<0, 1, "brab">; - def BLRAA : AuthBranchTwoOperands<1, 0, "blraa">; - def BLRAB : AuthBranchTwoOperands<1, 1, "blrab">; + let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def BRAA : AuthBranchTwoOperands<0, 0, "braa">; + def BRAB : AuthBranchTwoOperands<0, 1, "brab">; + } + let isCall = 1, Defs = [LR], Uses = [SP] in { + def BLRAA : AuthBranchTwoOperands<1, 0, "blraa">; + def BLRAB : AuthBranchTwoOperands<1, 1, "blrab">; + } - def BRAAZ : AuthOneOperand<0b000, 0, "braaz">; - def BRABZ : AuthOneOperand<0b000, 1, "brabz">; - def BLRAAZ : AuthOneOperand<0b001, 0, "blraaz">; - def BLRABZ : AuthOneOperand<0b001, 1, "blrabz">; + let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def BRAAZ : AuthOneOperand<0b000, 0, "braaz">; + def BRABZ : AuthOneOperand<0b000, 1, "brabz">; + } + let isCall = 1, Defs = [LR], Uses = [SP] in { + def BLRAAZ : AuthOneOperand<0b001, 0, "blraaz">; + def BLRABZ : AuthOneOperand<0b001, 1, "blrabz">; + } let isReturn = 1, isTerminator = 1, isBarrier = 1 in { def RETAA : AuthReturn<0b010, 0, "retaa">; @@ -583,17 +1292,19 @@ let Predicates = [HasV8_3a] in { defm LDRAA : AuthLoad<0, "ldraa", simm10Scaled>; defm LDRAB : AuthLoad<1, "ldrab", simm10Scaled>; - // v8.3a floating point conversion for javascript - let Predicates = [HasV8_3a, HasFPARMv8] in - def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32, - "fjcvtzs", []> { - let Inst{31} = 0; - } +} -} // HasV8_3a +// v8.3a floating point conversion for javascript +let Predicates = [HasJS, HasFPARMv8], Defs = [NZCV] in +def FJCVTZS : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32, + "fjcvtzs", + [(set GPR32:$Rd, + (int_aarch64_fjcvtzs FPR64:$Rn))]> { + let Inst{31} = 0; +} // HasJS, HasFPARMv8 // v8.4 Flag manipulation instructions -let Predicates = [HasV8_4a] in { +let Predicates = [HasFlagM], Defs = [NZCV], Uses = [NZCV] in { def CFINV : SimpleSystemI<0, (ins), "cfinv", "">, Sched<[WriteSys]> { let Inst{20-5} = 0b0000001000000000; } @@ -601,27 +1312,75 @@ def SETF8 : BaseFlagManipulation<0, 0, (ins GPR32:$Rn), "setf8", "{\t$Rn}">; def SETF16 : BaseFlagManipulation<0, 1, (ins GPR32:$Rn), "setf16", "{\t$Rn}">; def RMIF : FlagRotate<(ins GPR64:$Rn, uimm6:$imm, imm0_15:$mask), "rmif", "{\t$Rn, $imm, $mask}">; -} // HasV8_4a +} // HasFlagM + +// v8.5 flag manipulation instructions +let Predicates = [HasAltNZCV], Uses = [NZCV], Defs = [NZCV] in { + +def XAFLAG : PstateWriteSimple<(ins), "xaflag", "">, Sched<[WriteSys]> { + let Inst{18-16} = 0b000; + let Inst{11-8} = 0b0000; + let Unpredictable{11-8} = 0b1111; + let Inst{7-5} = 0b001; +} + +def AXFLAG : PstateWriteSimple<(ins), "axflag", "">, Sched<[WriteSys]> { + let Inst{18-16} = 0b000; + let Inst{11-8} = 0b0000; + let Unpredictable{11-8} = 0b1111; + let Inst{7-5} = 0b010; +} +} // HasAltNZCV + + +// Armv8.5-A speculation barrier +def SB : SimpleSystemI<0, (ins), "sb", "">, Sched<[]> { + let Inst{20-5} = 0b0001100110000111; + let Unpredictable{11-8} = 0b1111; + let Predicates = [HasSB]; + let hasSideEffects = 1; +} def : InstAlias<"clrex", (CLREX 0xf)>; def : InstAlias<"isb", (ISB 0xf)>; +def : InstAlias<"ssbb", (DSB 0)>; +def : InstAlias<"pssbb", (DSB 4)>; +def : InstAlias<"dfb", (DSB 0b1100)>, Requires<[HasV8_0r]>; def MRS : MRSI; def MSR : MSRI; def MSRpstateImm1 : MSRpstateImm0_1; def MSRpstateImm4 : MSRpstateImm0_15; +def : Pat<(AArch64mrs imm:$id), + (MRS imm:$id)>; + // The thread pointer (on Linux, at least, where this has been implemented) is // TPIDR_EL0. def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins), [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>; +let Uses = [ X9 ], Defs = [ X16, X17, LR, NZCV ] in { +def HWASAN_CHECK_MEMACCESS : Pseudo< + (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo), + [(int_hwasan_check_memaccess X9, GPR64noip:$ptr, (i32 timm:$accessinfo))]>, + Sched<[]>; +} + +let Uses = [ X20 ], Defs = [ X16, X17, LR, NZCV ] in { +def HWASAN_CHECK_MEMACCESS_SHORTGRANULES : Pseudo< + (outs), (ins GPR64noip:$ptr, i32imm:$accessinfo), + [(int_hwasan_check_memaccess_shortgranules X20, GPR64noip:$ptr, (i32 timm:$accessinfo))]>, + Sched<[]>; +} + // The cycle counter PMC register is PMCCNTR_EL0. let Predicates = [HasPerfMon] in def : Pat<(readcyclecounter), (MRS 0xdce8)>; // FPCR register def : Pat<(i64 (int_aarch64_get_fpcr)), (MRS 0xda20)>; +def : Pat<(int_aarch64_set_fpcr i64:$val), (MSR 0xda20, GPR64:$val)>; // Generic system instructions def SYSxt : SystemXtI<0, "sys">; @@ -631,6 +1390,23 @@ def : InstAlias<"sys $op1, $Cn, $Cm, $op2", (SYSxt imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, XZR)>; + +let Predicates = [HasTME] in { + +def TSTART : TMSystemI<0b0000, "tstart", + [(set GPR64:$Rt, (int_aarch64_tstart))]>; + +def TCOMMIT : TMSystemINoOperand<0b0000, "tcommit", [(int_aarch64_tcommit)]>; + +def TCANCEL : TMSystemException<0b011, "tcancel", + [(int_aarch64_tcancel timm64_0_65535:$imm)]>; + +def TTEST : TMSystemI<0b0001, "ttest", [(set GPR64:$Rt, (int_aarch64_ttest))]> { + let mayLoad = 0; + let mayStore = 0; +} +} // HasTME + //===----------------------------------------------------------------------===// // Move immediate instructions. //===----------------------------------------------------------------------===// @@ -642,37 +1418,37 @@ let PostEncoderMethod = "fixMOVZ" in defm MOVZ : MoveImmediate<0b10, "movz">; // First group of aliases covers an implicit "lsl #0". -def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0), 0>; -def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0), 0>; -def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, timm32_0_65535:$imm, 0), 0>; +def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, timm32_0_65535:$imm, 0), 0>; +def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, timm32_0_65535:$imm, 0)>; +def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, timm32_0_65535:$imm, 0)>; +def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, timm32_0_65535:$imm, 0)>; +def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, timm32_0_65535:$imm, 0)>; // Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax. -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>; -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>; -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g2:$sym, 32)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g1:$sym, 16)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movw_symbol_g0:$sym, 0)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g3:$sym, 48)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g2:$sym, 32)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g1:$sym, 16)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movw_symbol_g0:$sym, 0)>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48), 0>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32), 0>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16), 0>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g3:$sym, 48), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g2:$sym, 32), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g1:$sym, 16), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movw_symbol_g0:$sym, 0), 0>; -def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; +def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movw_symbol_g1:$sym, 16)>; +def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movw_symbol_g0:$sym, 0)>; -def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; +def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movw_symbol_g1:$sym, 16)>; +def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movw_symbol_g0:$sym, 0)>; -def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16), 0>; -def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movw_symbol_g1:$sym, 16), 0>; +def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movw_symbol_g0:$sym, 0), 0>; // Final group of aliases covers true "mov $Rd, $imm" cases. multiclass movw_mov_alias, GISDNodeXFormEquiv; +let Predicates = [OptimizedGISelOrOtherSelector] in { +// The SUBREG_TO_REG isn't eliminated at -O0, which can result in pointless +// copies. def : Pat<(i64 i64imm_32bit:$src), (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>; +} // Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model). def bitcast_fpimm_to_i32 : SDNodeXForm; let AddedComplexity = 1 in { -def : Pat<(sub GPR32sp:$R2, arith_extended_reg32:$R3), - (SUBSWrx GPR32sp:$R2, arith_extended_reg32:$R3)>; -def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64:$R3), - (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64:$R3)>; +def : Pat<(sub GPR32sp:$R2, arith_extended_reg32_i32:$R3), + (SUBSWrx GPR32sp:$R2, arith_extended_reg32_i32:$R3)>; +def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64_i64:$R3), + (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64_i64:$R3)>; } // Because of the immediate format for add/sub-imm instructions, the @@ -914,8 +1694,8 @@ def : ShiftAlias<"rorv", RORVXr, GPR64>; // Multiply-add let AddedComplexity = 5 in { -defm MADD : MulAccum<0, "madd", add>; -defm MSUB : MulAccum<1, "msub", sub>; +defm MADD : MulAccum<0, "madd">; +defm MSUB : MulAccum<1, "msub">; def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)), (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; @@ -938,8 +1718,16 @@ def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>; def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>; def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>; +def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (sext_inreg GPR64:$Rm, i32))), + (SMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), (EXTRACT_SUBREG $Rm, sub_32), XZR)>; +def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (sext GPR32:$Rm))), + (SMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), $Rm, XZR)>; def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))), (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; +def : Pat<(i64 (mul (and GPR64:$Rn, 0xFFFFFFFF), (and GPR64:$Rm, 0xFFFFFFFF))), + (UMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), (EXTRACT_SUBREG $Rm, sub_32), XZR)>; +def : Pat<(i64 (mul (and GPR64:$Rn, 0xFFFFFFFF), (zext GPR32:$Rm))), + (UMADDLrrr (EXTRACT_SUBREG $Rn, sub_32), $Rm, XZR)>; def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))), (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; @@ -1076,6 +1864,112 @@ defm : STOPregister<"stsmin","LDSMIN">;// STSMINx defm : STOPregister<"stumax","LDUMAX">;// STUMAXx defm : STOPregister<"stumin","LDUMIN">;// STUMINx +// v8.5 Memory Tagging Extension +let Predicates = [HasMTE] in { + +def IRG : BaseTwoOperand<0b0100, GPR64sp, "irg", int_aarch64_irg, GPR64sp, GPR64>, + Sched<[]>{ + let Inst{31} = 1; +} +def GMI : BaseTwoOperand<0b0101, GPR64, "gmi", int_aarch64_gmi, GPR64sp>, Sched<[]>{ + let Inst{31} = 1; + let isNotDuplicable = 1; +} +def ADDG : AddSubG<0, "addg", null_frag>; +def SUBG : AddSubG<1, "subg", null_frag>; + +def : InstAlias<"irg $dst, $src", (IRG GPR64sp:$dst, GPR64sp:$src, XZR), 1>; + +def SUBP : SUBP<0, "subp", int_aarch64_subp>, Sched<[]>; +def SUBPS : SUBP<1, "subps", null_frag>, Sched<[]>{ + let Defs = [NZCV]; +} + +def : InstAlias<"cmpp $lhs, $rhs", (SUBPS XZR, GPR64sp:$lhs, GPR64sp:$rhs), 0>; + +def LDG : MemTagLoad<"ldg", "\t$Rt, [$Rn, $offset]">; + +def : Pat<(int_aarch64_addg (am_indexedu6s128 GPR64sp:$Rn, uimm6s16:$imm6), imm0_15:$imm4), + (ADDG GPR64sp:$Rn, imm0_63:$imm6, imm0_15:$imm4)>; +def : Pat<(int_aarch64_ldg GPR64:$Rt, (am_indexeds9s128 GPR64sp:$Rn, simm9s16:$offset)), + (LDG GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)>; + +def : InstAlias<"ldg $Rt, [$Rn]", (LDG GPR64:$Rt, GPR64sp:$Rn, 0), 1>; + +def LDGM : MemTagVector<1, "ldgm", "\t$Rt, [$Rn]", + (outs GPR64:$Rt), (ins GPR64sp:$Rn)>; +def STGM : MemTagVector<0, "stgm", "\t$Rt, [$Rn]", + (outs), (ins GPR64:$Rt, GPR64sp:$Rn)>; +def STZGM : MemTagVector<0, "stzgm", "\t$Rt, [$Rn]", + (outs), (ins GPR64:$Rt, GPR64sp:$Rn)> { + let Inst{23} = 0; +} + +defm STG : MemTagStore<0b00, "stg">; +defm STZG : MemTagStore<0b01, "stzg">; +defm ST2G : MemTagStore<0b10, "st2g">; +defm STZ2G : MemTagStore<0b11, "stz2g">; + +def : Pat<(AArch64stg GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)), + (STGOffset $Rn, $Rm, $imm)>; +def : Pat<(AArch64stzg GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)), + (STZGOffset $Rn, $Rm, $imm)>; +def : Pat<(AArch64st2g GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)), + (ST2GOffset $Rn, $Rm, $imm)>; +def : Pat<(AArch64stz2g GPR64sp:$Rn, (am_indexeds9s128 GPR64sp:$Rm, simm9s16:$imm)), + (STZ2GOffset $Rn, $Rm, $imm)>; + +defm STGP : StorePairOffset <0b01, 0, GPR64z, simm7s16, "stgp">; +def STGPpre : StorePairPreIdx <0b01, 0, GPR64z, simm7s16, "stgp">; +def STGPpost : StorePairPostIdx<0b01, 0, GPR64z, simm7s16, "stgp">; + +def : Pat<(int_aarch64_stg GPR64:$Rt, (am_indexeds9s128 GPR64sp:$Rn, simm9s16:$offset)), + (STGOffset GPR64:$Rt, GPR64sp:$Rn, simm9s16:$offset)>; + +def : Pat<(int_aarch64_stgp (am_indexed7s128 GPR64sp:$Rn, simm7s16:$imm), GPR64:$Rt, GPR64:$Rt2), + (STGPi $Rt, $Rt2, $Rn, $imm)>; + +def IRGstack + : Pseudo<(outs GPR64sp:$Rd), (ins GPR64sp:$Rsp, GPR64:$Rm), []>, + Sched<[]>; +def TAGPstack + : Pseudo<(outs GPR64sp:$Rd), (ins GPR64sp:$Rn, uimm6s16:$imm6, GPR64sp:$Rm, imm0_15:$imm4), []>, + Sched<[]>; + +// Explicit SP in the first operand prevents ShrinkWrap optimization +// from leaving this instruction out of the stack frame. When IRGstack +// is transformed into IRG, this operand is replaced with the actual +// register / expression for the tagged base pointer of the current function. +def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>; + +// Large STG to be expanded into a loop. $sz is the size, $Rn is start address. +// $Rn_wback is one past the end of the range. $Rm is the loop counter. +let isCodeGenOnly=1, mayStore=1 in { +def STGloop_wback + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn), + [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >, + Sched<[WriteAdr, WriteST]>; + +def STZGloop_wback + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn), + [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >, + Sched<[WriteAdr, WriteST]>; + +// A variant of the above where $Rn2 is an independent register not tied to the input register $Rn. +// Their purpose is to use a FrameIndex operand as $Rn (which of course can not be written back). +def STGloop + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn2), (ins i64imm:$sz, GPR64sp:$Rn), + [], "@earlyclobber $Rn2,@earlyclobber $Rm" >, + Sched<[WriteAdr, WriteST]>; + +def STZGloop + : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn2), (ins i64imm:$sz, GPR64sp:$Rn), + [], "@earlyclobber $Rn2,@earlyclobber $Rm" >, + Sched<[WriteAdr, WriteST]>; +} + +} // Predicates = [HasMTE] + //===----------------------------------------------------------------------===// // Logical instructions. //===----------------------------------------------------------------------===// @@ -1162,6 +2056,8 @@ def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)), def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)), (i64 1))), (CLSXr GPR64:$Rn)>; +def : Pat<(int_aarch64_cls GPR32:$Rn), (CLSWr GPR32:$Rn)>; +def : Pat<(int_aarch64_cls64 GPR64:$Rm), (EXTRACT_SUBREG (CLSXr GPR64:$Rm), sub_32)>; // Unlike the other one operand instructions, the instructions with the "rev" // mnemonic do *not* just different in the size bit, but actually use different @@ -1178,6 +2074,10 @@ def : InstAlias<"rev64 $Rd, $Rn", (REVXr GPR64:$Rd, GPR64:$Rn), 0>; def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>; def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>; +// Match (srl (bswap x), C) -> revC if the upper bswap bits are known zero. +def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>; +def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>; + //===----------------------------------------------------------------------===// // Bitfield immediate extraction instruction. //===----------------------------------------------------------------------===// @@ -1350,6 +2250,11 @@ def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV), def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV), (CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>; +def : Pat<(add GPR32:$val, (AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV)), + (CSINCWr GPR32:$val, GPR32:$val, (i32 imm:$cc))>; +def : Pat<(add GPR64:$val, (zext (AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV))), + (CSINCXr GPR64:$val, GPR64:$val, (i32 imm:$cc))>; + // The inverse of the condition code from the alias instruction is what is used // in the aliased instruction. The parser all ready inverts the condition code // for these aliases. @@ -1383,7 +2288,8 @@ def : InstAlias<"cneg $dst, $src, $cc", //===----------------------------------------------------------------------===// let isReMaterializable = 1 in { let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in { -def ADR : ADRI<0, "adr", adrlabel, []>; +def ADR : ADRI<0, "adr", adrlabel, + [(set GPR64:$Xd, (AArch64adr tglobaladdr:$label))]>; } // hasSideEffects = 0 def ADRP : ADRI<1, "adrp", adrplabel, @@ -1391,6 +2297,10 @@ def ADRP : ADRI<1, "adrp", adrplabel, } // isReMaterializable = 1 // page address of a constant pool entry, block address +def : Pat<(AArch64adr tconstpool:$cp), (ADR tconstpool:$cp)>; +def : Pat<(AArch64adr tblockaddress:$cp), (ADR tblockaddress:$cp)>; +def : Pat<(AArch64adr texternalsym:$sym), (ADR texternalsym:$sym)>; +def : Pat<(AArch64adr tjumptable:$sym), (ADR tjumptable:$sym)>; def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>; def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>; def : Pat<(AArch64adrp texternalsym:$sym), (ADRP texternalsym:$sym)>; @@ -1409,9 +2319,31 @@ def ERET : SpecialReturn<0b0100, "eret">; def : InstAlias<"ret", (RET LR)>; let isCall = 1, Defs = [LR], Uses = [SP] in { -def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>; + def BLR : BranchReg<0b0001, "blr", []>; + def BLRNoIP : Pseudo<(outs), (ins GPR64noip:$Rn), []>, + Sched<[WriteBrReg]>, + PseudoInstExpansion<(BLR GPR64:$Rn)>; + def BLR_RVMARKER : Pseudo<(outs), (ins variable_ops), []>, + Sched<[WriteBrReg]>; + def BLR_BTI : Pseudo<(outs), (ins variable_ops), []>, + Sched<[WriteBrReg]>; } // isCall +def : Pat<(AArch64call GPR64:$Rn), + (BLR GPR64:$Rn)>, + Requires<[NoSLSBLRMitigation]>; +def : Pat<(AArch64call GPR64noip:$Rn), + (BLRNoIP GPR64noip:$Rn)>, + Requires<[SLSBLRMitigation]>; + +def : Pat<(AArch64call_rvmarker (i64 tglobaladdr:$rvfunc), GPR64:$Rn), + (BLR_RVMARKER tglobaladdr:$rvfunc, GPR64:$Rn)>, + Requires<[NoSLSBLRMitigation]>; + +def : Pat<(AArch64call_bti GPR64:$Rn), + (BLR_BTI GPR64:$Rn)>, + Requires<[NoSLSBLRMitigation]>; + let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>; } // isBranch, isTerminator, isBarrier, isIndirectBranch @@ -1434,9 +2366,14 @@ def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> { let AsmString = ".tlsdesccall $sym"; } +// Pseudo instruction to tell the streamer to emit a 'B' character into the +// augmentation string. +def EMITBKEY : Pseudo<(outs), (ins), []>, Sched<[]> {} + // FIXME: maybe the scratch register used shouldn't be fixed to X1? // FIXME: can "hasSideEffects be dropped? -let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1, +// This gets lowered to an instruction sequence which takes 16 bytes +let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1, Size = 16, isCodeGenOnly = 1 in def TLSDESC_CALLSEQ : Pseudo<(outs), (ins i64imm:$sym), @@ -1448,7 +2385,12 @@ def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym), //===----------------------------------------------------------------------===// // Conditional branch (immediate) instruction. //===----------------------------------------------------------------------===// -def Bcc : BranchCond; +def Bcc : BranchCond<0, "b">; + +// Armv8.8-A variant form which hints to the branch predictor that +// this branch is very likely to go the same way nearly all the time +// (even though it is not known at compile time _which_ way that is). +def BCcc : BranchCond<1, "bc">, Requires<[HasHBC]>; //===----------------------------------------------------------------------===// // Compare-and-branch instructions. @@ -1482,16 +2424,18 @@ def BRK : ExceptionGeneration<0b001, 0b00, "brk">; } def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">; def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">; -def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">; +def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">, Requires<[HasEL3]>; def HLT : ExceptionGeneration<0b010, 0b00, "hlt">; def HVC : ExceptionGeneration<0b000, 0b10, "hvc">; -def SMC : ExceptionGeneration<0b000, 0b11, "smc">; +def SMC : ExceptionGeneration<0b000, 0b11, "smc">, Requires<[HasEL3]>; def SVC : ExceptionGeneration<0b000, 0b01, "svc">; // DCPSn defaults to an immediate operand of zero if unspecified. def : InstAlias<"dcps1", (DCPS1 0)>; def : InstAlias<"dcps2", (DCPS2 0)>; -def : InstAlias<"dcps3", (DCPS3 0)>; +def : InstAlias<"dcps3", (DCPS3 0)>, Requires<[HasEL3]>; + +def UDF : UDFType<0, "udf">; //===----------------------------------------------------------------------===// // Load instructions. @@ -1532,6 +2476,9 @@ defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32Op, simm7s4, "ldnp">; defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64Op, simm7s8, "ldnp">; defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128Op, simm7s16, "ldnp">; +def : Pat<(AArch64ldp (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), + (LDPXi GPR64sp:$Rn, simm7s8:$offset)>; + //--- // (register offset) //--- @@ -1635,6 +2582,7 @@ let Predicates = [IsLE] in { defm : VecROLoadPat; defm : VecROLoadPat; defm : VecROLoadPat; + defm : VecROLoadPat; } defm : VecROLoadPat; @@ -1649,6 +2597,7 @@ let Predicates = [IsLE] in { defm : VecROLoadPat; defm : VecROLoadPat; defm : VecROLoadPat; + defm : VecROLoadPat; defm : VecROLoadPat; } } // AddedComplexity = 10 @@ -1731,6 +2680,10 @@ defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr", [(set (f128 FPR128Op:$Rt), (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>; +// bf16 load pattern +def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>; + // For regular load, we do not have any alignment requirement. // Thus, it is safe to directly map the vector loads with interesting // addressing modes. @@ -1780,6 +2733,8 @@ let Predicates = [IsLE] in { (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(v4bf16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; } def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; @@ -1803,6 +2758,8 @@ let Predicates = [IsLE] in { (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v8bf16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; } def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; @@ -1883,14 +2840,37 @@ def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>; //--- // (literal) -def LDRWl : LoadLiteral<0b00, 0, GPR32z, "ldr">; -def LDRXl : LoadLiteral<0b01, 0, GPR64z, "ldr">; -def LDRSl : LoadLiteral<0b00, 1, FPR32Op, "ldr">; -def LDRDl : LoadLiteral<0b01, 1, FPR64Op, "ldr">; -def LDRQl : LoadLiteral<0b10, 1, FPR128Op, "ldr">; + +def alignedglobal : PatLeaf<(iPTR iPTR:$label), [{ + if (auto *G = dyn_cast(N)) { + const DataLayout &DL = MF->getDataLayout(); + Align Align = G->getGlobal()->getPointerAlignment(DL); + return Align >= 4 && G->getOffset() % 4 == 0; + } + if (auto *C = dyn_cast(N)) + return C->getAlign() >= 4 && C->getOffset() % 4 == 0; + return false; +}]>; + +def LDRWl : LoadLiteral<0b00, 0, GPR32z, "ldr", + [(set GPR32z:$Rt, (load (AArch64adr alignedglobal:$label)))]>; +def LDRXl : LoadLiteral<0b01, 0, GPR64z, "ldr", + [(set GPR64z:$Rt, (load (AArch64adr alignedglobal:$label)))]>; +def LDRSl : LoadLiteral<0b00, 1, FPR32Op, "ldr", + [(set (f32 FPR32Op:$Rt), (load (AArch64adr alignedglobal:$label)))]>; +def LDRDl : LoadLiteral<0b01, 1, FPR64Op, "ldr", + [(set (f64 FPR64Op:$Rt), (load (AArch64adr alignedglobal:$label)))]>; +def LDRQl : LoadLiteral<0b10, 1, FPR128Op, "ldr", + [(set (f128 FPR128Op:$Rt), (load (AArch64adr alignedglobal:$label)))]>; // load sign-extended word -def LDRSWl : LoadLiteral<0b10, 0, GPR64z, "ldrsw">; +def LDRSWl : LoadLiteral<0b10, 0, GPR64z, "ldrsw", + [(set GPR64z:$Rt, (sextloadi32 (AArch64adr alignedglobal:$label)))]>; + +let AddedComplexity = 20 in { +def : Pat<(i64 (zextloadi32 (AArch64adr alignedglobal:$label))), + (SUBREG_TO_REG (i64 0), (LDRWl $label), sub_32)>; +} // prefetch def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>; @@ -1908,7 +2888,7 @@ defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur", [(set FPR8Op:$Rt, (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>; defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur", - [(set FPR16Op:$Rt, + [(set (f16 FPR16Op:$Rt), (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32Op, "ldur", [(set (f32 FPR32Op:$Rt), @@ -2202,6 +3182,13 @@ defm STNPS : StorePairNoAlloc<0b00, 1, FPR32Op, simm7s4, "stnp">; defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">; defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">; +def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), + (STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>; + +def : Pat<(AArch64stnp FPR128:$Rt, FPR128:$Rt2, (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)), + (STNPQi FPR128:$Rt, FPR128:$Rt2, GPR64sp:$Rn, simm7s16:$offset)>; + + //--- // (Register offset) @@ -2217,7 +3204,7 @@ defm STRB : Store8RO< 0b00, 1, 0b00, FPR8Op, "str", untyped, store>; defm STRH : Store16RO<0b01, 1, 0b00, FPR16Op, "str", f16, store>; defm STRS : Store32RO<0b10, 1, 0b00, FPR32Op, "str", f32, store>; defm STRD : Store64RO<0b11, 1, 0b00, FPR64Op, "str", f64, store>; -defm STRQ : Store128RO<0b00, 1, 0b10, FPR128Op, "str", f128, store>; +defm STRQ : Store128RO<0b00, 1, 0b10, FPR128Op, "str">; let Predicates = [UseSTRQro], AddedComplexity = 10 in { def : Pat<(store (f128 FPR128:$Rt), @@ -2271,6 +3258,7 @@ let Predicates = [IsLE] in { defm : VecROStorePat; defm : VecROStorePat; defm : VecROStorePat; + defm : VecROStorePat; } defm : VecROStorePat; @@ -2286,6 +3274,7 @@ let Predicates = [IsLE, UseSTRQro] in { defm : VecROStorePat; defm : VecROStorePat; defm : VecROStorePat; + defm : VecROStorePat; } } // AddedComplexity = 10 @@ -2346,6 +3335,11 @@ defm STRBB : StoreUIz<0b00, 0, 0b00, GPR32z, uimm12s1, "strb", (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>; +// bf16 store pattern +def : Pat<(store (bf16 FPR16Op:$Rt), + (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)), + (STRHui FPR16:$Rt, GPR64sp:$Rn, uimm12s2:$offset)>; + let AddedComplexity = 10 in { // Match all store 64 bits width whose type is compatible with FPR64 @@ -2373,6 +3367,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v4f16 FPR64:$Rt), (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(store (v4bf16 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; } // Match all store 128 bits width whose type is compatible with FPR128 @@ -2403,6 +3400,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v8f16 FPR128:$Rt), (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v8bf16 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; } // truncstore i64 @@ -2418,7 +3418,7 @@ def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)), } // AddedComplexity = 10 // Match stores from lane 0 to the appropriate subreg's store. -multiclass VecStoreLane0Pat { @@ -2467,8 +3467,9 @@ defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32z, "sturb", [(truncstorei8 GPR32z:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>; -// Armv8.4 LDAPR & STLR with Immediate Offset instruction -let Predicates = [HasV8_4a] in { +// Armv8.4 Weaker Release Consistency enhancements +// LDAPR & STLR with Immediate Offset instructions +let Predicates = [HasRCPC_IMMO] in { defm STLURB : BaseStoreUnscaleV84<"stlurb", 0b00, 0b00, GPR32>; defm STLURH : BaseStoreUnscaleV84<"stlurh", 0b01, 0b00, GPR32>; defm STLURW : BaseStoreUnscaleV84<"stlur", 0b10, 0b00, GPR32>; @@ -2509,6 +3510,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v4f16 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v4bf16 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; } // Match all store 128 bits width whose type is compatible with FPR128 @@ -2541,6 +3545,9 @@ let Predicates = [IsLE] in { def : Pat<(store (v8f16 FPR128:$Rt), (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v8bf16 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; } } // AddedComplexity = 10 @@ -2677,6 +3684,9 @@ def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off), (STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (bf16 FPR16:$Rt), GPR64sp:$addr, simm9:$off), + (STRHpost FPR16:$Rt, GPR64sp:$addr, simm9:$off)>; + def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off), (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), @@ -2691,6 +3701,8 @@ def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v4bf16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; @@ -2706,6 +3718,8 @@ def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v8bf16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; //===----------------------------------------------------------------------===// // Load/store exclusive instructions. @@ -2753,7 +3767,7 @@ def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">; def STXPW : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">; def STXPX : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">; -let Predicates = [HasV8_1a] in { +let Predicates = [HasLOR] in { // v8.1a "Limited Order Region" extension load-acquire instructions def LDLARW : LoadAcquire <0b10, 1, 1, 0, 0, GPR32, "ldlar">; def LDLARX : LoadAcquire <0b11, 1, 1, 0, 0, GPR64, "ldlar">; @@ -2779,23 +3793,63 @@ defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>; defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>; defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>; -defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>; -defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>; -defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>; -defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>; +defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>; +defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>; +defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>; +defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>; + +// AArch64's FCVT instructions saturate when out of range. +multiclass FPToIntegerSatPats { + let Predicates = [HasFullFP16] in { + def : Pat<(i32 (to_int_sat f16:$Rn, i32)), + (!cast(INST # UWHr) f16:$Rn)>; + def : Pat<(i64 (to_int_sat f16:$Rn, i64)), + (!cast(INST # UXHr) f16:$Rn)>; + } + def : Pat<(i32 (to_int_sat f32:$Rn, i32)), + (!cast(INST # UWSr) f32:$Rn)>; + def : Pat<(i64 (to_int_sat f32:$Rn, i64)), + (!cast(INST # UXSr) f32:$Rn)>; + def : Pat<(i32 (to_int_sat f64:$Rn, i32)), + (!cast(INST # UWDr) f64:$Rn)>; + def : Pat<(i64 (to_int_sat f64:$Rn, i64)), + (!cast(INST # UXDr) f64:$Rn)>; + + let Predicates = [HasFullFP16] in { + def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)), + (!cast(INST # SWHri) $Rn, $scale)>; + def : Pat<(i64 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i64:$scale), i64)), + (!cast(INST # SXHri) $Rn, $scale)>; + } + def : Pat<(i32 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i32:$scale), i32)), + (!cast(INST # SWSri) $Rn, $scale)>; + def : Pat<(i64 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i64:$scale), i64)), + (!cast(INST # SXSri) $Rn, $scale)>; + def : Pat<(i32 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i32:$scale), i32)), + (!cast(INST # SWDri) $Rn, $scale)>; + def : Pat<(i64 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i64:$scale), i64)), + (!cast(INST # SXDri) $Rn, $scale)>; +} + +defm : FPToIntegerSatPats; +defm : FPToIntegerSatPats; multiclass FPToIntegerIntPats { + let Predicates = [HasFullFP16] in { def : Pat<(i32 (round f16:$Rn)), (!cast(INST # UWHr) $Rn)>; def : Pat<(i64 (round f16:$Rn)), (!cast(INST # UXHr) $Rn)>; + } def : Pat<(i32 (round f32:$Rn)), (!cast(INST # UWSr) $Rn)>; def : Pat<(i64 (round f32:$Rn)), (!cast(INST # UXSr) $Rn)>; def : Pat<(i32 (round f64:$Rn)), (!cast(INST # UWDr) $Rn)>; def : Pat<(i64 (round f64:$Rn)), (!cast(INST # UXDr) $Rn)>; + let Predicates = [HasFullFP16] in { def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))), (!cast(INST # SWHri) $Rn, $scale)>; def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))), (!cast(INST # SXHri) $Rn, $scale)>; + } def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))), (!cast(INST # SWSri) $Rn, $scale)>; def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))), @@ -2809,7 +3863,7 @@ multiclass FPToIntegerIntPats { defm : FPToIntegerIntPats; defm : FPToIntegerIntPats; -multiclass FPToIntegerPats { +multiclass FPToIntegerPats { def : Pat<(i32 (to_int (round f32:$Rn))), (!cast(INST # UWSr) f32:$Rn)>; def : Pat<(i64 (to_int (round f32:$Rn))), @@ -2818,23 +3872,62 @@ multiclass FPToIntegerPats { (!cast(INST # UWDr) f64:$Rn)>; def : Pat<(i64 (to_int (round f64:$Rn))), (!cast(INST # UXDr) f64:$Rn)>; + + // These instructions saturate like fp_to_[su]int_sat. + let Predicates = [HasFullFP16] in { + def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)), + (!cast(INST # UWHr) f16:$Rn)>; + def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)), + (!cast(INST # UXHr) f16:$Rn)>; + } + def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)), + (!cast(INST # UWSr) f32:$Rn)>; + def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)), + (!cast(INST # UXSr) f32:$Rn)>; + def : Pat<(i32 (to_int_sat (round f64:$Rn), i32)), + (!cast(INST # UWDr) f64:$Rn)>; + def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)), + (!cast(INST # UXDr) f64:$Rn)>; } -defm : FPToIntegerPats; -defm : FPToIntegerPats; -defm : FPToIntegerPats; -defm : FPToIntegerPats; -defm : FPToIntegerPats; -defm : FPToIntegerPats; -defm : FPToIntegerPats; -defm : FPToIntegerPats; +defm : FPToIntegerPats; +defm : FPToIntegerPats; +defm : FPToIntegerPats; +defm : FPToIntegerPats; +defm : FPToIntegerPats; +defm : FPToIntegerPats; +defm : FPToIntegerPats; +defm : FPToIntegerPats; + + + +let Predicates = [HasFullFP16] in { + def : Pat<(i32 (lround f16:$Rn)), + (!cast(FCVTASUWHr) f16:$Rn)>; + def : Pat<(i64 (lround f16:$Rn)), + (!cast(FCVTASUXHr) f16:$Rn)>; + def : Pat<(i64 (llround f16:$Rn)), + (!cast(FCVTASUXHr) f16:$Rn)>; +} +def : Pat<(i32 (lround f32:$Rn)), + (!cast(FCVTASUWSr) f32:$Rn)>; +def : Pat<(i32 (lround f64:$Rn)), + (!cast(FCVTASUWDr) f64:$Rn)>; +def : Pat<(i64 (lround f32:$Rn)), + (!cast(FCVTASUXSr) f32:$Rn)>; +def : Pat<(i64 (lround f64:$Rn)), + (!cast(FCVTASUXDr) f64:$Rn)>; +def : Pat<(i64 (llround f32:$Rn)), + (!cast(FCVTASUXSr) f32:$Rn)>; +def : Pat<(i64 (llround f64:$Rn)), + (!cast(FCVTASUXDr) f64:$Rn)>; //===----------------------------------------------------------------------===// // Scaled integer to floating point conversion instructions. //===----------------------------------------------------------------------===// -defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>; -defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>; +defm SCVTF : IntegerToFP<0, "scvtf", any_sint_to_fp>; +defm UCVTF : IntegerToFP<1, "ucvtf", any_uint_to_fp>; //===----------------------------------------------------------------------===// // Unscaled integer to floating point conversion instruction. @@ -2873,12 +3966,9 @@ defm FNEG : SingleOperandFPData<0b0010, "fneg", fneg>; defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>; defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>; defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>; -defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>; +defm FRINTN : SingleOperandFPData<0b1000, "frintn", froundeven>; defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>; -def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))), - (FRINTNDr FPR64:$Rn)>; - defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>; defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>; @@ -2886,6 +3976,34 @@ let SchedRW = [WriteFDiv] in { defm FSQRT : SingleOperandFPData<0b0011, "fsqrt", fsqrt>; } +let Predicates = [HasFRInt3264] in { + defm FRINT32Z : FRIntNNT<0b00, "frint32z", int_aarch64_frint32z>; + defm FRINT64Z : FRIntNNT<0b10, "frint64z", int_aarch64_frint64z>; + defm FRINT32X : FRIntNNT<0b01, "frint32x", int_aarch64_frint32x>; + defm FRINT64X : FRIntNNT<0b11, "frint64x", int_aarch64_frint64x>; +} // HasFRInt3264 + +let Predicates = [HasFullFP16] in { + def : Pat<(i32 (lrint f16:$Rn)), + (FCVTZSUWHr (!cast(FRINTXHr) f16:$Rn))>; + def : Pat<(i64 (lrint f16:$Rn)), + (FCVTZSUXHr (!cast(FRINTXHr) f16:$Rn))>; + def : Pat<(i64 (llrint f16:$Rn)), + (FCVTZSUXHr (!cast(FRINTXHr) f16:$Rn))>; +} +def : Pat<(i32 (lrint f32:$Rn)), + (FCVTZSUWSr (!cast(FRINTXSr) f32:$Rn))>; +def : Pat<(i32 (lrint f64:$Rn)), + (FCVTZSUWDr (!cast(FRINTXDr) f64:$Rn))>; +def : Pat<(i64 (lrint f32:$Rn)), + (FCVTZSUXSr (!cast(FRINTXSr) f32:$Rn))>; +def : Pat<(i64 (lrint f64:$Rn)), + (FCVTZSUXDr (!cast(FRINTXDr) f64:$Rn))>; +def : Pat<(i64 (llrint f32:$Rn)), + (FCVTZSUXSr (!cast(FRINTXSr) f32:$Rn))>; +def : Pat<(i64 (llrint f64:$Rn)), + (FCVTZSUXDr (!cast(FRINTXDr) f64:$Rn))>; + //===----------------------------------------------------------------------===// // Floating point two operand instructions. //===----------------------------------------------------------------------===// @@ -2895,18 +4013,18 @@ let SchedRW = [WriteFDiv] in { defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>; } defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>; -defm FMAX : TwoOperandFPData<0b0100, "fmax", fmaxnan>; +defm FMAX : TwoOperandFPData<0b0100, "fmax", fmaximum>; defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>; -defm FMIN : TwoOperandFPData<0b0101, "fmin", fminnan>; +defm FMIN : TwoOperandFPData<0b0101, "fmin", fminimum>; let SchedRW = [WriteFMul] in { defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>; defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>; } defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>; -def : Pat<(v1f64 (fmaxnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), +def : Pat<(v1f64 (fmaximum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMAXDrr FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(v1f64 (fminnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), +def : Pat<(v1f64 (fminimum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMINDrr FPR64:$Rn, FPR64:$Rm)>; def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>; @@ -2930,32 +4048,37 @@ defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub", // N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike // the NEON variant. + +// Here we handle first -(a + b*c) for FNMADD: + +let Predicates = [HasNEON, HasFullFP16] in +def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, FPR16:$Ra)), + (FMSUBHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>; + def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)), (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)), (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; -// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and -// "(-a) + b*(-c)". +// Now it's time for "(-a) + (-b)*c" + +let Predicates = [HasNEON, HasFullFP16] in +def : Pat<(f16 (fma (fneg FPR16:$Rn), FPR16:$Rm, (fneg FPR16:$Ra))), + (FNMADDHrrr FPR16:$Rn, FPR16:$Rm, FPR16:$Ra)>; + def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))), (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))), (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; -def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))), - (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; - -def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))), - (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; - //===----------------------------------------------------------------------===// // Floating point comparison instructions. //===----------------------------------------------------------------------===// -defm FCMPE : FPComparison<1, "fcmpe">; -defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>; +defm FCMPE : FPComparison<1, "fcmpe", AArch64strict_fcmpe>; +defm FCMP : FPComparison<0, "fcmp", AArch64any_fcmp>; //===----------------------------------------------------------------------===// // Floating point conditional comparison instructions. @@ -2983,12 +4106,52 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd), let hasNoSchedulingInfo = 1; } +//===----------------------------------------------------------------------===// +// Instructions used for emitting unwind opcodes on ARM64 Windows. +//===----------------------------------------------------------------------===// +let isPseudo = 1 in { + def SEH_StackAlloc : Pseudo<(outs), (ins i32imm:$size), []>, Sched<[]>; + def SEH_SaveFPLR : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>; + def SEH_SaveFPLR_X : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>; + def SEH_SaveReg : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>; + def SEH_SaveReg_X : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>; + def SEH_SaveRegP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>; + def SEH_SaveRegP_X : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>; + def SEH_SaveFReg : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>; + def SEH_SaveFReg_X : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>; + def SEH_SaveFRegP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>; + def SEH_SaveFRegP_X : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>; + def SEH_SetFP : Pseudo<(outs), (ins), []>, Sched<[]>; + def SEH_AddFP : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>; + def SEH_Nop : Pseudo<(outs), (ins), []>, Sched<[]>; + def SEH_PrologEnd : Pseudo<(outs), (ins), []>, Sched<[]>; + def SEH_EpilogStart : Pseudo<(outs), (ins), []>, Sched<[]>; + def SEH_EpilogEnd : Pseudo<(outs), (ins), []>, Sched<[]>; +} + +// Pseudo instructions for Windows EH +//===----------------------------------------------------------------------===// +let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, + isCodeGenOnly = 1, isReturn = 1, isEHScopeReturn = 1, isPseudo = 1 in { + def CLEANUPRET : Pseudo<(outs), (ins), [(cleanupret)]>, Sched<[]>; + let usesCustomInserter = 1 in + def CATCHRET : Pseudo<(outs), (ins am_brcond:$dst, am_brcond:$src), [(catchret bb:$dst, bb:$src)]>, + Sched<[]>; +} + +// Pseudo instructions for homogeneous prolog/epilog +let isPseudo = 1 in { + // Save CSRs in order, {FPOffset} + def HOM_Prolog : Pseudo<(outs), (ins variable_ops), []>, Sched<[]>; + // Restore CSRs in order + def HOM_Epilog : Pseudo<(outs), (ins variable_ops), []>, Sched<[]>; +} //===----------------------------------------------------------------------===// // Floating point immediate move. //===----------------------------------------------------------------------===// -let isReMaterializable = 1 in { +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { defm FMOV : FPMoveImmediate<"fmov">; } @@ -2997,7 +4160,7 @@ defm FMOV : FPMoveImmediate<"fmov">; //===----------------------------------------------------------------------===// defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", - int_aarch64_neon_uabd>; + AArch64uabd>; // Match UABDL in log2-shuffle patterns. def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)), (zext (v8i8 V64:$opB))))), @@ -3039,6 +4202,21 @@ defm CMLT : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>; defm CNT : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>; defm FABS : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>; +def : Pat<(v8i8 (AArch64vashr (v8i8 V64:$Rn), (i32 7))), + (CMLTv8i8rz V64:$Rn)>; +def : Pat<(v4i16 (AArch64vashr (v4i16 V64:$Rn), (i32 15))), + (CMLTv4i16rz V64:$Rn)>; +def : Pat<(v2i32 (AArch64vashr (v2i32 V64:$Rn), (i32 31))), + (CMLTv2i32rz V64:$Rn)>; +def : Pat<(v16i8 (AArch64vashr (v16i8 V128:$Rn), (i32 7))), + (CMLTv16i8rz V128:$Rn)>; +def : Pat<(v8i16 (AArch64vashr (v8i16 V128:$Rn), (i32 15))), + (CMLTv8i16rz V128:$Rn)>; +def : Pat<(v4i32 (AArch64vashr (v4i32 V128:$Rn), (i32 31))), + (CMLTv4i32rz V128:$Rn)>; +def : Pat<(v2i64 (AArch64vashr (v2i64 V128:$Rn), (i32 63))), + (CMLTv2i64rz V128:$Rn)>; + defm FCMEQ : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>; defm FCMGE : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>; defm FCMGT : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>; @@ -3053,14 +4231,8 @@ def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn) (i64 4)))), (FCVTLv8i16 V128:$Rn)>; def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>; -def : Pat<(v2f64 (fpextend (v2f32 (extract_subvector (v4f32 V128:$Rn), - (i64 2))))), - (FCVTLv4i32 V128:$Rn)>; def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>; -def : Pat<(v4f32 (fpextend (v4f16 (extract_subvector (v8f16 V128:$Rn), - (i64 4))))), - (FCVTLv8i16 V128:$Rn)>; defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>; defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>; @@ -3083,6 +4255,22 @@ defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn", defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>; defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>; +// AArch64's FCVT instructions saturate when out of range. +multiclass SIMDTwoVectorFPToIntSatPats { + def : Pat<(v4i16 (to_int_sat v4f16:$Rn, i16)), + (!cast(INST # v4f16) v4f16:$Rn)>; + def : Pat<(v8i16 (to_int_sat v8f16:$Rn, i16)), + (!cast(INST # v8f16) v8f16:$Rn)>; + def : Pat<(v2i32 (to_int_sat v2f32:$Rn, i32)), + (!cast(INST # v2f32) v2f32:$Rn)>; + def : Pat<(v4i32 (to_int_sat v4f32:$Rn, i32)), + (!cast(INST # v4f32) v4f32:$Rn)>; + def : Pat<(v2i64 (to_int_sat v2f64:$Rn, i64)), + (!cast(INST # v2f64) v2f64:$Rn)>; +} +defm : SIMDTwoVectorFPToIntSatPats; +defm : SIMDTwoVectorFPToIntSatPats; + def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>; def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>; def : Pat<(v2i32 (int_aarch64_neon_fcvtzs v2f32:$Rn)), (FCVTZSv2f32 $Rn)>; @@ -3100,10 +4288,18 @@ defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>; defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>; defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>; defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>; -defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>; +defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", froundeven>; defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>; defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>; defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>; + +let Predicates = [HasFRInt3264] in { + defm FRINT32Z : FRIntNNTVector<0, 0, "frint32z", int_aarch64_neon_frint32z>; + defm FRINT64Z : FRIntNNTVector<0, 1, "frint64z", int_aarch64_neon_frint64z>; + defm FRINT32X : FRIntNNTVector<1, 0, "frint32x", int_aarch64_neon_frint32x>; + defm FRINT64X : FRIntNNTVector<1, 1, "frint64x", int_aarch64_neon_frint64x>; +} // HasFRInt3264 + defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>; defm FSQRT : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>; defm NEG : SIMDTwoVectorBHSD<1, 0b01011, "neg", @@ -3115,30 +4311,14 @@ def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}", def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}", (NOTv16i8 V128:$Vd, V128:$Vn)>; -def : Pat<(AArch64neg (v8i8 V64:$Rn)), (NEGv8i8 V64:$Rn)>; -def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>; -def : Pat<(AArch64neg (v4i16 V64:$Rn)), (NEGv4i16 V64:$Rn)>; -def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>; -def : Pat<(AArch64neg (v2i32 V64:$Rn)), (NEGv2i32 V64:$Rn)>; -def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>; -def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>; - -def : Pat<(AArch64not (v8i8 V64:$Rn)), (NOTv8i8 V64:$Rn)>; -def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>; -def : Pat<(AArch64not (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>; -def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>; -def : Pat<(AArch64not (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>; -def : Pat<(AArch64not (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>; -def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>; -def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>; - def : Pat<(vnot (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>; def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>; def : Pat<(vnot (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>; def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>; +def : Pat<(vnot (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>; def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>; -defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>; +defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", bitreverse>; defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>; defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>; defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>; @@ -3153,9 +4333,8 @@ defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>; defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>; defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>; defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp", - BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >; -defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp", - int_aarch64_neon_uaddlp>; + BinOpFrag<(add node:$LHS, (AArch64uaddlp node:$RHS))> >; +defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp", AArch64uaddlp>; defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>; defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>; defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>; @@ -3163,12 +4342,16 @@ defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte> defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>; defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>; -def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; -def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; -def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; -def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; -def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>; -def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>; +def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; +def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; +def : Pat<(v4bf16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; +def : Pat<(v4bf16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; +def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; +def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; +def : Pat<(v8bf16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; +def : Pat<(v8bf16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; +def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>; // Patterns for vector long shift (by element width). These need to match all // three of zext, sext and anyext so it's easier to pull the patterns out of the @@ -3192,6 +4375,63 @@ defm : SIMDVectorLShiftLongBySizeBHSPats; defm : SIMDVectorLShiftLongBySizeBHSPats; defm : SIMDVectorLShiftLongBySizeBHSPats; +// Constant vector values, used in the S/UQXTN patterns below. +def VImmFF: PatLeaf<(AArch64NvCast (v2i64 (AArch64movi_edit (i32 85))))>; +def VImmFFFF: PatLeaf<(AArch64NvCast (v2i64 (AArch64movi_edit (i32 51))))>; +def VImm7F: PatLeaf<(AArch64movi_shift (i32 127), (i32 0))>; +def VImm80: PatLeaf<(AArch64mvni_shift (i32 127), (i32 0))>; +def VImm7FFF: PatLeaf<(AArch64movi_msl (i32 127), (i32 264))>; +def VImm8000: PatLeaf<(AArch64mvni_msl (i32 127), (i32 264))>; + +// trunc(umin(X, 255)) -> UQXTRN v8i8 +def : Pat<(v8i8 (trunc (umin (v8i16 V128:$Vn), (v8i16 VImmFF)))), + (UQXTNv8i8 V128:$Vn)>; +// trunc(umin(X, 65535)) -> UQXTRN v4i16 +def : Pat<(v4i16 (trunc (umin (v4i32 V128:$Vn), (v4i32 VImmFFFF)))), + (UQXTNv4i16 V128:$Vn)>; +// trunc(smin(smax(X, -128), 128)) -> SQXTRN +// with reversed min/max +def : Pat<(v8i8 (trunc (smin (smax (v8i16 V128:$Vn), (v8i16 VImm80)), + (v8i16 VImm7F)))), + (SQXTNv8i8 V128:$Vn)>; +def : Pat<(v8i8 (trunc (smax (smin (v8i16 V128:$Vn), (v8i16 VImm7F)), + (v8i16 VImm80)))), + (SQXTNv8i8 V128:$Vn)>; +// trunc(smin(smax(X, -32768), 32767)) -> SQXTRN +// with reversed min/max +def : Pat<(v4i16 (trunc (smin (smax (v4i32 V128:$Vn), (v4i32 VImm8000)), + (v4i32 VImm7FFF)))), + (SQXTNv4i16 V128:$Vn)>; +def : Pat<(v4i16 (trunc (smax (smin (v4i32 V128:$Vn), (v4i32 VImm7FFF)), + (v4i32 VImm8000)))), + (SQXTNv4i16 V128:$Vn)>; + +// concat_vectors(Vd, trunc(smin(smax Vm, -128), 127) ~> SQXTN2(Vd, Vn) +// with reversed min/max +def : Pat<(v16i8 (concat_vectors + (v8i8 V64:$Vd), + (v8i8 (trunc (smin (smax (v8i16 V128:$Vn), (v8i16 VImm80)), + (v8i16 VImm7F)))))), + (SQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; +def : Pat<(v16i8 (concat_vectors + (v8i8 V64:$Vd), + (v8i8 (trunc (smax (smin (v8i16 V128:$Vn), (v8i16 VImm7F)), + (v8i16 VImm80)))))), + (SQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; + +// concat_vectors(Vd, trunc(smin(smax Vm, -32768), 32767) ~> SQXTN2(Vd, Vn) +// with reversed min/max +def : Pat<(v8i16 (concat_vectors + (v4i16 V64:$Vd), + (v4i16 (trunc (smin (smax (v4i32 V128:$Vn), (v4i32 VImm8000)), + (v4i32 VImm7FFF)))))), + (SQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; +def : Pat<(v8i16 (concat_vectors + (v4i16 V64:$Vd), + (v4i16 (trunc (smax (smin (v4i32 V128:$Vn), (v4i32 VImm7FFF)), + (v4i32 VImm8000)))))), + (SQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; + //===----------------------------------------------------------------------===// // Advanced SIMD three vector instructions. //===----------------------------------------------------------------------===// @@ -3204,6 +4444,9 @@ defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>; defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>; defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>; defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>; +foreach VT = [ v8i8, v16i8, v4i16, v8i16, v2i32, v4i32, v2i64 ] in { +def : Pat<(vnot (AArch64cmeqz VT:$Rn)), (!cast("CMTST"#VT) VT:$Rn, VT:$Rn)>; +} defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>; let Predicates = [HasNEON] in { foreach VT = [ v2f32, v4f32, v2f64 ] in @@ -3215,7 +4458,7 @@ def : Pat<(fabs (fsub VT:$Rn, VT:$Rm)), (!cast("FABD"#VT) VT:$Rn, V } defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>; defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>; -defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>; +defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_faddp>; defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>; defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; @@ -3224,11 +4467,11 @@ defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>; defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>; defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>; defm FMAXP : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>; -defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>; +defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaximum>; defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>; defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>; defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>; -defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>; +defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminimum>; // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the // instruction expects the addend first, while the fma intrinsic puts it last. @@ -3237,32 +4480,22 @@ defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla", defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls", TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; -// The following def pats catch the case where the LHS of an FMA is negated. -// The TriOpFrag above catches the case where the middle operand is negated. -def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)), - (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>; - -def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)), - (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>; - -def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)), - (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>; - defm FMULX : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>; defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>; defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>; defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>; defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>; -defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", - TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >; -defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", - TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >; + +// MLA and MLS are generated in MachineCombine +defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>; +defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>; + defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>; defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>; defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", - TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >; -defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>; -defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>; + TriOpFrag<(add node:$LHS, (AArch64sabd node:$MHS, node:$RHS))> >; +defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", AArch64sabd>; +defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", AArch64shadd>; defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>; defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>; defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>; @@ -3274,14 +4507,14 @@ defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrd defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>; defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>; defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>; -defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>; +defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", AArch64srhadd>; defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>; defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>; defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>; defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba", - TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >; -defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>; -defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>; + TriOpFrag<(add node:$LHS, (AArch64uabd node:$MHS, node:$RHS))> >; +defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", AArch64uabd>; +defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", AArch64uhadd>; defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>; defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>; defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>; @@ -3291,44 +4524,53 @@ defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>; defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>; defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>; defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>; -defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>; +defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>; defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>; defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>; defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", - int_aarch64_neon_sqadd>; + int_aarch64_neon_sqrdmlah>; defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh", - int_aarch64_neon_sqsub>; + int_aarch64_neon_sqrdmlsh>; + +// Extra saturate patterns, other than the intrinsics matches above +defm : SIMDThreeSameVectorExtraPatterns<"SQADD", saddsat>; +defm : SIMDThreeSameVectorExtraPatterns<"UQADD", uaddsat>; +defm : SIMDThreeSameVectorExtraPatterns<"SQSUB", ssubsat>; +defm : SIMDThreeSameVectorExtraPatterns<"UQSUB", usubsat>; defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >; -defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">; -defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>; -defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl", - TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>; defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>; defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn", BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >; defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>; +// Pseudo bitwise select pattern BSP. +// It is expanded into BSL/BIT/BIF after register allocation. +defm BSP : SIMDLogicalThreeVectorPseudo>; +defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl">; +defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>; +defm BIF : SIMDLogicalThreeVectorTied<1, 0b11, "bif">; -def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), + (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), + (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}", (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>; @@ -3534,9 +4776,9 @@ defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt", defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>; -defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>; -defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>; -defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>; +defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONorStreamingSVE>; +defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONorStreamingSVE>; +defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONorStreamingSVE>; defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>; defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>; @@ -3555,15 +4797,11 @@ defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>; let Predicates = [HasRDM] in { defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">; defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">; - def : Pat<(i32 (int_aarch64_neon_sqadd - (i32 FPR32:$Rd), - (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn), - (i32 FPR32:$Rm))))), + def : Pat<(i32 (int_aarch64_neon_sqrdmlah (i32 FPR32:$Rd), (i32 FPR32:$Rn), + (i32 FPR32:$Rm))), (SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>; - def : Pat<(i32 (int_aarch64_neon_sqsub - (i32 FPR32:$Rd), - (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn), - (i32 FPR32:$Rm))))), + def : Pat<(i32 (int_aarch64_neon_sqrdmlsh (i32 FPR32:$Rd), (i32 FPR32:$Rn), + (i32 FPR32:$Rm))), (SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>; } @@ -3635,9 +4873,9 @@ defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">; def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">; defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">; defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">; -defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">; -defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">; -defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">; +defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe", HasNEONorStreamingSVE>; +defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx", HasNEONorStreamingSVE>; +defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte", HasNEONorStreamingSVE>; defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg", UnOpFrag<(sub immAllZerosV, node:$LHS)> >; defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>; @@ -3652,7 +4890,8 @@ defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd", int_aarch64_neon_usqadd>; -def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>; +def : Pat<(v1i64 (AArch64vashr (v1i64 V64:$Rn), (i32 63))), + (CMLTv1i64rz V64:$Rn)>; def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))), (FCVTASv1i64 FPR64:$Rn)>; @@ -3670,6 +4909,10 @@ def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))), (FCVTPSv1i64 FPR64:$Rn)>; def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))), (FCVTPUv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtzs (v1f64 FPR64:$Rn))), + (FCVTZSv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtzu (v1f64 FPR64:$Rn))), + (FCVTZUv1i64 FPR64:$Rn)>; def : Pat<(f16 (int_aarch64_neon_frecpe (f16 FPR16:$Rn))), (FRECPEv1f16 FPR16:$Rn)>; @@ -3744,6 +4987,27 @@ def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))), def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))), (FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>; +// Some float -> int -> float conversion patterns for which we want to keep the +// int values in FP registers using the corresponding NEON instructions to +// avoid more costly int <-> fp register transfers. +let Predicates = [HasNEON] in { +def : Pat<(f64 (sint_to_fp (i64 (fp_to_sint f64:$Rn)))), + (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>; +def : Pat<(f32 (sint_to_fp (i32 (fp_to_sint f32:$Rn)))), + (SCVTFv1i32 (i32 (FCVTZSv1i32 f32:$Rn)))>; +def : Pat<(f64 (uint_to_fp (i64 (fp_to_uint f64:$Rn)))), + (UCVTFv1i64 (i64 (FCVTZUv1i64 f64:$Rn)))>; +def : Pat<(f32 (uint_to_fp (i32 (fp_to_uint f32:$Rn)))), + (UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>; + +let Predicates = [HasFullFP16] in { +def : Pat<(f16 (sint_to_fp (i32 (fp_to_sint f16:$Rn)))), + (SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>; +def : Pat<(f16 (uint_to_fp (i32 (fp_to_uint f16:$Rn)))), + (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>; +} +} + // If an integer is about to be converted to a floating point value, // just load it on the floating point unit. // Here are the patterns for 8 and 16-bits to float. @@ -3841,9 +5105,9 @@ defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>; defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>; defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", - int_aarch64_neon_sabd>; + AArch64sabd>; defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", - int_aarch64_neon_sabd>; + AArch64sabd>; defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl", BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>; defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw", @@ -3864,20 +5128,58 @@ defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl", defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>; defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", - int_aarch64_neon_uabd>; + AArch64uabd>; defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl", - BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>; + BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>; defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", - BinOpFrag<(add node:$LHS, (zext node:$RHS))>>; + BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>; defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal", TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl", TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>; defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl", - BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>; + BinOpFrag<(sub (zanyext node:$LHS), (zanyext node:$RHS))>>; defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw", - BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>; + BinOpFrag<(sub node:$LHS, (zanyext node:$RHS))>>; + +// Additional patterns for [SU]ML[AS]L +multiclass Neon_mul_acc_widen_patterns { + def : Pat<(v4i16 (opnode + V64:$Ra, + (v4i16 (extract_subvector + (vecopnode (v8i8 V64:$Rn),(v8i8 V64:$Rm)), + (i64 0))))), + (EXTRACT_SUBREG (v8i16 (INST8B + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), V64:$Ra, dsub), + V64:$Rn, V64:$Rm)), dsub)>; + def : Pat<(v2i32 (opnode + V64:$Ra, + (v2i32 (extract_subvector + (vecopnode (v4i16 V64:$Rn),(v4i16 V64:$Rm)), + (i64 0))))), + (EXTRACT_SUBREG (v4i32 (INST4H + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), V64:$Ra, dsub), + V64:$Rn, V64:$Rm)), dsub)>; + def : Pat<(v1i64 (opnode + V64:$Ra, + (v1i64 (extract_subvector + (vecopnode (v2i32 V64:$Rn),(v2i32 V64:$Rm)), + (i64 0))))), + (EXTRACT_SUBREG (v2i64 (INST2S + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), V64:$Ra, dsub), + V64:$Rn, V64:$Rm)), dsub)>; +} + +defm : Neon_mul_acc_widen_patterns; +defm : Neon_mul_acc_widen_patterns; +defm : Neon_mul_acc_widen_patterns; +defm : Neon_mul_acc_widen_patterns; // Additional patterns for SMULL and UMULL multiclass Neon_mul_widen_patterns; -def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), - (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; -def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), - (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; -def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), - (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; -def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), - (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; -def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), - (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; -def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), - (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; -def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), - (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; -def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), - (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; -def : Pat<(v4f16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), - (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; -def : Pat<(v8f16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), - (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; - -// We use EXT to handle extract_subvector to copy the upper 64-bits of a -// 128-bit vector. -def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 8))), - (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; -def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))), - (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; -def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))), - (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; -def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))), - (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; -def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 4))), - (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; -def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))), - (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; -def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))), - (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; +def AdjustExtImm : SDNodeXFormgetTargetConstant(8 + N->getZExtValue(), SDLoc(N), MVT::i32); +}]>; +multiclass ExtPat { + def : Pat<(VT64 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), + (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; + def : Pat<(VT128 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), + (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; + // We use EXT to handle extract_subvector to copy the upper 64-bits of a + // 128-bit vector. + def : Pat<(VT64 (extract_subvector V128:$Rn, (i64 N))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; + // A 64-bit EXT of two halves of the same 128-bit register can be done as a + // single 128-bit EXT. + def : Pat<(VT64 (AArch64ext (extract_subvector V128:$Rn, (i64 0)), + (extract_subvector V128:$Rn, (i64 N)), + (i32 imm:$imm))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, imm:$imm), dsub)>; + // A 64-bit EXT of the high half of a 128-bit register can be done using a + // 128-bit EXT of the whole register with an adjustment to the immediate. The + // top half of the other operand will be unset, but that doesn't matter as it + // will not be used. + def : Pat<(VT64 (AArch64ext (extract_subvector V128:$Rn, (i64 N)), + V64:$Rm, + (i32 imm:$imm))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, + (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), + (AdjustExtImm imm:$imm)), dsub)>; +} +defm : ExtPat; +defm : ExtPat; +defm : ExtPat; +defm : ExtPat; +defm : ExtPat; +defm : ExtPat; +defm : ExtPat; +defm : ExtPat; //---------------------------------------------------------------------------- // AdvSIMD zip vector @@ -4054,6 +5356,16 @@ defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>; defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>; defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>; +def : Pat<(v16i8 (concat_vectors (v8i8 (trunc (v8i16 V128:$Vn))), + (v8i8 (trunc (v8i16 V128:$Vm))))), + (UZP1v16i8 V128:$Vn, V128:$Vm)>; +def : Pat<(v8i16 (concat_vectors (v4i16 (trunc (v4i32 V128:$Vn))), + (v4i16 (trunc (v4i32 V128:$Vm))))), + (UZP1v8i16 V128:$Vn, V128:$Vm)>; +def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))), + (v2i32 (trunc (v2i64 V128:$Vm))))), + (UZP1v4i32 V128:$Vn, V128:$Vm)>; + //---------------------------------------------------------------------------- // AdvSIMD TBL/TBX instructions //---------------------------------------------------------------------------- @@ -4075,10 +5387,10 @@ def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd), //---------------------------------------------------------------------------- -// AdvSIMD scalar CPY instruction +// AdvSIMD scalar DUP instruction //---------------------------------------------------------------------------- -defm CPY : SIMDScalarCPY<"cpy">; +defm DUP : SIMDScalarDUP<"mov">; //---------------------------------------------------------------------------- // AdvSIMD scalar pairwise instructions @@ -4090,6 +5402,26 @@ defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">; defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">; defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">; defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">; + +let Predicates = [HasFullFP16] in { +def : Pat<(f16 (vecreduce_fadd (v8f16 V128:$Rn))), + (FADDPv2i16p + (EXTRACT_SUBREG + (FADDPv8f16 (FADDPv8f16 V128:$Rn, (v8f16 (IMPLICIT_DEF))), (v8f16 (IMPLICIT_DEF))), + dsub))>; +def : Pat<(f16 (vecreduce_fadd (v4f16 V64:$Rn))), + (FADDPv2i16p (FADDPv4f16 V64:$Rn, (v4f16 (IMPLICIT_DEF))))>; +} +def : Pat<(f32 (vecreduce_fadd (v4f32 V128:$Rn))), + (FADDPv2i32p + (EXTRACT_SUBREG + (FADDPv4f32 V128:$Rn, (v4f32 (IMPLICIT_DEF))), + dsub))>; +def : Pat<(f32 (vecreduce_fadd (v2f32 V64:$Rn))), + (FADDPv2i32p V64:$Rn)>; +def : Pat<(f64 (vecreduce_fadd (v2f64 V128:$Rn))), + (FADDPv2i64p V128:$Rn)>; + def : Pat<(v2i64 (AArch64saddv V128:$Rn)), (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>; def : Pat<(v2i64 (AArch64uaddv V128:$Rn)), @@ -4137,6 +5469,12 @@ def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>; def DUPv8i8lane : SIMDDup8FromElement <0, ".8b", v8i8, V64>; def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>; +// DUP from a 64-bit register to a 64-bit register is just a copy +def : Pat<(v1i64 (AArch64dup (i64 GPR64:$Rn))), + (COPY_TO_REGCLASS GPR64:$Rn, FPR64)>; +def : Pat<(v1f64 (AArch64dup (f64 FPR64:$Rn))), + (COPY_TO_REGCLASS FPR64:$Rn, FPR64)>; + def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))), (v2f32 (DUPv2i32lane (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub), @@ -4153,16 +5491,29 @@ def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))), (v4f16 (DUPv4i16lane (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), (i64 0)))>; +def : Pat<(v4bf16 (AArch64dup (bf16 FPR16:$Rn))), + (v4bf16 (DUPv4i16lane + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), + (i64 0)))>; def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))), (v8f16 (DUPv8i16lane (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), (i64 0)))>; +def : Pat<(v8bf16 (AArch64dup (bf16 FPR16:$Rn))), + (v8bf16 (DUPv8i16lane + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub), + (i64 0)))>; def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)), (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>; def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)), (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>; +def : Pat<(v4bf16 (AArch64duplane16 (v8bf16 V128:$Rn), VectorIndexH:$imm)), + (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>; +def : Pat<(v8bf16 (AArch64duplane16 (v8bf16 V128:$Rn), VectorIndexH:$imm)), + (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>; + def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>; def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), @@ -4257,6 +5608,13 @@ def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx), (i32 0xffff)), (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>; +def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract (v16i8 V128:$Rn), + VectorIndexB:$idx)))), (i64 0xff))), + (SUBREG_TO_REG (i64 0), (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx)), sub_32)>; +def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn), + VectorIndexH:$idx)))), (i64 0xffff))), + (SUBREG_TO_REG (i64 0), (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx)), sub_32)>; + defm INS : SIMDIns; def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)), @@ -4278,6 +5636,11 @@ def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; + def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))), (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (i32 FPR32:$Rn), ssub))>; @@ -4294,6 +5657,11 @@ def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))), + (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; + def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))), @@ -4312,6 +5680,16 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn), (i64 0)), dsub)>; +def : Pat<(vector_insert (v8f16 v8f16:$Rn), (f16 fpimm0), + (i64 VectorIndexH:$imm)), + (INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>; +def : Pat<(vector_insert v4f32:$Rn, (f32 fpimm0), + (i64 VectorIndexS:$imm)), + (INSvi32gpr V128:$Rn, VectorIndexS:$imm, WZR)>; +def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), + (i64 VectorIndexD:$imm)), + (INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>; + def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn), (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))), (INSvi16lane @@ -4319,6 +5697,23 @@ def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn), (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), (i64 0))>; +def : Pat<(v4bf16 (vector_insert (v4bf16 V64:$Rn), + (bf16 FPR16:$Rm), (i64 VectorIndexS:$imm))), + (EXTRACT_SUBREG + (INSvi16lane + (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexS:$imm, + (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), + (i64 0)), + dsub)>; + +def : Pat<(v8bf16 (vector_insert (v8bf16 V128:$Rn), + (bf16 FPR16:$Rm), (i64 VectorIndexH:$imm))), + (INSvi16lane + V128:$Rn, VectorIndexH:$imm, + (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)), + (i64 0))>; + def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn), (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))), (EXTRACT_SUBREG @@ -4400,12 +5795,13 @@ multiclass Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; defm : Neon_INS_elt_pattern; defm : Neon_INS_elt_pattern; // Floating point vector extractions are codegen'd as either a sequence of -// subregister extractions, or a MOV (aka CPY here, alias for DUP) if +// subregister extractions, or a MOV (aka DUP here) if // the lane number is anything other than zero. def : Pat<(vector_extract (v2f64 V128:$Rn), 0), (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>; @@ -4413,13 +5809,18 @@ def : Pat<(vector_extract (v4f32 V128:$Rn), 0), (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>; def : Pat<(vector_extract (v8f16 V128:$Rn), 0), (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>; +def : Pat<(vector_extract (v8bf16 V128:$Rn), 0), + (bf16 (EXTRACT_SUBREG V128:$Rn, hsub))>; + def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx), - (f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>; + (f64 (DUPi64 V128:$Rn, VectorIndexD:$idx))>; def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx), - (f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>; + (f32 (DUPi32 V128:$Rn, VectorIndexS:$idx))>; def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx), - (f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>; + (f16 (DUPi16 V128:$Rn, VectorIndexH:$idx))>; +def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx), + (bf16 (DUPi16 V128:$Rn, VectorIndexH:$idx))>; // All concat_vectors operations are canonicalised to act on i64 vectors for // AArch64. In the general case we need an instruction, which had just as well be @@ -4435,6 +5836,7 @@ def : ConcatPat; def : ConcatPat; def : ConcatPat; def : ConcatPat; +def : ConcatPat; def : ConcatPat; // If the high lanes are undef, though, we can just ignore them: @@ -4465,6 +5867,25 @@ defm FMAXV : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>; defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>; defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>; +// Patterns for uaddv(uaddlp(x)) ==> uaddlv +def : Pat<(i32 (vector_extract (v8i16 (insert_subvector undef, + (v4i16 (AArch64uaddv (v4i16 (AArch64uaddlp (v8i8 V64:$op))))), + (i64 0))), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), + (UADDLVv8i8v V64:$op), hsub), ssub)>; +def : Pat<(i32 (vector_extract (v8i16 (AArch64uaddv (v8i16 (AArch64uaddlp + (v16i8 V128:$op))))), (i64 0))), + (EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), + (UADDLVv16i8v V128:$op), hsub), ssub)>; +def : Pat<(v4i32 (AArch64uaddv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))), + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (UADDLVv8i16v V128:$op), ssub)>; + +// Patterns for addp(uaddlp(x))) ==> uaddlv +def : Pat<(v2i32 (AArch64uaddv (v2i32 (AArch64uaddlp (v4i16 V64:$op))))), + (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (UADDLVv4i16v V64:$op), ssub)>; +def : Pat<(v2i64 (AArch64uaddv (v2i64 (AArch64uaddlp (v4i32 V128:$op))))), + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (UADDLVv4i32v V128:$op), dsub)>; + // Patterns for across-vector intrinsics, that have a node equivalent, that // returns a vector (with only the low lane defined) instead of a scalar. // In effect, opNode is the same as (scalar_to_vector (IntNode)). @@ -4491,7 +5912,7 @@ def : Pat<(v4i32 (opNode V128:$Rn)), // If none did, fallback to the explicit patterns, consuming the vector_extract. def : Pat<(i32 (vector_extract (insert_subvector undef, (v8i8 (opNode V64:$Rn)), - (i32 0)), (i64 0))), + (i64 0)), (i64 0))), (EXTRACT_SUBREG (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), ssub)>; @@ -4500,7 +5921,7 @@ def : Pat<(i32 (vector_extract (v16i8 (opNode V128:$Rn)), (i64 0))), (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), ssub)>; def : Pat<(i32 (vector_extract (insert_subvector undef, - (v4i16 (opNode V64:$Rn)), (i32 0)), (i64 0))), + (v4i16 (opNode V64:$Rn)), (i64 0)), (i64 0))), (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), ssub)>; @@ -4521,7 +5942,7 @@ multiclass SIMDAcrossLanesSignedIntrinsic(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), @@ -4533,7 +5954,7 @@ def : Pat<(i32 (sext_inreg (i32 (vector_extract (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), (i64 0)))>; def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef, - (opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), i16)), + (opNode (v4i16 V64:$Rn)), (i64 0)), (i64 0))), i16)), (i32 (SMOVvi16to32 (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), @@ -4552,7 +5973,7 @@ multiclass SIMDAcrossLanesUnsignedIntrinsic(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), @@ -4564,7 +5985,7 @@ def : Pat<(i32 (and (i32 (vector_extract (opNode (v16i8 V128:$Rn)), (i64 0))), (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), ssub))>; def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef, - (opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), maski16_or_more)), + (opNode (v4i16 V64:$Rn)), (i64 0)), (i64 0))), maski16_or_more)), (i32 (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), @@ -4739,16 +6160,6 @@ def MOVID : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi", def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)), (MOVID imm0_255:$shift)>; -def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>; -def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>; -def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>; -def : Pat<(v8i8 immAllZerosV), (MOVID (i32 0))>; - -def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>; -def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>; -def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>; -def : Pat<(v8i8 immAllOnesV), (MOVID (i32 255))>; - // EDIT byte mask: 2d // The movi_edit node has the immediate value already encoded, so we use @@ -4769,6 +6180,18 @@ def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>; def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>; def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>; +// Set 64-bit vectors to all 0/1 by extracting from a 128-bit register as the +// extract is free and this gives better MachineCSE results. +def : Pat<(v1i64 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>; +def : Pat<(v2i32 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>; +def : Pat<(v4i16 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>; +def : Pat<(v8i8 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>; + +def : Pat<(v1i64 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>; +def : Pat<(v2i32 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>; +def : Pat<(v4i16 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>; +def : Pat<(v8i8 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>; + // EDIT per word & halfword: 2s, 4h, 4s, & 8h let isReMaterializable = 1, isAsCheapAsAMove = 1 in defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">; @@ -4885,7 +6308,7 @@ multiclass FMLSIndexedAfterNegPatterns { (v2f32 (AArch64duplane32 (v4f32 (insert_subvector undef, (v2f32 (fneg V64:$Rm)), - (i32 0))), + (i64 0))), VectorIndexS:$idx)))), (FMLSv2i32_indexed V64:$Rd, V64:$Rn, (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), @@ -4906,7 +6329,7 @@ multiclass FMLSIndexedAfterNegPatterns { (v4f32 (AArch64duplane32 (v4f32 (insert_subvector undef, (v2f32 (fneg V64:$Rm)), - (i32 0))), + (i64 0))), VectorIndexS:$idx)))), (FMLSv4i32_indexed V128:$Rd, V128:$Rn, (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), @@ -4937,7 +6360,7 @@ multiclass FMLSIndexedAfterNegPatterns { def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), (vector_extract (v4f32 (insert_subvector undef, (v2f32 (fneg V64:$Rm)), - (i32 0))), + (i64 0))), VectorIndexS:$idx))), (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn, (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; @@ -4973,10 +6396,16 @@ def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))), defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>; -defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", - TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>; -defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", - TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>; + +defm SQDMULH : SIMDIndexedHSPatterns; +defm SQRDMULH : SIMDIndexedHSPatterns; + +// Generated by MachineCombine +defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>; +defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>; + defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>; defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal", TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; @@ -4989,9 +6418,9 @@ defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", int_aarch64_neon_sqsub>; defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah", - int_aarch64_neon_sqadd>; + int_aarch64_neon_sqrdmlah>; defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh", - int_aarch64_neon_sqsub>; + int_aarch64_neon_sqrdmlsh>; defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>; defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal", TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; @@ -5007,6 +6436,22 @@ def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn), VectorIndexS:$idx)), (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>; +// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands +// have no common bits. +def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs), + [(add node:$lhs, node:$rhs), (or node:$lhs, node:$rhs)],[{ + if (N->getOpcode() == ISD::ADD) + return true; + return CurDAG->haveNoCommonBitsSet(N->getOperand(0), N->getOperand(1)); +}]> { + let GISelPredicateCode = [{ + // Only handle G_ADD for now. FIXME. build capability to compute whether + // operands of G_OR have common bits set or not. + return MI.getOpcode() == TargetOpcode::G_ADD; + }]; +} + + //---------------------------------------------------------------------------- // AdvSIMD scalar shift instructions //---------------------------------------------------------------------------- @@ -5052,6 +6497,8 @@ def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 (sext_inreg FPR32:$Rn, i16)), v (SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>; def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i32 FPR32:$Rn), vecshiftR16:$imm)), (SCVTFh (EXTRACT_SUBREG FPR32:$Rn, hsub), vecshiftR16:$imm)>; +def : Pat<(f16 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR16:$imm)), + (SCVTFh (EXTRACT_SUBREG FPR64:$Rn, hsub), vecshiftR16:$imm)>; def : Pat<(f16 (int_aarch64_neon_vcvtfxu2fp (and FPR32:$Rn, (i32 65535)), vecshiftR16:$imm)), @@ -5080,6 +6527,16 @@ def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f16 FPR16:$Rn), vecshiftR64:$imm)), (i64 (IMPLICIT_DEF)), (FCVTZUh FPR16:$Rn, vecshiftR64:$imm), hsub))>; +def : Pat<(i32 (int_aarch64_neon_facge (f16 FPR16:$Rn), (f16 FPR16:$Rm))), + (i32 (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), + (FACGE16 FPR16:$Rn, FPR16:$Rm), + hsub))>; +def : Pat<(i32 (int_aarch64_neon_facgt (f16 FPR16:$Rn), (f16 FPR16:$Rm))), + (i32 (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), + (FACGT16 FPR16:$Rn, FPR16:$Rm), + hsub))>; defm SHL : SIMDScalarLShiftD< 0, 0b01010, "shl", AArch64vshl>; defm SLI : SIMDScalarLShiftDTied<1, 0b01010, "sli">; @@ -5100,7 +6557,7 @@ defm SRSRA : SIMDScalarRShiftDTied< 0, 0b00110, "srsra", (AArch64srshri node:$MHS, node:$RHS))>>; defm SSHR : SIMDScalarRShiftD< 0, 0b00000, "sshr", AArch64vashr>; defm SSRA : SIMDScalarRShiftDTied< 0, 0b00010, "ssra", - TriOpFrag<(add node:$LHS, + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; defm UQRSHRN : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn", int_aarch64_neon_uqrshrn>; @@ -5113,7 +6570,7 @@ defm URSRA : SIMDScalarRShiftDTied< 1, 0b00110, "ursra", (AArch64urshri node:$MHS, node:$RHS))>>; defm USHR : SIMDScalarRShiftD< 1, 0b00000, "ushr", AArch64vlshr>; defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra", - TriOpFrag<(add node:$LHS, + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))>>; //---------------------------------------------------------------------------- @@ -5128,8 +6585,8 @@ defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>; defm SHRN : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn", BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>; -defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>; -def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), +defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", AArch64vsli>; +def : Pat<(v1i64 (AArch64vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))), (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>; defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn", @@ -5142,8 +6599,8 @@ defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn", int_aarch64_neon_sqshrn>; defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun", int_aarch64_neon_sqshrun>; -defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>; -def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), +defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", AArch64vsri>; +def : Pat<(v1i64 (AArch64vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))), (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>; defm SRSHR : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>; @@ -5155,7 +6612,7 @@ defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll", defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>; defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra", - TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf", int_aarch64_neon_vcvtfxu2fp>; defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn", @@ -5171,7 +6628,35 @@ defm USHLL : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll", BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>; defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>; defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra", - TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >; + TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >; + +// RADDHN patterns for when RSHRN shifts by half the size of the vector element +def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))), + (RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>; +def : Pat<(v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))), + (RADDHNv4i32_v4i16 V128:$Vn, (v4i32 (MOVIv2d_ns (i32 0))))>; +def : Pat<(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))), + (RADDHNv2i64_v2i32 V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>; + +// RADDHN2 patterns for when RSHRN shifts by half the size of the vector element +def : Pat<(v16i8 (concat_vectors + (v8i8 V64:$Vd), + (v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))))), + (RADDHNv8i16_v16i8 + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn, + (v8i16 (MOVIv2d_ns (i32 0))))>; +def : Pat<(v8i16 (concat_vectors + (v4i16 V64:$Vd), + (v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))))), + (RADDHNv4i32_v8i16 + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn, + (v4i32 (MOVIv2d_ns (i32 0))))>; +def : Pat<(v4i32 (concat_vectors + (v2i32 V64:$Vd), + (v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))))), + (RADDHNv2i64_v4i32 + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn, + (v2i64 (MOVIv2d_ns (i32 0))))>; // SHRN patterns for when a logical right shift was used instead of arithmetic // (the immediate guarantees no sign bits actually end up in the result so it @@ -5495,6 +6980,10 @@ def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))), (LD1Rv4h GPR64sp:$Rn)>; def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))), (LD1Rv8h GPR64sp:$Rn)>; +def : Pat<(v4bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))), + (LD1Rv4h GPR64sp:$Rn)>; +def : Pat<(v8bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))), + (LD1Rv8h GPR64sp:$Rn)>; class Ld1Lane128Pat @@ -5509,6 +6998,47 @@ def : Ld1Lane128Pat; def : Ld1Lane128Pat; def : Ld1Lane128Pat; def : Ld1Lane128Pat; +def : Ld1Lane128Pat; + +// Generate LD1 for extload if memory type does not match the +// destination type, for example: +// +// (v4i32 (insert_vector_elt (load anyext from i8) idx)) +// +// In this case, the index must be adjusted to match LD1 type. +// +class Ld1Lane128IdxOpPat + : Pat<(vector_insert (VTy VecListOne128:$Rd), + (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx), + (LD1 VecListOne128:$Rd, (IdxOp VecIndex:$idx), GPR64sp:$Rn)>; + +def VectorIndexStoH : SDNodeXFormgetTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64); +}]>; +def VectorIndexStoB : SDNodeXFormgetTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i64); +}]>; +def VectorIndexHtoB : SDNodeXFormgetTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i64); +}]>; + +def : Ld1Lane128IdxOpPat; +def : Ld1Lane128IdxOpPat; +def : Ld1Lane128IdxOpPat; + +// Same as above, but the first element is populated using +// scalar_to_vector + insert_subvector instead of insert_vector_elt. +class Ld1Lane128FirstElm + : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))), + (ResultTy (EXTRACT_SUBREG + (LD1 (VecTy (IMPLICIT_DEF)), 0, GPR64sp:$Rn), dsub))>; + +def : Ld1Lane128FirstElm; +def : Ld1Lane128FirstElm; +def : Ld1Lane128FirstElm; class Ld1Lane64Pat @@ -5524,6 +7054,7 @@ def : Ld1Lane64Pat; def : Ld1Lane64Pat; def : Ld1Lane64Pat; def : Ld1Lane64Pat; +def : Ld1Lane64Pat; defm LD1 : SIMDLdSt1SingleAliases<"ld1">; @@ -5552,6 +7083,7 @@ def : St1Lane128Pat; def : St1Lane128Pat; def : St1Lane128Pat; def : St1Lane128Pat; +def : St1Lane128Pat; let AddedComplexity = 19 in class St1Lane64Pat; def : St1Lane64Pat; def : St1Lane64Pat; def : St1Lane64Pat; +def : St1Lane64Pat; multiclass St1LanePost64Pat; defm : St1LanePost64Pat; defm : St1LanePost64Pat; defm : St1LanePost64Pat; +defm : St1LanePost64Pat; multiclass St1LanePost128Pat; defm : St1LanePost128Pat; defm : St1LanePost128Pat; defm : St1LanePost128Pat; +defm : St1LanePost128Pat; let mayStore = 1, hasSideEffects = 0 in { defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>; @@ -5652,9 +7187,9 @@ def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>; // for AES fusion on some CPUs. let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in { def AESMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">, - Sched<[WriteV]>; + Sched<[WriteVq]>; def AESIMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">, - Sched<[WriteV]>; + Sched<[WriteVq]>; } // Only use constrained versions of AES(I)MC instructions if they are paired with @@ -5769,6 +7304,52 @@ def : Pat<(i32 (trunc GPR64sp:$src)), // __builtin_trap() uses the BRK instruction on AArch64. def : Pat<(trap), (BRK 1)>; +def : Pat<(debugtrap), (BRK 0xF000)>; + +def ubsan_trap_xform : SDNodeXFormgetTargetConstant(N->getZExtValue() | ('U' << 8), SDLoc(N), MVT::i32); +}]>; + +def ubsan_trap_imm : TImmLeaf(Imm); +}], ubsan_trap_xform>; + +def : Pat<(ubsantrap ubsan_trap_imm:$kind), (BRK ubsan_trap_imm:$kind)>; + +// Multiply high patterns which multiply the lower subvector using smull/umull +// and the upper subvector with smull2/umull2. Then shuffle the high the high +// part of both results together. +def : Pat<(v16i8 (mulhs V128:$Rn, V128:$Rm)), + (UZP2v16i8 + (SMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (SMULLv16i8_v8i16 V128:$Rn, V128:$Rm))>; +def : Pat<(v8i16 (mulhs V128:$Rn, V128:$Rm)), + (UZP2v8i16 + (SMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (SMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>; +def : Pat<(v4i32 (mulhs V128:$Rn, V128:$Rm)), + (UZP2v4i32 + (SMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (SMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>; + +def : Pat<(v16i8 (mulhu V128:$Rn, V128:$Rm)), + (UZP2v16i8 + (UMULLv8i8_v8i16 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (UMULLv16i8_v8i16 V128:$Rn, V128:$Rm))>; +def : Pat<(v8i16 (mulhu V128:$Rn, V128:$Rm)), + (UZP2v8i16 + (UMULLv4i16_v4i32 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (UMULLv8i16_v4i32 V128:$Rn, V128:$Rm))>; +def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)), + (UZP2v4i32 + (UMULLv2i32_v2i64 (EXTRACT_SUBREG V128:$Rn, dsub), + (EXTRACT_SUBREG V128:$Rm, dsub)), + (UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>; // Conversions within AdvSIMD types in the same register size are free. // But because we need a consistent lane ordering, in big endian many @@ -5820,6 +7401,7 @@ def : Pat<(trap), (BRK 1)>; def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; @@ -5827,12 +7409,14 @@ def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2f32 (AArch64NvCast (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; @@ -5840,6 +7424,7 @@ def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (AArch64NvCast (f64 FPR64:$src))), (v4bf16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>; @@ -5850,11 +7435,13 @@ def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>; def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1f64 (AArch64NvCast (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>; // Natural vector casts (128 bit) def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>; @@ -5863,6 +7450,7 @@ def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -5871,6 +7459,7 @@ def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -5879,6 +7468,7 @@ def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -5890,6 +7480,7 @@ def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; @@ -5898,6 +7489,7 @@ def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (AArch64NvCast (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>; def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; let Predicates = [IsLE] in { @@ -5905,6 +7497,7 @@ def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v4bf16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))), @@ -5915,6 +7508,8 @@ def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v4bf16 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), @@ -5929,6 +7524,8 @@ def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; +def : Pat<(v4bf16 (bitconvert GPR64:$Xn)), + (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; @@ -5940,6 +7537,8 @@ def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))), (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +def : Pat<(i64 (bitconvert (v4bf16 V64:$Vn))), + (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; } @@ -5964,11 +7563,15 @@ def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))), def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(f16 (bitconvert (bf16 FPR16:$src))), (f16 FPR16:$src)>; +def : Pat<(bf16 (bitconvert (f16 FPR16:$src))), (bf16 FPR16:$src)>; + let Predicates = [IsLE] in { def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (v4bf16 FPR64:$src))), (v1i64 FPR64:$src)>; def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>; } let Predicates = [IsBE] in { @@ -5980,6 +7583,8 @@ def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 (REV64v8i8 FPR64:$src))>; def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 (REV64v4i16 FPR64:$src))>; +def : Pat<(v1i64 (bitconvert (v4bf16 FPR64:$src))), + (v1i64 (REV64v4i16 FPR64:$src))>; def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 (REV64v2i32 FPR64:$src))>; } @@ -5993,6 +7598,7 @@ def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>; def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v4bf16 FPR64:$src))), (v2i32 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), @@ -6007,6 +7613,8 @@ def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 (REV64v2i32 FPR64:$src))>; def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 (REV32v4i16 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (v4bf16 FPR64:$src))), + (v2i32 (REV32v4i16 FPR64:$src))>; } def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; @@ -6033,6 +7641,7 @@ def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 (REV64v4i16 FPR64:$src))>; } def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v4bf16 FPR64:$src))), (v4i16 FPR64:$src)>; let Predicates = [IsLE] in { def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>; @@ -6041,6 +7650,13 @@ def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>; def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), (v4f16 FPR64:$src)>; def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>; def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>; + +def : Pat<(v4bf16 (bitconvert (v1i64 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (f64 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), (v4bf16 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), @@ -6055,8 +7671,22 @@ def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 (REV32v4i16 FPR64:$src))>; def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 (REV64v4i16 FPR64:$src))>; + +def : Pat<(v4bf16 (bitconvert (v1i64 FPR64:$src))), + (v4bf16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v2i32 FPR64:$src))), + (v4bf16 (REV32v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v8i8 FPR64:$src))), + (v4bf16 (REV16v8i8 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (f64 FPR64:$src))), + (v4bf16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), + (v4bf16 (REV32v4i16 FPR64:$src))>; +def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), + (v4bf16 (REV64v4i16 FPR64:$src))>; } def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>; let Predicates = [IsLE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>; @@ -6066,6 +7696,7 @@ def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>; def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v4bf16 FPR64:$src))), (v8i8 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), @@ -6082,6 +7713,8 @@ def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 (REV64v8i8 FPR64:$src))>; def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 (REV16v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v4bf16 FPR64:$src))), + (v8i8 (REV16v8i8 FPR64:$src))>; } let Predicates = [IsLE] in { @@ -6090,6 +7723,7 @@ def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v4bf16 FPR64:$src))), (f64 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), @@ -6102,6 +7736,8 @@ def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 (REV64v8i8 FPR64:$src))>; def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 (REV64v4i16 FPR64:$src))>; +def : Pat<(f64 (bitconvert (v4bf16 FPR64:$src))), + (f64 (REV64v4i16 FPR64:$src))>; } def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>; def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>; @@ -6112,6 +7748,7 @@ def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (v4bf16 FPR64:$src))), (v1f64 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), @@ -6124,6 +7761,8 @@ def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 (REV64v2i32 FPR64:$src))>; def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 (REV64v4i16 FPR64:$src))>; +def : Pat<(v1f64 (bitconvert (v4bf16 FPR64:$src))), + (v1f64 (REV64v4i16 FPR64:$src))>; } def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>; def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>; @@ -6135,6 +7774,7 @@ def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>; def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v4bf16 FPR64:$src))), (v2f32 FPR64:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), @@ -6149,6 +7789,8 @@ def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 (REV64v2i32 FPR64:$src))>; def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 (REV32v4i16 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (v4bf16 FPR64:$src))), + (v2f32 (REV32v4i16 FPR64:$src))>; } def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; @@ -6159,6 +7801,7 @@ def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v8bf16 FPR128:$src))), (f128 FPR128:$src)>; def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>; } let Predicates = [IsBE] in { @@ -6173,6 +7816,9 @@ def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 (EXTv16i8 (REV64v8i16 FPR128:$src), (REV64v8i16 FPR128:$src), (i32 8)))>; +def : Pat<(f128 (bitconvert (v8bf16 FPR128:$src))), + (f128 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), (i32 8)))>; def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>; def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), @@ -6188,6 +7834,7 @@ def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (v8bf16 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; } @@ -6201,6 +7848,8 @@ def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 (REV64v8i16 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 (REV64v8i16 FPR128:$src))>; +def : Pat<(v2f64 (bitconvert (v8bf16 FPR128:$src))), + (v2f64 (REV64v8i16 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 (REV64v16i8 FPR128:$src))>; def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), @@ -6212,6 +7861,7 @@ let Predicates = [IsLE] in { def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8bf16 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; @@ -6224,6 +7874,8 @@ def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 (REV32v8i16 FPR128:$src))>; def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 (REV32v8i16 FPR128:$src))>; +def : Pat<(v4f32 (bitconvert (v8bf16 FPR128:$src))), + (v4f32 (REV32v8i16 FPR128:$src))>; def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 (REV32v16i8 FPR128:$src))>; def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), @@ -6240,6 +7892,7 @@ def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8bf16 FPR128:$src))), (v2i64 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), @@ -6255,6 +7908,8 @@ def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 (REV64v4i32 FPR128:$src))>; def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 (REV64v8i16 FPR128:$src))>; +def : Pat<(v2i64 (bitconvert (v8bf16 FPR128:$src))), + (v2i64 (REV64v8i16 FPR128:$src))>; } def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; @@ -6265,6 +7920,7 @@ def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8bf16 FPR128:$src))), (v4i32 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), @@ -6281,6 +7937,8 @@ def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 (REV64v4i32 FPR128:$src))>; def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 (REV32v8i16 FPR128:$src))>; +def : Pat<(v4i32 (bitconvert (v8bf16 FPR128:$src))), + (v4i32 (REV32v8i16 FPR128:$src))>; } def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; @@ -6309,6 +7967,7 @@ def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 (REV32v8i16 FPR128:$src))>; } def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v8bf16 FPR128:$src))), (v8i16 FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>; @@ -6317,6 +7976,13 @@ def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>; def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>; + +def : Pat<(v8bf16 (bitconvert (f128 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), @@ -6333,8 +7999,24 @@ def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 (REV64v8i16 FPR128:$src))>; def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 (REV32v8i16 FPR128:$src))>; + +def : Pat<(v8bf16 (bitconvert (f128 FPR128:$src))), + (v8bf16 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), + (i32 8)))>; +def : Pat<(v8bf16 (bitconvert (v2i64 FPR128:$src))), + (v8bf16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v4i32 FPR128:$src))), + (v8bf16 (REV32v8i16 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v16i8 FPR128:$src))), + (v8bf16 (REV16v16i8 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), + (v8bf16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), + (v8bf16 (REV32v8i16 FPR128:$src))>; } def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; @@ -6344,6 +8026,7 @@ def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>; def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8bf16 FPR128:$src))), (v16i8 FPR128:$src)>; } let Predicates = [IsBE] in { def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), @@ -6362,6 +8045,8 @@ def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 (REV32v16i8 FPR128:$src))>; def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 (REV16v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v8bf16 FPR128:$src))), + (v16i8 (REV16v16i8 FPR128:$src))>; } def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))), @@ -6372,6 +8057,8 @@ def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))), (EXTRACT_SUBREG V128:$Rn, dsub)>; def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))), (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v4bf16 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))), (EXTRACT_SUBREG V128:$Rn, dsub)>; def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))), @@ -6403,6 +8090,8 @@ multiclass InsertSubvectorUndef { (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (Ty 0)), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; + def : Pat<(insert_subvector undef, (v4bf16 FPR64:$src), (Ty 0)), + (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (Ty 0)), (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>; } @@ -6424,6 +8113,9 @@ def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)), def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)), (vector_extract (v4f32 FPR128:$Rn), (i64 1))), (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>; +def : Pat<(fadd (vector_extract (v8f16 FPR128:$Rn), (i64 0)), + (vector_extract (v8f16 FPR128:$Rn), (i64 1))), + (f16 (FADDPv2i16p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>; // Scalar 64-bit shifts in FPR64 registers. def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), @@ -6444,7 +8136,7 @@ class NTStore128Pat : Pat<(nontemporalstore (VT FPR128:$Rt), (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub), - (CPYi64 FPR128:$Rt, (i64 1)), + (DUPi64 FPR128:$Rt, (i64 1)), GPR64sp:$Rn, simm7s8:$offset)>; def : NTStore128Pat; @@ -6456,7 +8148,7 @@ class NTStore64Pat : Pat<(nontemporalstore (VT FPR64:$Rt), (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub), - (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)), + (DUPi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)), GPR64sp:$Rn, simm7s4:$offset)>; // FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64? @@ -6481,14 +8173,253 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { Sched<[WriteBrReg]>; def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>, Sched<[WriteBrReg]>; + // Indirect tail-call with any register allowed, used by MachineOutliner when + // this is proven safe. + // FIXME: If we have to add any more hacks like this, we should instead relax + // some verifier checks for outlined functions. + def TCRETURNriALL : Pseudo<(outs), (ins GPR64:$dst, i32imm:$FPDiff), []>, + Sched<[WriteBrReg]>; + // Indirect tail-call limited to only use registers (x16 and x17) which are + // allowed to tail-call a "BTI c" instruction. + def TCRETURNriBTI : Pseudo<(outs), (ins rtcGPR64:$dst, i32imm:$FPDiff), []>, + Sched<[WriteBrReg]>; } def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)), - (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>; + (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>, + Requires<[NotUseBTI]>; +def : Pat<(AArch64tcret rtcGPR64:$dst, (i32 timm:$FPDiff)), + (TCRETURNriBTI rtcGPR64:$dst, imm:$FPDiff)>, + Requires<[UseBTI]>; def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)), (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)), (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; +def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>; +def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>; + +// Extracting lane zero is a special case where we can just use a plain +// EXTRACT_SUBREG instruction, which will become FMOV. This is easier for the +// rest of the compiler, especially the register allocator and copy propagation, +// to reason about, so is preferred when it's possible to use it. +let AddedComplexity = 10 in { + def : Pat<(i64 (extractelt (v2i64 V128:$V), (i64 0))), (EXTRACT_SUBREG V128:$V, dsub)>; + def : Pat<(i32 (extractelt (v4i32 V128:$V), (i64 0))), (EXTRACT_SUBREG V128:$V, ssub)>; + def : Pat<(i32 (extractelt (v2i32 V64:$V), (i64 0))), (EXTRACT_SUBREG V64:$V, ssub)>; +} + +// dot_v4i8 +class mul_v4i8 : + PatFrag<(ops node:$Rn, node:$Rm, node:$offset), + (mul (ldop (add node:$Rn, node:$offset)), + (ldop (add node:$Rm, node:$offset)))>; +class mulz_v4i8 : + PatFrag<(ops node:$Rn, node:$Rm), + (mul (ldop node:$Rn), (ldop node:$Rm))>; + +def load_v4i8 : + OutPatFrag<(ops node:$R), + (INSERT_SUBREG + (v2i32 (IMPLICIT_DEF)), + (i32 (COPY_TO_REGCLASS (LDRWui node:$R, (i64 0)), FPR32)), + ssub)>; + +class dot_v4i8 : + Pat<(i32 (add (mul_v4i8 GPR64sp:$Rn, GPR64sp:$Rm, (i64 3)), + (add (mul_v4i8 GPR64sp:$Rn, GPR64sp:$Rm, (i64 2)), + (add (mul_v4i8 GPR64sp:$Rn, GPR64sp:$Rm, (i64 1)), + (mulz_v4i8 GPR64sp:$Rn, GPR64sp:$Rm))))), + (EXTRACT_SUBREG (i64 (DOT (DUPv2i32gpr WZR), + (load_v4i8 GPR64sp:$Rn), + (load_v4i8 GPR64sp:$Rm))), + sub_32)>, Requires<[HasDotProd]>; + +// dot_v8i8 +class ee_v8i8 : + PatFrag<(ops node:$V, node:$K), + (v4i16 (extract_subvector (v8i16 (extend node:$V)), node:$K))>; + +class mul_v8i8 : + PatFrag<(ops node:$M, node:$N, node:$K), + (mulop (v4i16 (ee_v8i8 node:$M, node:$K)), + (v4i16 (ee_v8i8 node:$N, node:$K)))>; + +class idot_v8i8 : + PatFrag<(ops node:$M, node:$N), + (i32 (extractelt + (v4i32 (AArch64uaddv + (add (mul_v8i8 node:$M, node:$N, (i64 0)), + (mul_v8i8 node:$M, node:$N, (i64 4))))), + (i64 0)))>; + +// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm +def VADDV_32 : OutPatFrag<(ops node:$R), (ADDPv2i32 node:$R, node:$R)>; + +class odot_v8i8 : + OutPatFrag<(ops node:$Vm, node:$Vn), + (EXTRACT_SUBREG + (VADDV_32 + (i64 (DOT (DUPv2i32gpr WZR), + (v8i8 node:$Vm), + (v8i8 node:$Vn)))), + sub_32)>; + +class dot_v8i8 : + Pat<(idot_v8i8 V64:$Vm, V64:$Vn), + (odot_v8i8 V64:$Vm, V64:$Vn)>, + Requires<[HasDotProd]>; + +// dot_v16i8 +class ee_v16i8 : + PatFrag<(ops node:$V, node:$K1, node:$K2), + (v4i16 (extract_subvector + (v8i16 (extend + (v8i8 (extract_subvector node:$V, node:$K1)))), node:$K2))>; + +class mul_v16i8 : + PatFrag<(ops node:$M, node:$N, node:$K1, node:$K2), + (v4i32 + (mulop (v4i16 (ee_v16i8 node:$M, node:$K1, node:$K2)), + (v4i16 (ee_v16i8 node:$N, node:$K1, node:$K2))))>; + +class idot_v16i8 : + PatFrag<(ops node:$M, node:$N), + (i32 (extractelt + (v4i32 (AArch64uaddv + (add + (add (mul_v16i8 node:$M, node:$N, (i64 0), (i64 0)), + (mul_v16i8 node:$M, node:$N, (i64 8), (i64 0))), + (add (mul_v16i8 node:$M, node:$N, (i64 0), (i64 4)), + (mul_v16i8 node:$M, node:$N, (i64 8), (i64 4)))))), + (i64 0)))>; + +class odot_v16i8 : + OutPatFrag<(ops node:$Vm, node:$Vn), + (i32 (ADDVv4i32v + (DOT (DUPv4i32gpr WZR), node:$Vm, node:$Vn)))>; + +class dot_v16i8 : + Pat<(idot_v16i8 V128:$Vm, V128:$Vn), + (odot_v16i8 V128:$Vm, V128:$Vn)>, + Requires<[HasDotProd]>; + +let AddedComplexity = 10 in { + def : dot_v4i8; + def : dot_v4i8; + def : dot_v8i8; + def : dot_v8i8; + def : dot_v16i8; + def : dot_v16i8; + + // FIXME: add patterns to generate vector by element dot product. + // FIXME: add SVE dot-product patterns. +} + +// Custom DAG nodes and isel rules to make a 64-byte block out of eight GPRs, +// so that it can be used as input to inline asm, and vice versa. +def LS64_BUILD : SDNode<"AArch64ISD::LS64_BUILD", SDTypeProfile<1, 8, []>>; +def LS64_EXTRACT : SDNode<"AArch64ISD::LS64_EXTRACT", SDTypeProfile<1, 2, []>>; +def : Pat<(i64x8 (LS64_BUILD GPR64:$x0, GPR64:$x1, GPR64:$x2, GPR64:$x3, + GPR64:$x4, GPR64:$x5, GPR64:$x6, GPR64:$x7)), + (REG_SEQUENCE GPR64x8Class, + $x0, x8sub_0, $x1, x8sub_1, $x2, x8sub_2, $x3, x8sub_3, + $x4, x8sub_4, $x5, x8sub_5, $x6, x8sub_6, $x7, x8sub_7)>; +foreach i = 0-7 in { + def : Pat<(i64 (LS64_EXTRACT (i64x8 GPR64x8:$val), (i32 i))), + (EXTRACT_SUBREG $val, !cast("x8sub_"#i))>; +} + +let Predicates = [HasLS64] in { + def LD64B: LoadStore64B<0b101, "ld64b", (ins GPR64sp:$Rn), + (outs GPR64x8:$Rt)>; + def ST64B: LoadStore64B<0b001, "st64b", (ins GPR64x8:$Rt, GPR64sp:$Rn), + (outs)>; + def ST64BV: Store64BV<0b011, "st64bv">; + def ST64BV0: Store64BV<0b010, "st64bv0">; + + class ST64BPattern + : Pat<(intrinsic GPR64sp:$addr, GPR64:$x0, GPR64:$x1, GPR64:$x2, GPR64:$x3, GPR64:$x4, GPR64:$x5, GPR64:$x6, GPR64:$x7), + (instruction (REG_SEQUENCE GPR64x8Class, $x0, x8sub_0, $x1, x8sub_1, $x2, x8sub_2, $x3, x8sub_3, $x4, x8sub_4, $x5, x8sub_5, $x6, x8sub_6, $x7, x8sub_7), $addr)>; + + def : ST64BPattern; + def : ST64BPattern; + def : ST64BPattern; +} + +let Predicates = [HasMOPS] in { + let Defs = [NZCV] in { + defm CPYFP : MOPSMemoryCopyInsns<0b00, "cpyfp">; + + defm CPYP : MOPSMemoryMoveInsns<0b00, "cpyp">; + + defm SETP : MOPSMemorySetInsns<0b00, "setp">; + } + let Uses = [NZCV] in { + defm CPYFM : MOPSMemoryCopyInsns<0b01, "cpyfm">; + defm CPYFE : MOPSMemoryCopyInsns<0b10, "cpyfe">; + + defm CPYM : MOPSMemoryMoveInsns<0b01, "cpym">; + defm CPYE : MOPSMemoryMoveInsns<0b10, "cpye">; + + defm SETM : MOPSMemorySetInsns<0b01, "setm">; + defm SETE : MOPSMemorySetInsns<0b10, "sete">; + } +} +let Predicates = [HasMOPS, HasMTE] in { + let Defs = [NZCV] in { + defm SETGP : MOPSMemorySetTaggingInsns<0b00, "setgp">; + } + let Uses = [NZCV] in { + defm SETGM : MOPSMemorySetTaggingInsns<0b01, "setgm">; + // Can't use SETGE because it's a reserved name in TargetSelectionDAG.td + defm MOPSSETGE : MOPSMemorySetTaggingInsns<0b10, "setge">; + } +} + +// MOPS Node operands: 0: Dst, 1: Src or Value, 2: Size, 3: Chain +// MOPS Node results: 0: Dst writeback, 1: Size writeback, 2: Chain +def SDT_AArch64mops : SDTypeProfile<2, 3, [ SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2> ]>; +def AArch64mops_memset : SDNode<"AArch64ISD::MOPS_MEMSET", SDT_AArch64mops>; +def AArch64mops_memset_tagging : SDNode<"AArch64ISD::MOPS_MEMSET_TAGGING", SDT_AArch64mops>; +def AArch64mops_memcopy : SDNode<"AArch64ISD::MOPS_MEMCOPY", SDT_AArch64mops>; +def AArch64mops_memmove : SDNode<"AArch64ISD::MOPS_MEMMOVE", SDT_AArch64mops>; + +// MOPS operations always contain three 4-byte instructions +let Predicates = [HasMOPS], Defs = [NZCV], Size = 12, mayStore = 1 in { + let mayLoad = 1 in { + def MOPSMemoryCopyPseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64common:$Rs_wb, GPR64:$Rn_wb), + (ins GPR64common:$Rd, GPR64common:$Rs, GPR64:$Rn), + [], "$Rd = $Rd_wb,$Rs = $Rs_wb,$Rn = $Rn_wb">, Sched<[]>; + def MOPSMemoryMovePseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64common:$Rs_wb, GPR64:$Rn_wb), + (ins GPR64common:$Rd, GPR64common:$Rs, GPR64:$Rn), + [], "$Rd = $Rd_wb,$Rs = $Rs_wb,$Rn = $Rn_wb">, Sched<[]>; + } + let mayLoad = 0 in { + def MOPSMemorySetPseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb), + (ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm), + [], "$Rd = $Rd_wb,$Rn = $Rn_wb">, Sched<[]>; + } +} +let Predicates = [HasMOPS, HasMTE], Defs = [NZCV], Size = 12, mayLoad = 0, mayStore = 1 in { + def MOPSMemorySetTaggingPseudo : Pseudo<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb), + (ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm), + [], "$Rd = $Rd_wb,$Rn = $Rn_wb">, Sched<[]>; +} + +// This gets lowered into an instruction sequence of 20 bytes +let Defs = [X16, X17], mayStore = 1, isCodeGenOnly = 1, Size = 20 in +def StoreSwiftAsyncContext + : Pseudo<(outs), (ins GPR64:$ctx, GPR64sp:$base, simm9:$offset), + []>, Sched<[]>; + +def AArch64AssertZExtBool : SDNode<"AArch64ISD::ASSERT_ZEXT_BOOL", SDT_assert>; +def : Pat<(AArch64AssertZExtBool GPR32:$op), + (i32 GPR32:$op)>; + include "AArch64InstrAtomics.td" include "AArch64SVEInstrInfo.td" +include "AArch64SMEInstrInfo.td" +include "AArch64InstrGISel.td" diff --git a/suite/synctools/tablegen/AArch64/AArch64PfmCounters.td b/suite/synctools/tablegen/AArch64/AArch64PfmCounters.td new file mode 100644 index 00000000..b1d1664e --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64PfmCounters.td @@ -0,0 +1,18 @@ +//===-- AArch64PfmCounters.td - AArch64 Hardware Counters --*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This describes the available hardware counters for AArch64. +// +//===----------------------------------------------------------------------===// + +def CpuCyclesPfmCounter : PfmCounter<"CPU_CYCLES">; + +def DefaultPfmCounters : ProcPfmCounters { + let CycleCounter = CpuCyclesPfmCounter; +} +def : PfmCountersDefaultBinding; diff --git a/suite/synctools/tablegen/AArch64/AArch64RegisterBanks.td b/suite/synctools/tablegen/AArch64/AArch64RegisterBanks.td index eee58470..615ce7d5 100644 --- a/suite/synctools/tablegen/AArch64/AArch64RegisterBanks.td +++ b/suite/synctools/tablegen/AArch64/AArch64RegisterBanks.td @@ -1,9 +1,8 @@ //=- AArch64RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,7 +10,7 @@ //===----------------------------------------------------------------------===// /// General Purpose Registers: W, X. -def GPRRegBank : RegisterBank<"GPR", [GPR64all]>; +def GPRRegBank : RegisterBank<"GPR", [XSeqPairsClass]>; /// Floating Point/Vector Registers: B, H, S, D, Q. def FPRRegBank : RegisterBank<"FPR", [QQQQ]>; diff --git a/suite/synctools/tablegen/AArch64/AArch64RegisterInfo.td b/suite/synctools/tablegen/AArch64/AArch64RegisterInfo.td index bbf401b4..70daf5ab 100644 --- a/suite/synctools/tablegen/AArch64/AArch64RegisterInfo.td +++ b/suite/synctools/tablegen/AArch64/AArch64RegisterInfo.td @@ -1,9 +1,8 @@ //=- AArch64RegisterInfo.td - Describe the AArch64 Registers -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -25,11 +24,9 @@ let Namespace = "AArch64" in { def bsub : SubRegIndex<8>; def hsub : SubRegIndex<16>; def ssub : SubRegIndex<32>; - def dsub : SubRegIndex<32>; + def dsub : SubRegIndex<64>; def sube32 : SubRegIndex<32>; def subo32 : SubRegIndex<32>; - def qhisub : SubRegIndex<64>; - def qsub : SubRegIndex<64>; def sube64 : SubRegIndex<64>; def subo64 : SubRegIndex<64>; // SVE @@ -48,6 +45,16 @@ let Namespace = "AArch64" in { def qsub1 : SubRegIndex<128>; def qsub2 : SubRegIndex<128>; def qsub3 : SubRegIndex<128>; + // Note: Code depends on these having consecutive numbers + def zasubb : SubRegIndex<2048>; // (16 x 16)/1 bytes = 2048 bits + def zasubh0 : SubRegIndex<1024>; // (16 x 16)/2 bytes = 1024 bits + def zasubh1 : SubRegIndex<1024>; // (16 x 16)/2 bytes = 1024 bits + def zasubs0 : SubRegIndex<512>; // (16 x 16)/4 bytes = 512 bits + def zasubs1 : SubRegIndex<512>; // (16 x 16)/4 bytes = 512 bits + def zasubd0 : SubRegIndex<256>; // (16 x 16)/8 bytes = 256 bits + def zasubd1 : SubRegIndex<256>; // (16 x 16)/8 bytes = 256 bits + def zasubq0 : SubRegIndex<128>; // (16 x 16)/16 bytes = 128 bits + def zasubq1 : SubRegIndex<128>; // (16 x 16)/16 bytes = 128 bits } let Namespace = "AArch64" in { @@ -134,6 +141,9 @@ def NZCV : AArch64Reg<0, "nzcv">; // First fault status register def FFR : AArch64Reg<0, "ffr">, DwarfRegNum<[47]>; +// Purely virtual Vector Granule (VG) Dwarf register +def VG : AArch64Reg<0, "vg">, DwarfRegNum<[46]>; + // GPR register classes with the intersections of GPR32/GPR32sp and // GPR64/GPR64sp for use by the coalescer. def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> { @@ -188,6 +198,10 @@ def GPR64z : RegisterOperand { let GIZeroRegister = XZR; } +// GPR argument registers. +def GPR32arg : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 7)>; +def GPR64arg : RegisterClass<"AArch64", [i64], 64, (sequence "X%u", 0, 7)>; + // GPR register classes which include WZR/XZR AND SP/WSP. This is not a // constraint used by any instructions, it is used as a common super-class. def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>; @@ -200,6 +214,17 @@ def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X2 X22, X23, X24, X25, X26, X27, X28, FP, LR)>; +// Restricted set of tail call registers, for use when branch target +// enforcement is enabled. These are the only registers which can be used to +// indirectly branch (not call) to the "BTI c" instruction at the start of a +// BTI-protected function. +def rtcGPR64 : RegisterClass<"AArch64", [i64], 64, (add X16, X17)>; + +// Register set that excludes registers that are reserved for procedure calls. +// This is used for pseudo-instructions that are actually implemented using a +// procedure call. +def GPR64noip : RegisterClass<"AArch64", [i64], 64, (sub GPR64, X16, X17, LR)>; + // GPR register classes for post increment amount of vector load/store that // has alternate printing when Rm=31 and prints a constant immediate value // equal to the total number of bytes transferred. @@ -408,25 +433,35 @@ def Q31 : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias; def FPR8 : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> { let Size = 8; } -def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> { +def FPR16 : RegisterClass<"AArch64", [f16, bf16], 16, (sequence "H%u", 0, 31)> { + let Size = 16; +} + +def FPR16_lo : RegisterClass<"AArch64", [f16], 16, (trunc FPR16, 16)> { let Size = 16; } def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>; def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32, - v1i64, v4f16], - 64, (sequence "D%u", 0, 31)>; + v1i64, v4f16, v4bf16], + 64, (sequence "D%u", 0, 31)>; +def FPR64_lo : RegisterClass<"AArch64", + [v8i8, v4i16, v2i32, v1i64, v4f16, v4bf16, v2f32, + v1f64], + 64, (trunc FPR64, 16)>; + // We don't (yet) have an f128 legal type, so don't use that here. We // normalize 128-bit vectors to v2f64 for arg passing and such, so use // that here. def FPR128 : RegisterClass<"AArch64", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128, - v8f16], + v8f16, v8bf16], 128, (sequence "Q%u", 0, 31)>; // The lower 16 vector registers. Some instructions can only take registers // in this range. def FPR128_lo : RegisterClass<"AArch64", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], + [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16, + v8bf16], 128, (trunc FPR128, 16)>; // Pairs, triples, and quads of 64-bit vector registers. @@ -467,7 +502,7 @@ def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> { // Vector operand versions of the FP registers. Alternate name printing and -// assmebler matching. +// assembler matching. def VectorReg64AsmOperand : AsmOperandClass { let Name = "VectorReg64"; let PredicateMethod = "isNeonVectorReg"; @@ -489,6 +524,9 @@ def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; let PredicateMethod = "isNeonVectorRegLo"; } +def V64_lo : RegisterOperand { + let ParserMatchClass = VectorRegLoAsmOperand; +} def V128_lo : RegisterOperand { let ParserMatchClass = VectorRegLoAsmOperand; } @@ -627,6 +665,10 @@ def FPR16Op : RegisterOperand { let ParserMatchClass = FPRAsmOperand<"FPR16">; } +def FPR16Op_lo : RegisterOperand { + let ParserMatchClass = FPRAsmOperand<"FPR16_lo">; +} + def FPR32Op : RegisterOperand { let ParserMatchClass = FPRAsmOperand<"FPR32">; } @@ -643,16 +685,18 @@ def FPR128Op : RegisterOperand { // ARMv8.1a atomic CASP register operands -def WSeqPairs : RegisterTuples<[sube32, subo32], - [(rotl GPR32, 0), (rotl GPR32, 1)]>; -def XSeqPairs : RegisterTuples<[sube64, subo64], - [(rotl GPR64, 0), (rotl GPR64, 1)]>; +def WSeqPairs : RegisterTuples<[sube32, subo32], + [(decimate (rotl GPR32, 0), 2), + (decimate (rotl GPR32, 1), 2)]>; +def XSeqPairs : RegisterTuples<[sube64, subo64], + [(decimate (rotl GPR64, 0), 2), + (decimate (rotl GPR64, 1), 2)]>; -def WSeqPairsClass : RegisterClass<"AArch64", [untyped], 32, +def WSeqPairsClass : RegisterClass<"AArch64", [untyped], 32, (add WSeqPairs)>{ let Size = 64; } -def XSeqPairsClass : RegisterClass<"AArch64", [untyped], 64, +def XSeqPairsClass : RegisterClass<"AArch64", [untyped], 64, (add XSeqPairs)>{ let Size = 128; } @@ -675,6 +719,34 @@ def XSeqPairClassOperand : //===----- END: v8.1a atomic CASP register operands -----------------------===// +//===----------------------------------------------------------------------===// +// Armv8.7a accelerator extension register operands: 8 consecutive GPRs +// starting with an even one + +let Namespace = "AArch64" in { + foreach i = 0-7 in + def "x8sub_"#i : SubRegIndex<64, !mul(64, i)>; +} + +def Tuples8X : RegisterTuples< + !foreach(i, [0,1,2,3,4,5,6,7], !cast("x8sub_"#i)), + !foreach(i, [0,1,2,3,4,5,6,7], (trunc (decimate (rotl GPR64, i), 2), 12))>; + +def GPR64x8Class : RegisterClass<"AArch64", [i64x8], 512, (trunc Tuples8X, 12)> { + let Size = 512; +} +def GPR64x8AsmOp : AsmOperandClass { + let Name = "GPR64x8"; + let ParserMethod = "tryParseGPR64x8"; + let RenderMethod = "addRegOperands"; +} +def GPR64x8 : RegisterOperand { + let ParserMatchClass = GPR64x8AsmOp; + let PrintMethod = "printGPR64x8"; +} + +//===----- END: v8.7a accelerator extension register operands -------------===// + // SVE predicate registers def P0 : AArch64Reg<0, "p0">, DwarfRegNum<[48]>; def P1 : AArch64Reg<1, "p1">, DwarfRegNum<[49]>; @@ -764,7 +836,7 @@ def Z30 : AArch64Reg<30, "z30", [Q30, Z30_HI]>, DwarfRegNum<[126]>; def Z31 : AArch64Reg<31, "z31", [Q31, Z31_HI]>, DwarfRegNum<[127]>; } -// Enum descibing the element size for destructive +// Enum describing the element size for destructive // operations. class ElementSizeEnum val> { bits<3> Value = val; @@ -829,48 +901,25 @@ def PPR32 : PPRRegOp<"s", PPRAsmOp32, ElementSizeS, PPR>; def PPR64 : PPRRegOp<"d", PPRAsmOp64, ElementSizeD, PPR>; def PPRAsmOp3bAny : PPRAsmOperand<"Predicate3bAny", "PPR_3b", 0>; -def PPRAsmOp3b8 : PPRAsmOperand<"Predicate3bB", "PPR_3b", 8>; -def PPRAsmOp3b16 : PPRAsmOperand<"Predicate3bH", "PPR_3b", 16>; -def PPRAsmOp3b32 : PPRAsmOperand<"Predicate3bS", "PPR_3b", 32>; -def PPRAsmOp3b64 : PPRAsmOperand<"Predicate3bD", "PPR_3b", 64>; def PPR3bAny : PPRRegOp<"", PPRAsmOp3bAny, ElementSizeNone, PPR_3b>; -def PPR3b8 : PPRRegOp<"b", PPRAsmOp3b8, ElementSizeB, PPR_3b>; -def PPR3b16 : PPRRegOp<"h", PPRAsmOp3b16, ElementSizeH, PPR_3b>; -def PPR3b32 : PPRRegOp<"s", PPRAsmOp3b32, ElementSizeS, PPR_3b>; -def PPR3b64 : PPRRegOp<"d", PPRAsmOp3b64, ElementSizeD, PPR_3b>; //****************************************************************************** -// SVE vector register class -def ZPR : RegisterClass<"AArch64", - [nxv16i8, nxv8i16, nxv4i32, nxv2i64, - nxv2f16, nxv4f16, nxv8f16, - nxv1f32, nxv2f32, nxv4f32, - nxv1f64, nxv2f64], - 128, (sequence "Z%u", 0, 31)> { +// SVE vector register classes +class ZPRClass : RegisterClass<"AArch64", + [nxv16i8, nxv8i16, nxv4i32, nxv2i64, + nxv2f16, nxv4f16, nxv8f16, + nxv2bf16, nxv4bf16, nxv8bf16, + nxv2f32, nxv4f32, + nxv2f64], + 128, (sequence "Z%u", 0, lastreg)> { let Size = 128; } -// SVE restricted 4 bit scalable vector register class -def ZPR_4b : RegisterClass<"AArch64", - [nxv16i8, nxv8i16, nxv4i32, nxv2i64, - nxv2f16, nxv4f16, nxv8f16, - nxv1f32, nxv2f32, nxv4f32, - nxv1f64, nxv2f64], - 128, (sequence "Z%u", 0, 15)> { - let Size = 128; -} - -// SVE restricted 3 bit scalable vector register class -def ZPR_3b : RegisterClass<"AArch64", - [nxv16i8, nxv8i16, nxv4i32, nxv2i64, - nxv2f16, nxv4f16, nxv8f16, - nxv1f32, nxv2f32, nxv4f32, - nxv1f64, nxv2f64], - 128, (sequence "Z%u", 0, 7)> { - let Size = 128; -} +def ZPR : ZPRClass<31>; +def ZPR_4b : ZPRClass<15>; // Restricted 4 bit SVE vector register class. +def ZPR_3b : ZPRClass<7>; // Restricted 3 bit SVE vector register class. class ZPRAsmOperand : AsmOperandClass { @@ -1104,10 +1153,235 @@ class GPR64ExtendRegisterOperand let PrintMethod = "printRegWithShiftExtend"; } -foreach Scale = [8, 16, 32, 64] in { +foreach Scale = [8, 16, 32, 64, 128] in { def GPR64shiftedAsmOpnd # Scale : GPR64ShiftExtendAsmOperand<"GPR64shifted", Scale, "GPR64">; def GPR64shifted # Scale : GPR64ExtendRegisterOperand<"GPR64shiftedAsmOpnd" # Scale, Scale, GPR64>; def GPR64NoXZRshiftedAsmOpnd # Scale : GPR64ShiftExtendAsmOperand<"GPR64NoXZRshifted", Scale, "GPR64common">; def GPR64NoXZRshifted # Scale : GPR64ExtendRegisterOperand<"GPR64NoXZRshiftedAsmOpnd" # Scale, Scale, GPR64common>; } + +// Accumulator array tiles. +def ZAQ0 : AArch64Reg<0, "za0.q">; +def ZAQ1 : AArch64Reg<1, "za1.q">; +def ZAQ2 : AArch64Reg<2, "za2.q">; +def ZAQ3 : AArch64Reg<3, "za3.q">; +def ZAQ4 : AArch64Reg<4, "za4.q">; +def ZAQ5 : AArch64Reg<5, "za5.q">; +def ZAQ6 : AArch64Reg<6, "za6.q">; +def ZAQ7 : AArch64Reg<7, "za7.q">; +def ZAQ8 : AArch64Reg<8, "za8.q">; +def ZAQ9 : AArch64Reg<9, "za9.q">; +def ZAQ10 : AArch64Reg<10, "za10.q">; +def ZAQ11 : AArch64Reg<11, "za11.q">; +def ZAQ12 : AArch64Reg<12, "za12.q">; +def ZAQ13 : AArch64Reg<13, "za13.q">; +def ZAQ14 : AArch64Reg<14, "za14.q">; +def ZAQ15 : AArch64Reg<15, "za15.q">; + +let SubRegIndices = [zasubq0, zasubq1] in { + def ZAD0 : AArch64Reg<0, "za0.d", [ZAQ0, ZAQ8]>; + def ZAD1 : AArch64Reg<1, "za1.d", [ZAQ1, ZAQ9]>; + def ZAD2 : AArch64Reg<2, "za2.d", [ZAQ2, ZAQ10]>; + def ZAD3 : AArch64Reg<3, "za3.d", [ZAQ3, ZAQ11]>; + def ZAD4 : AArch64Reg<4, "za4.d", [ZAQ4, ZAQ12]>; + def ZAD5 : AArch64Reg<5, "za5.d", [ZAQ5, ZAQ13]>; + def ZAD6 : AArch64Reg<6, "za6.d", [ZAQ6, ZAQ14]>; + def ZAD7 : AArch64Reg<7, "za7.d", [ZAQ7, ZAQ15]>; +} + +let SubRegIndices = [zasubd0, zasubd1] in { + def ZAS0 : AArch64Reg<0, "za0.s", [ZAD0, ZAD4]>; + def ZAS1 : AArch64Reg<1, "za1.s", [ZAD1, ZAD5]>; + def ZAS2 : AArch64Reg<2, "za2.s", [ZAD2, ZAD6]>; + def ZAS3 : AArch64Reg<3, "za3.s", [ZAD3, ZAD7]>; +} + +let SubRegIndices = [zasubs0, zasubs1] in { + def ZAH0 : AArch64Reg<0, "za0.h", [ZAS0, ZAS2]>; + def ZAH1 : AArch64Reg<1, "za1.h", [ZAS1, ZAS3]>; +} + +let SubRegIndices = [zasubh0, zasubh1] in { + def ZAB0 : AArch64Reg<0, "za0.b", [ZAH0, ZAH1]>; +} + +let SubRegIndices = [zasubb] in { + def ZA : AArch64Reg<0, "za", [ZAB0]>; +} + +// SME Register Classes + +// Accumulator array +def MPR : RegisterClass<"AArch64", [untyped], 2048, (add ZA)> { + let Size = 2048; +} + +// Accumulator array as single tiles +def MPR8 : RegisterClass<"AArch64", [untyped], 2048, (add (sequence "ZAB%u", 0, 0))> { + let Size = 2048; +} +def MPR16 : RegisterClass<"AArch64", [untyped], 1024, (add (sequence "ZAH%u", 0, 1))> { + let Size = 1024; +} +def MPR32 : RegisterClass<"AArch64", [untyped], 512, (add (sequence "ZAS%u", 0, 3))> { + let Size = 512; +} +def MPR64 : RegisterClass<"AArch64", [untyped], 256, (add (sequence "ZAD%u", 0, 7))> { + let Size = 256; +} +def MPR128 : RegisterClass<"AArch64", [untyped], 128, (add (sequence "ZAQ%u", 0, 15))> { + let Size = 128; +} + +// SME Register Operands +// There are three types of SME matrix register operands: +// * Tiles: +// +// These tiles make up the larger accumulator matrix. The tile representation +// has an element type suffix, e.g. za0.b or za15.q and can be any of the +// registers: +// ZAQ0..ZAQ15 +// ZAD0..ZAD7 +// ZAS0..ZAS3 +// ZAH0..ZAH1 +// or ZAB0 +// +// * Tile vectors: +// +// Their representation is similar to regular tiles, but they have an extra +// 'h' or 'v' to tell how the vector at [reg+offset] is layed out in the tile, +// horizontally or vertically. +// +// e.g. za1h.h or za15v.q, which corresponds to vectors in registers ZAH1 and +// ZAQ15, respectively. The horizontal/vertical is more a property of the +// instruction, than a property of the asm-operand itself, or its register. +// The distinction is required for the parsing/printing of the operand, +// as from a compiler's perspective, the whole tile is read/written. +// +// * Accumulator matrix: +// +// This is the entire matrix accumulator register ZA (<=> ZAB0), printed as +// 'za'. + +// +// Tiles +// + +class MatrixTileAsmOperand : AsmOperandClass { + let Name = "MatrixTile" # EltSize; + let DiagnosticType = "Invalid" # Name; + let ParserMethod = "tryParseMatrixRegister"; + let RenderMethod = "addMatrixOperands"; + let PredicateMethod = "isMatrixRegOperand<" + # "MatrixKind::Tile" # ", " + # EltSize # ", AArch64::" # RC # "RegClassID>"; +} + +class MatrixTileOperand + : RegisterOperand { + let ParserMatchClass = MatrixTileAsmOperand(RC), EltSize>; + let DecoderMethod = "DecodeMatrixTile<" # NumBitsForTile # ">"; + let PrintMethod = "printMatrixTile"; +} + +def TileOp32 : MatrixTileOperand<32, 2, MPR32>; +def TileOp64 : MatrixTileOperand<64, 3, MPR64>; + +// +// Tile vectors (horizontal and vertical) +// + +class MatrixTileVectorAsmOperand + : AsmOperandClass { + let Name = "MatrixTileVector" # !if(IsVertical, "V", "H") # EltSize; + let DiagnosticType = "Invalid" # Name; + let ParserMethod = "tryParseMatrixRegister"; + let RenderMethod = "addMatrixOperands"; + let PredicateMethod = "isMatrixRegOperand<" + # "MatrixKind::" + # !if(IsVertical, "Col", "Row") # ", " + # EltSize # ", AArch64::" # RC # "RegClassID>"; +} + +class MatrixTileVectorOperand + : RegisterOperand { + let ParserMatchClass = MatrixTileVectorAsmOperand(RC), EltSize, + IsVertical>; + let DecoderMethod = "DecodeMatrixTile<" # NumBitsForTile # ">"; + let PrintMethod = "printMatrixTileVector<" # IsVertical # ">"; +} + +def TileVectorOpH8 : MatrixTileVectorOperand< 8, 0, MPR8, 0>; +def TileVectorOpH16 : MatrixTileVectorOperand< 16, 1, MPR16, 0>; +def TileVectorOpH32 : MatrixTileVectorOperand< 32, 2, MPR32, 0>; +def TileVectorOpH64 : MatrixTileVectorOperand< 64, 3, MPR64, 0>; +def TileVectorOpH128 : MatrixTileVectorOperand<128, 4, MPR128, 0>; + +def TileVectorOpV8 : MatrixTileVectorOperand< 8, 0, MPR8, 1>; +def TileVectorOpV16 : MatrixTileVectorOperand< 16, 1, MPR16, 1>; +def TileVectorOpV32 : MatrixTileVectorOperand< 32, 2, MPR32, 1>; +def TileVectorOpV64 : MatrixTileVectorOperand< 64, 3, MPR64, 1>; +def TileVectorOpV128 : MatrixTileVectorOperand<128, 4, MPR128, 1>; + +// +// Accumulator matrix +// + +class MatrixAsmOperand : AsmOperandClass { + let Name = "Matrix"; + let DiagnosticType = "Invalid" # Name; + let ParserMethod = "tryParseMatrixRegister"; + let RenderMethod = "addMatrixOperands"; + let PredicateMethod = "isMatrixRegOperand<" + # "MatrixKind::Array" # ", " + # EltSize # ", AArch64::" # RC # "RegClassID>"; +} + +class MatrixOperand : RegisterOperand { + let ParserMatchClass = MatrixAsmOperand(RC), EltSize>; + let PrintMethod = "printMatrix<" # EltSize # ">"; +} + +def MatrixOp : MatrixOperand; + +class MatrixTileListAsmOperand : AsmOperandClass { + let Name = "MatrixTileList"; + let ParserMethod = "tryParseMatrixTileList"; + let RenderMethod = "addMatrixTileListOperands"; + let PredicateMethod = "isMatrixTileList"; +} + +class MatrixTileListOperand : Operand { + let ParserMatchClass = MatrixTileListAsmOperand<>; + let DecoderMethod = "DecodeMatrixTileListRegisterClass"; + let EncoderMethod = "EncodeMatrixTileListRegisterClass"; + let PrintMethod = "printMatrixTileList"; +} + +def MatrixTileList : MatrixTileListOperand<>; + +def MatrixIndexGPR32_12_15 : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 12, 15)> { + let DiagnosticType = "InvalidMatrixIndexGPR32_12_15"; +} +def MatrixIndexGPR32Op12_15 : RegisterOperand { + let EncoderMethod = "encodeMatrixIndexGPR32"; +} + +def SVCROperand : AsmOperandClass { + let Name = "SVCR"; + let ParserMethod = "tryParseSVCR"; + let DiagnosticType = "Invalid" # Name; +} + +def svcr_op : Operand { + let ParserMatchClass = SVCROperand; + let PrintMethod = "printSVCROp"; + let DecoderMethod = "DecodeSVCROp"; + let MCOperandPredicate = [{ + if (!MCOp.isImm()) + return false; + return AArch64SVCR::lookupSVCRByEncoding(MCOp.getImm()) != nullptr; + }]; +} diff --git a/suite/synctools/tablegen/AArch64/AArch64SMEInstrInfo.td b/suite/synctools/tablegen/AArch64/AArch64SMEInstrInfo.td new file mode 100644 index 00000000..aacace64 --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64SMEInstrInfo.td @@ -0,0 +1,143 @@ +//=- AArch64SMEInstrInfo.td - AArch64 SME Instructions -*- tablegen -*-----=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// AArch64 Scalable Matrix Extension (SME) Instruction definitions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Add vector elements horizontally or vertically to ZA tile. +//===----------------------------------------------------------------------===// + +let Predicates = [HasSME] in { +def ADDHA_MPPZ_S : sme_add_vector_to_tile_u32<0b0, "addha">; +def ADDVA_MPPZ_S : sme_add_vector_to_tile_u32<0b1, "addva">; +} + +let Predicates = [HasSMEI64] in { +def ADDHA_MPPZ_D : sme_add_vector_to_tile_u64<0b0, "addha">; +def ADDVA_MPPZ_D : sme_add_vector_to_tile_u64<0b1, "addva">; +} + +let Predicates = [HasSME] in { +//===----------------------------------------------------------------------===// +// Outer products +//===----------------------------------------------------------------------===// + +defm BFMOPA_MPPZZ : sme_bf16_outer_product<0b0, "bfmopa">; +defm BFMOPS_MPPZZ : sme_bf16_outer_product<0b1, "bfmops">; + +def FMOPA_MPPZZ_S : sme_outer_product_fp32<0b0, "fmopa">; +def FMOPS_MPPZZ_S : sme_outer_product_fp32<0b1, "fmops">; +} + +let Predicates = [HasSMEF64] in { +def FMOPA_MPPZZ_D : sme_outer_product_fp64<0b0, "fmopa">; +def FMOPS_MPPZZ_D : sme_outer_product_fp64<0b1, "fmops">; +} + +let Predicates = [HasSME] in { +defm FMOPAL_MPPZZ : sme_f16_outer_product<0b0, "fmopa">; +defm FMOPSL_MPPZZ : sme_f16_outer_product<0b1, "fmops">; + +def SMOPA_MPPZZ_S : sme_int_outer_product_i32<0b000, "smopa">; +def SMOPS_MPPZZ_S : sme_int_outer_product_i32<0b001, "smops">; +def UMOPA_MPPZZ_S : sme_int_outer_product_i32<0b110, "umopa">; +def UMOPS_MPPZZ_S : sme_int_outer_product_i32<0b111, "umops">; +def SUMOPA_MPPZZ_S : sme_int_outer_product_i32<0b010, "sumopa">; +def SUMOPS_MPPZZ_S : sme_int_outer_product_i32<0b011, "sumops">; +def USMOPA_MPPZZ_S : sme_int_outer_product_i32<0b100, "usmopa">; +def USMOPS_MPPZZ_S : sme_int_outer_product_i32<0b101, "usmops">; +} + +let Predicates = [HasSMEI64] in { +def SMOPA_MPPZZ_D : sme_int_outer_product_i64<0b000, "smopa">; +def SMOPS_MPPZZ_D : sme_int_outer_product_i64<0b001, "smops">; +def UMOPA_MPPZZ_D : sme_int_outer_product_i64<0b110, "umopa">; +def UMOPS_MPPZZ_D : sme_int_outer_product_i64<0b111, "umops">; +def SUMOPA_MPPZZ_D : sme_int_outer_product_i64<0b010, "sumopa">; +def SUMOPS_MPPZZ_D : sme_int_outer_product_i64<0b011, "sumops">; +def USMOPA_MPPZZ_D : sme_int_outer_product_i64<0b100, "usmopa">; +def USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops">; +} + +let Predicates = [HasSME] in { +//===----------------------------------------------------------------------===// +// Loads and stores +//===----------------------------------------------------------------------===// + +defm LD1_MXIPXX : sme_mem_ld_ss<"ld1">; +defm ST1_MXIPXX : sme_mem_st_ss<"st1">; + +//===----------------------------------------------------------------------===// +// Spill + fill +//===----------------------------------------------------------------------===// + +defm LDR_ZA : sme_fill<"ldr">; +defm STR_ZA : sme_spill<"str">; + +//===----------------------------------------------------------------------===// +// Move instructions +//===----------------------------------------------------------------------===// + +defm INSERT_MXIPZ : sme_vector_to_tile<"mova">; +defm EXTRACT_ZPMXI : sme_tile_to_vector<"mova">; + +//===----------------------------------------------------------------------===// +// Zero instruction +//===----------------------------------------------------------------------===// + +defm ZERO_M : sme_zero<"zero">; + +//===----------------------------------------------------------------------===// +// Mode selection and state access instructions +//===----------------------------------------------------------------------===// + +// SME defines three pstate fields to set or clear PSTATE.SM, PSTATE.ZA, or +// both fields: +// +// MSR SVCRSM, # +// MSR SVCRZA, # +// MSR SVCRSMZA, # +// +// It's tricky to using the existing pstate operand defined in +// AArch64SystemOperands.td since it only encodes 5 bits including op1;op2, +// when these fields are also encoded in CRm[3:1]. +class MSRpstatesvcrImm0_1 + : PstateWriteSimple<(ins svcr_op:$pstatefield, imm0_1:$imm), "msr", + "\t$pstatefield, $imm">, + Sched<[WriteSys]> { + bits<3> pstatefield; + bit imm; + let Inst{18-16} = 0b011; // op1 + let Inst{11-9} = pstatefield; + let Inst{8} = imm; + let Inst{7-5} = 0b011; // op2 +} + +def MSRpstatesvcrImm1 : MSRpstatesvcrImm0_1; +def : InstAlias<"smstart", (MSRpstatesvcrImm1 0b011, 0b1)>; +def : InstAlias<"smstart sm", (MSRpstatesvcrImm1 0b001, 0b1)>; +def : InstAlias<"smstart za", (MSRpstatesvcrImm1 0b010, 0b1)>; + +def : InstAlias<"smstop", (MSRpstatesvcrImm1 0b011, 0b0)>; +def : InstAlias<"smstop sm", (MSRpstatesvcrImm1 0b001, 0b0)>; +def : InstAlias<"smstop za", (MSRpstatesvcrImm1 0b010, 0b0)>; + +//===----------------------------------------------------------------------===// +// SVE2 instructions +//===----------------------------------------------------------------------===// + +def REVD_ZPmZ : sve2_int_perm_revd<"revd">; + +defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0>; +defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1>; + +defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel">; + +} // End let Predicates = [HasSME] diff --git a/suite/synctools/tablegen/AArch64/AArch64SVEInstrInfo.td b/suite/synctools/tablegen/AArch64/AArch64SVEInstrInfo.td index 0fde6801..2397a6d3 100644 --- a/suite/synctools/tablegen/AArch64/AArch64SVEInstrInfo.td +++ b/suite/synctools/tablegen/AArch64/AArch64SVEInstrInfo.td @@ -1,9 +1,8 @@ //=- AArch64SVEInstrInfo.td - AArch64 SVE Instructions -*- tablegen -*-----=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,175 +10,601 @@ // //===----------------------------------------------------------------------===// +// For predicated nodes where the entire operation is controlled by a governing +// predicate, please stick to a similar naming convention as used for the +// ISD nodes: +// +// SDNode <=> AArch64ISD +// ------------------------------- +// _m <=> _MERGE_OP +// _mt <=> _MERGE_PASSTHRU +// _z <=> _MERGE_ZERO +// _p <=> _PRED +// +// Given the context of this file, it is not strictly necessary to use _p to +// distinguish predicated from unpredicated nodes given that most SVE +// instructions are predicated. + +// Contiguous loads - node definitions +// +def SDT_AArch64_LD1 : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64ld1_z : SDNode<"AArch64ISD::LD1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1s_z : SDNode<"AArch64ISD::LD1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; + +// Non-faulting & first-faulting loads - node definitions +// +def AArch64ldnf1_z : SDNode<"AArch64ISD::LDNF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_z : SDNode<"AArch64ISD::LDFF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; + +def AArch64ldnf1s_z : SDNode<"AArch64ISD::LDNF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_z : SDNode<"AArch64ISD::LDFF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; + +// Contiguous load and replicate - node definitions +// + +def SDT_AArch64_LD1Replicate : SDTypeProfile<1, 2, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64ld1rq_z : SDNode<"AArch64ISD::LD1RQ_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1ro_z : SDNode<"AArch64ISD::LD1RO_MERGE_ZERO", SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>; + +// Gather loads - node definitions +// +def SDT_AArch64_GATHER_SV : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def SDT_AArch64_GATHER_VS : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64ld1_gather_z : SDNode<"AArch64ISD::GLD1_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_scaled_z : SDNode<"AArch64ISD::GLD1_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_uxtw_z : SDNode<"AArch64ISD::GLD1_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_sxtw_z : SDNode<"AArch64ISD::GLD1_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1_gather_imm_z : SDNode<"AArch64ISD::GLD1_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; + +def AArch64ld1s_gather_z : SDNode<"AArch64ISD::GLD1S_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_scaled_z : SDNode<"AArch64ISD::GLD1S_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_uxtw_z : SDNode<"AArch64ISD::GLD1S_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_sxtw_z : SDNode<"AArch64ISD::GLD1S_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ld1s_gather_imm_z : SDNode<"AArch64ISD::GLD1S_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; + +def AArch64ldff1_gather_z : SDNode<"AArch64ISD::GLDFF1_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_scaled_z : SDNode<"AArch64ISD::GLDFF1_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_uxtw_z : SDNode<"AArch64ISD::GLDFF1_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_sxtw_z : SDNode<"AArch64ISD::GLDFF1_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1_gather_imm_z : SDNode<"AArch64ISD::GLDFF1_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; + +def AArch64ldff1s_gather_z : SDNode<"AArch64ISD::GLDFF1S_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_uxtw_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_sxtw_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ldff1s_gather_imm_z : SDNode<"AArch64ISD::GLDFF1S_IMM_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>; + +def AArch64ldnt1_gather_z : SDNode<"AArch64ISD::GLDNT1_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ldnt1s_gather_z : SDNode<"AArch64ISD::GLDNT1S_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; + +// Contiguous stores - node definitions +// +def SDT_AArch64_ST1 : SDTypeProfile<0, 4, [ + SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, + SDTCVecEltisVT<2,i1>, SDTCisSameNumEltsAs<0,2> +]>; + +def AArch64st1 : SDNode<"AArch64ISD::ST1_PRED", SDT_AArch64_ST1, [SDNPHasChain, SDNPMayStore]>; + +// Scatter stores - node definitions +// +def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def SDT_AArch64_SCATTER_VS : SDTypeProfile<0, 5, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64st1_scatter : SDNode<"AArch64ISD::SST1_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>; +def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>; + +def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>; + +// AArch64 SVE/SVE2 - the remaining node definitions +// + +// SVE CNT/INC/RDVL +def sve_rdvl_imm : ComplexPattern">; +def sve_cnth_imm : ComplexPattern">; +def sve_cntw_imm : ComplexPattern">; +def sve_cntd_imm : ComplexPattern">; + +// SVE DEC +def sve_cnth_imm_neg : ComplexPattern">; +def sve_cntw_imm_neg : ComplexPattern">; +def sve_cntd_imm_neg : ComplexPattern">; + +def SDT_AArch64Reduce : SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisVec<2>]>; +def AArch64faddv_p : SDNode<"AArch64ISD::FADDV_PRED", SDT_AArch64Reduce>; +def AArch64fmaxv_p : SDNode<"AArch64ISD::FMAXV_PRED", SDT_AArch64Reduce>; +def AArch64fmaxnmv_p : SDNode<"AArch64ISD::FMAXNMV_PRED", SDT_AArch64Reduce>; +def AArch64fminv_p : SDNode<"AArch64ISD::FMINV_PRED", SDT_AArch64Reduce>; +def AArch64fminnmv_p : SDNode<"AArch64ISD::FMINNMV_PRED", SDT_AArch64Reduce>; +def AArch64saddv_p : SDNode<"AArch64ISD::SADDV_PRED", SDT_AArch64Reduce>; +def AArch64uaddv_p : SDNode<"AArch64ISD::UADDV_PRED", SDT_AArch64Reduce>; +def AArch64smaxv_p : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>; +def AArch64umaxv_p : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>; +def AArch64sminv_p : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>; +def AArch64uminv_p : SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>; +def AArch64orv_p : SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>; +def AArch64eorv_p : SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>; +def AArch64andv_p : SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>; +def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>; +def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>; + +def SDT_AArch64Arith : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, + SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3> +]>; + +def SDT_AArch64FMA : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4>, + SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4> +]>; + +// Predicated operations with the result of inactive lanes being unspecified. +def AArch64add_p : SDNode<"AArch64ISD::ADD_PRED", SDT_AArch64Arith>; +def AArch64asr_p : SDNode<"AArch64ISD::SRA_PRED", SDT_AArch64Arith>; +def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>; +def AArch64fdiv_p : SDNode<"AArch64ISD::FDIV_PRED", SDT_AArch64Arith>; +def AArch64fma_p : SDNode<"AArch64ISD::FMA_PRED", SDT_AArch64FMA>; +def AArch64fmax_p : SDNode<"AArch64ISD::FMAX_PRED", SDT_AArch64Arith>; +def AArch64fmaxnm_p : SDNode<"AArch64ISD::FMAXNM_PRED", SDT_AArch64Arith>; +def AArch64fmin_p : SDNode<"AArch64ISD::FMIN_PRED", SDT_AArch64Arith>; +def AArch64fminnm_p : SDNode<"AArch64ISD::FMINNM_PRED", SDT_AArch64Arith>; +def AArch64fmul_p : SDNode<"AArch64ISD::FMUL_PRED", SDT_AArch64Arith>; +def AArch64fsub_p : SDNode<"AArch64ISD::FSUB_PRED", SDT_AArch64Arith>; +def AArch64lsl_p : SDNode<"AArch64ISD::SHL_PRED", SDT_AArch64Arith>; +def AArch64lsr_p : SDNode<"AArch64ISD::SRL_PRED", SDT_AArch64Arith>; +def AArch64mul_p : SDNode<"AArch64ISD::MUL_PRED", SDT_AArch64Arith>; +def AArch64sabd_p : SDNode<"AArch64ISD::ABDS_PRED", SDT_AArch64Arith>; +def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>; +def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>; +def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>; +def AArch64smulh_p : SDNode<"AArch64ISD::MULHS_PRED", SDT_AArch64Arith>; +def AArch64sub_p : SDNode<"AArch64ISD::SUB_PRED", SDT_AArch64Arith>; +def AArch64uabd_p : SDNode<"AArch64ISD::ABDU_PRED", SDT_AArch64Arith>; +def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>; +def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>; +def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>; +def AArch64umulh_p : SDNode<"AArch64ISD::MULHU_PRED", SDT_AArch64Arith>; + +def SDT_AArch64Arith_Imm : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3,i32>, + SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2> +]>; + +def AArch64asrd_m1 : SDNode<"AArch64ISD::SRAD_MERGE_OP1", SDT_AArch64Arith_Imm>; + +def SDT_AArch64IntExtend : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVT<3, OtherVT>, SDTCisVec<4>, + SDTCVecEltisVT<1,i1>, SDTCisSameAs<0,2>, SDTCisVTSmallerThanOp<3, 2>, SDTCisSameAs<0,4> +]>; + +// Predicated operations with the result of inactive lanes provided by the last operand. +def AArch64clz_mt : SDNode<"AArch64ISD::CTLZ_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64cnt_mt : SDNode<"AArch64ISD::CTPOP_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64fneg_mt : SDNode<"AArch64ISD::FNEG_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64fabs_mt : SDNode<"AArch64ISD::FABS_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64abs_mt : SDNode<"AArch64ISD::ABS_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64neg_mt : SDNode<"AArch64ISD::NEG_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64sxt_mt : SDNode<"AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>; +def AArch64uxt_mt : SDNode<"AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU", SDT_AArch64IntExtend>; +def AArch64frintp_mt : SDNode<"AArch64ISD::FCEIL_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64frintm_mt : SDNode<"AArch64ISD::FFLOOR_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64frinti_mt : SDNode<"AArch64ISD::FNEARBYINT_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64frintx_mt : SDNode<"AArch64ISD::FRINT_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64frinta_mt : SDNode<"AArch64ISD::FROUND_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64frintn_mt : SDNode<"AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64frintz_mt : SDNode<"AArch64ISD::FTRUNC_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64fsqrt_mt : SDNode<"AArch64ISD::FSQRT_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64frecpx_mt : SDNode<"AArch64ISD::FRECPX_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64rbit_mt : SDNode<"AArch64ISD::BITREVERSE_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64revb_mt : SDNode<"AArch64ISD::BSWAP_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64revh_mt : SDNode<"AArch64ISD::REVH_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64revw_mt : SDNode<"AArch64ISD::REVW_MERGE_PASSTHRU", SDT_AArch64Arith>; + +// These are like the above but we don't yet have need for ISD nodes. They allow +// a single pattern to match intrinsic and ISD operand layouts. +def AArch64cls_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_cls node:$pt, node:$pg, node:$op)]>; +def AArch64cnot_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_cnot node:$pt, node:$pg, node:$op)]>; +def AArch64not_mt : PatFrags<(ops node:$pg, node:$op, node:$pt), [(int_aarch64_sve_not node:$pt, node:$pg, node:$op)]>; + +def SDT_AArch64FCVT : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, + SDTCVecEltisVT<1,i1> +]>; + +def SDT_AArch64FCVTR : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVec<4>, + SDTCVecEltisVT<1,i1> +]>; + +def AArch64fcvtr_mt : SDNode<"AArch64ISD::FP_ROUND_MERGE_PASSTHRU", SDT_AArch64FCVTR>; +def AArch64fcvte_mt : SDNode<"AArch64ISD::FP_EXTEND_MERGE_PASSTHRU", SDT_AArch64FCVT>; +def AArch64ucvtf_mt : SDNode<"AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>; +def AArch64scvtf_mt : SDNode<"AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>; +def AArch64fcvtzu_mt : SDNode<"AArch64ISD::FCVTZU_MERGE_PASSTHRU", SDT_AArch64FCVT>; +def AArch64fcvtzs_mt : SDNode<"AArch64ISD::FCVTZS_MERGE_PASSTHRU", SDT_AArch64FCVT>; + +def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>; +def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>; +def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>; +def AArch64fadda_p : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>; + +def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; +def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>; + +def SDT_AArch64DUP_PRED : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>]>; +def AArch64dup_mt : SDNode<"AArch64ISD::DUP_MERGE_PASSTHRU", SDT_AArch64DUP_PRED>; + +def AArch64splice : SDNode<"AArch64ISD::SPLICE", SDT_AArch64Arith>; + +def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>; + +def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2), + (AArch64mul_p node:$pred, node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; + +def AArch64fabd_p : PatFrag<(ops node:$pg, node:$op1, node:$op2), + (AArch64fabs_mt node:$pg, (AArch64fsub_p node:$pg, node:$op1, node:$op2), undef)>; + +def AArch64fneg_mt_nsz : PatFrag<(ops node:$pred, node:$op, node:$pt), + (AArch64fneg_mt node:$pred, node:$op, node:$pt), [{ + return N->getFlags().hasNoSignedZeros(); +}]>; + +def SDT_AArch64Arith_Unpred : SDTypeProfile<1, 2, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, + SDTCisSameAs<0,1>, SDTCisSameAs<1,2> +]>; + +def AArch64bic_node : SDNode<"AArch64ISD::BIC", SDT_AArch64Arith_Unpred>; + +def AArch64bic : PatFrags<(ops node:$op1, node:$op2), + [(and node:$op1, (xor node:$op2, (AArch64dup (i32 -1)))), + (and node:$op1, (xor node:$op2, (AArch64dup (i64 -1)))), + (and node:$op1, (xor node:$op2, (SVEAllActive))), + (AArch64bic_node node:$op1, node:$op2)]>; + let Predicates = [HasSVE] in { + defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>; + def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">; + defm RDFFR_P : sve_int_rdffr_unpred<"rdffr", int_aarch64_sve_rdffr>; + def SETFFR : sve_int_setffr<"setffr", int_aarch64_sve_setffr>; + def WRFFR : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>; +} // End HasSVE - def RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr">; - def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">; - def RDFFR_P : sve_int_rdffr_unpred<"rdffr">; - def SETFFR : sve_int_setffr<"setffr">; - def WRFFR : sve_int_wrffr<"wrffr">; +let Predicates = [HasSVEorStreamingSVE] in { + defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add>; + defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub>; + defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat>; + defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat>; + defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat>; + defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat>; - defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add">; - defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub">; - defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd">; - defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd">; - defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub">; - defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub">; + defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and", and>; + defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>; + defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>; + defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", AArch64bic>; - def AND_ZZZ : sve_int_bin_cons_log<0b00, "and">; - def ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr">; - def EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor">; - def BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic">; + defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", int_aarch64_sve_add, DestructiveBinaryComm>; + defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">; + defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>; - defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add">; - defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub">; - defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr">; + defm ADD_ZPZZ : sve_int_bin_pred_bhsd; + defm SUB_ZPZZ : sve_int_bin_pred_bhsd; +} // End HasSVEorStreamingSVE - defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr">; - defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor">; - defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and">; - defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic">; +let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in { + defm ADD_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + defm SUB_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + defm SUBR_ZPZZ : sve_int_bin_pred_zeroing_bhsd; +} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos - defm ADD_ZI : sve_int_arith_imm0<0b000, "add">; - defm SUB_ZI : sve_int_arith_imm0<0b001, "sub">; - defm SUBR_ZI : sve_int_arith_imm0<0b011, "subr">; - defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd">; - defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd">; - defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub">; - defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub">; +let Predicates = [HasSVEorStreamingSVE] in { + defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>; + defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>; + defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>; + defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>; - defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad">; - defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb">; - defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla">; - defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls">; + defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add>; + defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub>; + defm SUBR_ZI : sve_int_arith_imm0_subr<0b011, "subr", sub>; + defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat>; + defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>; + defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat>; + defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat>; + + defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>; + defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>; + defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla, add, AArch64mul_p_oneuse>; + defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls, sub, AArch64mul_p_oneuse>; // SVE predicated integer reductions. - defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv">; - defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv">; - defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv">; - defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv">; - defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv">; - defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv">; - defm ORV_VPZ : sve_int_reduce_2<0b000, "orv">; - defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv">; - defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv">; + defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", AArch64saddv_p>; + defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", AArch64uaddv_p>; + defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_p>; + defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_p>; + defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_p>; + defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_p>; + defm ORV_VPZ : sve_int_reduce_2<0b000, "orv", AArch64orv_p>; + defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_p>; + defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_p>; - defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn">; - defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon">; - defm AND_ZI : sve_int_log_imm<0b10, "and", "bic">; + defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn", or>; + defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>; + defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>; + defm BIC_ZI : sve_int_log_imm_bic; - defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", simm8>; - defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", simm8>; - defm UMAX_ZI : sve_int_arith_imm1<0b01, "umax", imm0_255>; - defm UMIN_ZI : sve_int_arith_imm1<0b11, "umin", imm0_255>; + defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", AArch64smax_p>; + defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", AArch64smin_p>; + defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_p>; + defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_p>; - defm MUL_ZI : sve_int_arith_imm2<"mul">; - defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul">; - defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh">; - defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh">; + defm MUL_ZI : sve_int_arith_imm2<"mul", AArch64mul_p>; + defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", "MUL_ZPZZ", int_aarch64_sve_mul, DestructiveBinaryComm>; + defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", "SMULH_ZPZZ", int_aarch64_sve_smulh, DestructiveBinaryComm>; + defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", "UMULH_ZPZZ", int_aarch64_sve_umulh, DestructiveBinaryComm>; - defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv">; - defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv">; - defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr">; - defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr">; + defm MUL_ZPZZ : sve_int_bin_pred_bhsd; + defm SMULH_ZPZZ : sve_int_bin_pred_bhsd; + defm UMULH_ZPZZ : sve_int_bin_pred_bhsd; - defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot">; - defm UDOT_ZZZ : sve_intx_dot<0b1, "udot">; + defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", "SDIV_ZPZZ", int_aarch64_sve_sdiv, DestructiveBinaryCommWithRev, "SDIVR_ZPmZ">; + defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", "UDIV_ZPZZ", int_aarch64_sve_udiv, DestructiveBinaryCommWithRev, "UDIVR_ZPmZ">; + defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr", "SDIVR_ZPZZ", int_aarch64_sve_sdivr, DestructiveBinaryCommWithRev, "SDIV_ZPmZ", /*isReverseInstr*/ 1>; + defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr", "UDIVR_ZPZZ", int_aarch64_sve_udivr, DestructiveBinaryCommWithRev, "UDIV_ZPmZ", /*isReverseInstr*/ 1>; - defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot">; - defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot">; + defm SDIV_ZPZZ : sve_int_bin_pred_sd; + defm UDIV_ZPZZ : sve_int_bin_pred_sd; - defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb">; - defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb">; - defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth">; - defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth">; - defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw">; - defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw">; - defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs">; - defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg">; + defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", AArch64sdot>; + defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", AArch64udot>; - defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls">; - defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz">; - defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt">; - defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot">; - defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not">; - defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs">; - defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg">; + defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>; + defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>; - defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax">; - defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax">; - defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin">; - defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin">; - defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd">; - defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd">; + defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb", AArch64sxt_mt>; + defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb", AArch64uxt_mt>; + defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth", AArch64sxt_mt>; + defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth", AArch64uxt_mt>; + defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw", AArch64sxt_mt>; + defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw", AArch64uxt_mt>; + defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", AArch64abs_mt>; + defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", AArch64neg_mt>; - defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe">; - defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte">; + defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", AArch64cls_mt>; + defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", AArch64clz_mt>; + defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", AArch64cnt_mt>; + defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", AArch64cnot_mt>; + defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", AArch64not_mt>; + defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", AArch64fabs_mt>; + defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", AArch64fneg_mt>; - defm FADD_ZPmI : sve_fp_2op_i_p_zds<0b000, "fadd", sve_fpimm_half_one>; - defm FSUB_ZPmI : sve_fp_2op_i_p_zds<0b001, "fsub", sve_fpimm_half_one>; - defm FMUL_ZPmI : sve_fp_2op_i_p_zds<0b010, "fmul", sve_fpimm_half_two>; - defm FSUBR_ZPmI : sve_fp_2op_i_p_zds<0b011, "fsubr", sve_fpimm_half_one>; - defm FMAXNM_ZPmI : sve_fp_2op_i_p_zds<0b100, "fmaxnm", sve_fpimm_zero_one>; - defm FMINNM_ZPmI : sve_fp_2op_i_p_zds<0b101, "fminnm", sve_fpimm_zero_one>; - defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>; - defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>; + defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", "SMAX_ZPZZ", int_aarch64_sve_smax, DestructiveBinaryComm>; + defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", "UMAX_ZPZZ", int_aarch64_sve_umax, DestructiveBinaryComm>; + defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", "SMIN_ZPZZ", int_aarch64_sve_smin, DestructiveBinaryComm>; + defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", "UMIN_ZPZZ", int_aarch64_sve_umin, DestructiveBinaryComm>; + defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", "SABD_ZPZZ", int_aarch64_sve_sabd, DestructiveBinaryComm>; + defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", "UABD_ZPZZ", int_aarch64_sve_uabd, DestructiveBinaryComm>; - defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd">; - defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub">; - defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul">; - defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr">; - defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm">; - defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm">; - defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax">; - defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin">; - defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd">; - defm FSCALE_ZPmZ : sve_fp_2op_p_zds<0b1001, "fscale">; - defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx">; - defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr">; - defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv">; + defm SMAX_ZPZZ : sve_int_bin_pred_bhsd; + defm UMAX_ZPZZ : sve_int_bin_pred_bhsd; + defm SMIN_ZPZZ : sve_int_bin_pred_bhsd; + defm UMIN_ZPZZ : sve_int_bin_pred_bhsd; + defm SABD_ZPZZ : sve_int_bin_pred_bhsd; + defm UABD_ZPZZ : sve_int_bin_pred_bhsd; - defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd">; - defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub">; - defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul">; - defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul">; - defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps">; - defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts">; + defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe", AArch64frecpe>; + defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", AArch64frsqrte>; - defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">; + defm FADD_ZPmI : sve_fp_2op_i_p_zds<0b000, "fadd", "FADD_ZPZI", sve_fpimm_half_one, fpimm_half, fpimm_one, int_aarch64_sve_fadd>; + defm FSUB_ZPmI : sve_fp_2op_i_p_zds<0b001, "fsub", "FSUB_ZPZI", sve_fpimm_half_one, fpimm_half, fpimm_one, int_aarch64_sve_fsub>; + defm FMUL_ZPmI : sve_fp_2op_i_p_zds<0b010, "fmul", "FMUL_ZPZI", sve_fpimm_half_two, fpimm_half, fpimm_two, int_aarch64_sve_fmul>; + defm FSUBR_ZPmI : sve_fp_2op_i_p_zds<0b011, "fsubr", "FSUBR_ZPZI", sve_fpimm_half_one, fpimm_half, fpimm_one, int_aarch64_sve_fsubr>; + defm FMAXNM_ZPmI : sve_fp_2op_i_p_zds<0b100, "fmaxnm", "FMAXNM_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmaxnm>; + defm FMINNM_ZPmI : sve_fp_2op_i_p_zds<0b101, "fminnm", "FMINNM_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fminnm>; + defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", "FMAX_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmax>; + defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", "FMIN_ZPZI", sve_fpimm_zero_one, fpimm0, fpimm_one, int_aarch64_sve_fmin>; + + defm FADD_ZPZI : sve_fp_2op_i_p_zds_hfd; + defm FSUB_ZPZI : sve_fp_2op_i_p_zds_hfd; + defm FMUL_ZPZI : sve_fp_2op_i_p_zds_hfd; + defm FSUBR_ZPZI : sve_fp_2op_i_p_zds_hfd; + defm FMAXNM_ZPZI : sve_fp_2op_i_p_zds_hfd; + defm FMINNM_ZPZI : sve_fp_2op_i_p_zds_hfd; + defm FMAX_ZPZI : sve_fp_2op_i_p_zds_hfd; + defm FMIN_ZPZI : sve_fp_2op_i_p_zds_hfd; - defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd">; - defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla">; + let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in { + defm FADD_ZPZI : sve_fp_2op_i_p_zds_zeroing_hfd; + defm FSUB_ZPZI : sve_fp_2op_i_p_zds_zeroing_hfd; + defm FMUL_ZPZI : sve_fp_2op_i_p_zds_zeroing_hfd; + defm FSUBR_ZPZI : sve_fp_2op_i_p_zds_zeroing_hfd; + defm FMAXNM_ZPZI : sve_fp_2op_i_p_zds_zeroing_hfd; + defm FMINNM_ZPZI : sve_fp_2op_i_p_zds_zeroing_hfd; + defm FMAX_ZPZI : sve_fp_2op_i_p_zds_zeroing_hfd; + defm FMIN_ZPZI : sve_fp_2op_i_p_zds_zeroing_hfd; + } - defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla">; - defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls">; - defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla">; - defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls">; + defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", int_aarch64_sve_fadd, DestructiveBinaryComm>; + defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", int_aarch64_sve_fsub, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">; + defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", int_aarch64_sve_fmul, DestructiveBinaryComm>; + defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", int_aarch64_sve_fsubr, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /*isReverseInstr*/ 1>; + defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", "FMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>; + defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", "FMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>; + defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", "FMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>; + defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", "FMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>; + defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", "FABD_ZPZZ", int_aarch64_sve_fabd, DestructiveBinaryComm>; + defm FSCALE_ZPmZ : sve_fp_2op_p_zds_fscale<0b1001, "fscale", int_aarch64_sve_fscale>; + defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", "FMULX_ZPZZ", int_aarch64_sve_fmulx, DestructiveBinaryComm>; + defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", "FDIVR_ZPZZ", int_aarch64_sve_fdivr, DestructiveBinaryCommWithRev, "FDIV_ZPmZ", /*isReverseInstr*/ 1>; + defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ">; - defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad">; - defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb">; - defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad">; - defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb">; + defm FADD_ZPZZ : sve_fp_bin_pred_hfd; + defm FSUB_ZPZZ : sve_fp_bin_pred_hfd; + defm FMUL_ZPZZ : sve_fp_bin_pred_hfd; + defm FMAXNM_ZPZZ : sve_fp_bin_pred_hfd; + defm FMINNM_ZPZZ : sve_fp_bin_pred_hfd; + defm FMAX_ZPZZ : sve_fp_bin_pred_hfd; + defm FMIN_ZPZZ : sve_fp_bin_pred_hfd; + defm FABD_ZPZZ : sve_fp_bin_pred_hfd; + defm FDIV_ZPZZ : sve_fp_bin_pred_hfd; +} // End HasSVEorStreamingSVE - defm FTMAD_ZZI : sve_fp_ftmad<"ftmad">; +let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in { + defm FADD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FSUB_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FMUL_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FSUBR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FMAX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FMIN_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FABD_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FMULX_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FDIVR_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; + defm FDIV_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd; +} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos - defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla">; - defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls">; +let Predicates = [HasSVEorStreamingSVE] in { + defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd, AArch64fadd_p>; + defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub, AArch64fsub_p>; + defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", fmul, AArch64fmul_p>; +} // End HasSVEorStreamingSVE - defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla">; - defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul">; +let Predicates = [HasSVE] in { + defm FTSMUL_ZZZ : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>; +} // End HasSVE +let Predicates = [HasSVEorStreamingSVE] in { + defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", AArch64frecps>; + defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", AArch64frsqrts>; +} // End HasSVEorStreamingSVE + +let Predicates = [HasSVE] in { + defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel", int_aarch64_sve_ftssel_x>; +} // End HasSVE + +let Predicates = [HasSVEorStreamingSVE] in { + defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>; + defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>; + + defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla", "FMLA_ZPZZZ", int_aarch64_sve_fmla, "FMAD_ZPmZZ">; + defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls", "FMLS_ZPZZZ", int_aarch64_sve_fmls, "FMSB_ZPmZZ">; + defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla", "FNMLA_ZPZZZ", int_aarch64_sve_fnmla, "FNMAD_ZPmZZ">; + defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls", "FNMLS_ZPZZZ", int_aarch64_sve_fnmls, "FNMSB_ZPmZZ">; + + defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad", int_aarch64_sve_fmad, "FMLA_ZPmZZ", /*isReverseInstr*/ 1>; + defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb", int_aarch64_sve_fmsb, "FMLS_ZPmZZ", /*isReverseInstr*/ 1>; + defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad, "FNMLA_ZPmZZ", /*isReverseInstr*/ 1>; + defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb, "FNMLS_ZPmZZ", /*isReverseInstr*/ 1>; + + defm FMLA_ZPZZZ : sve_fp_3op_p_zds_zx; + defm FMLS_ZPZZZ : sve_fp_3op_p_zds_zx; + defm FNMLA_ZPZZZ : sve_fp_3op_p_zds_zx; + defm FNMLS_ZPZZZ : sve_fp_3op_p_zds_zx; + + multiclass fma { + // Zd = Za + Zn * Zm + def : Pat<(Ty (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za)), + (!cast("FMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; + + // Zd = Za + -Zn * Zm + def : Pat<(Ty (AArch64fma_p PredTy:$P, (AArch64fneg_mt PredTy:$P, Ty:$Zn, (Ty (undef))), Ty:$Zm, Ty:$Za)), + (!cast("FMLS_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; + + // Zd = -Za + Zn * Zm + def : Pat<(Ty (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, (AArch64fneg_mt PredTy:$P, Ty:$Za, (Ty (undef))))), + (!cast("FNMLS_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; + + // Zd = -Za + -Zn * Zm + def : Pat<(Ty (AArch64fma_p PredTy:$P, (AArch64fneg_mt PredTy:$P, Ty:$Zn, (Ty (undef))), Ty:$Zm, (AArch64fneg_mt PredTy:$P, Ty:$Za, (Ty (undef))))), + (!cast("FNMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; + + // Zd = -(Za + Zn * Zm) + // (with nsz neg.) + def : Pat<(AArch64fneg_mt_nsz PredTy:$P, (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za), (Ty (undef))), + (!cast("FNMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; + + // Zda = Zda + Zn * Zm + def : Pat<(vselect (PredTy PPR:$Pg), (Ty (AArch64fma_p (PredTy (AArch64ptrue 31)), ZPR:$Zn, ZPR:$Zm, ZPR:$Za)), ZPR:$Za), + (!cast("FMLA_ZPmZZ_"#Suffix) PPR:$Pg, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; + + // Zda = Zda + -Zn * Zm + def : Pat<(vselect (PredTy PPR:$Pg), (Ty (AArch64fma_p (PredTy (AArch64ptrue 31)), (AArch64fneg_mt (PredTy (AArch64ptrue 31)), Ty:$Zn, (Ty (undef))), ZPR:$Zm, ZPR:$Za)), ZPR:$Za), + (!cast("FMLS_ZPmZZ_"#Suffix) PPR:$Pg, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; + } + + defm : fma; + defm : fma; + defm : fma; + defm : fma; + defm : fma; + defm : fma; +} // End HasSVEorStreamingSVE + +let Predicates = [HasSVE] in { + defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>; +} // End HasSVE + +let Predicates = [HasSVEorStreamingSVE] in { + defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla", int_aarch64_sve_fmla_lane>; + defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls", int_aarch64_sve_fmls_lane>; + + defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla", int_aarch64_sve_fcmla_lane>; + defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>; +} // End HasSVEorStreamingSVE + +let Predicates = [HasSVE] in { // SVE floating point reductions. - defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda">; - defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv">; - defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv">; - defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv">; - defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv">; - defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv">; + defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", AArch64fadda_p>; +} // End HasSVE + +let Predicates = [HasSVEorStreamingSVE] in { + defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", AArch64faddv_p>; + defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", AArch64fmaxnmv_p>; + defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", AArch64fminnmv_p>; + defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", AArch64fmaxv_p>; + defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", AArch64fminv_p>; // Splat immediate (unpredicated) - defm DUP_ZI : sve_int_dup_imm<"dup">; - defm FDUP_ZI : sve_int_dup_fpimm<"fdup">; + defm DUP_ZI : sve_int_dup_imm<"dup">; + defm FDUP_ZI : sve_int_dup_fpimm<"fdup">; defm DUPM_ZI : sve_int_dup_mask_imm<"dupm">; // Splat immediate (predicated) @@ -188,92 +613,160 @@ let Predicates = [HasSVE] in { defm FCPY_ZPmI : sve_int_dup_fpimm_pred<"fcpy">; // Splat scalar register (unpredicated, GPR or vector + element index) - defm DUP_ZR : sve_int_perm_dup_r<"dup">; + defm DUP_ZR : sve_int_perm_dup_r<"dup", AArch64dup>; defm DUP_ZZI : sve_int_perm_dup_i<"dup">; // Splat scalar register (predicated) - defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy">; - defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy">; + defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_mt>; + defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>; + + // Duplicate FP scalar into all vector elements + def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + def : Pat<(nxv4f16 (AArch64dup (f16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + def : Pat<(nxv2f16 (AArch64dup (f16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + def : Pat<(nxv4f32 (AArch64dup (f32 FPR32:$src))), + (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>; + def : Pat<(nxv2f32 (AArch64dup (f32 FPR32:$src))), + (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>; + def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))), + (DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>; + def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))), + (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>; + + // Duplicate +0.0 into all vector elements + def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv4f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv2f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>; + def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>; + def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>; + def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>; + def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>; + + // Duplicate Int immediate into all vector elements + def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))), + (DUP_ZI_B $a, $b)>; + def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))), + (DUP_ZI_H $a, $b)>; + def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))), + (DUP_ZI_S $a, $b)>; + def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm64 i32:$a, i32:$b)))), + (DUP_ZI_D $a, $b)>; + + // Duplicate immediate FP into all vector elements. + def : Pat<(nxv2f32 (AArch64dup (f32 fpimm:$val))), + (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>; + def : Pat<(nxv4f32 (AArch64dup (f32 fpimm:$val))), + (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>; + def : Pat<(nxv2f64 (AArch64dup (f64 fpimm:$val))), + (DUP_ZR_D (MOVi64imm (bitcast_fpimm_to_i64 f64:$val)))>; + + // Duplicate FP immediate into all vector elements + let AddedComplexity = 2 in { + def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)), + (FDUP_ZI_H fpimm16:$imm8)>; + def : Pat<(nxv4f16 (AArch64dup fpimm16:$imm8)), + (FDUP_ZI_H fpimm16:$imm8)>; + def : Pat<(nxv2f16 (AArch64dup fpimm16:$imm8)), + (FDUP_ZI_H fpimm16:$imm8)>; + def : Pat<(nxv4f32 (AArch64dup fpimm32:$imm8)), + (FDUP_ZI_S fpimm32:$imm8)>; + def : Pat<(nxv2f32 (AArch64dup fpimm32:$imm8)), + (FDUP_ZI_S fpimm32:$imm8)>; + def : Pat<(nxv2f64 (AArch64dup fpimm64:$imm8)), + (FDUP_ZI_D fpimm64:$imm8)>; + } // Select elements from either vector (predicated) - defm SEL_ZPZZ : sve_int_sel_vvv<"sel">; + defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>; - defm SPLICE_ZPZ : sve_int_perm_splice<"splice">; - defm COMPACT_ZPZ : sve_int_perm_compact<"compact">; - defm INSR_ZR : sve_int_perm_insrs<"insr">; - defm INSR_ZV : sve_int_perm_insrv<"insr">; - def EXT_ZZI : sve_int_perm_extract_i<"ext">; + defm SPLICE_ZPZ : sve_int_perm_splice<"splice", AArch64splice>; +} // End HasSVEorStreamingSVE - defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit">; - defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb">; - defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh">; - defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw">; +let Predicates = [HasSVE] in { + defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>; +} // End HasSVE - defm REV_PP : sve_int_perm_reverse_p<"rev">; - defm REV_ZZ : sve_int_perm_reverse_z<"rev">; +let Predicates = [HasSVEorStreamingSVE] in { + defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>; + defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>; + defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>; - defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo">; - defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi">; - defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo">; - defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi">; + defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", AArch64rbit_mt>; + defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", AArch64revb_mt>; + defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", AArch64revh_mt>; + defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", AArch64revw_mt>; - def PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo">; - def PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi">; + defm REV_PP : sve_int_perm_reverse_p<"rev", vector_reverse>; + defm REV_ZZ : sve_int_perm_reverse_z<"rev", vector_reverse>; + + defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>; + defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>; + defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>; + defm UUNPKHI_ZZ : sve_int_perm_unpk<0b11, "uunpkhi", AArch64uunpkhi>; + + defm PUNPKLO_PP : sve_int_perm_punpk<0b0, "punpklo", int_aarch64_sve_punpklo>; + defm PUNPKHI_PP : sve_int_perm_punpk<0b1, "punpkhi", int_aarch64_sve_punpkhi>; defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">; defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">; def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>; - def FEXPA_ZZ_H : sve_int_bin_cons_misc_0_c<0b01000000, "fexpa", ZPR16>; - def FEXPA_ZZ_S : sve_int_bin_cons_misc_0_c<0b10000000, "fexpa", ZPR32>; - def FEXPA_ZZ_D : sve_int_bin_cons_misc_0_c<0b11000000, "fexpa", ZPR64>; +} // End HasSVEorStreamingSVE - def BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa">; - def BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas">; - def BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb">; - def BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs">; +let Predicates = [HasSVE] in { + defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>; +} // End HasSVE - def BRKN_PPzP : sve_int_brkn<0b0, "brkn">; - def BRKNS_PPzP : sve_int_brkn<0b1, "brkns">; +let Predicates = [HasSVEorStreamingSVE] in { + defm BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa", int_aarch64_sve_brkpa_z>; + defm BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas", null_frag>; + defm BRKPB_PPzPP : sve_int_brkp<0b01, "brkpb", int_aarch64_sve_brkpb_z>; + defm BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs", null_frag>; - defm BRKA_PPzP : sve_int_break_z<0b000, "brka">; - defm BRKA_PPmP : sve_int_break_m<0b001, "brka">; - defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas">; - defm BRKB_PPzP : sve_int_break_z<0b100, "brkb">; - defm BRKB_PPmP : sve_int_break_m<0b101, "brkb">; - defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs">; + defm BRKN_PPzP : sve_int_brkn<0b0, "brkn", int_aarch64_sve_brkn_z>; + defm BRKNS_PPzP : sve_int_brkn<0b1, "brkns", null_frag>; + + defm BRKA_PPzP : sve_int_break_z<0b000, "brka", int_aarch64_sve_brka_z>; + defm BRKA_PPmP : sve_int_break_m<0b001, "brka", int_aarch64_sve_brka>; + defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas", null_frag>; + defm BRKB_PPzP : sve_int_break_z<0b100, "brkb", int_aarch64_sve_brkb_z>; + defm BRKB_PPmP : sve_int_break_m<0b101, "brkb", int_aarch64_sve_brkb>; + defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>; def PTEST_PP : sve_int_ptest<0b010000, "ptest">; - def PFALSE : sve_int_pfalse<0b000000, "pfalse">; - defm PFIRST : sve_int_pfirst<0b00000, "pfirst">; - defm PNEXT : sve_int_pnext<0b00110, "pnext">; + defm PFALSE : sve_int_pfalse<0b000000, "pfalse">; + defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>; + defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>; - def AND_PPzPP : sve_int_pred_log<0b0000, "and">; - def BIC_PPzPP : sve_int_pred_log<0b0001, "bic">; - def EOR_PPzPP : sve_int_pred_log<0b0010, "eor">; - def SEL_PPPP : sve_int_pred_log<0b0011, "sel">; - def ANDS_PPzPP : sve_int_pred_log<0b0100, "ands">; - def BICS_PPzPP : sve_int_pred_log<0b0101, "bics">; - def EORS_PPzPP : sve_int_pred_log<0b0110, "eors">; - def ORR_PPzPP : sve_int_pred_log<0b1000, "orr">; - def ORN_PPzPP : sve_int_pred_log<0b1001, "orn">; - def NOR_PPzPP : sve_int_pred_log<0b1010, "nor">; - def NAND_PPzPP : sve_int_pred_log<0b1011, "nand">; - def ORRS_PPzPP : sve_int_pred_log<0b1100, "orrs">; - def ORNS_PPzPP : sve_int_pred_log<0b1101, "orns">; - def NORS_PPzPP : sve_int_pred_log<0b1110, "nors">; - def NANDS_PPzPP : sve_int_pred_log<0b1111, "nands">; + defm AND_PPzPP : sve_int_pred_log_v2<0b0000, "and", int_aarch64_sve_and_z, and>; + defm BIC_PPzPP : sve_int_pred_log_v2<0b0001, "bic", int_aarch64_sve_bic_z, AArch64bic>; + defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z, xor>; + defm SEL_PPPP : sve_int_pred_log_v2<0b0011, "sel", vselect, or>; + defm ANDS_PPzPP : sve_int_pred_log<0b0100, "ands", null_frag>; + defm BICS_PPzPP : sve_int_pred_log<0b0101, "bics", null_frag>; + defm EORS_PPzPP : sve_int_pred_log<0b0110, "eors", null_frag>; + defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z>; + defm ORN_PPzPP : sve_int_pred_log<0b1001, "orn", int_aarch64_sve_orn_z>; + defm NOR_PPzPP : sve_int_pred_log<0b1010, "nor", int_aarch64_sve_nor_z>; + defm NAND_PPzPP : sve_int_pred_log<0b1011, "nand", int_aarch64_sve_nand_z>; + defm ORRS_PPzPP : sve_int_pred_log<0b1100, "orrs", null_frag>; + defm ORNS_PPzPP : sve_int_pred_log<0b1101, "orns", null_frag>; + defm NORS_PPzPP : sve_int_pred_log<0b1110, "nors", null_frag>; + defm NANDS_PPzPP : sve_int_pred_log<0b1111, "nands", null_frag>; - defm CLASTA_RPZ : sve_int_perm_clast_rz<0, "clasta">; - defm CLASTB_RPZ : sve_int_perm_clast_rz<1, "clastb">; - defm CLASTA_VPZ : sve_int_perm_clast_vz<0, "clasta">; - defm CLASTB_VPZ : sve_int_perm_clast_vz<1, "clastb">; - defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta">; - defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb">; + defm CLASTA_RPZ : sve_int_perm_clast_rz<0, "clasta", AArch64clasta_n>; + defm CLASTB_RPZ : sve_int_perm_clast_rz<1, "clastb", AArch64clastb_n>; + defm CLASTA_VPZ : sve_int_perm_clast_vz<0, "clasta", AArch64clasta_n>; + defm CLASTB_VPZ : sve_int_perm_clast_vz<1, "clastb", AArch64clastb_n>; + defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>; + defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>; - defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta">; - defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb">; - defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta">; - defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb">; + defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>; + defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>; + defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>; + defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>; // continuous load with reg+immediate defm LD1B_IMM : sve_mem_cld_si<0b0000, "ld1b", Z_b, ZPR8>; @@ -338,7 +831,9 @@ let Predicates = [HasSVE] in { defm LD1SB_S : sve_mem_cld_ss<0b1101, "ld1sb", Z_s, ZPR32, GPR64NoXZRshifted8>; defm LD1SB_H : sve_mem_cld_ss<0b1110, "ld1sb", Z_h, ZPR16, GPR64NoXZRshifted8>; defm LD1D : sve_mem_cld_ss<0b1111, "ld1d", Z_d, ZPR64, GPR64NoXZRshifted64>; +} // End HasSVEorStreamingSVE +let Predicates = [HasSVE] in { // non-faulting continuous load with reg+immediate defm LDNF1B_IMM : sve_mem_cldnf_si<0b0000, "ldnf1b", Z_b, ZPR8>; defm LDNF1B_H_IMM : sve_mem_cldnf_si<0b0001, "ldnf1b", Z_h, ZPR16>; @@ -374,7 +869,9 @@ let Predicates = [HasSVE] in { defm LDFF1SB_S : sve_mem_cldff_ss<0b1101, "ldff1sb", Z_s, ZPR32, GPR64shifted8>; defm LDFF1SB_H : sve_mem_cldff_ss<0b1110, "ldff1sb", Z_h, ZPR16, GPR64shifted8>; defm LDFF1D : sve_mem_cldff_ss<0b1111, "ldff1d", Z_d, ZPR64, GPR64shifted64>; +} // End HasSVE +let Predicates = [HasSVEorStreamingSVE] in { // LD(2|3|4) structured loads with reg+immediate defm LD2B_IMM : sve_mem_eld_si<0b00, 0b01, ZZ_b, "ld2b", simm4s2>; defm LD3B_IMM : sve_mem_eld_si<0b00, 0b10, ZZZ_b, "ld3b", simm4s3>; @@ -402,119 +899,123 @@ let Predicates = [HasSVE] in { def LD2D : sve_mem_eld_ss<0b11, 0b01, ZZ_d, "ld2d", GPR64NoXZRshifted64>; def LD3D : sve_mem_eld_ss<0b11, 0b10, ZZZ_d, "ld3d", GPR64NoXZRshifted64>; def LD4D : sve_mem_eld_ss<0b11, 0b11, ZZZZ_d, "ld4d", GPR64NoXZRshifted64>; +} // End HasSVEorStreamingSVE +let Predicates = [HasSVE] in { // Gathers using unscaled 32-bit offsets, e.g. // ld1h z0.s, p0/z, [x0, z0.s, uxtw] - defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", ZPR32ExtSXTW8, ZPR32ExtUXTW8>; + defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; // Gathers using scaled 32-bit offsets, e.g. // ld1h z0.s, p0/z, [x0, z0.s, uxtw #1] - defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", ZPR32ExtSXTW32, ZPR32ExtUXTW32>; - defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", ZPR32ExtSXTW32, ZPR32ExtUXTW32>; + defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; - // Gathers using scaled 32-bit pointers with offset, e.g. + // Gathers using 32-bit pointers with scaled offset, e.g. // ld1h z0.s, p0/z, [z0.s, #16] - defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31>; - defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31>; - defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31>; - defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31>; - defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2>; - defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2>; - defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2>; - defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2>; - defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4>; - defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4>; + defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm_z, nxv4i8>; + defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv4i8>; + defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm_z, nxv4i8>; + defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm_z, nxv4i8>; + defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm_z, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm_z, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm_z, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm_z, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm_z, nxv4i32>; - // Gathers using scaled 64-bit pointers with offset, e.g. + // Gathers using 64-bit pointers with scaled offset, e.g. // ld1h z0.d, p0/z, [z0.d, #16] - defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31>; - defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31>; - defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31>; - defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31>; - defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2>; - defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2>; - defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2>; - defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2>; - defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4>; - defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4>; - defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4>; - defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4>; - defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8>; - defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8>; + defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm_z, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm_z, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm_z, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm_z, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm_z, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm_z, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm_z, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, AArch64ldff1s_gather_imm_z, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm_z, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm_z, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm_z, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, AArch64ldff1_gather_imm_z, nxv2i64>; // Gathers using unscaled 64-bit offsets, e.g. // ld1h z0.d, p0/z, [x0, z0.d] - defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb">; - defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb">; - defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b">; - defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b">; - defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh">; - defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh">; - defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h">; - defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h">; - defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw">; - defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw">; - defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w">; - defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w">; - defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d">; - defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d">; + defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_z, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_z, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather_z, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_z, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_z, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_z, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather_z, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_z, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_z, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_z, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather_z, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_z, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather_z, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_z, nxv2i64>; // Gathers using scaled 64-bit offsets, e.g. // ld1h z0.d, p0/z, [x0, z0.d, lsl #1] - defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", ZPR64ExtLSL16>; - defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", ZPR64ExtLSL16>; - defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", ZPR64ExtLSL16>; - defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", ZPR64ExtLSL16>; - defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", ZPR64ExtLSL32>; - defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", ZPR64ExtLSL32>; - defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", ZPR64ExtLSL32>; - defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", ZPR64ExtLSL32>; - defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", ZPR64ExtLSL64>; - defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", ZPR64ExtLSL64>; + defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled_z, ZPR64ExtLSL64, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", AArch64ldff1_gather_scaled_z, ZPR64ExtLSL64, nxv2i64>; // Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g. // ld1h z0.d, p0/z, [x0, z0.d, uxtw] - defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; + defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_z, AArch64ld1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_z, AArch64ld1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_z, AArch64ldff1_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; // Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g. // ld1h z0.d, p0/z, [x0, z0.d, uxtw #1] - defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh",ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw",ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", ZPR64ExtSXTW64, ZPR64ExtUXTW64>; - defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", ZPR64ExtSXTW64, ZPR64ExtUXTW64>; + defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled_z, AArch64ld1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled_z, AArch64ld1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_scaled_z, AArch64ldff1_gather_uxtw_scaled_z, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; +} // End HasSVE +let Predicates = [HasSVEorStreamingSVE] in { // Non-temporal contiguous loads (register + immediate) defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>; defm LDNT1H_ZRI : sve_mem_cldnt_si<0b01, "ldnt1h", Z_h, ZPR16>; @@ -550,53 +1051,61 @@ let Predicates = [HasSVE] in { defm ST1W : sve_mem_cst_ss<0b1010, "st1w", Z_s, ZPR32, GPR64NoXZRshifted32>; defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>; defm ST1D : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>; +} // End HasSVEorStreamingSVE - // Scatters using unscaled 32-bit offsets, e.g. - // st1h z0.s, p0, [x0, z0.s, uxtw] - // and unpacked: +let Predicates = [HasSVE] in { + // Scatters using unpacked, unscaled 32-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d, uxtw] - defm SST1B_D : sve_mem_sst_sv_32_unscaled<0b000, "st1b", Z_d, ZPR64, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm SST1B_S : sve_mem_sst_sv_32_unscaled<0b001, "st1b", Z_s, ZPR32, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm SST1H_D : sve_mem_sst_sv_32_unscaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm SST1H_S : sve_mem_sst_sv_32_unscaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm SST1W_D : sve_mem_sst_sv_32_unscaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm SST1W : sve_mem_sst_sv_32_unscaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm SST1D : sve_mem_sst_sv_32_unscaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>; + defm SST1B_D : sve_mem_64b_sst_sv_32_unscaled<0b000, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm SST1H_D : sve_mem_64b_sst_sv_32_unscaled<0b010, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm SST1W_D : sve_mem_64b_sst_sv_32_unscaled<0b100, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm SST1D : sve_mem_64b_sst_sv_32_unscaled<0b110, "st1d", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; - // Scatters using scaled 32-bit offsets, e.g. + // Scatters using packed, unscaled 32-bit offsets, e.g. + // st1h z0.s, p0, [x0, z0.s, uxtw] + defm SST1B_S : sve_mem_32b_sst_sv_32_unscaled<0b001, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm SST1H_S : sve_mem_32b_sst_sv_32_unscaled<0b011, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm SST1W : sve_mem_32b_sst_sv_32_unscaled<0b101, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; + + // Scatters using packed, scaled 32-bit offsets, e.g. // st1h z0.s, p0, [x0, z0.s, uxtw #1] - // and unpacked: + defm SST1H_S : sve_mem_32b_sst_sv_32_scaled<0b011, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm SST1W : sve_mem_32b_sst_sv_32_scaled<0b101, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; + + // Scatters using unpacked, scaled 32-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d, uxtw #1] - defm SST1H_D : sve_mem_sst_sv_32_scaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm SST1H_S : sve_mem_sst_sv_32_scaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm SST1W_D : sve_mem_sst_sv_32_scaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm SST1W : sve_mem_sst_sv_32_scaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW32, ZPR32ExtUXTW32>; - defm SST1D : sve_mem_sst_sv_32_scaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW64, ZPR64ExtUXTW64>; + defm SST1H_D : sve_mem_64b_sst_sv_32_scaled<0b010, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm SST1W_D : sve_mem_64b_sst_sv_32_scaled<0b100, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm SST1D : sve_mem_64b_sst_sv_32_scaled<0b110, "st1d", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; // Scatters using 32/64-bit pointers with offset, e.g. // st1h z0.s, p0, [z0.s, #16] + defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", imm0_31, AArch64st1_scatter_imm, nxv4i8>; + defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv4i16>; + defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv4i32>; + + // Scatters using 32/64-bit pointers with offset, e.g. // st1h z0.d, p0, [z0.d, #16] - defm SST1B_D : sve_mem_sst_vi_ptrs<0b000, "st1b", Z_d, ZPR64, imm0_31>; - defm SST1B_S : sve_mem_sst_vi_ptrs<0b001, "st1b", Z_s, ZPR32, imm0_31>; - defm SST1H_D : sve_mem_sst_vi_ptrs<0b010, "st1h", Z_d, ZPR64, uimm5s2>; - defm SST1H_S : sve_mem_sst_vi_ptrs<0b011, "st1h", Z_s, ZPR32, uimm5s2>; - defm SST1W_D : sve_mem_sst_vi_ptrs<0b100, "st1w", Z_d, ZPR64, uimm5s4>; - defm SST1W : sve_mem_sst_vi_ptrs<0b101, "st1w", Z_s, ZPR32, uimm5s4>; - defm SST1D : sve_mem_sst_vi_ptrs<0b110, "st1d", Z_d, ZPR64, uimm5s8>; + defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", imm0_31, AArch64st1_scatter_imm, nxv2i8>; + defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv2i16>; + defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv2i32>; + defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", uimm5s8, AArch64st1_scatter_imm, nxv2i64>; // Scatters using unscaled 64-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d] - defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b">; - defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h">; - defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w">; - defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d">; + defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b", AArch64st1_scatter, nxv2i8>; + defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h", AArch64st1_scatter, nxv2i16>; + defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w", AArch64st1_scatter, nxv2i32>; + defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d", AArch64st1_scatter, nxv2i64>; // Scatters using scaled 64-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d, lsl #1] - defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", ZPR64ExtLSL16>; - defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", ZPR64ExtLSL32>; - defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", ZPR64ExtLSL64>; + defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>; + defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>; + defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>; +} // End HasSVE +let Predicates = [HasSVEorStreamingSVE] in { // ST(2|3|4) structured stores (register + immediate) defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b, "st2b", simm4s2>; defm ST3B_IMM : sve_mem_est_si<0b00, 0b10, ZZZ_b, "st3b", simm4s3>; @@ -655,114 +1164,338 @@ let Predicates = [HasSVE] in { def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>; def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>; + multiclass sve_prefetch { + // reg + imm + let AddedComplexity = 2 in { + def _reg_imm : Pat<(prefetch (PredTy PPR_3b:$gp), (am_sve_indexed_s6 GPR64sp:$base, simm6s1:$offset), (i32 sve_prfop:$prfop)), + (RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, simm6s1:$offset)>; + } + + // reg + reg + let AddedComplexity = 1 in { + def _reg_reg : Pat<(prefetch (PredTy PPR_3b:$gp), (AddrCP GPR64sp:$base, GPR64:$index), (i32 sve_prfop:$prfop)), + (RegRegInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, GPR64:$index)>; + } + + // default fallback + def _default : Pat<(prefetch (PredTy PPR_3b:$gp), GPR64:$base, (i32 sve_prfop:$prfop)), + (RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, (i64 0))>; + } + + defm : sve_prefetch; + defm : sve_prefetch; + defm : sve_prefetch; + defm : sve_prefetch; +} // End HasSVEorStreamingSVE + +let Predicates = [HasSVE] in { // Gather prefetch using scaled 32-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.s, uxtw #1] - defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32>; - defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64>; + defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>; + defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16, int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>; + defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32, int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>; + defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64, int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>; // Gather prefetch using unpacked, scaled 32-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.d, uxtw #1] - defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64>; + defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>; + defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16, int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>; + defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32, int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>; + defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64, int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>; // Gather prefetch using scaled 64-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.d, lsl #1] - defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8>; - defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16>; - defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32>; - defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64>; + defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8, int_aarch64_sve_prfb_gather_index>; + defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_prfh_gather_index>; + defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_prfw_gather_index>; + defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_prfd_gather_index>; // Gather prefetch using 32/64-bit pointers with offset, e.g. // prfh pldl1keep, p0, [z0.s, #16] // prfh pldl1keep, p0, [z0.d, #16] - defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31>; - defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2>; - defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4>; - defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8>; + defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>; + defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>; + defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>; + defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>; - defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31>; - defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2>; - defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4>; - defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8>; + defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>; + defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>; + defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>; + defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>; defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">; defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">; defm ADR_LSL_ZZZ_S : sve_int_bin_cons_misc_0_a_32_lsl<0b10, "adr">; defm ADR_LSL_ZZZ_D : sve_int_bin_cons_misc_0_a_64_lsl<0b11, "adr">; - defm TBL_ZZZ : sve_int_perm_tbl<"tbl">; + def : Pat<(nxv4i32 (int_aarch64_sve_adrb nxv4i32:$Op1, nxv4i32:$Op2)), + (ADR_LSL_ZZZ_S_0 $Op1, $Op2)>; + def : Pat<(nxv4i32 (int_aarch64_sve_adrh nxv4i32:$Op1, nxv4i32:$Op2)), + (ADR_LSL_ZZZ_S_1 $Op1, $Op2)>; + def : Pat<(nxv4i32 (int_aarch64_sve_adrw nxv4i32:$Op1, nxv4i32:$Op2)), + (ADR_LSL_ZZZ_S_2 $Op1, $Op2)>; + def : Pat<(nxv4i32 (int_aarch64_sve_adrd nxv4i32:$Op1, nxv4i32:$Op2)), + (ADR_LSL_ZZZ_S_3 $Op1, $Op2)>; - defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1">; - defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2">; - defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1">; - defm UZP2_ZZZ : sve_int_perm_bin_perm_zz<0b011, "uzp2">; - defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1">; - defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2">; + def : Pat<(nxv2i64 (int_aarch64_sve_adrb nxv2i64:$Op1, nxv2i64:$Op2)), + (ADR_LSL_ZZZ_D_0 $Op1, $Op2)>; + def : Pat<(nxv2i64 (int_aarch64_sve_adrh nxv2i64:$Op1, nxv2i64:$Op2)), + (ADR_LSL_ZZZ_D_1 $Op1, $Op2)>; + def : Pat<(nxv2i64 (int_aarch64_sve_adrw nxv2i64:$Op1, nxv2i64:$Op2)), + (ADR_LSL_ZZZ_D_2 $Op1, $Op2)>; + def : Pat<(nxv2i64 (int_aarch64_sve_adrd nxv2i64:$Op1, nxv2i64:$Op2)), + (ADR_LSL_ZZZ_D_3 $Op1, $Op2)>; - defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1">; - defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2">; - defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1">; - defm UZP2_PPP : sve_int_perm_bin_perm_pp<0b011, "uzp2">; - defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1">; - defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2">; + // Patterns to generate adr instruction. + // adr z0.d, [z0.d, z0.d, uxtw] + def : Pat<(add nxv2i64:$Op1, + (nxv2i64 (and nxv2i64:$Op2, (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))))), + (ADR_UXTW_ZZZ_D_0 $Op1, $Op2)>; + // adr z0.d, [z0.d, z0.d, sxtw] + def : Pat<(add nxv2i64:$Op1, + (nxv2i64 (sext_inreg nxv2i64:$Op2, nxv2i32))), + (ADR_SXTW_ZZZ_D_0 $Op1, $Op2)>; - defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs">; - defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi">; - defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge">; - defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt">; - defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq">; - defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne">; + // adr z0.s, [z0.s, z0.s, lsl #] + // adr z0.d, [z0.d, z0.d, lsl #] + multiclass adrShiftPat { + def : Pat<(add Ty:$Op1, + (Ty (AArch64lsl_p (PredTy (SVEAllActive)), + Ty:$Op2, + (Ty (AArch64dup (ShiftTy ShiftAmt)))))), + (DestAdrIns $Op1, $Op2)>; + } + defm : adrShiftPat; + defm : adrShiftPat; + defm : adrShiftPat; + defm : adrShiftPat; + defm : adrShiftPat; + defm : adrShiftPat; - defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq">; - defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne">; - defm CMPGE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b000, "cmpge">; - defm CMPGT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b001, "cmpgt">; - defm CMPLT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b010, "cmplt">; - defm CMPLE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b011, "cmple">; - defm CMPHS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b100, "cmphs">; - defm CMPHI_WIDE_PPzZZ : sve_int_cmp_1_wide<0b101, "cmphi">; - defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo">; - defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls">; + // adr z0.d, [z0.d, z0.d, uxtw #] + // adr z0.d, [z0.d, z0.d, sxtw #] + multiclass adrXtwShiftPat { + def : Pat<(add Ty:$Op1, + (Ty (AArch64lsl_p (PredTy (SVEAllActive)), + (Ty (and Ty:$Op2, (Ty (AArch64dup (i64 0xFFFFFFFF))))), + (Ty (AArch64dup (i64 ShiftAmt)))))), + (!cast("ADR_UXTW_ZZZ_D_"#ShiftAmt) $Op1, $Op2)>; - defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge">; - defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt">; - defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt">; - defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple">; - defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq">; - defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne">; - defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs">; - defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi">; - defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo">; - defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls">; + def : Pat<(add Ty:$Op1, + (Ty (AArch64lsl_p (PredTy (SVEAllActive)), + (Ty (sext_inreg Ty:$Op2, nxv2i32)), + (Ty (AArch64dup (i64 ShiftAmt)))))), + (!cast("ADR_SXTW_ZZZ_D_"#ShiftAmt) $Op1, $Op2)>; + } + defm : adrXtwShiftPat; + defm : adrXtwShiftPat; + defm : adrXtwShiftPat; +} // End HasSVE - defm FCMGE_PPzZZ : sve_fp_3op_p_pd<0b000, "fcmge">; - defm FCMGT_PPzZZ : sve_fp_3op_p_pd<0b001, "fcmgt">; - defm FCMEQ_PPzZZ : sve_fp_3op_p_pd<0b010, "fcmeq">; - defm FCMNE_PPzZZ : sve_fp_3op_p_pd<0b011, "fcmne">; - defm FCMUO_PPzZZ : sve_fp_3op_p_pd<0b100, "fcmuo">; - defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge">; - defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt">; +let Predicates = [HasSVEorStreamingSVE] in { + defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>; - defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge">; - defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt">; - defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt">; - defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle">; - defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">; - defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">; + defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>; + defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2", AArch64zip2>; + defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1", AArch64uzp1>; + defm UZP2_ZZZ : sve_int_perm_bin_perm_zz<0b011, "uzp2", AArch64uzp2>; + defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1", AArch64trn1>; + defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2", AArch64trn2>; - defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt">; - defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele">; - defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo">; - defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels">; + defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1", AArch64zip1>; + defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2", AArch64zip2>; + defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1", AArch64uzp1>; + defm UZP2_PPP : sve_int_perm_bin_perm_pp<0b011, "uzp2", AArch64uzp2>; + defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>; + defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>; - defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt">; - defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele">; - defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo">; - defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels">; + // Extract lo/hi halves of legal predicate types. + def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP PPR:$Ps)>; + def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))), + (PUNPKHI_PP PPR:$Ps)>; + def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP PPR:$Ps)>; + def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))), + (PUNPKHI_PP PPR:$Ps)>; + def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP PPR:$Ps)>; + def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))), + (PUNPKHI_PP PPR:$Ps)>; + + def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>; + def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 2))), + (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))>; + def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))), + (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>; + def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 6))), + (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>; + + def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>; + def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))), + (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))>; + def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))), + (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>; + def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))), + (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>; + + def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 2))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 6))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 10))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 14))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>; + + // Extract subvectors from FP SVE vectors + def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))), + (UUNPKLO_ZZ_D ZPR:$Zs)>; + def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 2))), + (UUNPKHI_ZZ_D ZPR:$Zs)>; + def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))), + (UUNPKLO_ZZ_S ZPR:$Zs)>; + def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))), + (UUNPKHI_ZZ_S ZPR:$Zs)>; + def : Pat<(nxv2f32 (extract_subvector (nxv4f32 ZPR:$Zs), (i64 0))), + (UUNPKLO_ZZ_D ZPR:$Zs)>; + def : Pat<(nxv2f32 (extract_subvector (nxv4f32 ZPR:$Zs), (i64 2))), + (UUNPKHI_ZZ_D ZPR:$Zs)>; + + def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 0))), + (UUNPKLO_ZZ_D ZPR:$Zs)>; + def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 2))), + (UUNPKHI_ZZ_D ZPR:$Zs)>; + def : Pat<(nxv4bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 0))), + (UUNPKLO_ZZ_S ZPR:$Zs)>; + def : Pat<(nxv4bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 4))), + (UUNPKHI_ZZ_S ZPR:$Zs)>; + + def : Pat<(nxv2f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))), + (UUNPKLO_ZZ_D (UUNPKLO_ZZ_S ZPR:$Zs))>; + def : Pat<(nxv2f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 2))), + (UUNPKHI_ZZ_D (UUNPKLO_ZZ_S ZPR:$Zs))>; + def : Pat<(nxv2f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))), + (UUNPKLO_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>; + def : Pat<(nxv2f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 6))), + (UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>; + + def : Pat<(nxv2bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 0))), + (UUNPKLO_ZZ_D (UUNPKLO_ZZ_S ZPR:$Zs))>; + def : Pat<(nxv2bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 2))), + (UUNPKHI_ZZ_D (UUNPKLO_ZZ_S ZPR:$Zs))>; + def : Pat<(nxv2bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 4))), + (UUNPKLO_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>; + def : Pat<(nxv2bf16 (extract_subvector (nxv8bf16 ZPR:$Zs), (i64 6))), + (UUNPKHI_ZZ_D (UUNPKHI_ZZ_S ZPR:$Zs))>; + + // Concatenate two predicates. + def : Pat<(nxv4i1 (concat_vectors nxv2i1:$p1, nxv2i1:$p2)), + (UZP1_PPP_S $p1, $p2)>; + def : Pat<(nxv8i1 (concat_vectors nxv4i1:$p1, nxv4i1:$p2)), + (UZP1_PPP_H $p1, $p2)>; + def : Pat<(nxv16i1 (concat_vectors nxv8i1:$p1, nxv8i1:$p2)), + (UZP1_PPP_B $p1, $p2)>; + + // Concatenate two floating point vectors. + def : Pat<(nxv4f16 (concat_vectors nxv2f16:$v1, nxv2f16:$v2)), + (UZP1_ZZZ_S $v1, $v2)>; + def : Pat<(nxv8f16 (concat_vectors nxv4f16:$v1, nxv4f16:$v2)), + (UZP1_ZZZ_H $v1, $v2)>; + def : Pat<(nxv4f32 (concat_vectors nxv2f32:$v1, nxv2f32:$v2)), + (UZP1_ZZZ_S $v1, $v2)>; + def : Pat<(nxv4bf16 (concat_vectors nxv2bf16:$v1, nxv2bf16:$v2)), + (UZP1_ZZZ_S $v1, $v2)>; + def : Pat<(nxv8bf16 (concat_vectors nxv4bf16:$v1, nxv4bf16:$v2)), + (UZP1_ZZZ_H $v1, $v2)>; + + // Splice with lane equal to -1 + def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 -1))), + (INSR_ZV_B ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_B (PTRUE_B 31), ZPR:$Z1), bsub))>; + def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 -1))), + (INSR_ZV_H ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_H (PTRUE_H 31), ZPR:$Z1), hsub))>; + def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 -1))), + (INSR_ZV_S ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_S (PTRUE_S 31), ZPR:$Z1), ssub))>; + def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 -1))), + (INSR_ZV_D ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF), + (LASTB_VPZ_D (PTRUE_D 31), ZPR:$Z1), dsub))>; + + // Splice with lane bigger or equal to 0 + def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 (sve_ext_imm_0_255 i32:$index)))), + (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 (sve_ext_imm_0_127 i32:$index)))), + (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 (sve_ext_imm_0_63 i32:$index)))), + (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 (sve_ext_imm_0_31 i32:$index)))), + (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; + + defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>; + defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>; + defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>; + defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt", SETGT, SETLT>; + defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq", SETEQ, SETEQ>; + defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne", SETNE, SETNE>; + + defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq", int_aarch64_sve_cmpeq_wide>; + defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne", int_aarch64_sve_cmpne_wide>; + defm CMPGE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b000, "cmpge", int_aarch64_sve_cmpge_wide>; + defm CMPGT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b001, "cmpgt", int_aarch64_sve_cmpgt_wide>; + defm CMPLT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b010, "cmplt", int_aarch64_sve_cmplt_wide>; + defm CMPLE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b011, "cmple", int_aarch64_sve_cmple_wide>; + defm CMPHS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b100, "cmphs", int_aarch64_sve_cmphs_wide>; + defm CMPHI_WIDE_PPzZZ : sve_int_cmp_1_wide<0b101, "cmphi", int_aarch64_sve_cmphi_wide>; + defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo", int_aarch64_sve_cmplo_wide>; + defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls", int_aarch64_sve_cmpls_wide>; + + defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge", SETGE, SETLE>; + defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt", SETGT, SETLT>; + defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt", SETLT, SETGT>; + defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple", SETLE, SETGE>; + defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq", SETEQ, SETEQ>; + defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne", SETNE, SETEQ>; + defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs", SETUGE, SETULE>; + defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi", SETUGT, SETULT>; + defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>; + defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>; + + defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", SETOGE, SETGE, SETOLE, SETLE>; + defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", SETOGT, SETGT, SETOLT, SETLT>; + defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", SETOEQ, SETEQ, SETOEQ, SETEQ>; + defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", SETONE, SETNE, SETONE, SETNE>; + defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", SETUO, SETUO, SETUO, SETUO>; + defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>; + defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>; + + defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge", SETOGE, SETGE, SETOLE, SETLE>; + defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt", SETOGT, SETGT, SETOLT, SETLT>; + defm FCMLT_PPzZ0 : sve_fp_2op_p_pd<0b010, "fcmlt", SETOLT, SETLT, SETOGT, SETGT>; + defm FCMLE_PPzZ0 : sve_fp_2op_p_pd<0b011, "fcmle", SETOLE, SETLE, SETOGE, SETGE>; + defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq", SETOEQ, SETEQ, SETOEQ, SETEQ>; + defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne", SETONE, SETNE, SETONE, SETNE>; + + defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt", int_aarch64_sve_whilelt>; + defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele", int_aarch64_sve_whilele>; + defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo", int_aarch64_sve_whilelo>; + defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels", int_aarch64_sve_whilels>; + + defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt", int_aarch64_sve_whilelt>; + defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele", int_aarch64_sve_whilele>; + defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo", int_aarch64_sve_whilelo>; + defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels", int_aarch64_sve_whilels>; def CTERMEQ_WW : sve_int_cterm<0b0, 0b0, "ctermeq", GPR32>; def CTERMNE_WW : sve_int_cterm<0b0, 0b1, "ctermne", GPR32>; @@ -773,170 +1506,265 @@ let Predicates = [HasSVE] in { def ADDVL_XXI : sve_int_arith_vl<0b0, "addvl">; def ADDPL_XXI : sve_int_arith_vl<0b1, "addpl">; - defm CNTB_XPiI : sve_int_count<0b000, "cntb">; - defm CNTH_XPiI : sve_int_count<0b010, "cnth">; - defm CNTW_XPiI : sve_int_count<0b100, "cntw">; - defm CNTD_XPiI : sve_int_count<0b110, "cntd">; - defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp">; + defm CNTB_XPiI : sve_int_count<0b000, "cntb", int_aarch64_sve_cntb>; + defm CNTH_XPiI : sve_int_count<0b010, "cnth", int_aarch64_sve_cnth>; + defm CNTW_XPiI : sve_int_count<0b100, "cntw", int_aarch64_sve_cntw>; + defm CNTD_XPiI : sve_int_count<0b110, "cntd", int_aarch64_sve_cntd>; + defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>; +} - defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb">; - defm DECB_XPiI : sve_int_pred_pattern_a<0b001, "decb">; - defm INCH_XPiI : sve_int_pred_pattern_a<0b010, "inch">; - defm DECH_XPiI : sve_int_pred_pattern_a<0b011, "dech">; - defm INCW_XPiI : sve_int_pred_pattern_a<0b100, "incw">; - defm DECW_XPiI : sve_int_pred_pattern_a<0b101, "decw">; - defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd">; - defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd">; + defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>; + defm DECB_XPiI : sve_int_pred_pattern_a<0b001, "decb", sub, int_aarch64_sve_cntb>; + defm INCH_XPiI : sve_int_pred_pattern_a<0b010, "inch", add, int_aarch64_sve_cnth>; + defm DECH_XPiI : sve_int_pred_pattern_a<0b011, "dech", sub, int_aarch64_sve_cnth>; + defm INCW_XPiI : sve_int_pred_pattern_a<0b100, "incw", add, int_aarch64_sve_cntw>; + defm DECW_XPiI : sve_int_pred_pattern_a<0b101, "decw", sub, int_aarch64_sve_cntw>; + defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd", add, int_aarch64_sve_cntd>; + defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd", sub, int_aarch64_sve_cntd>; - defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb">; - defm UQINCB_WPiI : sve_int_pred_pattern_b_u32<0b00001, "uqincb">; - defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb">; - defm UQDECB_WPiI : sve_int_pred_pattern_b_u32<0b00011, "uqdecb">; - defm SQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00100, "sqincb">; - defm UQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00101, "uqincb">; - defm SQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00110, "sqdecb">; - defm UQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00111, "uqdecb">; +let Predicates = [HasSVEorStreamingSVE] in { + defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb", int_aarch64_sve_sqincb_n32>; + defm UQINCB_WPiI : sve_int_pred_pattern_b_u32<0b00001, "uqincb", int_aarch64_sve_uqincb_n32>; + defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb", int_aarch64_sve_sqdecb_n32>; + defm UQDECB_WPiI : sve_int_pred_pattern_b_u32<0b00011, "uqdecb", int_aarch64_sve_uqdecb_n32>; + defm SQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00100, "sqincb", int_aarch64_sve_sqincb_n64>; + defm UQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00101, "uqincb", int_aarch64_sve_uqincb_n64>; + defm SQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00110, "sqdecb", int_aarch64_sve_sqdecb_n64>; + defm UQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00111, "uqdecb", int_aarch64_sve_uqdecb_n64>; - defm SQINCH_XPiWdI : sve_int_pred_pattern_b_s32<0b01000, "sqinch">; - defm UQINCH_WPiI : sve_int_pred_pattern_b_u32<0b01001, "uqinch">; - defm SQDECH_XPiWdI : sve_int_pred_pattern_b_s32<0b01010, "sqdech">; - defm UQDECH_WPiI : sve_int_pred_pattern_b_u32<0b01011, "uqdech">; - defm SQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01100, "sqinch">; - defm UQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01101, "uqinch">; - defm SQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01110, "sqdech">; - defm UQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01111, "uqdech">; + defm SQINCH_XPiWdI : sve_int_pred_pattern_b_s32<0b01000, "sqinch", int_aarch64_sve_sqinch_n32>; + defm UQINCH_WPiI : sve_int_pred_pattern_b_u32<0b01001, "uqinch", int_aarch64_sve_uqinch_n32>; + defm SQDECH_XPiWdI : sve_int_pred_pattern_b_s32<0b01010, "sqdech", int_aarch64_sve_sqdech_n32>; + defm UQDECH_WPiI : sve_int_pred_pattern_b_u32<0b01011, "uqdech", int_aarch64_sve_uqdech_n32>; + defm SQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01100, "sqinch", int_aarch64_sve_sqinch_n64>; + defm UQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01101, "uqinch", int_aarch64_sve_uqinch_n64>; + defm SQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01110, "sqdech", int_aarch64_sve_sqdech_n64>; + defm UQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01111, "uqdech", int_aarch64_sve_uqdech_n64>; - defm SQINCW_XPiWdI : sve_int_pred_pattern_b_s32<0b10000, "sqincw">; - defm UQINCW_WPiI : sve_int_pred_pattern_b_u32<0b10001, "uqincw">; - defm SQDECW_XPiWdI : sve_int_pred_pattern_b_s32<0b10010, "sqdecw">; - defm UQDECW_WPiI : sve_int_pred_pattern_b_u32<0b10011, "uqdecw">; - defm SQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10100, "sqincw">; - defm UQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10101, "uqincw">; - defm SQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10110, "sqdecw">; - defm UQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10111, "uqdecw">; + defm SQINCW_XPiWdI : sve_int_pred_pattern_b_s32<0b10000, "sqincw", int_aarch64_sve_sqincw_n32>; + defm UQINCW_WPiI : sve_int_pred_pattern_b_u32<0b10001, "uqincw", int_aarch64_sve_uqincw_n32>; + defm SQDECW_XPiWdI : sve_int_pred_pattern_b_s32<0b10010, "sqdecw", int_aarch64_sve_sqdecw_n32>; + defm UQDECW_WPiI : sve_int_pred_pattern_b_u32<0b10011, "uqdecw", int_aarch64_sve_uqdecw_n32>; + defm SQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10100, "sqincw", int_aarch64_sve_sqincw_n64>; + defm UQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10101, "uqincw", int_aarch64_sve_uqincw_n64>; + defm SQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10110, "sqdecw", int_aarch64_sve_sqdecw_n64>; + defm UQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10111, "uqdecw", int_aarch64_sve_uqdecw_n64>; - defm SQINCD_XPiWdI : sve_int_pred_pattern_b_s32<0b11000, "sqincd">; - defm UQINCD_WPiI : sve_int_pred_pattern_b_u32<0b11001, "uqincd">; - defm SQDECD_XPiWdI : sve_int_pred_pattern_b_s32<0b11010, "sqdecd">; - defm UQDECD_WPiI : sve_int_pred_pattern_b_u32<0b11011, "uqdecd">; - defm SQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11100, "sqincd">; - defm UQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11101, "uqincd">; - defm SQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11110, "sqdecd">; - defm UQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11111, "uqdecd">; + defm SQINCD_XPiWdI : sve_int_pred_pattern_b_s32<0b11000, "sqincd", int_aarch64_sve_sqincd_n32>; + defm UQINCD_WPiI : sve_int_pred_pattern_b_u32<0b11001, "uqincd", int_aarch64_sve_uqincd_n32>; + defm SQDECD_XPiWdI : sve_int_pred_pattern_b_s32<0b11010, "sqdecd", int_aarch64_sve_sqdecd_n32>; + defm UQDECD_WPiI : sve_int_pred_pattern_b_u32<0b11011, "uqdecd", int_aarch64_sve_uqdecd_n32>; + defm SQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11100, "sqincd", int_aarch64_sve_sqincd_n64>; + defm UQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11101, "uqincd", int_aarch64_sve_uqincd_n64>; + defm SQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11110, "sqdecd", int_aarch64_sve_sqdecd_n64>; + defm UQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11111, "uqdecd", int_aarch64_sve_uqdecd_n64>; - defm SQINCH_ZPiI : sve_int_countvlv<0b01000, "sqinch", ZPR16>; - defm UQINCH_ZPiI : sve_int_countvlv<0b01001, "uqinch", ZPR16>; - defm SQDECH_ZPiI : sve_int_countvlv<0b01010, "sqdech", ZPR16>; - defm UQDECH_ZPiI : sve_int_countvlv<0b01011, "uqdech", ZPR16>; + defm SQINCH_ZPiI : sve_int_countvlv<0b01000, "sqinch", ZPR16, int_aarch64_sve_sqinch, nxv8i16>; + defm UQINCH_ZPiI : sve_int_countvlv<0b01001, "uqinch", ZPR16, int_aarch64_sve_uqinch, nxv8i16>; + defm SQDECH_ZPiI : sve_int_countvlv<0b01010, "sqdech", ZPR16, int_aarch64_sve_sqdech, nxv8i16>; + defm UQDECH_ZPiI : sve_int_countvlv<0b01011, "uqdech", ZPR16, int_aarch64_sve_uqdech, nxv8i16>; defm INCH_ZPiI : sve_int_countvlv<0b01100, "inch", ZPR16>; defm DECH_ZPiI : sve_int_countvlv<0b01101, "dech", ZPR16>; - defm SQINCW_ZPiI : sve_int_countvlv<0b10000, "sqincw", ZPR32>; - defm UQINCW_ZPiI : sve_int_countvlv<0b10001, "uqincw", ZPR32>; - defm SQDECW_ZPiI : sve_int_countvlv<0b10010, "sqdecw", ZPR32>; - defm UQDECW_ZPiI : sve_int_countvlv<0b10011, "uqdecw", ZPR32>; + defm SQINCW_ZPiI : sve_int_countvlv<0b10000, "sqincw", ZPR32, int_aarch64_sve_sqincw, nxv4i32>; + defm UQINCW_ZPiI : sve_int_countvlv<0b10001, "uqincw", ZPR32, int_aarch64_sve_uqincw, nxv4i32>; + defm SQDECW_ZPiI : sve_int_countvlv<0b10010, "sqdecw", ZPR32, int_aarch64_sve_sqdecw, nxv4i32>; + defm UQDECW_ZPiI : sve_int_countvlv<0b10011, "uqdecw", ZPR32, int_aarch64_sve_uqdecw, nxv4i32>; defm INCW_ZPiI : sve_int_countvlv<0b10100, "incw", ZPR32>; defm DECW_ZPiI : sve_int_countvlv<0b10101, "decw", ZPR32>; - defm SQINCD_ZPiI : sve_int_countvlv<0b11000, "sqincd", ZPR64>; - defm UQINCD_ZPiI : sve_int_countvlv<0b11001, "uqincd", ZPR64>; - defm SQDECD_ZPiI : sve_int_countvlv<0b11010, "sqdecd", ZPR64>; - defm UQDECD_ZPiI : sve_int_countvlv<0b11011, "uqdecd", ZPR64>; + defm SQINCD_ZPiI : sve_int_countvlv<0b11000, "sqincd", ZPR64, int_aarch64_sve_sqincd, nxv2i64>; + defm UQINCD_ZPiI : sve_int_countvlv<0b11001, "uqincd", ZPR64, int_aarch64_sve_uqincd, nxv2i64>; + defm SQDECD_ZPiI : sve_int_countvlv<0b11010, "sqdecd", ZPR64, int_aarch64_sve_sqdecd, nxv2i64>; + defm UQDECD_ZPiI : sve_int_countvlv<0b11011, "uqdecd", ZPR64, int_aarch64_sve_uqdecd, nxv2i64>; defm INCD_ZPiI : sve_int_countvlv<0b11100, "incd", ZPR64>; defm DECD_ZPiI : sve_int_countvlv<0b11101, "decd", ZPR64>; - defm SQINCP_XPWd : sve_int_count_r_s32<0b00000, "sqincp">; - defm SQINCP_XP : sve_int_count_r_x64<0b00010, "sqincp">; - defm UQINCP_WP : sve_int_count_r_u32<0b00100, "uqincp">; - defm UQINCP_XP : sve_int_count_r_x64<0b00110, "uqincp">; - defm SQDECP_XPWd : sve_int_count_r_s32<0b01000, "sqdecp">; - defm SQDECP_XP : sve_int_count_r_x64<0b01010, "sqdecp">; - defm UQDECP_WP : sve_int_count_r_u32<0b01100, "uqdecp">; - defm UQDECP_XP : sve_int_count_r_x64<0b01110, "uqdecp">; - defm INCP_XP : sve_int_count_r_x64<0b10000, "incp">; - defm DECP_XP : sve_int_count_r_x64<0b10100, "decp">; + defm SQINCP_XPWd : sve_int_count_r_s32<0b00000, "sqincp", int_aarch64_sve_sqincp_n32>; + defm SQINCP_XP : sve_int_count_r_x64<0b00010, "sqincp", int_aarch64_sve_sqincp_n64>; + defm UQINCP_WP : sve_int_count_r_u32<0b00100, "uqincp", int_aarch64_sve_uqincp_n32>; + defm UQINCP_XP : sve_int_count_r_x64<0b00110, "uqincp", int_aarch64_sve_uqincp_n64>; + defm SQDECP_XPWd : sve_int_count_r_s32<0b01000, "sqdecp", int_aarch64_sve_sqdecp_n32>; + defm SQDECP_XP : sve_int_count_r_x64<0b01010, "sqdecp", int_aarch64_sve_sqdecp_n64>; + defm UQDECP_WP : sve_int_count_r_u32<0b01100, "uqdecp", int_aarch64_sve_uqdecp_n32>; + defm UQDECP_XP : sve_int_count_r_x64<0b01110, "uqdecp", int_aarch64_sve_uqdecp_n64>; + defm INCP_XP : sve_int_count_r_x64<0b10000, "incp", null_frag, add>; + defm DECP_XP : sve_int_count_r_x64<0b10100, "decp", null_frag, sub>; - defm SQINCP_ZP : sve_int_count_v<0b00000, "sqincp">; - defm UQINCP_ZP : sve_int_count_v<0b00100, "uqincp">; - defm SQDECP_ZP : sve_int_count_v<0b01000, "sqdecp">; - defm UQDECP_ZP : sve_int_count_v<0b01100, "uqdecp">; + defm SQINCP_ZP : sve_int_count_v<0b00000, "sqincp", int_aarch64_sve_sqincp>; + defm UQINCP_ZP : sve_int_count_v<0b00100, "uqincp", int_aarch64_sve_uqincp>; + defm SQDECP_ZP : sve_int_count_v<0b01000, "sqdecp", int_aarch64_sve_sqdecp>; + defm UQDECP_ZP : sve_int_count_v<0b01100, "uqdecp", int_aarch64_sve_uqdecp>; defm INCP_ZP : sve_int_count_v<0b10000, "incp">; defm DECP_ZP : sve_int_count_v<0b10100, "decp">; - defm INDEX_RR : sve_int_index_rr<"index">; - defm INDEX_IR : sve_int_index_ir<"index">; + defm INDEX_RR : sve_int_index_rr<"index", AArch64mul_p_oneuse>; + defm INDEX_IR : sve_int_index_ir<"index", AArch64mul_p, AArch64mul_p_oneuse>; defm INDEX_RI : sve_int_index_ri<"index">; defm INDEX_II : sve_int_index_ii<"index">; // Unpredicated shifts - defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr">; - defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr">; - defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl">; + defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_p>; + defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_p>; + defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_p>; - defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">; - defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">; - defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">; + defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr", int_aarch64_sve_asr_wide>; + defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr", int_aarch64_sve_lsr_wide>; + defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl", int_aarch64_sve_lsl_wide>; // Predicated shifts - defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b000, "asr">; - defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b001, "lsr">; - defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b011, "lsl">; - defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b100, "asrd">; + defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr", "ASR_ZPZI", int_aarch64_sve_asr>; + defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right_dup<0b0001, "lsr", "LSR_ZPZI", int_aarch64_sve_lsr>; + defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left_dup< 0b0011, "lsl", "LSL_ZPZI", int_aarch64_sve_lsl>; + defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right< 0b0100, "asrd", "ASRD_ZPZI", AArch64asrd_m1>; - defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr">; - defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr">; - defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl">; - defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr">; - defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr">; - defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr">; + defm ASR_ZPZI : sve_int_shift_pred_bhsd; + defm LSR_ZPZI : sve_int_shift_pred_bhsd; + defm LSL_ZPZI : sve_int_shift_pred_bhsd; +} // End HasSVEorStreamingSVE - defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr">; - defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr">; - defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl">; +let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in { + defm ASR_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + defm LSR_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + defm LSL_ZPZZ : sve_int_bin_pred_zeroing_bhsd; + defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd; +} // End HasSVEorStreamingSVE, UseExperimentalZeroingPseudos - def FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, ElementSizeS>; - def FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, ElementSizeS>; - def SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, ElementSizeH>; - def SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, ElementSizeS>; - def UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, ElementSizeS>; - def UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, ElementSizeH>; - def FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, ElementSizeH>; - def FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, ElementSizeS>; - def FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, ElementSizeH>; - def FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, ElementSizeS>; - def FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, ElementSizeD>; - def FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, ElementSizeD>; - def FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, ElementSizeD>; - def FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, ElementSizeD>; - def SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, ElementSizeD>; - def UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, ElementSizeD>; - def UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, ElementSizeS>; - def SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, ElementSizeD>; - def SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, ElementSizeS>; - def SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, ElementSizeD>; - def UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, ElementSizeD>; - def UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, ElementSizeD>; - def SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, ElementSizeD>; - def UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, ElementSizeD>; - def FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, ElementSizeD>; - def FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, ElementSizeD>; - def FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, ElementSizeD>; - def FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, ElementSizeS>; - def FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, ElementSizeD>; - def FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, ElementSizeS>; - def FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, ElementSizeD>; - def FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, ElementSizeD>; - def FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, ElementSizeD>; - def FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, ElementSizeD>; +let Predicates = [HasSVEorStreamingSVE] in { + defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", int_aarch64_sve_asr, "ASRR_ZPmZ">; + defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", int_aarch64_sve_lsr, "LSRR_ZPmZ">; + defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", int_aarch64_sve_lsl, "LSLR_ZPmZ">; + defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", "ASRR_ZPZZ", null_frag, "ASR_ZPmZ", /*isReverseInstr*/ 1>; + defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", /*isReverseInstr*/ 1>; + defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", /*isReverseInstr*/ 1>; - defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn">; - defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp">; - defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm">; - defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz">; - defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta">; - defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx">; - defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti">; - defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx">; - defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt">; + defm ASR_ZPZZ : sve_int_bin_pred_bhsd; + defm LSR_ZPZZ : sve_int_bin_pred_bhsd; + defm LSL_ZPZZ : sve_int_bin_pred_bhsd; + defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>; + defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>; + defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>; + + defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zdr<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, AArch64fcvtr_mt, nxv4f16, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, AArch64fcvte_mt, nxv4f32, nxv4i1, nxv4f16, ElementSizeS>; + defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0110010, "scvtf", ZPR16, ZPR16, null_frag, AArch64scvtf_mt, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; + defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1010100, "scvtf", ZPR32, ZPR32, null_frag, AArch64scvtf_mt, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; + defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1010101, "ucvtf", ZPR32, ZPR32, null_frag, AArch64ucvtf_mt, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; + defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0110011, "ucvtf", ZPR16, ZPR16, null_frag, AArch64ucvtf_mt, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; + defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0111010, "fcvtzs", ZPR16, ZPR16, null_frag, AArch64fcvtzs_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; + defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1011100, "fcvtzs", ZPR32, ZPR32, null_frag, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0111011, "fcvtzu", ZPR16, ZPR16, null_frag, AArch64fcvtzu_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; + defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1011101, "fcvtzu", ZPR32, ZPR32, null_frag, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zdr<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, AArch64fcvtr_mt, nxv2f16, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f16, ElementSizeD>; + defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zdr<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, AArch64fcvtr_mt, nxv2f32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, AArch64fcvte_mt, nxv2f64, nxv2i1, nxv2f32, ElementSizeD>; + defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; + defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv4i32, ElementSizeD>; + defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, AArch64ucvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>; + defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, AArch64scvtf_mt, nxv2f32, nxv2i1, nxv2i64, ElementSizeD>; + defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd< 0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, AArch64scvtf_mt, nxv4f16, nxv4i1, nxv4i32, ElementSizeS>; + defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd< 0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, AArch64scvtf_mt, nxv2f16, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, AArch64ucvtf_mt, nxv2f32, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd< 0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, AArch64ucvtf_mt, nxv2f16, nxv2i1, nxv2i64, ElementSizeD>; + defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1110110, "scvtf", ZPR64, ZPR64, null_frag, AArch64scvtf_mt, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1110111, "ucvtf", ZPR64, ZPR64, null_frag, AArch64ucvtf_mt, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; + defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag, nxv4i32, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>; + defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>; + defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>; + defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>; + defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>; + defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>; + defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111110, "fcvtzs", ZPR64, ZPR64, null_frag, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111111, "fcvtzu", ZPR64, ZPR64, null_frag, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; + + //These patterns exist to improve the code quality of conversions on unpacked types. + def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 (SVEAllActive):$Pg), (nxv2f16 ZPR:$Zs), (nxv2f32 ZPR:$Zd))), + (FCVT_ZPmZ_HtoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + // FP_ROUND has an additional 'precise' flag which indicates the type of rounding. + // This is ignored by the pattern below where it is matched by (i64 timm0_1) + def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 (SVEAllActive):$Pg), (nxv2f32 ZPR:$Zs), (i64 timm0_1), (nxv2f16 ZPR:$Zd))), + (FCVT_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + // Signed integer -> Floating-point + def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg), + (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (nxv2f16 ZPR:$Zd))), + (SCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv4f16 (AArch64scvtf_mt (nxv4i1 (SVEAllActive):$Pg), + (sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (nxv4f16 ZPR:$Zd))), + (SCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg), + (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f16 ZPR:$Zd))), + (SCVTF_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv2f32 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg), + (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f32 ZPR:$Zd))), + (SCVTF_ZPmZ_StoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv2f64 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg), + (sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (nxv2f64 ZPR:$Zd))), + (SCVTF_ZPmZ_StoD_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + // Unsigned integer -> Floating-point + def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg), + (and (nxv2i64 ZPR:$Zs), + (nxv2i64 (AArch64dup (i64 0xFFFF)))), (nxv2f16 ZPR:$Zd))), + (UCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv2f16 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg), + (and (nxv2i64 ZPR:$Zs), + (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f16 ZPR:$Zd))), + (UCVTF_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv4f16 (AArch64ucvtf_mt (nxv4i1 (SVEAllActive):$Pg), + (and (nxv4i32 ZPR:$Zs), + (nxv4i32 (AArch64dup (i32 0xFFFF)))), (nxv4f16 ZPR:$Zd))), + (UCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv2f32 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg), + (and (nxv2i64 ZPR:$Zs), + (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f32 ZPR:$Zd))), + (UCVTF_ZPmZ_StoS_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + def : Pat<(nxv2f64 (AArch64ucvtf_mt (nxv2i1 (SVEAllActive):$Pg), + (and (nxv2i64 ZPR:$Zs), + (nxv2i64 (AArch64dup (i64 0xFFFFFFFF)))), (nxv2f64 ZPR:$Zd))), + (UCVTF_ZPmZ_StoD_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>; + + defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", AArch64frintn_mt>; + defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", AArch64frintp_mt>; + defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm", AArch64frintm_mt>; + defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz", AArch64frintz_mt>; + defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta", AArch64frinta_mt>; + defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx", AArch64frintx_mt>; + defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", AArch64frinti_mt>; + defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", AArch64frecpx_mt>; + defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", AArch64fsqrt_mt>; +} // End HasSVEorStreamingSVE + +let Predicates = [HasBF16, HasSVEorStreamingSVE] in { + defm BFDOT_ZZZ : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>; + defm BFDOT_ZZI : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>; +} // End HasBF16, HasSVEorStreamingSVE + +let Predicates = [HasBF16, HasSVE] in { + defm BFMMLA_ZZZ : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>; +} // End HasBF16, HasSVE + +let Predicates = [HasBF16, HasSVEorStreamingSVE] in { + defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>; + defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>; + defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>; + defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>; + defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32>; + defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>; +} // End HasBF16, HasSVEorStreamingSVE + +let Predicates = [HasSVEorStreamingSVE] in { // InstAliases def : InstAlias<"mov $Zd, $Zn", (ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>; @@ -1021,4 +1849,1347 @@ let Predicates = [HasSVE] in { (FCMGT_PPzZZ_S PPR32:$Zd, PPR3bAny:$Pg, ZPR32:$Zn, ZPR32:$Zm), 0>; def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn", (FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>; -} + + // Pseudo instructions representing unpredicated LDR and STR for ZPR2,3,4. + // These get expanded to individual LDR_ZXI/STR_ZXI instructions in + // AArch64ExpandPseudoInsts. + let mayLoad = 1, hasSideEffects = 0 in { + def LDR_ZZXI : Pseudo<(outs ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def LDR_ZZZXI : Pseudo<(outs ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + } + let mayStore = 1, hasSideEffects = 0 in { + def STR_ZZXI : Pseudo<(outs), (ins ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def STR_ZZZXI : Pseudo<(outs), (ins ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + } + + def : Pat<(AArch64ptest (nxv16i1 PPR:$pg), (nxv16i1 PPR:$src)), + (PTEST_PP PPR:$pg, PPR:$src)>; + def : Pat<(AArch64ptest (nxv8i1 PPR:$pg), (nxv8i1 PPR:$src)), + (PTEST_PP PPR:$pg, PPR:$src)>; + def : Pat<(AArch64ptest (nxv4i1 PPR:$pg), (nxv4i1 PPR:$src)), + (PTEST_PP PPR:$pg, PPR:$src)>; + def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)), + (PTEST_PP PPR:$pg, PPR:$src)>; + + let AddedComplexity = 1 in { + class LD1RPat : + Pat<(vt (AArch64dup (index_vt (operator (CP GPR64:$base, immtype:$offset))))), + (load (ptrue 31), GPR64:$base, $offset)>; + } + + // LDR1 of 8-bit data + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + + // LDR1 of 16-bit data + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + + // LDR1 of 32-bit data + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + + // LDR1 of 64-bit data + def : LD1RPat; + + // LD1R of FP data + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + def : LD1RPat; + + // LD1R of 128-bit masked data + def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), + (LD1RQ_B_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), + (LD1RQ_H_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), + (LD1RQ_W_IMM $gp, $base, (i64 0))>; + def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, GPR64:$base)), + (LD1RQ_D_IMM $gp, $base, (i64 0))>; + + def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_B_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_H_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_W_IMM $gp, $base, simm4s16:$imm)>; + def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))), + (LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>; + + def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; + def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; + def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8), (SXTB_ZPmZ_UNDEF_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; + def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (SXTH_ZPmZ_UNDEF_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>; + def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i8), (SXTB_ZPmZ_UNDEF_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>; + def : Pat<(sext_inreg (nxv8i16 ZPR:$Zs), nxv8i8), (SXTB_ZPmZ_UNDEF_H (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>; + + // General case that we ideally never want to match. + def : Pat<(vscale GPR64:$scale), (MADDXrrr (UBFMXri (RDVLI_XI 1), 4, 63), $scale, XZR)>; + + let AddedComplexity = 5 in { + def : Pat<(vscale (i64 1)), (UBFMXri (RDVLI_XI 1), 4, 63)>; + def : Pat<(vscale (i64 -1)), (SBFMXri (RDVLI_XI -1), 4, 63)>; + + def : Pat<(vscale (sve_rdvl_imm i32:$imm)), (RDVLI_XI $imm)>; + def : Pat<(vscale (sve_cnth_imm i32:$imm)), (CNTH_XPiI 31, $imm)>; + def : Pat<(vscale (sve_cntw_imm i32:$imm)), (CNTW_XPiI 31, $imm)>; + def : Pat<(vscale (sve_cntd_imm i32:$imm)), (CNTD_XPiI 31, $imm)>; + + def : Pat<(vscale (sve_cnth_imm_neg i32:$imm)), (SUBXrs XZR, (CNTH_XPiI 31, $imm), 0)>; + def : Pat<(vscale (sve_cntw_imm_neg i32:$imm)), (SUBXrs XZR, (CNTW_XPiI 31, $imm), 0)>; + def : Pat<(vscale (sve_cntd_imm_neg i32:$imm)), (SUBXrs XZR, (CNTD_XPiI 31, $imm), 0)>; + } + + let AddedComplexity = 5 in { + def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))), + (ADDVL_XXI GPR64:$op, $imm)>; + + def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))), + (i32 (EXTRACT_SUBREG (ADDVL_XXI (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + GPR32:$op, sub_32), $imm), + sub_32))>; + + def : Pat<(nxv8i16 (add ZPR:$op, (nxv8i16 (AArch64dup (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))), + (INCH_ZPiI ZPR:$op, 31, $imm)>; + def : Pat<(nxv4i32 (add ZPR:$op, (nxv4i32 (AArch64dup (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))), + (INCW_ZPiI ZPR:$op, 31, $imm)>; + def : Pat<(nxv2i64 (add ZPR:$op, (nxv2i64 (AArch64dup (i64 (vscale (sve_cntd_imm i32:$imm))))))), + (INCD_ZPiI ZPR:$op, 31, $imm)>; + + def : Pat<(nxv8i16 (sub ZPR:$op, (nxv8i16 (AArch64dup (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))), + (DECH_ZPiI ZPR:$op, 31, $imm)>; + def : Pat<(nxv4i32 (sub ZPR:$op, (nxv4i32 (AArch64dup (i32 (trunc (vscale (sve_cntw_imm i32:$imm)))))))), + (DECW_ZPiI ZPR:$op, 31, $imm)>; + def : Pat<(nxv2i64 (sub ZPR:$op, (nxv2i64 (AArch64dup (i64 (vscale (sve_cntd_imm i32:$imm))))))), + (DECD_ZPiI ZPR:$op, 31, $imm)>; + } + + let Predicates = [HasSVEorStreamingSVE, UseScalarIncVL], AddedComplexity = 5 in { + def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm i32:$imm))), + (INCH_XPiI GPR64:$op, 31, $imm)>; + def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm i32:$imm))), + (INCW_XPiI GPR64:$op, 31, $imm)>; + def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm i32:$imm))), + (INCD_XPiI GPR64:$op, 31, $imm)>; + + def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm_neg i32:$imm))), + (DECH_XPiI GPR64:$op, 31, $imm)>; + def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm_neg i32:$imm))), + (DECW_XPiI GPR64:$op, 31, $imm)>; + def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm_neg i32:$imm))), + (DECD_XPiI GPR64:$op, 31, $imm)>; + + def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm i32:$imm))))), + (i32 (EXTRACT_SUBREG (INCH_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + GPR32:$op, sub_32), 31, $imm), + sub_32))>; + def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntw_imm i32:$imm))))), + (i32 (EXTRACT_SUBREG (INCW_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + GPR32:$op, sub_32), 31, $imm), + sub_32))>; + def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntd_imm i32:$imm))))), + (i32 (EXTRACT_SUBREG (INCD_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + GPR32:$op, sub_32), 31, $imm), + sub_32))>; + + def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm_neg i32:$imm))))), + (i32 (EXTRACT_SUBREG (DECH_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + GPR32:$op, sub_32), 31, $imm), + sub_32))>; + def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntw_imm_neg i32:$imm))))), + (i32 (EXTRACT_SUBREG (DECW_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + GPR32:$op, sub_32), 31, $imm), + sub_32))>; + def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntd_imm_neg i32:$imm))))), + (i32 (EXTRACT_SUBREG (DECD_XPiI (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + GPR32:$op, sub_32), 31, $imm), + sub_32))>; + } + + def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))), + (ADDVL_XXI GPR64:$op, $imm)>; + + // FIXME: BigEndian requires an additional REV instruction to satisfy the + // constraint that none of the bits change when stored to memory as one + // type, and and reloaded as another type. + let Predicates = [IsLE] in { + def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>; + + def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>; + + def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>; + + def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>; + + def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>; + + def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>; + + def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>; + + def : Pat<(nxv8bf16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + def : Pat<(nxv8bf16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8bf16 ZPR:$src)>; + + def : Pat<(nxv16i8 (bitconvert (nxv8bf16 ZPR:$src))), (nxv16i8 ZPR:$src)>; + def : Pat<(nxv8i16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8i16 ZPR:$src)>; + def : Pat<(nxv4i32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4i32 ZPR:$src)>; + def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>; + def : Pat<(nxv8f16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8f16 ZPR:$src)>; + def : Pat<(nxv4f32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4f32 ZPR:$src)>; + def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>; + } + + // These allow casting from/to unpacked predicate types. + def : Pat<(nxv16i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv16i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv8i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv8i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv8i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv4i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv4i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv4i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv2i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv2i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv2i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + + // These allow casting from/to unpacked floating-point types. + def : Pat<(nxv2f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv8f16 (reinterpret_cast (nxv2f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv4f16 (reinterpret_cast (nxv8f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv8f16 (reinterpret_cast (nxv4f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv2f32 (reinterpret_cast (nxv4f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv4f32 (reinterpret_cast (nxv2f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv2bf16 (reinterpret_cast (nxv8bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv8bf16 (reinterpret_cast (nxv2bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv4bf16 (reinterpret_cast (nxv8bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv8bf16 (reinterpret_cast (nxv4bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + + def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)), + (AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>; + def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)), + (AND_PPzPP (PTRUE_H 31), PPR:$Ps1, PPR:$Ps2)>; + def : Pat<(nxv4i1 (and PPR:$Ps1, PPR:$Ps2)), + (AND_PPzPP (PTRUE_S 31), PPR:$Ps1, PPR:$Ps2)>; + def : Pat<(nxv2i1 (and PPR:$Ps1, PPR:$Ps2)), + (AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>; + + // Add more complex addressing modes here as required + multiclass pred_load { + let AddedComplexity = 1 in { + def _reg_reg_z : Pat<(Ty (Load (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), (SVEDup0Undef))), + (RegRegInst PPR:$gp, GPR64:$base, GPR64:$offset)>; + } + let AddedComplexity = 2 in { + def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))), + (RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>; + } + def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))), + (RegImmInst PPR:$gp, GPR64:$base, (i64 0))>; + } + + // 2-element contiguous loads + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + + // 4-element contiguous loads + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + + // 8-element contiguous loads + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + + // 16-element contiguous loads + defm : pred_load; + + multiclass pred_store { + let AddedComplexity = 1 in { + def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)), + (RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>; + } + let AddedComplexity = 2 in { + def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>; + } + def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>; + } + + // 2-element contiguous stores + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + + // 4-element contiguous stores + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + + // 8-element contiguous stores + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + + // 16-element contiguous stores + defm : pred_store; + + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + + defm : pred_store; + defm : pred_store; + defm : pred_store; + defm : pred_store; + + multiclass unpred_store { + let AddedComplexity = 1 in { + def _reg : Pat<(Store (Ty ZPR:$val), (AddrCP GPR64sp:$base, GPR64:$offset)), + (RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>; + } + let AddedComplexity = 2 in { + def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)), + (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + } + + def : Pat<(Store (Ty ZPR:$val), GPR64:$base), + (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>; + } + + defm : unpred_store< store, nxv16i8, ST1B, ST1B_IMM, PTRUE_B, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H, ST1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S, ST1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D, ST1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_store< store, nxv8i16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store; + defm : unpred_store; + defm : unpred_store< store, nxv4i32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_store; + defm : unpred_store< store, nxv2i64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + defm : unpred_store< store, nxv8f16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv8bf16, ST1H, ST1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv4f16, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv4bf16, ST1H_S, ST1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv2f16, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv2bf16, ST1H_D, ST1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_store< store, nxv4f32, ST1W, ST1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_store< store, nxv2f32, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_store< store, nxv2f64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + + multiclass unpred_load { + let AddedComplexity = 1 in { + def _reg: Pat<(Ty (Load (AddrCP GPR64sp:$base, GPR64:$offset))), + (RegRegInst (PTrue 31), GPR64sp:$base, GPR64:$offset)>; + } + let AddedComplexity = 2 in { + def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))), + (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + } + + def : Pat<(Ty (Load GPR64:$base)), + (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>; + } + + defm : unpred_load< load, nxv16i8, LD1B, LD1B_IMM, PTRUE_B, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< zextloadvi8, nxv2i64, LD1B_D, LD1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< extloadvi8, nxv2i64, LD1B_D, LD1B_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H, LD1SB_H_IMM, PTRUE_H, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S, LD1SB_S_IMM, PTRUE_S, am_sve_regreg_lsl0>; + defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D, LD1SB_D_IMM, PTRUE_D, am_sve_regreg_lsl0>; + defm : unpred_load< load, nxv8i16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load< extloadvi16, nxv4i32, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_load< extloadvi16, nxv2i64, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_load; + defm : unpred_load; + defm : unpred_load< load, nxv4i32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_load; + defm : unpred_load< extloadvi32, nxv2i64, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_load; + defm : unpred_load< load, nxv2i64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + defm : unpred_load< load, nxv8f16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv8bf16, LD1H, LD1H_IMM, PTRUE_H, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv4f16, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv4bf16, LD1H_S, LD1H_S_IMM, PTRUE_S, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv2f16, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv2bf16, LD1H_D, LD1H_D_IMM, PTRUE_D, am_sve_regreg_lsl1>; + defm : unpred_load< load, nxv4f32, LD1W, LD1W_IMM, PTRUE_S, am_sve_regreg_lsl2>; + defm : unpred_load< load, nxv2f32, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>; + defm : unpred_load< load, nxv2f64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>; + + // Allow using the reg+reg form of ld1b/st1b for memory accesses with the + // same width as nxv16i8. This saves an add in cases where we would + // otherwise compute the address separately. + multiclass unpred_loadstore_bitcast { + let Predicates = [IsLE] in { + def : Pat<(Ty (load (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset))), + (LD1B (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>; + def : Pat<(store (Ty ZPR:$val), (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset)), + (ST1B ZPR:$val, (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>; + } + } + defm : unpred_loadstore_bitcast; + defm : unpred_loadstore_bitcast; + defm : unpred_loadstore_bitcast; + defm : unpred_loadstore_bitcast; + defm : unpred_loadstore_bitcast; + defm : unpred_loadstore_bitcast; + defm : unpred_loadstore_bitcast; + + multiclass unpred_store_predicate { + def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)), + (Store PPR:$val, GPR64sp:$base, simm9:$offset)>; + + def _default : Pat<(store (Ty PPR:$Val), GPR64:$base), + (Store PPR:$Val, GPR64:$base, (i64 0))>; + } + + defm Pat_Store_P16 : unpred_store_predicate; + + multiclass unpred_load_predicate { + def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))), + (Load GPR64sp:$base, simm9:$offset)>; + + def _default : Pat<(Ty (load GPR64:$base)), + (Load GPR64:$base, (i64 0))>; + } + + defm Pat_Load_P16 : unpred_load_predicate; + + multiclass ld1 { + // reg + reg + let AddedComplexity = 1 in { + def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)), + (RegRegInst PPR:$gp, GPR64sp:$base, GPR64:$offset)>; + } + + // scalar + immediate (mul vl) + let AddedComplexity = 2 in { + def : Pat<(Ty (Load (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)), + (RegImmInst PPR:$gp, GPR64sp:$base, simm4s1:$offset)>; + } + + // base + def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)), + (RegImmInst PPR:$gp, GPR64sp:$base, (i64 0))>; + } + + // 2-element contiguous loads + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + + // 4-element contiguous loads + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + + // 8-element contiguous loads + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + + // 16-element contiguous loads + defm : ld1; +} // End HasSVEorStreamingSVE + +let Predicates = [HasSVE] in { + multiclass ldnf1 { + // scalar + immediate (mul vl) + let AddedComplexity = 1 in { + def : Pat<(Ty (Load (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)), + (I PPR:$gp, GPR64sp:$base, simm4s1:$offset)>; + } + + // base + def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)), + (I PPR:$gp, GPR64sp:$base, (i64 0))>; + } + + // 2-element contiguous non-faulting loads + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + + // 4-element contiguous non-faulting loads + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + + // 8-element contiguous non-faulting loads + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + + // 16-element contiguous non-faulting loads + defm : ldnf1; + + multiclass ldff1 { + // reg + reg + let AddedComplexity = 1 in { + def : Pat<(Ty (Load (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)), + (I PPR:$gp, GPR64sp:$base, GPR64:$offset)>; + } + + // Base + def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)), + (I PPR:$gp, GPR64sp:$base, XZR)>; + } + + // 2-element contiguous first faulting loads + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + + // 4-element contiguous first faulting loads + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + + // 8-element contiguous first faulting loads + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + + // 16-element contiguous first faulting loads + defm : ldff1; +} // End HasSVE + +let Predicates = [HasSVEorStreamingSVE] in { + multiclass st1 { + // reg + reg + let AddedComplexity = 1 in { + def : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), MemVT), + (RegRegInst ZPR:$vec, PPR:$gp, GPR64sp:$base, GPR64:$offset)>; + } + + // scalar + immediate (mul vl) + let AddedComplexity = 2 in { + def : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), MemVT), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64sp:$base, simm4s1:$offset)>; + } + + // base + def : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp), MemVT), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>; + } + + // 2-element contiguous store + defm : st1; + defm : st1; + defm : st1; + defm : st1; + + // 4-element contiguous store + defm : st1; + defm : st1; + defm : st1; + + // 8-element contiguous store + defm : st1; + defm : st1; + + // 16-element contiguous store + defm : st1; + + // Insert scalar into undef[0] + def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv4i32 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv2i64 (vector_insert (nxv2i64 (undef)), (i64 FPR64:$src), 0)), + (INSERT_SUBREG (nxv2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; + + def : Pat<(nxv8f16 (vector_insert (nxv8f16 (undef)), (f16 FPR16:$src), 0)), + (INSERT_SUBREG (nxv8f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>; + def : Pat<(nxv4f16 (vector_insert (nxv4f16 (undef)), (f16 FPR16:$src), 0)), + (INSERT_SUBREG (nxv4f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>; + def : Pat<(nxv2f16 (vector_insert (nxv2f16 (undef)), (f16 FPR16:$src), 0)), + (INSERT_SUBREG (nxv2f16 (IMPLICIT_DEF)), FPR16:$src, hsub)>; + def : Pat<(nxv4f32 (vector_insert (nxv4f32 (undef)), (f32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv4f32 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv2f32 (vector_insert (nxv2f32 (undef)), (f32 FPR32:$src), 0)), + (INSERT_SUBREG (nxv2f32 (IMPLICIT_DEF)), FPR32:$src, ssub)>; + def : Pat<(nxv2f64 (vector_insert (nxv2f64 (undef)), (f64 FPR64:$src), 0)), + (INSERT_SUBREG (nxv2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; + + // Insert scalar into vector[0] + def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)), + (CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>; + def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), (i32 GPR32:$src), 0)), + (CPY_ZPmR_H ZPR:$vec, (PTRUE_H 1), GPR32:$src)>; + def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), (i32 GPR32:$src), 0)), + (CPY_ZPmR_S ZPR:$vec, (PTRUE_S 1), GPR32:$src)>; + def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), (i64 GPR64:$src), 0)), + (CPY_ZPmR_D ZPR:$vec, (PTRUE_D 1), GPR64:$src)>; + + def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), 0)), + (SEL_ZPZZ_H (PTRUE_H 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), ZPR:$vec)>; + def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), 0)), + (SEL_ZPZZ_S (PTRUE_S 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), ZPR:$vec)>; + def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), 0)), + (SEL_ZPZZ_D (PTRUE_D 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), ZPR:$vec)>; + + // Insert scalar into vector with scalar index + def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), GPR32:$src, GPR64:$index)), + (CPY_ZPmR_B ZPR:$vec, + (CMPEQ_PPzZZ_B (PTRUE_B 31), + (INDEX_II_B 0, 1), + (DUP_ZR_B (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + GPR32:$src)>; + def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), GPR32:$src, GPR64:$index)), + (CPY_ZPmR_H ZPR:$vec, + (CMPEQ_PPzZZ_H (PTRUE_H 31), + (INDEX_II_H 0, 1), + (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + GPR32:$src)>; + def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), GPR32:$src, GPR64:$index)), + (CPY_ZPmR_S ZPR:$vec, + (CMPEQ_PPzZZ_S (PTRUE_S 31), + (INDEX_II_S 0, 1), + (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + GPR32:$src)>; + def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), GPR64:$src, GPR64:$index)), + (CPY_ZPmR_D ZPR:$vec, + (CMPEQ_PPzZZ_D (PTRUE_D 31), + (INDEX_II_D 0, 1), + (DUP_ZR_D GPR64:$index)), + GPR64:$src)>; + + // Insert FP scalar into vector with scalar index + def : Pat<(nxv2f16 (vector_insert (nxv2f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)), + (CPY_ZPmV_H ZPR:$vec, + (CMPEQ_PPzZZ_D (PTRUE_D 31), + (INDEX_II_D 0, 1), + (DUP_ZR_D GPR64:$index)), + $src)>; + def : Pat<(nxv4f16 (vector_insert (nxv4f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)), + (CPY_ZPmV_H ZPR:$vec, + (CMPEQ_PPzZZ_S (PTRUE_S 31), + (INDEX_II_S 0, 1), + (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + $src)>; + def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)), + (CPY_ZPmV_H ZPR:$vec, + (CMPEQ_PPzZZ_H (PTRUE_H 31), + (INDEX_II_H 0, 1), + (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + $src)>; + def : Pat<(nxv2f32 (vector_insert (nxv2f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)), + (CPY_ZPmV_S ZPR:$vec, + (CMPEQ_PPzZZ_D (PTRUE_D 31), + (INDEX_II_D 0, 1), + (DUP_ZR_D GPR64:$index)), + $src) >; + def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)), + (CPY_ZPmV_S ZPR:$vec, + (CMPEQ_PPzZZ_S (PTRUE_S 31), + (INDEX_II_S 0, 1), + (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))), + $src)>; + def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), GPR64:$index)), + (CPY_ZPmV_D ZPR:$vec, + (CMPEQ_PPzZZ_D (PTRUE_D 31), + (INDEX_II_D 0, 1), + (DUP_ZR_D $index)), + $src)>; + + // Extract element from vector with scalar index + def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)), + (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_H (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_H (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_S (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>; + def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)), + (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index), ZPR:$vec)>; + + // Extract element from vector with immediate index + def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)), + (EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>; + def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)), + (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), ssub)>; + def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)), + (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>; + def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)), + (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>; + def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)), + (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>; + def : Pat<(f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)), + (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), hsub)>; + def : Pat<(f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)), + (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), hsub)>; + def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)), + (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>; + def : Pat<(f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)), + (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), ssub)>; + def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)), + (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>; + + // Extract element from vector with immediate index that's within the bottom 128-bits. + let AddedComplexity = 1 in { + def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)), + (i32 (UMOVvi8 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>; + def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)), + (i32 (UMOVvi16 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>; + def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)), + (i32 (UMOVvi32 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>; + def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), VectorIndexD:$index)), + (i64 (UMOVvi64 (v2i64 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexD:$index))>; + } + + def : Pat<(sext_inreg (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index), i8), + (i32 (SMOVvi8to32 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>; + def : Pat<(sext_inreg (anyext (vector_extract (nxv16i8 ZPR:$vec), VectorIndexB:$index)), i8), + (i64 (SMOVvi8to64 (v16i8 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexB:$index))>; + + def : Pat<(sext_inreg (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index), i16), + (i32 (SMOVvi16to32 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>; + def : Pat<(sext_inreg (anyext (vector_extract (nxv8i16 ZPR:$vec), VectorIndexH:$index)), i16), + (i64 (SMOVvi16to64 (v8i16 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexH:$index))>; + + def : Pat<(sext (vector_extract (nxv4i32 ZPR:$vec), VectorIndexS:$index)), + (i64 (SMOVvi32to64 (v4i32 (EXTRACT_SUBREG ZPR:$vec, zsub)), VectorIndexS:$index))>; + + // Extract first element from vector. + let AddedComplexity = 2 in { + def : Pat<(vector_extract (nxv16i8 ZPR:$Zs), (i64 0)), + (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; + def : Pat<(vector_extract (nxv8i16 ZPR:$Zs), (i64 0)), + (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; + def : Pat<(vector_extract (nxv4i32 ZPR:$Zs), (i64 0)), + (i32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; + def : Pat<(vector_extract (nxv2i64 ZPR:$Zs), (i64 0)), + (i64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>; + def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)), + (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>; + def : Pat<(vector_extract (nxv4f16 ZPR:$Zs), (i64 0)), + (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>; + def : Pat<(vector_extract (nxv2f16 ZPR:$Zs), (i64 0)), + (f16 (EXTRACT_SUBREG ZPR:$Zs, hsub))>; + def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)), + (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; + def : Pat<(vector_extract (nxv2f32 ZPR:$Zs), (i64 0)), + (f32 (EXTRACT_SUBREG ZPR:$Zs, ssub))>; + def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)), + (f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>; + } +} // End HasSVEorStreamingSVE + +let Predicates = [HasSVE, HasMatMulInt8] in { + defm SMMLA_ZZZ : sve_int_matmul<0b00, "smmla", int_aarch64_sve_smmla>; + defm UMMLA_ZZZ : sve_int_matmul<0b11, "ummla", int_aarch64_sve_ummla>; + defm USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla", int_aarch64_sve_usmmla>; +} // End HasSVE, HasMatMulInt8 + +let Predicates = [HasSVEorStreamingSVE, HasMatMulInt8] in { + defm USDOT_ZZZ : sve_int_dot_mixed<"usdot", int_aarch64_sve_usdot>; + defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>; + defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>; +} // End HasSVEorStreamingSVE, HasMatMulInt8 + +let Predicates = [HasSVE, HasMatMulFP32] in { + defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0, "fmmla", ZPR32, int_aarch64_sve_fmmla, nxv4f32>; +} // End HasSVE, HasMatMulFP32 + +let Predicates = [HasSVE, HasMatMulFP64] in { + defm FMMLA_ZZZ_D : sve_fp_matrix_mla<1, "fmmla", ZPR64, int_aarch64_sve_fmmla, nxv2f64>; + defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8, nxv16i8, nxv16i1, AArch64ld1ro_z>; + defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16, nxv8i16, nxv8i1, AArch64ld1ro_z>; + defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32, nxv4i32, nxv4i1, AArch64ld1ro_z>; + defm LD1RO_D_IMM : sve_mem_ldor_si<0b11, "ld1rod", Z_d, ZPR64, nxv2i64, nxv2i1, AArch64ld1ro_z>; + defm LD1RO_B : sve_mem_ldor_ss<0b00, "ld1rob", Z_b, ZPR8, GPR64NoXZRshifted8, nxv16i8, nxv16i1, AArch64ld1ro_z, am_sve_regreg_lsl0>; + defm LD1RO_H : sve_mem_ldor_ss<0b01, "ld1roh", Z_h, ZPR16, GPR64NoXZRshifted16, nxv8i16, nxv8i1, AArch64ld1ro_z, am_sve_regreg_lsl1>; + defm LD1RO_W : sve_mem_ldor_ss<0b10, "ld1row", Z_s, ZPR32, GPR64NoXZRshifted32, nxv4i32, nxv4i1, AArch64ld1ro_z, am_sve_regreg_lsl2>; + defm LD1RO_D : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64, nxv2i64, nxv2i1, AArch64ld1ro_z, am_sve_regreg_lsl3>; +} // End HasSVE, HasMatMulFP64 + +let Predicates = [HasSVEorStreamingSVE, HasMatMulFP64] in { + defm ZIP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1", int_aarch64_sve_zip1q>; + defm ZIP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2", int_aarch64_sve_zip2q>; + defm UZP1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1", int_aarch64_sve_uzp1q>; + defm UZP2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2", int_aarch64_sve_uzp2q>; + defm TRN1_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1", int_aarch64_sve_trn1q>; + defm TRN2_ZZZ_Q : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>; +} // End HasSVEorStreamingSVE, HasMatMulFP64 + +let Predicates = [HasSVE2orStreamingSVE] in { + // SVE2 integer multiply-add (indexed) + defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>; + defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", int_aarch64_sve_mls_lane>; + + // SVE2 saturating multiply-add high (indexed) + defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah_lane>; + defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh_lane>; + + // SVE2 saturating multiply-add high (vectors, unpredicated) + defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah>; + defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh>; + + // SVE2 integer multiply (indexed) + defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul", int_aarch64_sve_mul_lane>; + + // SVE2 saturating multiply high (indexed) + defm SQDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh", int_aarch64_sve_sqdmulh_lane>; + defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh", int_aarch64_sve_sqrdmulh_lane>; + + // SVE2 signed saturating doubling multiply high (unpredicated) + defm SQDMULH_ZZZ : sve2_int_mul<0b100, "sqdmulh", int_aarch64_sve_sqdmulh>; + defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh", int_aarch64_sve_sqrdmulh>; + + // SVE2 integer multiply vectors (unpredicated) + defm MUL_ZZZ : sve2_int_mul<0b000, "mul", null_frag, AArch64mul_p>; + defm SMULH_ZZZ : sve2_int_mul<0b010, "smulh", null_frag, AArch64smulh_p>; + defm UMULH_ZZZ : sve2_int_mul<0b011, "umulh", null_frag, AArch64umulh_p>; + defm PMUL_ZZZ : sve2_int_mul_single<0b001, "pmul", int_aarch64_sve_pmul>; + + // SVE2 complex integer dot product (indexed) + defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>; + + // SVE2 complex integer dot product + defm CDOT_ZZZ : sve2_cintx_dot<"cdot", int_aarch64_sve_cdot>; + + // SVE2 complex integer multiply-add (indexed) + defm CMLA_ZZZI : sve2_cmla_by_indexed_elem<0b0, "cmla", int_aarch64_sve_cmla_lane_x>; + // SVE2 complex saturating multiply-add (indexed) + defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_lane_x>; + + // SVE2 complex integer multiply-add + defm CMLA_ZZZ : sve2_int_cmla<0b0, "cmla", int_aarch64_sve_cmla_x>; + defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_x>; + + // SVE2 integer multiply long (indexed) + defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb", int_aarch64_sve_smullb_lane>; + defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt", int_aarch64_sve_smullt_lane>; + defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb", int_aarch64_sve_umullb_lane>; + defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt", int_aarch64_sve_umullt_lane>; + + // SVE2 saturating multiply (indexed) + defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb", int_aarch64_sve_sqdmullb_lane>; + defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt", int_aarch64_sve_sqdmullt_lane>; + + // SVE2 integer multiply-add long (indexed) + defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb", int_aarch64_sve_smlalb_lane>; + defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt", int_aarch64_sve_smlalt_lane>; + defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb", int_aarch64_sve_umlalb_lane>; + defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt", int_aarch64_sve_umlalt_lane>; + defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb", int_aarch64_sve_smlslb_lane>; + defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt", int_aarch64_sve_smlslt_lane>; + defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb", int_aarch64_sve_umlslb_lane>; + defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt", int_aarch64_sve_umlslt_lane>; + + // SVE2 integer multiply-add long (vectors, unpredicated) + defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb", int_aarch64_sve_smlalb>; + defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt", int_aarch64_sve_smlalt>; + defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb", int_aarch64_sve_umlalb>; + defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt", int_aarch64_sve_umlalt>; + defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb", int_aarch64_sve_smlslb>; + defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt", int_aarch64_sve_smlslt>; + defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb", int_aarch64_sve_umlslb>; + defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt", int_aarch64_sve_umlslt>; + + // SVE2 saturating multiply-add long (indexed) + defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb", int_aarch64_sve_sqdmlalb_lane>; + defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt", int_aarch64_sve_sqdmlalt_lane>; + defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb", int_aarch64_sve_sqdmlslb_lane>; + defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt", int_aarch64_sve_sqdmlslt_lane>; + + // SVE2 saturating multiply-add long (vectors, unpredicated) + defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb", int_aarch64_sve_sqdmlalb>; + defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt", int_aarch64_sve_sqdmlalt>; + defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb", int_aarch64_sve_sqdmlslb>; + defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt", int_aarch64_sve_sqdmlslt>; + + // SVE2 saturating multiply-add interleaved long + defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt", int_aarch64_sve_sqdmlalbt>; + defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt", int_aarch64_sve_sqdmlslbt>; + + // SVE2 integer halving add/subtract (predicated) + defm SHADD_ZPmZ : sve2_int_arith_pred<0b100000, "shadd", int_aarch64_sve_shadd>; + defm UHADD_ZPmZ : sve2_int_arith_pred<0b100010, "uhadd", int_aarch64_sve_uhadd>; + defm SHSUB_ZPmZ : sve2_int_arith_pred<0b100100, "shsub", int_aarch64_sve_shsub>; + defm UHSUB_ZPmZ : sve2_int_arith_pred<0b100110, "uhsub", int_aarch64_sve_uhsub>; + defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", int_aarch64_sve_srhadd>; + defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", int_aarch64_sve_urhadd>; + defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr", int_aarch64_sve_shsubr>; + defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr", int_aarch64_sve_uhsubr>; + + // SVE2 integer pairwise add and accumulate long + defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp", int_aarch64_sve_sadalp>; + defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp", int_aarch64_sve_uadalp>; + + // SVE2 integer pairwise arithmetic + defm ADDP_ZPmZ : sve2_int_arith_pred<0b100011, "addp", int_aarch64_sve_addp>; + defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp", int_aarch64_sve_smaxp>; + defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp", int_aarch64_sve_umaxp>; + defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp", int_aarch64_sve_sminp>; + defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp", int_aarch64_sve_uminp>; + + // SVE2 integer unary operations (predicated) + defm URECPE_ZPmZ : sve2_int_un_pred_arit_s<0b000, "urecpe", int_aarch64_sve_urecpe>; + defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte", int_aarch64_sve_ursqrte>; + defm SQABS_ZPmZ : sve2_int_un_pred_arit<0b100, "sqabs", int_aarch64_sve_sqabs>; + defm SQNEG_ZPmZ : sve2_int_un_pred_arit<0b101, "sqneg", int_aarch64_sve_sqneg>; + + // SVE2 saturating add/subtract + defm SQADD_ZPmZ : sve2_int_arith_pred<0b110000, "sqadd", int_aarch64_sve_sqadd>; + defm UQADD_ZPmZ : sve2_int_arith_pred<0b110010, "uqadd", int_aarch64_sve_uqadd>; + defm SQSUB_ZPmZ : sve2_int_arith_pred<0b110100, "sqsub", int_aarch64_sve_sqsub>; + defm UQSUB_ZPmZ : sve2_int_arith_pred<0b110110, "uqsub", int_aarch64_sve_uqsub>; + defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd", int_aarch64_sve_suqadd>; + defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd", int_aarch64_sve_usqadd>; + defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr", int_aarch64_sve_sqsubr>; + defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr", int_aarch64_sve_uqsubr>; + + // SVE2 saturating/rounding bitwise shift left (predicated) + defm SRSHL_ZPmZ : sve2_int_arith_pred<0b000100, "srshl", int_aarch64_sve_srshl, "SRSHL_ZPZZ", DestructiveBinaryCommWithRev, "SRSHLR_ZPmZ">; + defm URSHL_ZPmZ : sve2_int_arith_pred<0b000110, "urshl", int_aarch64_sve_urshl, "URSHL_ZPZZ", DestructiveBinaryCommWithRev, "URSHLR_ZPmZ">; + defm SRSHLR_ZPmZ : sve2_int_arith_pred<0b001100, "srshlr", null_frag, "SRSHLR_ZPZZ", DestructiveBinaryCommWithRev, "SRSHL_ZPmZ", /*isReverseInstr*/ 1>; + defm URSHLR_ZPmZ : sve2_int_arith_pred<0b001110, "urshlr", null_frag, "URSHLR_ZPZZ", DestructiveBinaryCommWithRev, "URSHL_ZPmZ", /*isReverseInstr*/ 1>; + defm SQSHL_ZPmZ : sve2_int_arith_pred<0b010000, "sqshl", int_aarch64_sve_sqshl, "SQSHL_ZPZZ", DestructiveBinaryCommWithRev, "SQSHLR_ZPmZ">; + defm UQSHL_ZPmZ : sve2_int_arith_pred<0b010010, "uqshl", int_aarch64_sve_uqshl, "UQSHL_ZPZZ", DestructiveBinaryCommWithRev, "UQSHLR_ZPmZ">; + defm SQRSHL_ZPmZ : sve2_int_arith_pred<0b010100, "sqrshl", int_aarch64_sve_sqrshl, "SQRSHL_ZPZZ", DestructiveBinaryCommWithRev, "SQRSHLR_ZPmZ">; + defm UQRSHL_ZPmZ : sve2_int_arith_pred<0b010110, "uqrshl", int_aarch64_sve_uqrshl, "UQRSHL_ZPZZ", DestructiveBinaryCommWithRev, "UQRSHLR_ZPmZ">; + defm SQSHLR_ZPmZ : sve2_int_arith_pred<0b011000, "sqshlr", null_frag, "SQSHLR_ZPZZ", DestructiveBinaryCommWithRev, "SQSHL_ZPmZ", /*isReverseInstr*/ 1>; + defm UQSHLR_ZPmZ : sve2_int_arith_pred<0b011010, "uqshlr", null_frag, "UQSHLR_ZPZZ", DestructiveBinaryCommWithRev, "UQSHL_ZPmZ", /*isReverseInstr*/ 1>; + defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr", null_frag, "SQRSHLR_ZPZZ", DestructiveBinaryCommWithRev, "SQRSHL_ZPmZ", /*isReverseInstr*/ 1>; + defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag, "UQRSHLR_ZPZZ", DestructiveBinaryCommWithRev, "UQRSHL_ZPmZ", /*isReverseInstr*/ 1>; + + defm SRSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd; + defm URSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd; + defm SQSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd; + defm UQSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd; + defm SQRSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd; + defm UQRSHL_ZPZZ : sve_int_bin_pred_all_active_bhsd; +} // End HasSVE2orStreamingSVE + +let Predicates = [HasSVE2orStreamingSVE, UseExperimentalZeroingPseudos] in { + defm SQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd; + defm UQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd; + defm SRSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd; + defm URSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zeroing_bhsd; + defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd; +} // End HasSVE2orStreamingSVE, UseExperimentalZeroingPseudos + +let Predicates = [HasSVE2orStreamingSVE] in { + // SVE2 predicated shifts + defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left_dup<0b0110, "sqshl", "SQSHL_ZPZI", int_aarch64_sve_sqshl>; + defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left_dup<0b0111, "uqshl", "UQSHL_ZPZI", int_aarch64_sve_uqshl>; + defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right< 0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>; + defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right< 0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>; + defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>; + + // SVE2 integer add/subtract long + defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>; + defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt", int_aarch64_sve_saddlt>; + defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb", int_aarch64_sve_uaddlb>; + defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt", int_aarch64_sve_uaddlt>; + defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb", int_aarch64_sve_ssublb>; + defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt", int_aarch64_sve_ssublt>; + defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb", int_aarch64_sve_usublb>; + defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt", int_aarch64_sve_usublt>; + defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb", int_aarch64_sve_sabdlb>; + defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt", int_aarch64_sve_sabdlt>; + defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb", int_aarch64_sve_uabdlb>; + defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt", int_aarch64_sve_uabdlt>; + + // SVE2 integer add/subtract wide + defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb", int_aarch64_sve_saddwb>; + defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt", int_aarch64_sve_saddwt>; + defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb", int_aarch64_sve_uaddwb>; + defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt", int_aarch64_sve_uaddwt>; + defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb", int_aarch64_sve_ssubwb>; + defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt", int_aarch64_sve_ssubwt>; + defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb", int_aarch64_sve_usubwb>; + defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt", int_aarch64_sve_usubwt>; + + // SVE2 integer multiply long + defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>; + defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt", int_aarch64_sve_sqdmullt>; + defm SMULLB_ZZZ : sve2_wide_int_arith_long<0b11100, "smullb", int_aarch64_sve_smullb>; + defm SMULLT_ZZZ : sve2_wide_int_arith_long<0b11101, "smullt", int_aarch64_sve_smullt>; + defm UMULLB_ZZZ : sve2_wide_int_arith_long<0b11110, "umullb", int_aarch64_sve_umullb>; + defm UMULLT_ZZZ : sve2_wide_int_arith_long<0b11111, "umullt", int_aarch64_sve_umullt>; + defm PMULLB_ZZZ : sve2_pmul_long<0b0, "pmullb", int_aarch64_sve_pmullb_pair>; + defm PMULLT_ZZZ : sve2_pmul_long<0b1, "pmullt", int_aarch64_sve_pmullt_pair>; + + // SVE2 bitwise shift and insert + defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri", int_aarch64_sve_sri>; + defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli", int_aarch64_sve_sli>; + + // SVE2 bitwise shift right and accumulate + defm SSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b00, "ssra", int_aarch64_sve_ssra>; + defm USRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b01, "usra", int_aarch64_sve_usra>; + defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra>; + defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra>; + + // SVE2 complex integer add + defm CADD_ZZI : sve2_int_cadd<0b0, "cadd", int_aarch64_sve_cadd_x>; + defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd", int_aarch64_sve_sqcadd_x>; + + // SVE2 integer absolute difference and accumulate + defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", int_aarch64_sve_saba>; + defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", int_aarch64_sve_uaba>; + + // SVE2 integer absolute difference and accumulate long + defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb", int_aarch64_sve_sabalb>; + defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt", int_aarch64_sve_sabalt>; + defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb", int_aarch64_sve_uabalb>; + defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt", int_aarch64_sve_uabalt>; + + // SVE2 integer add/subtract long with carry + defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb", int_aarch64_sve_adclb>; + defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt", int_aarch64_sve_adclt>; + defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb", int_aarch64_sve_sbclb>; + defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt", int_aarch64_sve_sbclt>; + + // SVE2 bitwise shift right narrow (bottom) + defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb", int_aarch64_sve_sqshrunb>; + defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb", int_aarch64_sve_sqrshrunb>; + defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb", int_aarch64_sve_shrnb>; + defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb", int_aarch64_sve_rshrnb>; + defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb", int_aarch64_sve_sqshrnb>; + defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb", int_aarch64_sve_sqrshrnb>; + defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb", int_aarch64_sve_uqshrnb>; + defm UQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb", int_aarch64_sve_uqrshrnb>; + + // SVE2 bitwise shift right narrow (top) + defm SQSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt", int_aarch64_sve_sqshrunt>; + defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt", int_aarch64_sve_sqrshrunt>; + defm SHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt", int_aarch64_sve_shrnt>; + defm RSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt", int_aarch64_sve_rshrnt>; + defm SQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt", int_aarch64_sve_sqshrnt>; + defm SQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt", int_aarch64_sve_sqrshrnt>; + defm UQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt", int_aarch64_sve_uqshrnt>; + defm UQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt", int_aarch64_sve_uqrshrnt>; + + // SVE2 integer add/subtract narrow high part (bottom) + defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb", int_aarch64_sve_addhnb>; + defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb", int_aarch64_sve_raddhnb>; + defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb", int_aarch64_sve_subhnb>; + defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb", int_aarch64_sve_rsubhnb>; + + // SVE2 integer add/subtract narrow high part (top) + defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b00, "addhnt", int_aarch64_sve_addhnt>; + defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt", int_aarch64_sve_raddhnt>; + defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b10, "subhnt", int_aarch64_sve_subhnt>; + defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt", int_aarch64_sve_rsubhnt>; + + // SVE2 saturating extract narrow (bottom) + defm SQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb", int_aarch64_sve_sqxtnb>; + defm UQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb", int_aarch64_sve_uqxtnb>; + defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb", int_aarch64_sve_sqxtunb>; + + // SVE2 saturating extract narrow (top) + defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt", int_aarch64_sve_sqxtnt>; + defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt", int_aarch64_sve_uqxtnt>; + defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>; +} // End HasSVE2orStreamingSVE + +let Predicates = [HasSVE2] in { + // SVE2 character match + defm MATCH_PPzZZ : sve2_char_match<0b0, "match", int_aarch64_sve_match>; + defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch", int_aarch64_sve_nmatch>; +} // End HasSVE2 + +let Predicates = [HasSVE2orStreamingSVE] in { + // SVE2 bitwise exclusive-or interleaved + defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt", int_aarch64_sve_eorbt>; + defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb", int_aarch64_sve_eortb>; + + // SVE2 bitwise shift left long + defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb", int_aarch64_sve_sshllb>; + defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt", int_aarch64_sve_sshllt>; + defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb", int_aarch64_sve_ushllb>; + defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt", int_aarch64_sve_ushllt>; + + // SVE2 integer add/subtract interleaved long + defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>; + defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt", int_aarch64_sve_ssublbt>; + defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb", int_aarch64_sve_ssubltb>; +} // End HasSVE2orStreamingSVE + +let Predicates = [HasSVE2] in { + // SVE2 histogram generation (segment) + def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg", int_aarch64_sve_histseg>; + + // SVE2 histogram generation (vector) + defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt", int_aarch64_sve_histcnt>; +} // End HasSVE2 + +let Predicates = [HasSVE2orStreamingSVE] in { + // SVE2 floating-point base 2 logarithm as integer + defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>; + + // SVE2 floating-point convert precision + defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding_top<"fcvtxnt", "int_aarch64_sve_fcvtxnt">; + defm FCVTX_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtx", "int_aarch64_sve_fcvtx">; + defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt", "int_aarch64_sve_fcvtnt">; + defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt", "int_aarch64_sve_fcvtlt">; + + // SVE2 floating-point pairwise operations + defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp", int_aarch64_sve_faddp>; + defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp", int_aarch64_sve_fmaxnmp>; + defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp", int_aarch64_sve_fminnmp>; + defm FMAXP_ZPmZZ : sve2_fp_pairwise_pred<0b110, "fmaxp", int_aarch64_sve_fmaxp>; + defm FMINP_ZPmZZ : sve2_fp_pairwise_pred<0b111, "fminp", int_aarch64_sve_fminp>; + + // SVE2 floating-point multiply-add long (indexed) + defm FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb", int_aarch64_sve_fmlalb_lane>; + defm FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt", int_aarch64_sve_fmlalt_lane>; + defm FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb", int_aarch64_sve_fmlslb_lane>; + defm FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt", int_aarch64_sve_fmlslt_lane>; + + // SVE2 floating-point multiply-add long + defm FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb", int_aarch64_sve_fmlalb>; + defm FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt", int_aarch64_sve_fmlalt>; + defm FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb", int_aarch64_sve_fmlslb>; + defm FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt", int_aarch64_sve_fmlslt>; + + // SVE2 bitwise ternary operations + defm EOR3_ZZZZ : sve2_int_bitwise_ternary_op<0b000, "eor3", int_aarch64_sve_eor3>; + defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", int_aarch64_sve_bcax>; + defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl>; + defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>; + defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>; + defm NBSL_ZZZZ : sve2_int_bitwise_ternary_op<0b111, "nbsl", int_aarch64_sve_nbsl>; + + // SVE2 bitwise xor and rotate right by immediate + defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>; + + // SVE2 extract vector (immediate offset, constructive) + def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">; +} // End HasSVE2orStreamingSVE + +let Predicates = [HasSVE2] in { + // SVE2 non-temporal gather loads + defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv4i8>; + defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00001, "ldnt1b", AArch64ldnt1_gather_z, nxv4i8>; + defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv4i16>; + defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00101, "ldnt1h", AArch64ldnt1_gather_z, nxv4i16>; + defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b01001, "ldnt1w", AArch64ldnt1_gather_z, nxv4i32>; + + defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv2i8>; + defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10010, "ldnt1b", AArch64ldnt1_gather_z, nxv2i8>; + defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv2i16>; + defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10110, "ldnt1h", AArch64ldnt1_gather_z, nxv2i16>; + defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather_z, nxv2i32>; + defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11010, "ldnt1w", AArch64ldnt1_gather_z, nxv2i32>; + defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d", AArch64ldnt1_gather_z, nxv2i64>; +} // End HasSVE2 + +let Predicates = [HasSVE2orStreamingSVE] in { + // SVE2 vector splice (constructive) + defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">; +} // End HasSVE2orStreamingSVE + +let Predicates = [HasSVE2] in { + // SVE2 non-temporal scatter stores + defm STNT1B_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>; + defm STNT1H_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>; + defm STNT1W_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>; + + defm STNT1B_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>; + defm STNT1H_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>; + defm STNT1W_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>; + defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>; +} // End HasSVE2 + +let Predicates = [HasSVE2orStreamingSVE] in { + // SVE2 table lookup (three sources) + defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>; + defm TBX_ZZZ : sve2_int_perm_tbx<"tbx", int_aarch64_sve_tbx>; + + // SVE2 integer compare scalar count and limit + defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege", int_aarch64_sve_whilege>; + defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt", int_aarch64_sve_whilegt>; + defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs", int_aarch64_sve_whilehs>; + defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi", int_aarch64_sve_whilehi>; + + defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege", int_aarch64_sve_whilege>; + defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt", int_aarch64_sve_whilegt>; + defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs", int_aarch64_sve_whilehs>; + defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi>; + + // SVE2 pointer conflict compare + defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">; + defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">; +} // End HasSVE2orStreamingSVE + +let Predicates = [HasSVE2AES] in { + // SVE2 crypto destructive binary operations + defm AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8, int_aarch64_sve_aese, nxv16i8>; + defm AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8, int_aarch64_sve_aesd, nxv16i8>; + + // SVE2 crypto unary operations + defm AESMC_ZZ_B : sve2_crypto_unary_op<0b0, "aesmc", int_aarch64_sve_aesmc>; + defm AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc", int_aarch64_sve_aesimc>; + + // PMULLB and PMULLT instructions which operate with 64-bit source and + // 128-bit destination elements are enabled with crypto extensions, similar + // to NEON PMULL2 instruction. + defm PMULLB_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11010, "pmullb", int_aarch64_sve_pmullb_pair>; + defm PMULLT_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11011, "pmullt", int_aarch64_sve_pmullt_pair>; +} // End HasSVE2AES + +let Predicates = [HasSVE2SM4] in { + // SVE2 crypto constructive binary operations + defm SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32, int_aarch64_sve_sm4ekey, nxv4i32>; + // SVE2 crypto destructive binary operations + defm SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32, int_aarch64_sve_sm4e, nxv4i32>; +} // End HasSVE2SM4 + +let Predicates = [HasSVE2SHA3] in { + // SVE2 crypto constructive binary operations + defm RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64, int_aarch64_sve_rax1, nxv2i64>; +} // End HasSVE2SHA3 + +let Predicates = [HasSVE2BitPerm] in { + // SVE2 bitwise permute + defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext", int_aarch64_sve_bext_x>; + defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep", int_aarch64_sve_bdep_x>; + defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp", int_aarch64_sve_bgrp_x>; +} // End HasSVE2BitPerm diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedA53.td b/suite/synctools/tablegen/AArch64/AArch64SchedA53.td index f253a4f3..d18a05fd 100644 --- a/suite/synctools/tablegen/AArch64/AArch64SchedA53.td +++ b/suite/synctools/tablegen/AArch64/AArch64SchedA53.td @@ -1,9 +1,8 @@ //==- AArch64SchedA53.td - Cortex-A53 Scheduling Definitions -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -27,7 +26,9 @@ def CortexA53Model : SchedMachineModel { // v 1.0 Spreadsheet let CompleteModel = 1; - list UnsupportedFeatures = [HasSVE]; + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F, + SMEUnsupported.F); } @@ -126,7 +127,8 @@ def : WriteRes { let Latency = 6; } def : WriteRes { let Latency = 6; } def : WriteRes { let Latency = 6; } def : WriteRes { let Latency = 6; } -def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } // FP Mul, Div, Sqrt def : WriteRes { let Latency = 6; } @@ -148,6 +150,7 @@ def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32; // No forwarding for these reads. def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; def : ReadAdvance; // ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedA55.td b/suite/synctools/tablegen/AArch64/AArch64SchedA55.td new file mode 100644 index 00000000..009219ce --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64SchedA55.td @@ -0,0 +1,361 @@ +//==- AArch64SchedCortexA55.td - ARM Cortex-A55 Scheduling Definitions -*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the ARM Cortex-A55 processors. +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// The following definitions describe the per-operand machine model. +// This works with MachineScheduler. See MCSchedModel.h for details. + +// Cortex-A55 machine model for scheduling and other instruction cost heuristics. +def CortexA55Model : SchedMachineModel { + let MicroOpBufferSize = 0; // The Cortex-A55 is an in-order processor + let IssueWidth = 2; // It dual-issues under most circumstances + let LoadLatency = 4; // Cycles for loads to access the cache. The + // optimisation guide shows that most loads have + // a latency of 3, but some have a latency of 4 + // or 5. Setting it 4 looked to be good trade-off. + let MispredictPenalty = 8; // A branch direction mispredict. + let PostRAScheduler = 1; // Enable PostRA scheduler pass. + let CompleteModel = 0; // Covers instructions applicable to Cortex-A55. + + list UnsupportedFeatures = [HasSVE]; + + // FIXME: Remove when all errors have been fixed. + let FullInstRWOverlapCheck = 0; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available. + +// Modeling each pipeline as a ProcResource using the BufferSize = 0 since the +// Cortex-A55 is in-order. + +def CortexA55UnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU +def CortexA55UnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC, 64-bi wide +def CortexA55UnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division, not pipelined +def CortexA55UnitLd : ProcResource<1> { let BufferSize = 0; } // Load pipe +def CortexA55UnitSt : ProcResource<1> { let BufferSize = 0; } // Store pipe +def CortexA55UnitB : ProcResource<1> { let BufferSize = 0; } // Branch + +// The FP DIV/SQRT instructions execute totally differently from the FP ALU +// instructions, which can mostly be dual-issued; that's why for now we model +// them with 2 resources. +def CortexA55UnitFPALU : ProcResource<2> { let BufferSize = 0; } // FP ALU +def CortexA55UnitFPMAC : ProcResource<2> { let BufferSize = 0; } // FP MAC +def CortexA55UnitFPDIV : ProcResource<1> { let BufferSize = 0; } // FP Div/SQRT, 64/128 + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedWrite types + +let SchedModel = CortexA55Model in { + +// These latencies are modeled without taking into account forwarding paths +// (the software optimisation guide lists latencies taking into account +// typical forwarding paths). +def : WriteRes { let Latency = 3; } // MOVN, MOVZ +def : WriteRes { let Latency = 3; } // ALU +def : WriteRes { let Latency = 3; } // ALU of Shifted-Reg +def : WriteRes { let Latency = 3; } // ALU of Extended-Reg +def : WriteRes { let Latency = 3; } // EXTR from a reg pair +def : WriteRes { let Latency = 3; } // Shift/Scale + +// MAC +def : WriteRes { let Latency = 4; } // 32-bit Multiply +def : WriteRes { let Latency = 4; } // 64-bit Multiply + +// Div +def : WriteRes { + let Latency = 8; let ResourceCycles = [8]; +} +def : WriteRes { + let Latency = 8; let ResourceCycles = [8]; +} + +// Load +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 5; } + +// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd +// below, choosing the median of 3 which makes the latency 6. +// An extra cycle is needed to get the swizzling right. +def : WriteRes { let Latency = 6; + let ResourceCycles = [3]; } +def CortexA55WriteVLD1 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 4; } +def CortexA55WriteVLD1SI : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 4; let SingleIssue = 1; } +def CortexA55WriteVLD2 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 5; + let ResourceCycles = [2]; } +def CortexA55WriteVLD3 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 6; + let ResourceCycles = [3]; } +def CortexA55WriteVLD4 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 7; + let ResourceCycles = [4]; } +def CortexA55WriteVLD5 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 8; + let ResourceCycles = [5]; } +def CortexA55WriteVLD6 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 9; + let ResourceCycles = [6]; } +def CortexA55WriteVLD7 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 10; + let ResourceCycles = [7]; } +def CortexA55WriteVLD8 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 11; + let ResourceCycles = [8]; } + +def CortexA55WriteLDP1 : SchedWriteRes<[]> { let Latency = 4; } +def CortexA55WriteLDP2 : SchedWriteRes<[CortexA55UnitLd]> { let Latency = 5; } +def CortexA55WriteLDP4 : SchedWriteRes<[CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd, CortexA55UnitLd]> { let Latency = 6; } + +// Pre/Post Indexing - Performed as part of address generation +def : WriteRes { let Latency = 0; } + +// Store +let RetireOOO = 1 in { +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +} +def : WriteRes { let Latency = 4; } + +// Vector Store - Similar to vector loads, can take 1-3 cycles to issue. +def : WriteRes { let Latency = 5; + let ResourceCycles = [2];} +def CortexA55WriteVST1 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 4; } +def CortexA55WriteVST2 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5; + let ResourceCycles = [2]; } +def CortexA55WriteVST3 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 6; + let ResourceCycles = [3]; } +def CortexA55WriteVST4 : SchedWriteRes<[CortexA55UnitSt]> { let Latency = 5; + let ResourceCycles = [4]; } + +def : WriteRes { let Unsupported = 1; } + +// Branch +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; + +// FP ALU +// As WriteF result is produced in F5 and it can be mostly forwarded +// to consumer at F1, the effectively latency is set as 4. +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; let BeginGroup = 1; } + +// FP ALU specific new schedwrite definitions +def CortexA55WriteFPALU_F2 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 2;} +def CortexA55WriteFPALU_F3 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 3;} +def CortexA55WriteFPALU_F4 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 4;} +def CortexA55WriteFPALU_F5 : SchedWriteRes<[CortexA55UnitFPALU]> { let Latency = 5;} + +// FP Mul, Div, Sqrt. Div/Sqrt are not pipelined +def : WriteRes { let Latency = 4; } + +let RetireOOO = 1 in { +def : WriteRes { let Latency = 22; + let ResourceCycles = [29]; } +def CortexA55WriteFMAC : SchedWriteRes<[CortexA55UnitFPMAC]> { let Latency = 4; } +def CortexA55WriteFDivHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8; + let ResourceCycles = [5]; } +def CortexA55WriteFDivSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 13; + let ResourceCycles = [10]; } +def CortexA55WriteFDivDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22; + let ResourceCycles = [19]; } +def CortexA55WriteFSqrtHP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 8; + let ResourceCycles = [5]; } +def CortexA55WriteFSqrtSP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 12; + let ResourceCycles = [9]; } +def CortexA55WriteFSqrtDP : SchedWriteRes<[CortexA55UnitFPDIV]> { let Latency = 22; + let ResourceCycles = [19]; } +} +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedRead types. + +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +// ALU - ALU input operands are generally needed in EX1. An operand produced in +// in say EX2 can be forwarded for consumption to ALU in EX1, thereby +// allowing back-to-back ALU operations such as add. If an operand requires +// a shift, it will, however, be required in ISS stage. +def : ReadAdvance; +// Shifted operand +def CortexA55ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def CortexA55ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def CortexA55ReadISReg : SchedReadVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +def CortexA55ReadIEReg : SchedReadVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +// MUL +def : ReadAdvance; +def : ReadAdvance; + +// Div +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Subtarget-specific InstRWs. + +//--- +// Miscellaneous +//--- +def : InstRW<[CortexA55WriteVLD1SI,CortexA55WriteLDP1], (instregex "LDPS?Wi")>; +def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP1], (instregex "LDPSi")>; +def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP2], (instregex "LDP(X|D)i")>; +def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP4], (instregex "LDPQi")>; +def : InstRW<[WriteAdr, CortexA55WriteVLD1SI,CortexA55WriteLDP1], (instregex "LDPS?W(pre|post)")>; +def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP1], (instregex "LDPS(pre|post)")>; +def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP2], (instregex "LDP(X|D)(pre|post)")>; +def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP4], (instregex "LDPQ(pre|post)")>; +def : InstRW<[WriteI], (instrs COPY)>; +//--- +// Vector Loads - 64-bit per cycle +//--- +// 1-element structures +def : InstRW<[CortexA55WriteVLD1], (instregex "LD1i(8|16|32|64)$")>; // single element +def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // replicate +def : InstRW<[CortexA55WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)$")>; // multiple structures +def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[CortexA55WriteVLD6], (instregex "LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[CortexA55WriteVLD8], (instregex "LD1Fourv(16b|8h|4s|2d)$")>; + +def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; +def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>; + +// 2-element structures +def : InstRW<[CortexA55WriteVLD2], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[CortexA55WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>; + +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>; +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>; +def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>; + +// 3-element structures +def : InstRW<[CortexA55WriteVLD2], (instregex "LD3i(8|16|32|64)$")>; +def : InstRW<[CortexA55WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>; +def : InstRW<[CortexA55WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>; + +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>; +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>; + +// 4-element structures +def : InstRW<[CortexA55WriteVLD2], (instregex "LD4i(8|16|32|64)$")>; // load single 4-el structure to one lane of 4 regs. +def : InstRW<[CortexA55WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; // load single 4-el structure, replicate to all lanes of 4 regs. +def : InstRW<[CortexA55WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>; // load multiple 4-el structures to 4 regs. +def : InstRW<[CortexA55WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>; + +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>; +def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>; + +//--- +// Vector Stores +//--- +def : InstRW<[CortexA55WriteVST1], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[CortexA55WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; +def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[CortexA55WriteVST2], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[CortexA55WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[CortexA55WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; +def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[CortexA55WriteVST2], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[CortexA55WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; +def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>; + +def : InstRW<[CortexA55WriteVST2], (instregex "ST4i(8|16|32|64)$")>; +def : InstRW<[CortexA55WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>; +def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +//--- +// Floating Point Conversions, MAC, DIV, SQRT +//--- +def : InstRW<[CortexA55WriteFPALU_F2], (instregex "^DUP(v2i64|v4i32|v8i16|v16i8)")>; +def : InstRW<[CortexA55WriteFPALU_F2], (instregex "^XTN")>; +def : InstRW<[CortexA55WriteFPALU_F3], (instregex "^FCVT[ALMNPZ][SU](S|U)?(W|X)")>; +def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^FCVT(X)?[ALMNPXZ](S|U|N)?v")>; + +def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(S|U)(W|X)(H|S|D)")>; +def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTF(h|s|d)")>; +def : InstRW<[CortexA55WriteFPALU_F4], (instregex "^(S|U)CVTFv")>; + +def : InstRW<[CortexA55WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>; +def : InstRW<[CortexA55WriteFMAC], (instregex "^FML(A|S).*")>; +def : InstRW<[CortexA55WriteFDivHP], (instrs FDIVHrr)>; +def : InstRW<[CortexA55WriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[CortexA55WriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[CortexA55WriteFDivHP], (instregex "^FDIVv.*16$")>; +def : InstRW<[CortexA55WriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[CortexA55WriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[CortexA55WriteFSqrtHP], (instregex "^.*SQRT.*16$")>; +def : InstRW<[CortexA55WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[CortexA55WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; + +} diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedA57.td b/suite/synctools/tablegen/AArch64/AArch64SchedA57.td index ade03f23..a860aa90 100644 --- a/suite/synctools/tablegen/AArch64/AArch64SchedA57.td +++ b/suite/synctools/tablegen/AArch64/AArch64SchedA57.td @@ -1,9 +1,8 @@ //=- AArch64SchedA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -32,7 +31,9 @@ def CortexA57Model : SchedMachineModel { let LoopMicroOpBufferSize = 16; let CompleteModel = 1; - list UnsupportedFeatures = [HasSVE]; + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F, + SMEUnsupported.F); } //===----------------------------------------------------------------------===// @@ -93,9 +94,10 @@ def : SchedAlias; def : SchedAlias; def : SchedAlias; def : SchedAlias; -def : SchedAlias; +def : WriteRes { let Latency = 5;} def : SchedAlias; -def : SchedAlias; +def : SchedAlias; +def : SchedAlias; def : SchedAlias; def : SchedAlias; @@ -115,6 +117,7 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; @@ -350,12 +353,16 @@ def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST4Fourv(2d)_POST$") // D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64 // Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64 +// Cortex A57 Software Optimization Guide Sec 3.14 +// Advance for absolute diff accum, pairwise add and accumulate, shift accumulate +def A57ReadIVA3 : SchedReadAdvance<3, [A57Write_4cyc_1X_NonMul_Forward, A57Write_5cyc_2X_NonMul_Forward]>; + // ASIMD absolute diff accum, D-form -def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; // ASIMD absolute diff accum, Q-form -def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; +def : InstRW<[A57Write_5cyc_2X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; // ASIMD absolute diff accum long -def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABAL")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ABAL")>; // ASIMD arith, reduce, 4H/4S def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; @@ -372,32 +379,41 @@ def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")> def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>; // ASIMD multiply, D-form -def : InstRW<[A57Write_5cyc_1W], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>; +// MUL +def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^MUL(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>; +// PMUL, SQDMULH, SQRDMULH +def : InstRW<[A57Write_5cyc_1W], (instregex "^(PMUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>; + // ASIMD multiply, Q-form -def : InstRW<[A57Write_6cyc_2W], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; +// MUL +def : InstRW<[A57Write_6cyc_2W_Mul_Forward], (instregex "^MUL(v16i8|v8i16|v4i32)(_indexed)?$")>; +// PMUL, SQDMULH, SQRDMULH +def : InstRW<[A57Write_6cyc_2W], (instregex "^(PMUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; + +// Cortex A57 Software Optimization Guide Sec 3.14 +def A57ReadIVMA4 : SchedReadAdvance<4 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>; +def A57ReadIVMA3 : SchedReadAdvance<3 , [A57Write_5cyc_1W_Mul_Forward, A57Write_6cyc_2W_Mul_Forward]>; // ASIMD multiply accumulate, D-form -def : InstRW<[A57Write_5cyc_1W], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; // ASIMD multiply accumulate, Q-form -def : InstRW<[A57Write_6cyc_2W], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[A57Write_6cyc_2W_Mul_Forward, A57ReadIVMA4], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; // ASIMD multiply accumulate long // ASIMD multiply accumulate saturating long -def A57WriteIVMA : SchedWriteRes<[A57UnitW]> { let Latency = 5; } -def A57ReadIVMA4 : SchedReadAdvance<4, [A57WriteIVMA]>; -def : InstRW<[A57WriteIVMA, A57ReadIVMA4], (instregex "^(S|U|SQD)ML[AS]L")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA4], (instregex "^(S|U)ML[AS]L")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward, A57ReadIVMA3], (instregex "^SQDML[AS]L")>; // ASIMD multiply long -def : InstRW<[A57Write_5cyc_1W], (instregex "^(S|U|SQD)MULL")>; +def : InstRW<[A57Write_5cyc_1W_Mul_Forward], (instregex "^(S|U)MULL")>; +def : InstRW<[A57Write_5cyc_1W], (instregex "^SQDMULL")>; def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>; def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>; // ASIMD pairwise add and accumulate // ASIMD shift accumulate -def A57WriteIVA : SchedWriteRes<[A57UnitX]> { let Latency = 4; } -def A57ReadIVA3 : SchedReadAdvance<3, [A57WriteIVA]>; -def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^[SU]ADALP")>; -def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^[SU]ADALP")>; +def : InstRW<[A57Write_4cyc_1X_NonMul_Forward, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>; // ASIMD shift by immed, complex def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>; @@ -474,17 +490,22 @@ def : InstRW<[A57Write_9cyc_3V], (instregex "^(FMAX|FMIN)(NM)?P(v4f32|v2f64|v2i6 def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>; // ASIMD FP multiply, D-form, FZ -def : InstRW<[A57Write_5cyc_1V], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; +def : InstRW<[A57Write_5cyc_1V_FP_Forward], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; // ASIMD FP multiply, Q-form, FZ -def : InstRW<[A57Write_5cyc_2V], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; +def : InstRW<[A57Write_5cyc_2V_FP_Forward], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; // ASIMD FP multiply accumulate, D-form, FZ // ASIMD FP multiply accumulate, Q-form, FZ def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9; } def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10; } -def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ]>; + +// Cortex A57 Software Optimization Guide Sec 3.15 +// Advances from FP mul and mul-accum to mul-accum +def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>; +def A57ReadFPVMA6 : SchedReadAdvance<6, [A57WriteFPVMAD, A57WriteFPVMAQ, A57Write_5cyc_1V_FP_Forward, A57Write_5cyc_2V_FP_Forward]>; + def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>; -def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA5], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; +def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA6], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; // ASIMD FP round, D-form def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>; @@ -502,10 +523,10 @@ def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; // Q form - v16i8, v8i16, v4i32, v2i64 // ASIMD bitwise insert, Q-form -def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>; +def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL|BSP)v16i8")>; // ASIMD duplicate, gen reg, D-form and Q-form -def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>; +def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^DUP(i8|i16|i32|i64)$")>; def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^DUPv.+gpr")>; // ASIMD move, saturating @@ -547,8 +568,9 @@ def : InstRW<[A57Write_6cyc_3V], (instregex "^(UZP|ZIP)(1|2)(v16i8|v8i16|v4i32|v def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>; +// Cortex A57 Software Optimization Guide Sec 3.10 def A57WriteFPMA : SchedWriteRes<[A57UnitV]> { let Latency = 9; } -def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA]>; +def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA, WriteFMul]>; def A57ReadFPM : SchedReadAdvance<0>; def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>; diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedA57WriteRes.td b/suite/synctools/tablegen/AArch64/AArch64SchedA57WriteRes.td index 55005e1d..a4c090d4 100644 --- a/suite/synctools/tablegen/AArch64/AArch64SchedA57WriteRes.td +++ b/suite/synctools/tablegen/AArch64/AArch64SchedA57WriteRes.td @@ -1,9 +1,8 @@ //=- AArch64SchedA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,6 +13,10 @@ // Prefix: A57Write // Latency: #cyc // MicroOp Count/Types: #(B|I|M|L|S|X|W|V) +// Postfix (optional): (XYZ)_Forward +// +// The postfix is added to differentiate SchedWriteRes that are used in +// subsequent SchedReadAdvances. // // e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are // 11 micro-ops to be issued down one I pipe, six S pipes and four V pipes. @@ -26,7 +29,9 @@ def A57Write_5cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 5; } def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; } def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; } +def A57Write_5cyc_1V_FP_Forward : SchedWriteRes<[A57UnitV]> { let Latency = 5; } def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; } +def A57Write_5cyc_1W_Mul_Forward : SchedWriteRes<[A57UnitW]> { let Latency = 5; } def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; } def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17; let ResourceCycles = [17]; } @@ -46,6 +51,7 @@ def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; } def A57Write_3cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 3; } def A57Write_4cyc_1L : SchedWriteRes<[A57UnitL]> { let Latency = 4; } def A57Write_4cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 4; } +def A57Write_4cyc_1X_NonMul_Forward : SchedWriteRes<[A57UnitX]> { let Latency = 4; } def A57Write_9cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 9; } def A57Write_6cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 6; } def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; } @@ -94,6 +100,10 @@ def A57Write_6cyc_2W : SchedWriteRes<[A57UnitW, A57UnitW]> { let Latency = 6; let NumMicroOps = 2; } +def A57Write_6cyc_2W_Mul_Forward : SchedWriteRes<[A57UnitW, A57UnitW]> { + let Latency = 6; + let NumMicroOps = 2; +} def A57Write_5cyc_1I_1L : SchedWriteRes<[A57UnitI, A57UnitL]> { let Latency = 5; @@ -103,10 +113,18 @@ def A57Write_5cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 5; let NumMicroOps = 2; } +def A57Write_5cyc_2V_FP_Forward : SchedWriteRes<[A57UnitV, A57UnitV]> { + let Latency = 5; + let NumMicroOps = 2; +} def A57Write_5cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> { let Latency = 5; let NumMicroOps = 2; } +def A57Write_5cyc_2X_NonMul_Forward : SchedWriteRes<[A57UnitX, A57UnitX]> { + let Latency = 5; + let NumMicroOps = 2; +} def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL, A57UnitV]> { let Latency = 10; diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedA64FX.td b/suite/synctools/tablegen/AArch64/AArch64SchedA64FX.td new file mode 100644 index 00000000..fa10d056 --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64SchedA64FX.td @@ -0,0 +1,3896 @@ +//=- AArch64SchedA64FX.td - Fujitsu A64FX Scheduling Defs -*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the scheduling model for the Fujitsu A64FX processors. +// +//===----------------------------------------------------------------------===// + +def A64FXModel : SchedMachineModel { + let IssueWidth = 6; // 6 micro-ops dispatched at a time. + let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer. + let LoadLatency = 5; // Optimistic load latency. + let MispredictPenalty = 12; // Extra cycles for mispredicted branch. + // Determined via a mix of micro-arch details and experimentation. + let LoopMicroOpBufferSize = 128; + let PostRAScheduler = 1; // Using PostRA sched. + let CompleteModel = 1; + + list UnsupportedFeatures = + [HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth, + HasSVE2orStreamingSVE]; + + let FullInstRWOverlapCheck = 0; +} + +let SchedModel = A64FXModel in { + +// Define the issue ports. + +// A64FXIP* + +// Port 0 +def A64FXIPFLA : ProcResource<1>; + +// Port 1 +def A64FXIPPR : ProcResource<1>; + +// Port 2 +def A64FXIPEXA : ProcResource<1>; + +// Port 3 +def A64FXIPFLB : ProcResource<1>; + +// Port 4 +def A64FXIPEXB : ProcResource<1>; + +// Port 5 +def A64FXIPEAGA : ProcResource<1>; + +// Port 6 +def A64FXIPEAGB : ProcResource<1>; + +// Port 7 +def A64FXIPBR : ProcResource<1>; + +// Define groups for the functional units on each issue port. Each group +// created will be used by a WriteRes later on. + +def A64FXGI7 : ProcResGroup<[A64FXIPBR]>; + +def A64FXGI0 : ProcResGroup<[A64FXIPFLA]>; + +def A64FXGI1 : ProcResGroup<[A64FXIPPR]>; + +def A64FXGI2 : ProcResGroup<[A64FXIPEXA]>; + +def A64FXGI3 : ProcResGroup<[A64FXIPFLB]>; + +def A64FXGI4 : ProcResGroup<[A64FXIPEXB]>; + +def A64FXGI5 : ProcResGroup<[A64FXIPEAGA]>; + +def A64FXGI6 : ProcResGroup<[A64FXIPEAGB]>; + +def A64FXGI03 : ProcResGroup<[A64FXIPFLA, A64FXIPFLB]>; + +def A64FXGI01 : ProcResGroup<[A64FXIPFLA, A64FXIPPR]>; + +def A64FXGI02 : ProcResGroup<[A64FXIPFLA, A64FXIPEXA]>; + +def A64FXGI12 : ProcResGroup<[A64FXIPEXA, A64FXIPPR]>; + +def A64FXGI15 : ProcResGroup<[A64FXIPEAGA, A64FXIPPR]>; + +def A64FXGI05 : ProcResGroup<[A64FXIPFLA, A64FXIPEAGA]>; + +def A64FXGI24 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB]>; + +def A64FXGI124 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB, A64FXIPPR]>; + +def A64FXGI056 : ProcResGroup<[A64FXIPFLA, A64FXIPEAGA, A64FXIPEAGB]>; + +def A64FXGI0256 : ProcResGroup<[A64FXIPFLA, A64FXIPEXA, A64FXIPEAGA, A64FXIPEAGB]>; + +def A64FXGI56 : ProcResGroup<[A64FXIPEAGA, A64FXIPEAGB]>; + +def A64FXGI2456 : ProcResGroup<[A64FXIPEXA, A64FXIPEXB, A64FXIPEAGA, A64FXIPEAGB]>; + +def A64FXAny : ProcResGroup<[A64FXIPFLA, A64FXIPPR, A64FXIPEXA, A64FXIPFLB, + A64FXIPEXB, A64FXIPEAGA, A64FXIPEAGB, A64FXIPBR]> { + let BufferSize = 60; +} + +def A64FXWrite_6Cyc : SchedWriteRes<[]> { + let Latency = 6; +} + +def A64FXWrite_1Cyc_GI7 : SchedWriteRes<[A64FXGI7]> { + let Latency = 1; +} + +def A64FXWrite_2Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 2; +} + +def A64FXWrite_4Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 4; +} + +def A64FXWrite_5Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 5; +} + +def A64FXWrite_6Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 6; +} + +def A64FXWrite_8Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 8; +} + +def A64FXWrite_9Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 9; +} + +def A64FXWrite_13Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 13; +} + +def A64FXWrite_37Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 37; +} + +def A64FXWrite_98Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 98; +} + +def A64FXWrite_134Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 134; +} + +def A64FXWrite_154Cyc_GI0 : SchedWriteRes<[A64FXGI0]> { + let Latency = 154; +} + +def A64FXWrite_4Cyc_GI01 : SchedWriteRes<[A64FXGI01]> { + let Latency = 4; +} + +def A64FXWrite_6Cyc_GI01 : SchedWriteRes<[A64FXGI01]> { + let Latency = 6; +} + +def A64FXWrite_8Cyc_GI01 : SchedWriteRes<[A64FXGI01]> { + let Latency = 8; +} + +def A64FXWrite_12Cyc_GI01 : SchedWriteRes<[A64FXGI01]> { + let Latency = 12; +} + +def A64FXWrite_10Cyc_GI02 : SchedWriteRes<[A64FXGI02]> { + let Latency = 10; +} + +def A64FXWrite_17Cyc_GI02 : SchedWriteRes<[A64FXGI02]> { + let Latency = 17; +} + +def A64FXWrite_21Cyc_GI02 : SchedWriteRes<[A64FXGI02]> { + let Latency = 21; +} + +def A64FXWrite_3Cyc_GI1 : SchedWriteRes<[A64FXGI1]> { + let Latency = 3; +} + +def A64FXWrite_6Cyc_NGI1 : SchedWriteRes<[A64FXGI1]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def A64FXWrite_4Cyc_GI12 : SchedWriteRes<[A64FXGI12]> { + let Latency = 4; +} + +def A64FXWrite_3Cyc_GI2 : SchedWriteRes<[A64FXGI2]> { + let Latency = 3; +} + +def A64FXWrite_5Cyc_GI2 : SchedWriteRes<[A64FXGI2]> { + let Latency = 5; +} + +def A64FXWrite_6Cyc_GI2 : SchedWriteRes<[A64FXGI2]> { + let Latency = 6; +} + +def A64FXWrite_4Cyc_GI3 : SchedWriteRes<[A64FXGI3]> { + let Latency = 4; +} + +def A64FXWrite_6Cyc_GI3 : SchedWriteRes<[A64FXGI3]> { + let Latency = 6; +} + +def A64FXWrite_6Cyc_GI15 : SchedWriteRes<[A64FXGI15]> { + let Latency = 6; +} + +def A64FXWrite_3Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 3; +} + +def A64FXWrite_4Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 4; +} + +def A64FXWrite_6Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 6; +} + +def A64FXWrite_8Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 8; +} + +def A64FXWrite_9Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 9; +} + +def A64FXWrite_10Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; +} + +def A64FXWrite_12Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 12; +} + +def A64FXWrite_14Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 14; +} + +def A64FXWrite_15Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 15; +} + +def A64FXWrite_15Cyc_NGI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 15; + let NumMicroOps = 2; +} + +def A64FXWrite_18Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 18; +} + +def A64FXWrite_45Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 45; +} + +def A64FXWrite_60Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 60; +} + +def A64FXWrite_75Cyc_GI03 : SchedWriteRes<[A64FXGI03]> { + let Latency = 75; +} + +def A64FXWrite_6Cyc_GI05 : SchedWriteRes<[A64FXGI05]> { + let Latency = 6; +} + +def A64FXWrite_10Cyc_GI4 : SchedWriteRes<[A64FXGI4]> { + let Latency = 10; +} + +def A64FXWrite_12Cyc_GI4 : SchedWriteRes<[A64FXGI4]> { + let Latency = 12; +} + +def A64FXWrite_20Cyc_GI4 : SchedWriteRes<[A64FXGI4]> { + let Latency = 20; +} + +def A64FXWrite_5Cyc_GI5 : SchedWriteRes<[A64FXGI5]> { + let Latency = 5; +} + +def A64FXWrite_11Cyc_GI5 : SchedWriteRes<[A64FXGI5]> { + let Latency = 11; +} + +def A64FXWrite_5Cyc_GI6 : SchedWriteRes<[A64FXGI6]> { + let Latency = 5; +} + +def A64FXWrite_1Cyc_GI24 : SchedWriteRes<[A64FXGI24]> { + let Latency = 1; +} + +def A64FXWrite_2Cyc_GI24 : SchedWriteRes<[A64FXGI24]> { + let Latency = 2; +} + +def A64FXWrite_4Cyc_NGI24 : SchedWriteRes<[A64FXGI24]> { + let Latency = 4; + let NumMicroOps = 4; +} + +def A64FXWrite_6Cyc_GI124: SchedWriteRes<[A64FXGI124]> { + let Latency = 6; +} + +def A64FXWrite_8Cyc_GI124 : SchedWriteRes<[A64FXGI124]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def A64FXWrite_6Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_1Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { + let Latency = 1; +} + +def A64FXWrite_5Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { + let Latency = 5; +} + +def A64FXWrite_8Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { + let Latency = 8; +} + +def A64FXWrite_11Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { + let Latency = 11; +} + +def A64FXWrite_44Cyc_GI56 : SchedWriteRes<[A64FXGI56]> { + let Latency = 44; +} + +def A64FXWrite_10Cyc_GI056 : SchedWriteRes<[A64FXGI056]> { + let Latency = 10; +} + +def A64FXWrite_15Cyc_GI056 : SchedWriteRes<[A64FXGI056]> { + let Latency = 15; +} + +def A64FXWrite_19Cyc_GI056 : SchedWriteRes<[A64FXGI056]> { + let Latency = 19; +} + +def A64FXWrite_25Cyc_GI056 : SchedWriteRes<[A64FXGI056]> { + let Latency = 25; +} + +def A64FXWrite_14Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> { + let Latency = 14; +} + +def A64FXWrite_19Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> { + let Latency = 19; +} + +def A64FXWrite_29Cyc_GI0256 : SchedWriteRes<[A64FXGI0256]> { + let Latency = 29; +} + +def A64FXWrite_LDNP: SchedWriteRes<[A64FXGI56]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def A64FXWrite_LDP01: SchedWriteRes<[A64FXGI2456]> { + let Latency = 5; + let NumMicroOps = 3; +} + +def A64FXWrite_LDR01: SchedWriteRes<[A64FXGI2456]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def A64FXWrite_LD102: SchedWriteRes<[A64FXGI56]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def A64FXWrite_LD103: SchedWriteRes<[A64FXGI56]> { + let Latency = 11; + let NumMicroOps = 2; + +} + +def A64FXWrite_LD104: SchedWriteRes<[A64FXGI56]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def A64FXWrite_LD105: SchedWriteRes<[A64FXGI56]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def A64FXWrite_LD106: SchedWriteRes<[A64FXGI56]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def A64FXWrite_LD107: SchedWriteRes<[A64FXGI56]> { + let Latency = 11; + let NumMicroOps = 4; +} + +def A64FXWrite_LD108: SchedWriteRes<[A64FXGI56]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def A64FXWrite_LD109: SchedWriteRes<[A64FXGI56]> { + let Latency = 11; + let NumMicroOps = 2; +} + +def A64FXWrite_LD110: SchedWriteRes<[A64FXGI56]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def A64FXWrite_LD111: SchedWriteRes<[A64FXGI56]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def A64FXWrite_LD112: SchedWriteRes<[A64FXGI56]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def A64FXWrite_LD113: SchedWriteRes<[A64FXGI56]> { + let Latency = 11; + let NumMicroOps = 4; +} + +def A64FXWrite_LD114: SchedWriteRes<[A64FXGI56]> { + let Latency = 8; + let NumMicroOps = 5; +} + +def A64FXWrite_LD115: SchedWriteRes<[A64FXGI56]> { + let Latency = 11; + let NumMicroOps = 5; +} + +def A64FXWrite_LD1I0: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def A64FXWrite_LD1I1: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def A64FXWrite_LD2I0: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def A64FXWrite_LD2I1: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 5; +} + +def A64FXWrite_LD3I0: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 6; +} + +def A64FXWrite_LD3I1: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 7; +} + +def A64FXWrite_LD4I0: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 8; +} + +def A64FXWrite_LD4I1: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; + let NumMicroOps = 9; +} + +def A64FXWrite_1Cyc_GI2456 : SchedWriteRes<[A64FXGI2456]> { + let Latency = 1; +} + +def A64FXWrite_FMOV_GV : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; +} + +def A64FXWrite_FMOV_VG14 : SchedWriteRes<[A64FXGI03]> { + let Latency = 14; +} + +def A64FXWrite_FMOV_VG : SchedWriteRes<[A64FXGI03]> { + let Latency = 25; +} + +def A64FXWrite_ADDLV : SchedWriteRes<[A64FXGI03]> { + let Latency = 12; +} + +def A64FXWrite_MULLE : SchedWriteRes<[A64FXGI03]> { + let Latency = 14; +} + +def A64FXWrite_MULLV : SchedWriteRes<[A64FXGI03]> { + let Latency = 14; +} + +def A64FXWrite_MADDL : SchedWriteRes<[A64FXGI03]> { + let Latency = 6; +} + +def A64FXWrite_ABA : SchedWriteRes<[A64FXGI03]> { + let Latency = 8; +} + +def A64FXWrite_ABAL : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; +} + +def A64FXWrite_ADDLV1 : SchedWriteRes<[A64FXGI03]> { + let Latency = 12; + let NumMicroOps = 6; +} + +def A64FXWrite_MINMAXV : SchedWriteRes<[A64FXGI03]> { + let Latency = 14; + let NumMicroOps = 6; +} + +def A64FXWrite_SQRDMULH : SchedWriteRes<[A64FXGI03]> { + let Latency = 9; +} + +def A64FXWrite_PMUL : SchedWriteRes<[A64FXGI03]> { + let Latency = 8; +} + + +def A64FXWrite_SRSRAV : SchedWriteRes<[A64FXGI03]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def A64FXWrite_SSRAV : SchedWriteRes<[A64FXGI03]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def A64FXWrite_RSHRN : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def A64FXWrite_SHRN : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 2; +} + + +def A64FXWrite_ADDP : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def A64FXWrite_FMULXE : SchedWriteRes<[A64FXGI03]> { + let Latency = 15; + let NumMicroOps = 2; +} + +def A64FXWrite_FADDPV : SchedWriteRes<[A64FXGI03]> { + let Latency = 15; + let NumMicroOps = 3; +} + +def A64FXWrite_SADALP : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def A64FXWrite_SADDLP : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def A64FXWrite_FCVTXNV : SchedWriteRes<[A64FXGI03]> { + let Latency = 15; + let NumMicroOps = 2; +} + +def A64FXWrite_FMAXVVH : SchedWriteRes<[A64FXGI03]> { + let Latency = 14; + let NumMicroOps = 7; +} + +def A64FXWrite_FMAXVVS : SchedWriteRes<[A64FXGI03]> { + let Latency = 14; +} + +def A64FXWrite_BIF : SchedWriteRes<[A64FXGI03]> { + let Latency = 5; +} + +def A64FXWrite_DUPGENERAL : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; +} + +def A64FXWrite_SHA00 : SchedWriteRes<[A64FXGI0]> { + let Latency = 9; +} + +def A64FXWrite_SHA01 : SchedWriteRes<[A64FXGI0]> { + let Latency = 12; +} + +def A64FXWrite_SMOV : SchedWriteRes<[A64FXGI03]> { + let Latency = 25; +} + +def A64FXWrite_TBX1 : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def A64FXWrite_TBX2 : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 5; +} + +def A64FXWrite_TBX3 : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 7; +} + +def A64FXWrite_TBX4 : SchedWriteRes<[A64FXGI03]> { + let Latency = 10; + let NumMicroOps = 9; +} + +def A64FXWrite_PREF0: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_PREF1: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_SWP: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_STUR: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_STNP: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_STP01: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_ST10: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_ST11: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_ST12: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_ST13: SchedWriteRes<[A64FXGI56]> { + let Latency = 0; +} + +def A64FXWrite_ST14: SchedWriteRes<[A64FXGI56]> { + let Latency = 1; +} + +def A64FXWrite_ST15: SchedWriteRes<[A64FXGI56]> { + let Latency = 1; +} + +def A64FXWrite_ST16: SchedWriteRes<[A64FXGI56]> { + let Latency = 1; +} + +def A64FXWrite_ST17: SchedWriteRes<[A64FXGI56]> { + let Latency = 1; +} + +def A64FXWrite_ST1W_6: SchedWriteRes<[A64FXGI056]> { + let Latency = 6; +} + +def A64FXWrite_ST2W_7: SchedWriteRes<[A64FXGI056]> { + let Latency = 7; +} + +def A64FXWrite_ST3W_8: SchedWriteRes<[A64FXGI056]> { + let Latency = 8; +} + +def A64FXWrite_ST4W_9: SchedWriteRes<[A64FXGI056]> { + let Latency = 9; +} + +def A64FXWrite_ST1W_15: SchedWriteRes<[A64FXGI056]> { + let Latency = 15; +} + +def A64FXWrite_ST1W_19: SchedWriteRes<[A64FXGI056]> { + let Latency = 19; +} + +def A64FXWrite_CAS: SchedWriteRes<[A64FXGI56]> { + let Latency = 7; +} + +// Define commonly used read types. + +// No forwarding is provided for these types. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// 3. Instruction Tables. + +//--- +// 3.1 Branch Instructions +//--- + +// Branch, immed +// Branch and link, immed +// Compare and branch +def : WriteRes { + let Latency = 1; +} + +// Branch, register +// Branch and link, register != LR +// Branch and link, register = LR +def : WriteRes { + let Latency = 1; +} + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +def : WriteRes { + let Latency = 4; +} + +//--- +// Branch +//--- +def : InstRW<[A64FXWrite_1Cyc_GI7], (instrs B, BL, BR, BLR)>; +def : InstRW<[A64FXWrite_1Cyc_GI7], (instrs RET)>; +def : InstRW<[A64FXWrite_1Cyc_GI7], (instregex "^B..$")>; +def : InstRW<[A64FXWrite_1Cyc_GI7], + (instregex "^CBZ", "^CBNZ", "^TBZ", "^TBNZ")>; + +//--- +// 3.2 Arithmetic and Logical Instructions +// 3.3 Move and Shift Instructions +//--- + +// ALU, basic +// Conditional compare +// Conditional select +// Address generation +def : WriteRes { + let Latency = 1; + let ResourceCycles = [1]; +} + +def : InstRW<[WriteI], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +def : InstRW<[WriteI], (instrs COPY)>; + +// ALU, extend and/or shift +def : WriteRes { + let Latency = 2; + let ResourceCycles = [1]; +} + +def : InstRW<[WriteISReg], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +def : WriteRes { + let Latency = 1; + let ResourceCycles = [1]; +} + +def : InstRW<[WriteIEReg], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +// Move immed +def : WriteRes { + let Latency = 1; + let ResourceCycles = [1]; +} + +def : InstRW<[A64FXWrite_1Cyc_GI2456], + (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; + +def : InstRW<[A64FXWrite_2Cyc_GI24], + (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>; + +// Variable shift +def : WriteRes { + let Latency = 1; + let ResourceCycles = [1]; +} + +//--- +// 3.4 Divide and Multiply Instructions +//--- + +// Divide, W-form +def : WriteRes { + let Latency = 39; + let ResourceCycles = [39]; +} + +// Divide, X-form +def : WriteRes { + let Latency = 23; + let ResourceCycles = [23]; +} + +// Multiply accumulate, W-form +def : WriteRes { + let Latency = 5; + let ResourceCycles = [1]; +} + +// Multiply accumulate, X-form +def : WriteRes { + let Latency = 5; + let ResourceCycles = [1]; +} + +def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>; +def : InstRW<[A64FXWrite_MADDL], + (instregex "(S|U)(MADDL|MSUBL)rrr")>; + +def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>; +def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>; + +// Bitfield extract, two reg +def : WriteRes { + let Latency = 1; + let ResourceCycles = [1]; +} + +// Multiply high +def : InstRW<[A64FXWrite_5Cyc_GI2], (instrs SMULHrr, UMULHrr)>; + +// Miscellaneous Data-Processing Instructions +// Bitfield extract +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs EXTRWrri, EXTRXrri)>; + +// Bitifield move - basic +def : InstRW<[A64FXWrite_1Cyc_GI24], + (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>; + +// Bitfield move, insert +def : InstRW<[A64FXWrite_4Cyc_NGI24], (instregex "^BFM")>; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instregex "(S|U)?BFM.*")>; + +// Count leading +def : InstRW<[A64FXWrite_2Cyc_GI0], (instregex "^CLS(W|X)r$", + "^CLZ(W|X)r$")>; + +// Reverse bits +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs RBITWr, RBITXr)>; + +// Cryptography Extensions +def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^AES[DE]")>; +def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^AESI?MC")>; +def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^PMULL")>; +def : InstRW<[A64FXWrite_SHA00], (instregex "^SHA1SU0")>; +def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA1(H|SU1)")>; +def : InstRW<[A64FXWrite_SHA01], (instregex "^SHA1[CMP]")>; +def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA256SU0")>; +def : InstRW<[A64FXWrite_8Cyc_GI0], (instregex "^SHA256SU1")>; +def : InstRW<[A64FXWrite_SHA01], (instregex "^SHA256(H|H2)")>; + +// CRC Instructions +def : InstRW<[A64FXWrite_10Cyc_GI4], (instrs CRC32Brr, CRC32Hrr)>; +def : InstRW<[A64FXWrite_12Cyc_GI4], (instrs CRC32Wrr)>; +def : InstRW<[A64FXWrite_20Cyc_GI4], (instrs CRC32Xrr)>; + +def : InstRW<[A64FXWrite_10Cyc_GI4], (instrs CRC32CBrr, CRC32CHrr)>; +def : InstRW<[A64FXWrite_12Cyc_GI4], (instrs CRC32CWrr)>; +def : InstRW<[A64FXWrite_20Cyc_GI4], (instrs CRC32CXrr)>; + +// Reverse bits/bytes +// NOTE: Handled by WriteI. + +//--- +// 3.6 Load Instructions +// 3.10 FP Load Instructions +//--- + +// Load register, literal +// Load register, unscaled immed +// Load register, immed unprivileged +// Load register, unsigned immed +def : WriteRes { + let Latency = 4; + let ResourceCycles = [3]; +} + +// Load register, immed post-index +// NOTE: Handled by WriteLD, WriteI. +// Load register, immed pre-index +// NOTE: Handled by WriteLD, WriteAdr. +def : WriteRes { + let Latency = 1; + let ResourceCycles = [1]; +} + +// Load pair, immed offset, normal +// Load pair, immed offset, signed words, base != SP +// Load pair, immed offset signed words, base = SP +// LDP only breaks into *one* LS micro-op. Thus +// the resources are handled by WriteLD. +def : WriteRes { + let Latency = 5; +} + +// Load register offset, basic +// Load register, register offset, scale by 4/8 +// Load register, register offset, scale by 2 +// Load register offset, extend +// Load register, register offset, extend, scale by 4/8 +// Load register, register offset, extend, scale by 2 +def A64FXWriteLDIdx : SchedWriteVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +def A64FXReadAdrBase : SchedReadVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +// Load pair, immed pre-index, normal +// Load pair, immed pre-index, signed words +// Load pair, immed post-index, normal +// Load pair, immed post-index, signed words +// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr. + +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPDi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPQi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPSi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPWi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDNPXi)>; + +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPDi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPQi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPSi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPSWi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPWi)>; +def : InstRW<[A64FXWrite_LDNP, WriteLDHi], (instrs LDPXi)>; + +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRBui)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRDui)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRHui)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRQui)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDRSui)>; + +def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRDl)>; +def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRQl)>; +def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRWl)>; +def : InstRW<[A64FXWrite_5Cyc_GI6], (instrs LDRXl)>; + +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRBi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRHi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRWi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRXi)>; + +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSBWi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSBXi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSHWi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSHXi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDTRSWi)>; + +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPDpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPQpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPSpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; + +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRDpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRQpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRWpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRXpre)>; + +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBWpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBXpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBWpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSBXpost)>; + +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHWpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHXpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHWpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSHXpost)>; + +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBBpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBBpost)>; + +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHHpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHHpost)>; + +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPDpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPQpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPSpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPWpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPXpost)>; + +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRBpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRDpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRHpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRQpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRSpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRWpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRXpost)>; + +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPDpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPQpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPSpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPXpre)>; + +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRBpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRDpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRHpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRQpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRSpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRWpre)>; +def : InstRW<[A64FXWrite_LDR01, WriteAdr], (instrs LDRXpre)>; + +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPDpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPQpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPSpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPWpost)>; +def : InstRW<[A64FXWrite_LDP01, WriteLDHi, WriteAdr], + (instrs LDPXpost)>; + +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRBpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRDpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRHpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRQpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRSpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRWpost)>; +def : InstRW<[A64FXWrite_LDR01, WriteI], (instrs LDRXpost)>; + +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRBroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRDroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHHroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRQroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHWroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHXroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRWroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRXroW)>; + +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRBroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRDroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHHroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRHroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRQroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHWroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRSHXroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRWroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], (instrs LDRXroX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRBroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRBroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRDroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRHroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRHHroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRQroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRSroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRSHWroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRSHXroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRWroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRXroW)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRBroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRDroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRHroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRHHroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRQroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRSroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRSHWroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRSHXroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRWroX)>; +def : InstRW<[A64FXWrite_5Cyc_GI56, ReadAdrBase], + (instrs LDRXroX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURBi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURBBi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURDi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURHi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURHHi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURQi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURXi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSBWi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSBXi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSHWi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSHXi)>; +def : InstRW<[A64FXWrite_5Cyc_GI56], (instrs LDURSWi)>; + +//--- +// Prefetch +//--- +def : InstRW<[A64FXWrite_PREF0], (instrs PRFMl)>; +def : InstRW<[A64FXWrite_PREF1], (instrs PRFUMi)>; +def : InstRW<[A64FXWrite_PREF1], (instrs PRFMui)>; +def : InstRW<[A64FXWrite_PREF1], (instrs PRFMroW)>; +def : InstRW<[A64FXWrite_PREF1], (instrs PRFMroX)>; + +//-- +// 3.7 Store Instructions +// 3.11 FP Store Instructions +//-- + +// Store register, unscaled immed +// Store register, immed unprivileged +// Store register, unsigned immed +def : WriteRes { + let Latency = 1; +} + +// Store register, immed post-index +// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase + +// Store register, immed pre-index +// NOTE: Handled by WriteAdr, WriteST + +// Store register, register offset, basic +// Store register, register offset, scaled by 4/8 +// Store register, register offset, scaled by 2 +// Store register, register offset, extend +// Store register, register offset, extend, scale by 4/8 +// Store register, register offset, extend, scale by 1 +def : WriteRes { + let Latency = 1; +} + +// Store pair, immed offset, W-form +// Store pair, immed offset, X-form +def : WriteRes { + let Latency = 1; +} + +// Store pair, immed post-index, W-form +// Store pair, immed post-index, X-form +// Store pair, immed pre-index, W-form +// Store pair, immed pre-index, X-form +// NOTE: Handled by WriteAdr, WriteSTP. + +def : InstRW<[A64FXWrite_STUR], (instrs STURBi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURBBi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURDi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURHi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURHHi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURQi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURSi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURWi)>; +def : InstRW<[A64FXWrite_STUR], (instrs STURXi)>; + +def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRBi)>; +def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRHi)>; +def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRWi)>; +def : InstRW<[WriteAdr, A64FXWrite_STUR], (instrs STTRXi)>; + +def : InstRW<[A64FXWrite_STNP], (instrs STNPDi)>; +def : InstRW<[A64FXWrite_STNP], (instrs STNPQi)>; +def : InstRW<[A64FXWrite_STNP], (instrs STNPXi)>; +def : InstRW<[A64FXWrite_STNP], (instrs STNPWi)>; + +def : InstRW<[A64FXWrite_STNP], (instrs STPDi)>; +def : InstRW<[A64FXWrite_STNP], (instrs STPQi)>; +def : InstRW<[A64FXWrite_STNP], (instrs STPXi)>; +def : InstRW<[A64FXWrite_STNP], (instrs STPWi)>; + +def : InstRW<[A64FXWrite_STUR], (instrs STRBui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRBui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRDui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRDui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRHui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRHui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRQui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRQui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRXui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRXui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRWui)>; +def : InstRW<[A64FXWrite_STUR], (instrs STRWui)>; + +def : InstRW<[A64FXWrite_STP01], + (instrs STPDpre, STPDpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPDpre, STPDpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPDpre, STPDpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPDpre, STPDpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPQpre, STPQpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPQpre, STPQpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPQpre, STPQpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPQpre, STPQpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPSpre, STPSpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPSpre, STPSpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPSpre, STPSpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPSpre, STPSpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPWpre, STPWpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPWpre, STPWpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPWpre, STPWpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPWpre, STPWpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPXpre, STPXpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPXpre, STPXpost)>; +def : InstRW<[A64FXWrite_STP01], + (instrs STPXpre, STPXpost)>; +def : InstRW<[A64FXWrite_STP01, ReadAdrBase], + (instrs STPXpre, STPXpost)>; + +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, A64FXWrite_STP01, ReadAdrBase], + (instrs STRXpre, STRXpost)>; + +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRBroW, STRBroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRBroW, STRBroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRBBroW, STRBBroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRBBroW, STRBBroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRDroW, STRDroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRDroW, STRDroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRHroW, STRHroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRHroW, STRHroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRHHroW, STRHHroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRHHroW, STRHHroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRQroW, STRQroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRQroW, STRQroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRSroW, STRSroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRSroW, STRSroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRWroW, STRWroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRWroW, STRWroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRXroW, STRXroX)>; +def : InstRW<[A64FXWrite_STUR, ReadAdrBase], + (instrs STRXroW, STRXroX)>; + +//--- +// 3.8 FP Data Processing Instructions +//--- + +// FP absolute value +// FP min/max +// FP negate +def : WriteRes { + let Latency = 4; + let ResourceCycles = [2]; +} + +// FP arithmetic + +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FADDDrr, FADDHrr)>; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FSUBDrr, FSUBHrr)>; + +// FP compare +def : WriteRes { + let Latency = 4; + let ResourceCycles = [2]; +} + +// FP Div, Sqrt +def : WriteRes { + let Latency = 43; +} + +def A64FXXWriteFDiv : SchedWriteRes<[A64FXGI0]> { + let Latency = 38; +} + +def A64FXXWriteFDivSP : SchedWriteRes<[A64FXGI0]> { + let Latency = 29; +} + +def A64FXXWriteFDivDP : SchedWriteRes<[A64FXGI0]> { + let Latency = 43; +} + +def A64FXXWriteFSqrtSP : SchedWriteRes<[A64FXGI0]> { + let Latency = 29; +} + +def A64FXXWriteFSqrtDP : SchedWriteRes<[A64FXGI0]> { + let Latency = 43; +} + +// FP divide, S-form +// FP square root, S-form +def : InstRW<[A64FXXWriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[A64FXXWriteFSqrtSP], (instrs FSQRTSr)>; +def : InstRW<[A64FXXWriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[A64FXXWriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[A64FXXWriteFDivSP], (instregex "^FDIVSrr")>; +def : InstRW<[A64FXXWriteFSqrtSP], (instregex "^FSQRTSr")>; + +// FP divide, D-form +// FP square root, D-form +def : InstRW<[A64FXXWriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[A64FXXWriteFSqrtDP], (instrs FSQRTDr)>; +def : InstRW<[A64FXXWriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[A64FXXWriteFSqrtDP], (instregex "^.*SQRT.*64$")>; +def : InstRW<[A64FXXWriteFDivDP], (instregex "^FDIVDrr")>; +def : InstRW<[A64FXXWriteFSqrtDP], (instregex "^FSQRTDr")>; + +// FP multiply +// FP multiply accumulate +def : WriteRes { + let Latency = 9; + let ResourceCycles = [2]; +} + +def A64FXXWriteFMul : SchedWriteRes<[A64FXGI03]> { + let Latency = 9; + let ResourceCycles = [2]; +} + +def A64FXXWriteFMulAcc : SchedWriteRes<[A64FXGI03]> { + let Latency = 9; + let ResourceCycles = [2]; +} + +def : InstRW<[A64FXXWriteFMul], (instregex "^FMUL", "^FNMUL")>; +def : InstRW<[A64FXXWriteFMulAcc], + (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>; + +// FP round to integral +def : InstRW<[A64FXWrite_9Cyc_GI03], + (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>; + +// FP select +def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FCSEL")>; + +//--- +// 3.9 FP Miscellaneous Instructions +//--- + +// FP convert, from vec to vec reg +// FP convert, from gen to vec reg +// FP convert, from vec to gen reg +def : WriteRes { + let Latency = 9; + let ResourceCycles = [2]; +} + +// FP move, immed +// FP move, register +def : WriteRes { + let Latency = 4; + let ResourceCycles = [2]; +} + +// FP transfer, from gen to vec reg +// FP transfer, from vec to gen reg +def : WriteRes { + let Latency = 4; + let ResourceCycles = [2]; +} + +def : InstRW<[A64FXWrite_FMOV_GV], (instrs FMOVXDHighr)>; +def : InstRW<[A64FXWrite_FMOV_VG14], (instrs FMOVDXHighr)>; + +//--- +// 3.12 ASIMD Integer Instructions +//--- + +// ASIMD absolute diff, D-form +// ASIMD absolute diff, Q-form +// ASIMD absolute diff accum, D-form +// ASIMD absolute diff accum, Q-form +// ASIMD absolute diff accum long +// ASIMD absolute diff long +// ASIMD arith, basic +// ASIMD arith, complex +// ASIMD compare +// ASIMD logical (AND, BIC, EOR) +// ASIMD max/min, basic +// ASIMD max/min, reduce, 4H/4S +// ASIMD max/min, reduce, 8B/8H +// ASIMD max/min, reduce, 16B +// ASIMD multiply, D-form +// ASIMD multiply, Q-form +// ASIMD multiply accumulate long +// ASIMD multiply accumulate saturating long +// ASIMD multiply long +// ASIMD pairwise add and accumulate +// ASIMD shift accumulate +// ASIMD shift by immed, basic +// ASIMD shift by immed and insert, basic, D-form +// ASIMD shift by immed and insert, basic, Q-form +// ASIMD shift by immed, complex +// ASIMD shift by register, basic, D-form +// ASIMD shift by register, basic, Q-form +// ASIMD shift by register, complex, D-form +// ASIMD shift by register, complex, Q-form +def : WriteRes { + let Latency = 4; + let ResourceCycles = [1]; +} +def : WriteRes { + let Latency = 4; + let ResourceCycles = [1]; +} + +// ASIMD arith, reduce, 4H/4S +// ASIMD arith, reduce, 8B/8H +// ASIMD arith, reduce, 16B + +// ASIMD logical (MVN (alias for NOT), ORN, ORR) +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; + +// ASIMD arith, reduce +def : InstRW<[A64FXWrite_ADDLV], + (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>; + +// ASIMD polynomial (8x8) multiply long +def : InstRW<[A64FXWrite_MULLE], (instregex "^(S|U|SQD)MULL")>; +def : InstRW<[A64FXWrite_MULLV], + (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>; +def : InstRW<[A64FXWrite_8Cyc_GI03], (instregex "^PMULL(v8i8|v16i8)")>; +def : InstRW<[A64FXWrite_8Cyc_GI03], (instregex "^PMULL(v1i64|v2i64)")>; + +// ASIMD absolute diff accum, D-form +def : InstRW<[A64FXWrite_ABA], + (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; +// ASIMD absolute diff accum, Q-form +def : InstRW<[A64FXWrite_ABA], + (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; +// ASIMD absolute diff accum long +def : InstRW<[A64FXWrite_ABAL], + (instregex "^[SU]ABAL")>; +// ASIMD arith, reduce, 4H/4S +def : InstRW<[A64FXWrite_ADDLV1], + (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; +// ASIMD arith, reduce, 8B +def : InstRW<[A64FXWrite_ADDLV1], + (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>; +// ASIMD arith, reduce, 16B/16H +def : InstRW<[A64FXWrite_ADDLV1], + (instregex "^[SU]?ADDL?Vv16i8v$")>; +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[A64FXWrite_MINMAXV], + (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>; +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[A64FXWrite_MINMAXV], + (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>; +// ASIMD max/min, reduce, 16B/16H +def : InstRW<[A64FXWrite_MINMAXV], + (instregex "^[SU](MIN|MAX)Vv16i8v$")>; +// ASIMD multiply, D-form +def : InstRW<[A64FXWrite_PMUL], + (instregex "^(P?MUL|SQR?DMUL)" # + "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" # + "(_indexed)?$")>; + +// ASIMD multiply, Q-form +def : InstRW<[A64FXWrite_PMUL], + (instregex "^(P?MUL)(v16i8|v8i16|v4i32)(_indexed)?$")>; + +// ASIMD multiply, Q-form +def : InstRW<[A64FXWrite_SQRDMULH], + (instregex "^(SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; + +// ASIMD multiply accumulate, D-form +def : InstRW<[A64FXWrite_9Cyc_GI03], + (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; +// ASIMD multiply accumulate, Q-form +def : InstRW<[A64FXWrite_9Cyc_GI03], + (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; +// ASIMD shift accumulate +def : InstRW<[A64FXWrite_SRSRAV], + (instregex "SRSRAv", "URSRAv")>; +def : InstRW<[A64FXWrite_SSRAV], + (instregex "SSRAv", "USRAv")>; + +// ASIMD shift by immed, basic +def : InstRW<[A64FXWrite_RSHRN], + (instregex "RSHRNv", "SQRSHRNv", "SQRSHRUNv", "UQRSHRNv")>; +def : InstRW<[A64FXWrite_SHRN], + (instregex "SHRNv", "SQSHRNv", "SQSHRUNv", "UQSHRNv")>; + +def : InstRW<[A64FXWrite_6Cyc_GI3], + (instregex "SQXTNv", "SQXTUNv", "UQXTNv")>; + +// ASIMD shift by immed, complex +def : InstRW<[A64FXWrite_ABA], (instregex "^[SU]?(Q|R){1,2}SHR")>; +def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^SQSHLU")>; +// ASIMD shift by register, basic, Q-form +def : InstRW<[A64FXWrite_6Cyc_GI3], + (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// ASIMD shift by register, complex, D-form +def : InstRW<[A64FXWrite_6Cyc_GI3], + (instregex "^[SU][QR]{1,2}SHL" # + "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; +// ASIMD shift by register, complex, Q-form +def : InstRW<[A64FXWrite_6Cyc_GI3], + (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD Arithmetic +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[A64FXWrite_SHRN], (instregex "(ADD|SUB)HNv.*")>; +def : InstRW<[A64FXWrite_RSHRN], (instregex "(RADD|RSUB)HNv.*")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD", + "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>; +def : InstRW<[A64FXWrite_ADDP], + (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" # + "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>; +def : InstRW<[A64FXWrite_4Cyc_GI0], + (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>; +def : InstRW<[A64FXWrite_SADALP], (instregex "^SADALP", "^UADALP")>; +def : InstRW<[A64FXWrite_SADDLP], (instregex "^SADDLPv", "^UADDLPv")>; +def : InstRW<[A64FXWrite_ADDLV1], (instregex "^SADDLV", "^UADDLV")>; +def : InstRW<[A64FXWrite_MINMAXV], + (instregex "^ADDVv", "^SMAXVv", "^UMAXVv", "^SMINVv", "^UMINVv")>; +def : InstRW<[A64FXWrite_ABA], + (instregex "^SABAv", "^UABAv", "^SABALv", "^UABALv")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^SQADDv", "^SQSUBv", "^UQADDv", "^UQSUBv")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^SUQADDv", "^USQADDv")>; +def : InstRW<[A64FXWrite_SHRN], + (instregex "^ADDHNv", "^SUBHNv")>; +def : InstRW<[A64FXWrite_RSHRN], + (instregex "^RADDHNv", "^RSUBHNv")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^SQABS", "^SQADD", "^SQNEG", "^SQSUB", + "^SRHADD", "^SUQADD", "^UQADD", "^UQSUB", + "^URHADD", "^USQADD")>; + +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^CMEQv", "^CMGEv", "^CMGTv", + "^CMLEv", "^CMLTv", "^CMHIv", "^CMHSv")>; +def : InstRW<[A64FXWrite_MINMAXV], + (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>; +def : InstRW<[A64FXWrite_ADDP], + (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^SABDv", "^UABDv")>; +def : InstRW<[A64FXWrite_TBX1], + (instregex "^SABDLv", "^UABDLv")>; + +//--- +// 3.13 ASIMD Floating-point Instructions +//--- + +// ASIMD FP absolute value +def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FABSv")>; + +// ASIMD FP arith, normal, D-form +// ASIMD FP arith, normal, Q-form +def : InstRW<[A64FXWrite_9Cyc_GI03], + (instregex "^FABDv", "^FADDv", "^FSUBv")>; + +// ASIMD FP arith, pairwise, D-form +// ASIMD FP arith, pairwise, Q-form +def : InstRW<[A64FXWrite_FADDPV], (instregex "^FADDPv")>; + +// ASIMD FP compare, D-form +// ASIMD FP compare, Q-form +def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FACGEv", "^FACGTv")>; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FCMEQv", "^FCMGEv", + "^FCMGTv", "^FCMLEv", + "^FCMLTv")>; +// ASIMD FP round, D-form +def : InstRW<[A64FXWrite_9Cyc_GI03], + (instregex "^FRINT[AIMNPXZ](v2f32)")>; +// ASIMD FP round, Q-form +def : InstRW<[A64FXWrite_9Cyc_GI03], + (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; + +// ASIMD FP convert, long +// ASIMD FP convert, narrow +// ASIMD FP convert, other, D-form +// ASIMD FP convert, other, Q-form + +// ASIMD FP convert, long and narrow +def : InstRW<[A64FXWrite_FCVTXNV], (instregex "^FCVT(L|N|XN)v")>; +// ASIMD FP convert, other, D-form +def : InstRW<[A64FXWrite_FCVTXNV], + (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>; +// ASIMD FP convert, other, Q-form +def : InstRW<[A64FXWrite_FCVTXNV], + (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[A64FXXWriteFDivSP], (instrs FDIVv2f32)>; +def : InstRW<[A64FXXWriteFDivSP], (instregex "FDIVv2f32")>; + +// ASIMD FP divide, Q-form, F32 +def : InstRW<[A64FXXWriteFDiv], (instrs FDIVv4f32)>; +def : InstRW<[A64FXXWriteFDiv], (instregex "FDIVv4f32")>; + +// ASIMD FP divide, Q-form, F64 +def : InstRW<[A64FXXWriteFDivDP], (instrs FDIVv2f64)>; +def : InstRW<[A64FXXWriteFDivDP], (instregex "FDIVv2f64")>; + +// ASIMD FP max/min, normal, D-form +// ASIMD FP max/min, normal, Q-form +def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^FMAXv", "^FMAXNMv", + "^FMINv", "^FMINNMv")>; + +// ASIMD FP max/min, pairwise, D-form +// ASIMD FP max/min, pairwise, Q-form +def : InstRW<[A64FXWrite_ADDP], (instregex "^FMAXPv", "^FMAXNMPv", + "^FMINPv", "^FMINNMPv")>; + +// ASIMD FP max/min, reduce +def : InstRW<[A64FXWrite_FMAXVVH], (instregex "^FMAXVv", "^FMAXNMVv", + "^FMINVv", "^FMINNMVv")>; + +// ASIMD FP multiply, D-form, FZ +// ASIMD FP multiply, D-form, no FZ +// ASIMD FP multiply, Q-form, FZ +// ASIMD FP multiply, Q-form, no FZ +def : InstRW<[A64FXWrite_9Cyc_GI03], (instregex "^FMULv", "^FMULXv")>; +def : InstRW<[A64FXWrite_FMULXE], + (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; +def : InstRW<[A64FXWrite_FMULXE], + (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP multiply accumulate, Dform, FZ +// ASIMD FP multiply accumulate, Dform, no FZ +// ASIMD FP multiply accumulate, Qform, FZ +// ASIMD FP multiply accumulate, Qform, no FZ +def : InstRW<[A64FXWrite_9Cyc_GI03], (instregex "^FMLAv", "^FMLSv")>; +def : InstRW<[A64FXWrite_FMULXE], + (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>; +def : InstRW<[A64FXWrite_FMULXE], + (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP negate +def : InstRW<[A64FXWrite_4Cyc_GI03], (instregex "^FNEGv")>; + +//-- +// 3.14 ASIMD Miscellaneous Instructions +//-- + +// ASIMD bit reverse +def : InstRW<[A64FXWrite_1Cyc_GI2456], (instregex "^RBITv")>; + +// ASIMD bitwise insert, D-form +// ASIMD bitwise insert, Q-form +def : InstRW<[A64FXWrite_BIF], + (instregex "^BIFv", "^BITv", "^BSLv")>; + +// ASIMD count, D-form +// ASIMD count, Q-form +def : InstRW<[A64FXWrite_4Cyc_GI0], + (instregex "^CLSv", "^CLZv", "^CNTv")>; + +// ASIMD duplicate, gen reg +// ASIMD duplicate, element +def : InstRW<[A64FXWrite_DUPGENERAL], (instregex "^DUPv")>; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^DUP(i8|i16|i32|i64)$")>; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^DUPv.+gpr")>; + +// ASIMD extract +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^EXTv")>; + +// ASIMD extract narrow +def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^XTNv")>; + +// ASIMD extract narrow, saturating +def : InstRW<[A64FXWrite_6Cyc_GI3], + (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>; + +// ASIMD insert, element to element +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^INSv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[A64FXWrite_SMOV], (instregex "^[SU]MOVv")>; + +// ASIMD move, integer immed +def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^MOVIv")>; + +// ASIMD move, FP immed +def : InstRW<[A64FXWrite_4Cyc_GI0], (instregex "^FMOVv")>; + +// ASIMD table lookup, D-form +def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^TBLv8i8One")>; +def : InstRW<[A64FXWrite_TBX1], (instregex "^TBLv8i8Two")>; +def : InstRW<[A64FXWrite_TBX2], (instregex "^TBLv8i8Three")>; +def : InstRW<[A64FXWrite_TBX3], (instregex "^TBLv8i8Four")>; +def : InstRW<[A64FXWrite_TBX1], (instregex "^TBXv8i8One")>; +def : InstRW<[A64FXWrite_TBX2], (instregex "^TBXv8i8Two")>; +def : InstRW<[A64FXWrite_TBX3], (instregex "^TBXv8i8Three")>; +def : InstRW<[A64FXWrite_TBX4], (instregex "^TBXv8i8Four")>; + +// ASIMD table lookup, Q-form +def : InstRW<[A64FXWrite_6Cyc_GI3], (instregex "^TBLv16i8One")>; +def : InstRW<[A64FXWrite_TBX1], (instregex "^TBLv16i8Two")>; +def : InstRW<[A64FXWrite_TBX2], (instregex "^TBLv16i8Three")>; +def : InstRW<[A64FXWrite_TBX3], (instregex "^TBLv16i8Four")>; +def : InstRW<[A64FXWrite_TBX1], (instregex "^TBXv16i8One")>; +def : InstRW<[A64FXWrite_TBX2], (instregex "^TBXv16i8Two")>; +def : InstRW<[A64FXWrite_TBX3], (instregex "^TBXv16i8Three")>; +def : InstRW<[A64FXWrite_TBX4], (instregex "^TBXv16i8Four")>; + +// ASIMD transpose +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TRN1", "^TRN2")>; + +// ASIMD unzip/zip +def : InstRW<[A64FXWrite_6Cyc_GI0], + (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>; + +// ASIMD reciprocal estimate, D-form +// ASIMD reciprocal estimate, Q-form +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^FRECPEv", "^FRECPXv", "^URECPEv", + "^FRSQRTEv", "^URSQRTEv")>; + +// ASIMD reciprocal step, D-form, FZ +// ASIMD reciprocal step, D-form, no FZ +// ASIMD reciprocal step, Q-form, FZ +// ASIMD reciprocal step, Q-form, no FZ +def : InstRW<[A64FXWrite_9Cyc_GI0], (instregex "^FRECPSv", "^FRSQRTSv")>; + +// ASIMD reverse +def : InstRW<[A64FXWrite_4Cyc_GI03], + (instregex "^REV16v", "^REV32v", "^REV64v")>; + +// ASIMD table lookup, D-form +// ASIMD table lookup, Q-form +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TBLv", "^TBXv")>; + +// ASIMD transfer, element to word or word +def : InstRW<[A64FXWrite_SMOV], (instregex "^[SU]MOVv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[A64FXWrite_SMOV], (instregex "(S|U)MOVv.*")>; + +// ASIMD transfer gen reg to element +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^INSv")>; + +// ASIMD transpose +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^TRN1v", "^TRN2v", + "^UZP1v", "^UZP2v")>; + +// ASIMD unzip/zip +def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^ZIP1v", "^ZIP2v")>; + +//-- +// 3.15 ASIMD Load Instructions +//-- + +// ASIMD load, 1 element, multiple, 1 reg, D-form +// ASIMD load, 1 element, multiple, 1 reg, Q-form +def : InstRW<[A64FXWrite_8Cyc_GI56], + (instregex "^LD1Onev(8b|4h|2s|1d|2d)$")>; +def : InstRW<[A64FXWrite_11Cyc_GI56], + (instregex "^LD1Onev(16b|8h|4s)$")>; +def : InstRW<[A64FXWrite_LD108, WriteAdr], + (instregex "^LD1Onev(8b|4h|2s|1d|2d)_POST$")>; +def : InstRW<[A64FXWrite_LD109, WriteAdr], + (instregex "^LD1Onev(16b|8h|4s)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, D-form +// ASIMD load, 1 element, multiple, 2 reg, Q-form +def : InstRW<[A64FXWrite_LD102], + (instregex "^LD1Twov(8b|4h|2s|1d|2d)$")>; +def : InstRW<[A64FXWrite_LD103], + (instregex "^LD1Twov(16b|8h|4s)$")>; +def : InstRW<[A64FXWrite_LD110, WriteAdr], + (instregex "^LD1Twov(8b|4h|2s|1d|2d)_POST$")>; +def : InstRW<[A64FXWrite_LD111, WriteAdr], + (instregex "^LD1Twov(16b|8h|4s)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, D-form +// ASIMD load, 1 element, multiple, 3 reg, Q-form +def : InstRW<[A64FXWrite_LD104], + (instregex "^LD1Threev(8b|4h|2s|1d|2d)$")>; +def : InstRW<[A64FXWrite_LD105], + (instregex "^LD1Threev(16b|8h|4s)$")>; +def : InstRW<[A64FXWrite_LD112, WriteAdr], + (instregex "^LD1Threev(8b|4h|2s|1d|2d)_POST$")>; +def : InstRW<[A64FXWrite_LD113, WriteAdr], + (instregex "^LD1Threev(16b|8h|4s)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, D-form +// ASIMD load, 1 element, multiple, 4 reg, Q-form +def : InstRW<[A64FXWrite_LD106], + (instregex "^LD1Fourv(8b|4h|2s|1d|2d)$")>; +def : InstRW<[A64FXWrite_LD107], + (instregex "^LD1Fourv(16b|8h|4s)$")>; +def : InstRW<[A64FXWrite_LD114, WriteAdr], + (instregex "^LD1Fourv(8b|4h|2s|1d|2d)_POST$")>; +def : InstRW<[A64FXWrite_LD115, WriteAdr], + (instregex "^LD1Fourv(16b|8h|4s)_POST$")>; + +// ASIMD load, 1 element, one lane, B/H/S +// ASIMD load, 1 element, one lane, D +def : InstRW<[A64FXWrite_LD1I0], (instregex "^LD1i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_LD1I1, WriteAdr], + (instregex "^LD1i(8|16|32|64)_POST$")>; + +// ASIMD load, 1 element, all lanes, D-form, B/H/S +// ASIMD load, 1 element, all lanes, D-form, D +// ASIMD load, 1 element, all lanes, Q-form +def : InstRW<[A64FXWrite_8Cyc_GI03], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_LD108, WriteAdr], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, multiple, D-form, B/H/S +// ASIMD load, 2 element, multiple, Q-form, D +def : InstRW<[A64FXWrite_LD103], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_LD111, WriteAdr], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, one lane, B/H +// ASIMD load, 2 element, one lane, S +// ASIMD load, 2 element, one lane, D +def : InstRW<[A64FXWrite_LD2I0], (instregex "^LD2i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_LD2I1, WriteAdr], + (instregex "^LD2i(8|16|32|64)_POST$")>; + +// ASIMD load, 2 element, all lanes, D-form, B/H/S +// ASIMD load, 2 element, all lanes, D-form, D +// ASIMD load, 2 element, all lanes, Q-form +def : InstRW<[A64FXWrite_LD102], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_LD110, WriteAdr], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, multiple, D-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, D +def : InstRW<[A64FXWrite_LD105], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_LD113, WriteAdr], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, one lone, B/H +// ASIMD load, 3 element, one lane, S +// ASIMD load, 3 element, one lane, D +def : InstRW<[A64FXWrite_LD3I0], (instregex "^LD3i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_LD3I1, WriteAdr], + (instregex "^LD3i(8|16|32|64)_POST$")>; + +// ASIMD load, 3 element, all lanes, D-form, B/H/S +// ASIMD load, 3 element, all lanes, D-form, D +// ASIMD load, 3 element, all lanes, Q-form, B/H/S +// ASIMD load, 3 element, all lanes, Q-form, D +def : InstRW<[A64FXWrite_LD104], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_LD112, WriteAdr], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, multiple, D-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, D +def : InstRW<[A64FXWrite_LD107], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_LD115, WriteAdr], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, one lane, B/H +// ASIMD load, 4 element, one lane, S +// ASIMD load, 4 element, one lane, D +def : InstRW<[A64FXWrite_LD4I0], (instregex "^LD4i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_LD4I1, WriteAdr], + (instregex "^LD4i(8|16|32|64)_POST$")>; + +// ASIMD load, 4 element, all lanes, D-form, B/H/S +// ASIMD load, 4 element, all lanes, D-form, D +// ASIMD load, 4 element, all lanes, Q-form, B/H/S +// ASIMD load, 4 element, all lanes, Q-form, D +def : InstRW<[A64FXWrite_LD106], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_LD114, WriteAdr], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +//-- +// 3.16 ASIMD Store Instructions +//-- + +// ASIMD store, 1 element, multiple, 1 reg, D-form +// ASIMD store, 1 element, multiple, 1 reg, Q-form +def : InstRW<[A64FXWrite_ST10], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_ST14, WriteAdr], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, D-form +// ASIMD store, 1 element, multiple, 2 reg, Q-form +def : InstRW<[A64FXWrite_ST11], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_ST15, WriteAdr], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, D-form +// ASIMD store, 1 element, multiple, 3 reg, Q-form +def : InstRW<[A64FXWrite_ST12], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_ST16, WriteAdr], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, D-form +// ASIMD store, 1 element, multiple, 4 reg, Q-form +def : InstRW<[A64FXWrite_ST13], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_ST17, WriteAdr], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, one lane, B/H/S +// ASIMD store, 1 element, one lane, D +def : InstRW<[A64FXWrite_ST10], + (instregex "^ST1i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_ST14, WriteAdr], + (instregex "^ST1i(8|16|32|64)_POST$")>; + +// ASIMD store, 2 element, multiple, D-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, D +def : InstRW<[A64FXWrite_ST11], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_ST15, WriteAdr], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 2 element, one lane, B/H/S +// ASIMD store, 2 element, one lane, D +def : InstRW<[A64FXWrite_ST11], + (instregex "^ST2i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_ST15, WriteAdr], + (instregex "^ST2i(8|16|32|64)_POST$")>; + +// ASIMD store, 3 element, multiple, D-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, D +def : InstRW<[A64FXWrite_ST12], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_ST16, WriteAdr], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 3 element, one lane, B/H +// ASIMD store, 3 element, one lane, S +// ASIMD store, 3 element, one lane, D +def : InstRW<[A64FXWrite_ST12], (instregex "^ST3i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_ST16, WriteAdr], + (instregex "^ST3i(8|16|32|64)_POST$")>; + +// ASIMD store, 4 element, multiple, D-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, D +def : InstRW<[A64FXWrite_ST13], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[A64FXWrite_ST17, WriteAdr], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 4 element, one lane, B/H +// ASIMD store, 4 element, one lane, S +// ASIMD store, 4 element, one lane, D +def : InstRW<[A64FXWrite_ST13], (instregex "^ST4i(8|16|32|64)$")>; +def : InstRW<[A64FXWrite_ST17, WriteAdr], + (instregex "^ST4i(8|16|32|64)_POST$")>; + +// V8.1a Atomics (LSE) +def : InstRW<[A64FXWrite_CAS, WriteAtomic], + (instrs CASB, CASH, CASW, CASX)>; + +def : InstRW<[A64FXWrite_CAS, WriteAtomic], + (instrs CASAB, CASAH, CASAW, CASAX)>; + +def : InstRW<[A64FXWrite_CAS, WriteAtomic], + (instrs CASLB, CASLH, CASLW, CASLX)>; + +def : InstRW<[A64FXWrite_CAS, WriteAtomic], + (instrs CASALB, CASALH, CASALW, CASALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDLARB, LDLARH, LDLARW, LDLARX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDADDB, LDADDH, LDADDW, LDADDX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDADDAB, LDADDAH, LDADDAW, LDADDAX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDADDLB, LDADDLH, LDADDLW, LDADDLX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDADDALB, LDADDALH, LDADDALW, LDADDALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDCLRB, LDCLRH, LDCLRW, LDCLRX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDCLRAB, LDCLRAH, LDCLRAW, LDCLRAX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDCLRLB, LDCLRLH, LDCLRLW, LDCLRLX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDCLRALB, LDCLRALH, LDCLRALW, LDCLRALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDEORB, LDEORH, LDEORW, LDEORX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDEORAB, LDEORAH, LDEORAW, LDEORAX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDEORLB, LDEORLH, LDEORLW, LDEORLX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDEORALB, LDEORALH, LDEORALW, LDEORALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDSETB, LDSETH, LDSETW, LDSETX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDSETAB, LDSETAH, LDSETAW, LDSETAX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDSETLB, LDSETLH, LDSETLW, LDSETLX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDSETALB, LDSETALH, LDSETALW, LDSETALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDSMAXB, LDSMAXH, LDSMAXW, LDSMAXX, + LDSMAXAB, LDSMAXAH, LDSMAXAW, LDSMAXAX, + LDSMAXLB, LDSMAXLH, LDSMAXLW, LDSMAXLX, + LDSMAXALB, LDSMAXALH, LDSMAXALW, LDSMAXALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDSMINB, LDSMINH, LDSMINW, LDSMINX, + LDSMINAB, LDSMINAH, LDSMINAW, LDSMINAX, + LDSMINLB, LDSMINLH, LDSMINLW, LDSMINLX, + LDSMINALB, LDSMINALH, LDSMINALW, LDSMINALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDUMAXB, LDUMAXH, LDUMAXW, LDUMAXX, + LDUMAXAB, LDUMAXAH, LDUMAXAW, LDUMAXAX, + LDUMAXLB, LDUMAXLH, LDUMAXLW, LDUMAXLX, + LDUMAXALB, LDUMAXALH, LDUMAXALW, LDUMAXALX)>; + +def : InstRW<[A64FXWrite_5Cyc_GI5, WriteAtomic], + (instrs LDUMINB, LDUMINH, LDUMINW, LDUMINX, + LDUMINAB, LDUMINAH, LDUMINAW, LDUMINAX, + LDUMINLB, LDUMINLH, LDUMINLW, LDUMINLX, + LDUMINALB, LDUMINALH, LDUMINALW, LDUMINALX)>; + +def : InstRW<[A64FXWrite_SWP, WriteAtomic], + (instrs SWPB, SWPH, SWPW, SWPX)>; + +def : InstRW<[A64FXWrite_SWP, WriteAtomic], + (instrs SWPAB, SWPAH, SWPAW, SWPAX)>; + +def : InstRW<[A64FXWrite_SWP, WriteAtomic], + (instrs SWPLB, SWPLH, SWPLW, SWPLX)>; + +def : InstRW<[A64FXWrite_SWP, WriteAtomic], + (instrs SWPALB, SWPALH, SWPALW, SWPALX)>; + +def : InstRW<[A64FXWrite_STUR, WriteAtomic], + (instrs STLLRB, STLLRH, STLLRW, STLLRX)>; + +// [ 1] "abs $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ABS_ZPmZ_B, ABS_ZPmZ_D, ABS_ZPmZ_H, ABS_ZPmZ_S)>; + +// [ 2] "add $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZZZ_B, ADD_ZZZ_D, ADD_ZZZ_H, ADD_ZZZ_S)>; + +// [ 3] "add $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZPmZ_B, ADD_ZPmZ_D, ADD_ZPmZ_H, ADD_ZPmZ_S)>; + +// [ 4] "add $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ADD_ZI_B, ADD_ZI_D, ADD_ZI_H, ADD_ZI_S)>; + +// [ 5] "addpl $Rd, $Rn, $imm6"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs ADDPL_XXI)>; + +// [ 6] "addvl $Rd, $Rn, $imm6"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs ADDVL_XXI)>; + +// [ 7] "adr $Zd, [$Zn, $Zm]"; +def : InstRW<[A64FXWrite_5Cyc_GI0], (instrs ADR_LSL_ZZZ_D_0, ADR_LSL_ZZZ_D_1, ADR_LSL_ZZZ_D_2, ADR_LSL_ZZZ_D_3, ADR_LSL_ZZZ_S_0, ADR_LSL_ZZZ_S_1, ADR_LSL_ZZZ_S_2, ADR_LSL_ZZZ_S_3, ADR_SXTW_ZZZ_D_0, ADR_SXTW_ZZZ_D_1, ADR_SXTW_ZZZ_D_2, ADR_SXTW_ZZZ_D_3, ADR_UXTW_ZZZ_D_0, ADR_UXTW_ZZZ_D_1, ADR_UXTW_ZZZ_D_2, ADR_UXTW_ZZZ_D_3)>; + +// [ 8] "and $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs AND_PPzPP)>; + +// [ 9] "and $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZZZ)>; + +// [10] "and $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZPmZ_B, AND_ZPmZ_D, AND_ZPmZ_H, AND_ZPmZ_S)>; + +// [11] "and $Zdn, $_Zdn, $imms13"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs AND_ZI)>; + +// [12] "ands $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ANDS_PPzPP)>; + +// [13] "andv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs ANDV_VPZ_B, ANDV_VPZ_D, ANDV_VPZ_H, ANDV_VPZ_S)>; + +// [14] "asr $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_WIDE_ZZZ_B, ASR_WIDE_ZZZ_H, ASR_WIDE_ZZZ_S)>; + +// [15] "asr $Zd, $Zn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_ZZI_B, ASR_ZZI_D, ASR_ZZI_H, ASR_ZZI_S)>; + +// [16] "asr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_WIDE_ZPmZ_B, ASR_WIDE_ZPmZ_H, ASR_WIDE_ZPmZ_S, ASR_ZPmZ_B, ASR_ZPmZ_D, ASR_ZPmZ_H, ASR_ZPmZ_S)>; + +// [17] "asr $Zdn, $Pg/m, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASR_ZPmI_B, ASR_ZPmI_D, ASR_ZPmI_H, ASR_ZPmI_S)>; + +// [18] "asrd $Zdn, $Pg/m, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASRD_ZPmI_B, ASRD_ZPmI_D, ASRD_ZPmI_H, ASRD_ZPmI_S)>; + +// [19] "asrr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ASRR_ZPmZ_B, ASRR_ZPmZ_D, ASRR_ZPmZ_H, ASRR_ZPmZ_S)>; + +// [20] "bic $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BIC_PPzPP)>; + +// [21] "bic $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs BIC_ZZZ)>; + +// [22] "bic $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs BIC_ZPmZ_B, BIC_ZPmZ_D, BIC_ZPmZ_H, BIC_ZPmZ_S)>; + +// [23] "bics $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BICS_PPzPP)>; + +// [24] "brka $Pd, $Pg/m, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKA_PPmP)>; + +// [25] "brka $Pd, $Pg/z, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKA_PPzP)>; + +// [26] "brkas $Pd, $Pg/z, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKAS_PPzP)>; + +// [27] "brkb $Pd, $Pg/m, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKB_PPmP)>; + +// [28] "brkb $Pd, $Pg/z, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKB_PPzP)>; + +// [29] "brkbs $Pd, $Pg/z, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKBS_PPzP)>; + +// [30] "brkn $Pdm, $Pg/z, $Pn, $_Pdm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKN_PPzP)>; + +// [31] "brkns $Pdm, $Pg/z, $Pn, $_Pdm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKNS_PPzP)>; + +// [32] "brkpa $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPA_PPzPP)>; + +// [33] "brkpas $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPAS_PPzPP)>; + +// [34] "brkpb $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPB_PPzPP)>; + +// [35] "brkpbs $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs BRKPBS_PPzPP)>; + +// [36] "clasta $Rdn, $Pg, $_Rdn, $Zm"; +def : InstRW<[A64FXWrite_29Cyc_GI0256], (instrs CLASTA_RPZ_B, CLASTA_RPZ_D, CLASTA_RPZ_H, CLASTA_RPZ_S)>; + +// [37] "clasta $Vdn, $Pg, $_Vdn, $Zm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTA_VPZ_B, CLASTA_VPZ_D, CLASTA_VPZ_H, CLASTA_VPZ_S)>; + +// [38] "clasta $Zdn, $Pg, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTA_ZPZ_B, CLASTA_ZPZ_D, CLASTA_ZPZ_H, CLASTA_ZPZ_S)>; + +// [39] "clastb $Rdn, $Pg, $_Rdn, $Zm"; +def : InstRW<[A64FXWrite_29Cyc_GI0256], (instrs CLASTB_RPZ_B, CLASTB_RPZ_D, CLASTB_RPZ_H, CLASTB_RPZ_S)>; + +// [40] "clastb $Vdn, $Pg, $_Vdn, $Zm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTB_VPZ_B, CLASTB_VPZ_D, CLASTB_VPZ_H, CLASTB_VPZ_S)>; + +// [41] "clastb $Zdn, $Pg, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CLASTB_ZPZ_B, CLASTB_ZPZ_D, CLASTB_ZPZ_H, CLASTB_ZPZ_S)>; + +// [42] "cls $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs CLS_ZPmZ_B, CLS_ZPmZ_D, CLS_ZPmZ_H, CLS_ZPmZ_S)>; + +// [43] "clz $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs CLZ_ZPmZ_B, CLZ_ZPmZ_D, CLZ_ZPmZ_H, CLZ_ZPmZ_S)>; + +// [44] "cmpeq $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPEQ_PPzZZ_B, CMPEQ_PPzZZ_D, CMPEQ_PPzZZ_H, CMPEQ_PPzZZ_S, CMPEQ_WIDE_PPzZZ_B, CMPEQ_WIDE_PPzZZ_H, CMPEQ_WIDE_PPzZZ_S)>; + +// [45] "cmpeq $Pd, $Pg/z, $Zn, $imm5"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPEQ_PPzZI_B, CMPEQ_PPzZI_D, CMPEQ_PPzZI_H, CMPEQ_PPzZI_S)>; + +// [46] "cmpge $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGE_PPzZZ_B, CMPGE_PPzZZ_D, CMPGE_PPzZZ_H, CMPGE_PPzZZ_S, CMPGE_WIDE_PPzZZ_B, CMPGE_WIDE_PPzZZ_H, CMPGE_WIDE_PPzZZ_S)>; + +// [47] "cmpge $Pd, $Pg/z, $Zn, $imm5"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGE_PPzZI_B, CMPGE_PPzZI_D, CMPGE_PPzZI_H, CMPGE_PPzZI_S)>; + +// [48] "cmpgt $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGT_PPzZZ_B, CMPGT_PPzZZ_D, CMPGT_PPzZZ_H, CMPGT_PPzZZ_S, CMPGT_WIDE_PPzZZ_B, CMPGT_WIDE_PPzZZ_H, CMPGT_WIDE_PPzZZ_S)>; + +// [49] "cmpgt $Pd, $Pg/z, $Zn, $imm5"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPGT_PPzZI_B, CMPGT_PPzZI_D, CMPGT_PPzZI_H, CMPGT_PPzZI_S)>; + +// [50] "cmphi $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHI_PPzZZ_B, CMPHI_PPzZZ_D, CMPHI_PPzZZ_H, CMPHI_PPzZZ_S, CMPHI_WIDE_PPzZZ_B, CMPHI_WIDE_PPzZZ_H, CMPHI_WIDE_PPzZZ_S)>; + +// [51] "cmphi $Pd, $Pg/z, $Zn, $imm7"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHI_PPzZI_B, CMPHI_PPzZI_D, CMPHI_PPzZI_H, CMPHI_PPzZI_S)>; + +// [52] "cmphs $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHS_PPzZZ_B, CMPHS_PPzZZ_D, CMPHS_PPzZZ_H, CMPHS_PPzZZ_S, CMPHS_WIDE_PPzZZ_B, CMPHS_WIDE_PPzZZ_H, CMPHS_WIDE_PPzZZ_S)>; + +// [53] "cmphs $Pd, $Pg/z, $Zn, $imm7"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPHS_PPzZI_B, CMPHS_PPzZI_D, CMPHS_PPzZI_H, CMPHS_PPzZI_S)>; + +// [54] "cmple $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLE_WIDE_PPzZZ_B, CMPLE_WIDE_PPzZZ_H, CMPLE_WIDE_PPzZZ_S)>; + +// [55] "cmple $Pd, $Pg/z, $Zn, $imm5"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLE_PPzZI_B, CMPLE_PPzZI_D, CMPLE_PPzZI_H, CMPLE_PPzZI_S)>; + +// [56] "cmplo $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLO_WIDE_PPzZZ_B, CMPLO_WIDE_PPzZZ_H, CMPLO_WIDE_PPzZZ_S)>; + +// [57] "cmplo $Pd, $Pg/z, $Zn, $imm7"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLO_PPzZI_B, CMPLO_PPzZI_D, CMPLO_PPzZI_H, CMPLO_PPzZI_S)>; + +// [58] "cmpls $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLS_WIDE_PPzZZ_B, CMPLS_WIDE_PPzZZ_H, CMPLS_WIDE_PPzZZ_S)>; + +// [59] "cmpls $Pd, $Pg/z, $Zn, $imm7"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLS_PPzZI_B, CMPLS_PPzZI_D, CMPLS_PPzZI_H, CMPLS_PPzZI_S)>; + +// [60] "cmplt $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLT_WIDE_PPzZZ_B, CMPLT_WIDE_PPzZZ_H, CMPLT_WIDE_PPzZZ_S)>; + +// [61] "cmplt $Pd, $Pg/z, $Zn, $imm5"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPLT_PPzZI_B, CMPLT_PPzZI_D, CMPLT_PPzZI_H, CMPLT_PPzZI_S)>; + +// [62] "cmpne $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPNE_PPzZZ_B, CMPNE_PPzZZ_D, CMPNE_PPzZZ_H, CMPNE_PPzZZ_S, CMPNE_WIDE_PPzZZ_B, CMPNE_WIDE_PPzZZ_H, CMPNE_WIDE_PPzZZ_S)>; + +// [63] "cmpne $Pd, $Pg/z, $Zn, $imm5"; +def : InstRW<[A64FXWrite_4Cyc_GI01], (instrs CMPNE_PPzZI_B, CMPNE_PPzZI_D, CMPNE_PPzZI_H, CMPNE_PPzZI_S)>; + +// [64] "cnot $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs CNOT_ZPmZ_B, CNOT_ZPmZ_D, CNOT_ZPmZ_H, CNOT_ZPmZ_S)>; + +// [65] "cnt $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI3], (instrs CNT_ZPmZ_B, CNT_ZPmZ_D, CNT_ZPmZ_H, CNT_ZPmZ_S)>; + +// [66] "cntb $Rd, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTB_XPiI)>; + +// [67] "cntd $Rd, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTD_XPiI)>; + +// [68] "cnth $Rd, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTH_XPiI)>; + +// [69] "cntp $Rd, $Pg, $Pn"; +def : InstRW<[A64FXWrite_6Cyc_GI01], (instrs CNTP_XPP_B, CNTP_XPP_D, CNTP_XPP_H, CNTP_XPP_S)>; + +// [70] "cntw $Rd, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTW_XPiI)>; + +// [71] "compact $Zd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs COMPACT_ZPZ_D, COMPACT_ZPZ_S)>; + +// [72] "cpy $Zd, $Pg/m, $Rn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CPY_ZPmR_B, CPY_ZPmR_D, CPY_ZPmR_H, CPY_ZPmR_S)>; + +// [73] "cpy $Zd, $Pg/m, $Vn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CPY_ZPmV_B, CPY_ZPmV_D, CPY_ZPmV_H, CPY_ZPmV_S)>; + +// [74] "cpy $Zd, $Pg/m, $imm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CPY_ZPmI_B, CPY_ZPmI_D, CPY_ZPmI_H, CPY_ZPmI_S)>; + +// [75] "cpy $Zd, $Pg/z, $imm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CPY_ZPzI_B, CPY_ZPzI_D, CPY_ZPzI_H, CPY_ZPzI_S)>; + +// [76] "ctermeq $Rn, $Rm"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs CTERMEQ_WW, CTERMEQ_XX)>; + +// [77] "ctermne $Rn, $Rm"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs CTERMNE_WW, CTERMNE_XX)>; + +// [78] "decb $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECB_XPiI)>; + +// [79] "decd $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECD_XPiI)>; + +// [80] "decd $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECD_ZPiI)>; + +// [81] "dech $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECH_XPiI)>; + +// [82] "dech $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECH_ZPiI)>; + +// [83] "decp $Rdn, $Pg"; +def : InstRW<[A64FXWrite_6Cyc_GI124], (instrs DECP_XP_B, DECP_XP_D, DECP_XP_H, DECP_XP_S)>; + +// [84] "decp $Zdn, $Pg"; +def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs DECP_ZP_D, DECP_ZP_H, DECP_ZP_S)>; + +// [85] "decw $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs DECW_XPiI)>; + +// [86] "decw $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs DECW_ZPiI)>; + +// [87] "dup $Zd, $Rn"; +def : InstRW<[A64FXWrite_8Cyc_GI01], (instrs DUP_ZR_B, DUP_ZR_D, DUP_ZR_H, DUP_ZR_S)>; + +// [88] "dup $Zd, $Zn$idx"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs DUP_ZZI_B, DUP_ZZI_D, DUP_ZZI_H, DUP_ZZI_Q, DUP_ZZI_S)>; + +// [89] "dup $Zd, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs DUP_ZI_B, DUP_ZI_D, DUP_ZI_H, DUP_ZI_S)>; + +// [90] "dupm $Zd, $imms"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs DUPM_ZI)>; + +// [91] "eor $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs EOR_PPzPP)>; + +// [92] "eor $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs EOR_ZZZ)>; + +// [93] "eor $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs EOR_ZPmZ_B, EOR_ZPmZ_D, EOR_ZPmZ_H, EOR_ZPmZ_S)>; + +// [94] "eor $Zdn, $_Zdn, $imms13"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs EOR_ZI)>; + +// [95] "eors $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs EORS_PPzPP)>; + +// [96] "eorv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs EORV_VPZ_B, EORV_VPZ_D, EORV_VPZ_H, EORV_VPZ_S)>; + +// [97] "ext $Zdn, $_Zdn, $Zm, $imm8"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs EXT_ZZI)>; + +// [99] "fabd $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FABD_ZPmZ_D, FABD_ZPmZ_H, FABD_ZPmZ_S)>; + +// [100] "fabs $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FABS_ZPmZ_D, FABS_ZPmZ_H, FABS_ZPmZ_S)>; + +// [101] "facge $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FACGE_PPzZZ_D, FACGE_PPzZZ_H, FACGE_PPzZZ_S)>; + +// [102] "facgt $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FACGT_PPzZZ_D, FACGT_PPzZZ_H, FACGT_PPzZZ_S)>; + +// [103] "fadd $Zd, $Zn, $Zm"; def is line 1638 +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZZZ_D, FADD_ZZZ_H, FADD_ZZZ_S)>; + +// [104] "fadd $Zdn, $Pg/m, $_Zdn, $Zm"; def is line 1638 +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZPmZ_D, FADD_ZPmZ_H, FADD_ZPmZ_S)>; + +// [105] "fadd $Zdn, $Pg/m, $_Zdn, $i1"; def is line 1638 +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FADD_ZPmI_D, FADD_ZPmI_H, FADD_ZPmI_S)>; + +// [106] "fadda $Vdn, $Pg, $_Vdn, $Zm"; +def : InstRW<[A64FXWrite_18Cyc_GI03], (instrs FADDA_VPZ_D, FADDA_VPZ_H, FADDA_VPZ_S)>; + +// [107] "faddv $Vd, $Pg, $Zn"; +// H : 4 / 6 / ([1,2]9 / [1]6) x 4 / [1,2]9 = 75 cycle +// S : 4 / 6 / ([1,2]9 / [1]6) x 3 / [1,2]9 = 60 cycle +// D : 4 / 6 / ([1,2]9 / [1]6) x 2 / [1,2]9 = 45 cycle +def : InstRW<[A64FXWrite_75Cyc_GI03], (instrs FADDV_VPZ_H)>; +def : InstRW<[A64FXWrite_60Cyc_GI03], (instrs FADDV_VPZ_S)>; +def : InstRW<[A64FXWrite_45Cyc_GI03], (instrs FADDV_VPZ_D)>; + +// [108] "fcadd $Zdn, $Pg/m, $_Zdn, $Zm, $imm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCADD_ZPmZ_D, FCADD_ZPmZ_H, FCADD_ZPmZ_S)>; + +// [109] "fcmeq $Pd, $Pg/z, $Zn, #0.0"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMEQ_PPzZ0_D, FCMEQ_PPzZ0_H, FCMEQ_PPzZ0_S)>; + +// [110] "fcmeq $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMEQ_PPzZZ_D, FCMEQ_PPzZZ_H, FCMEQ_PPzZZ_S)>; + +// [111] "fcmge $Pd, $Pg/z, $Zn, #0.0"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGE_PPzZ0_D, FCMGE_PPzZ0_H, FCMGE_PPzZ0_S)>; + +// [112] "fcmge $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGE_PPzZZ_D, FCMGE_PPzZZ_H, FCMGE_PPzZZ_S)>; + +// [113] "fcmgt $Pd, $Pg/z, $Zn, #0.0"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGT_PPzZ0_D, FCMGT_PPzZ0_H, FCMGT_PPzZ0_S)>; + +// [114] "fcmgt $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMGT_PPzZZ_D, FCMGT_PPzZZ_H, FCMGT_PPzZZ_S)>; + +// [115] "fcmla $Zda, $Pg/m, $Zn, $Zm, $imm"; +def : InstRW<[A64FXWrite_15Cyc_GI03], (instrs FCMLA_ZPmZZ_D, FCMLA_ZPmZZ_H, FCMLA_ZPmZZ_S)>; + +// [116] "fcmla $Zda, $Zn, $Zm$iop, $imm"; +def : InstRW<[A64FXWrite_15Cyc_GI03], (instrs FCMLA_ZZZI_H, FCMLA_ZZZI_S)>; + +// [117] "fcmle $Pd, $Pg/z, $Zn, #0.0"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMLE_PPzZ0_D, FCMLE_PPzZ0_H, FCMLE_PPzZ0_S)>; + +// [118] "fcmlt $Pd, $Pg/z, $Zn, #0.0"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMLT_PPzZ0_D, FCMLT_PPzZ0_H, FCMLT_PPzZ0_S)>; + +// [119] "fcmne $Pd, $Pg/z, $Zn, #0.0"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMNE_PPzZ0_D, FCMNE_PPzZ0_H, FCMNE_PPzZ0_S)>; + +// [120] "fcmne $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMNE_PPzZZ_D, FCMNE_PPzZZ_H, FCMNE_PPzZZ_S)>; + +// [121] "fcmuo $Pd, $Pg/z, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCMUO_PPzZZ_D, FCMUO_PPzZZ_H, FCMUO_PPzZZ_S)>; + +// [122] "fcpy $Zd, $Pg/m, $imm8"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FCPY_ZPmI_D, FCPY_ZPmI_H, FCPY_ZPmI_S)>; + +// [123] "fcvt $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVT_ZPmZ_DtoH, FCVT_ZPmZ_DtoS, FCVT_ZPmZ_HtoD, FCVT_ZPmZ_HtoS, FCVT_ZPmZ_StoD, FCVT_ZPmZ_StoH)>; + +// [124] "fcvtzs $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVTZS_ZPmZ_DtoD, FCVTZS_ZPmZ_DtoS, FCVTZS_ZPmZ_HtoD, FCVTZS_ZPmZ_HtoH, FCVTZS_ZPmZ_HtoS, FCVTZS_ZPmZ_StoD, FCVTZS_ZPmZ_StoS)>; + +// [125] "fcvtzu $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FCVTZU_ZPmZ_DtoD, FCVTZU_ZPmZ_DtoS, FCVTZU_ZPmZ_HtoD, FCVTZU_ZPmZ_HtoH, FCVTZU_ZPmZ_HtoS, FCVTZU_ZPmZ_StoD, FCVTZU_ZPmZ_StoS)>; + +// [126] "fdiv $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FDIV_ZPmZ_D)>; +def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FDIV_ZPmZ_H)>; +def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FDIV_ZPmZ_S)>; + +// [127] "fdivr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FDIVR_ZPmZ_D)>; +def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FDIVR_ZPmZ_H)>; +def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FDIVR_ZPmZ_S)>; + +// [128] "fdup $Zd, $imm8"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FDUP_ZI_D, FDUP_ZI_H, FDUP_ZI_S)>; + +// [129] "fexpa $Zd, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FEXPA_ZZ_D, FEXPA_ZZ_H, FEXPA_ZZ_S)>; + +// [130] "fmad $Zdn, $Pg/m, $Zm, $Za"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMAD_ZPmZZ_D, FMAD_ZPmZZ_H, FMAD_ZPmZZ_S)>; + +// [131] "fmax $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMAX_ZPmZ_D, FMAX_ZPmZ_H, FMAX_ZPmZ_S)>; + +// [132] "fmax $Zdn, $Pg/m, $_Zdn, $i1"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMAX_ZPmI_D, FMAX_ZPmI_H, FMAX_ZPmI_S)>; + +// [133] "fmaxnm $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMAXNM_ZPmZ_D, FMAXNM_ZPmZ_H, FMAXNM_ZPmZ_S)>; + +// [134] "fmaxnm $Zdn, $Pg/m, $_Zdn, $i1"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMAXNM_ZPmI_D, FMAXNM_ZPmI_H, FMAXNM_ZPmI_S)>; + +// [135] "fmaxnmv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMAXNMV_VPZ_D, FMAXNMV_VPZ_H, FMAXNMV_VPZ_S)>; + +// [136] "fmaxv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMAXV_VPZ_D, FMAXV_VPZ_H, FMAXV_VPZ_S)>; + +// [137] "fmin $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMIN_ZPmZ_D, FMIN_ZPmZ_H, FMIN_ZPmZ_S)>; + +// [138] "fmin $Zdn, $Pg/m, $_Zdn, $i1"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMIN_ZPmI_D, FMIN_ZPmI_H, FMIN_ZPmI_S)>; + +// [139] "fminnm $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FMINNM_ZPmZ_D, FMINNM_ZPmZ_H, FMINNM_ZPmZ_S)>; + +// [140] "fminnm $Zdn, $Pg/m, $_Zdn, $i1"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs FMINNM_ZPmI_D, FMINNM_ZPmI_H, FMINNM_ZPmI_S)>; + +// [141] "fminnmv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMINNMV_VPZ_D, FMINNMV_VPZ_H, FMINNMV_VPZ_S)>; + +// [142] "fminv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_10Cyc_GI03], (instrs FMINV_VPZ_D, FMINV_VPZ_H, FMINV_VPZ_S)>; + +// [143] "fmla $Zda, $Pg/m, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLA_ZPmZZ_D, FMLA_ZPmZZ_H, FMLA_ZPmZZ_S)>; + +// [144] "fmla $Zda, $Zn, $Zm$iop"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLA_ZZZI_D, FMLA_ZZZI_H, FMLA_ZZZI_S)>; + +// [145] "fmls $Zda, $Pg/m, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLS_ZPmZZ_D, FMLS_ZPmZZ_H, FMLS_ZPmZZ_S)>; + +// [146] "fmls $Zda, $Zn, $Zm$iop"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FMLS_ZZZI_D, FMLS_ZZZI_H, FMLS_ZZZI_S)>; + +// [147] "fmsb $Zdn, $Pg/m, $Zm, $Za"; + +// [148] "fmul $Zd, $Zn, $Zm"; + +// [149] "fmul $Zd, $Zn, $Zm$iop"; + +// [150] "fmul $Zdn, $Pg/m, $_Zdn, $Zm"; + +// [151] "fmul $Zdn, $Pg/m, $_Zdn, $i1"; + +// [152] "fmulx $Zdn, $Pg/m, $_Zdn, $Zm"; + +// [153] "fneg $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FNEG_ZPmZ_D, FNEG_ZPmZ_H, FNEG_ZPmZ_S)>; + +// [154] "fnmad $Zdn, $Pg/m, $Zm, $Za"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMAD_ZPmZZ_D, FNMAD_ZPmZZ_H, FNMAD_ZPmZZ_S)>; + +// [155] "fnmla $Zda, $Pg/m, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMLA_ZPmZZ_D, FNMLA_ZPmZZ_H, FNMLA_ZPmZZ_S)>; + +// [156] "fnmls $Zda, $Pg/m, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMLS_ZPmZZ_D, FNMLS_ZPmZZ_H, FNMLS_ZPmZZ_S)>; + +// [157] "fnmsb $Zdn, $Pg/m, $Zm, $Za"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FNMSB_ZPmZZ_D, FNMSB_ZPmZZ_H, FNMSB_ZPmZZ_S)>; + +// [158] "frecpe $Zd, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRECPE_ZZ_D, FRECPE_ZZ_H, FRECPE_ZZ_S)>; + +// [159] "frecps $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRECPS_ZZZ_D, FRECPS_ZZZ_H, FRECPS_ZZZ_S)>; + +// [160] "frecpx $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRECPX_ZPmZ_D, FRECPX_ZPmZ_H, FRECPX_ZPmZ_S)>; + +// [161] "frinta $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTA_ZPmZ_D, FRINTA_ZPmZ_H, FRINTA_ZPmZ_S)>; + +// [162] "frinti $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTI_ZPmZ_D, FRINTI_ZPmZ_H, FRINTI_ZPmZ_S)>; + +// [163] "frintm $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTM_ZPmZ_D, FRINTM_ZPmZ_H, FRINTM_ZPmZ_S)>; + +// [164] "frintn $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTN_ZPmZ_D, FRINTN_ZPmZ_H, FRINTN_ZPmZ_S)>; + +// [165] "frintp $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTP_ZPmZ_D, FRINTP_ZPmZ_H, FRINTP_ZPmZ_S)>; + +// [166] "frintx $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTX_ZPmZ_D, FRINTX_ZPmZ_H, FRINTX_ZPmZ_S)>; + +// [167] "frintz $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRINTZ_ZPmZ_D, FRINTZ_ZPmZ_H, FRINTZ_ZPmZ_S)>; + +// [168] "frsqrte $Zd, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FRSQRTE_ZZ_D, FRSQRTE_ZZ_H, FRSQRTE_ZZ_S)>; + +// [169] "frsqrts $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs FRSQRTS_ZZZ_D, FRSQRTS_ZZZ_H, FRSQRTS_ZZZ_S)>; + +// [170] "fscale $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSCALE_ZPmZ_D, FSCALE_ZPmZ_H, FSCALE_ZPmZ_S)>; + +// [171] "fsqrt $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_154Cyc_GI0], (instrs FSQRT_ZPmZ_D)>; +def : InstRW<[A64FXWrite_134Cyc_GI0], (instrs FSQRT_ZPmZ_H)>; +def : InstRW<[A64FXWrite_98Cyc_GI0], (instrs FSQRT_ZPmZ_S)>; + +// [172] "fsub $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUB_ZZZ_D, FSUB_ZZZ_H, FSUB_ZZZ_S)>; + +// [173] "fsub $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUB_ZPmZ_D, FSUB_ZPmZ_H, FSUB_ZPmZ_S)>; + +// [174] "fsub $Zdn, $Pg/m, $_Zdn, $i1"; +def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs FSUB_ZPmI_D, FSUB_ZPmI_H, FSUB_ZPmI_S)>; + +// [175] "fsubr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FSUBR_ZPmZ_D, FSUBR_ZPmZ_H, FSUBR_ZPmZ_S)>; + +// [176] "fsubr $Zdn, $Pg/m, $_Zdn, $i1"; +def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs FSUBR_ZPmI_D, FSUBR_ZPmI_H, FSUBR_ZPmI_S)>; + +// [177] "ftmad $Zdn, $_Zdn, $Zm, $imm3"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FTMAD_ZZI_D, FTMAD_ZZI_H, FTMAD_ZZI_S)>; + +// [178] "ftsmul $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs FTSMUL_ZZZ_D, FTSMUL_ZZZ_H, FTSMUL_ZZZ_S)>; + +// [180] "incb $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCB_XPiI)>; + +// [181] "incd $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCD_XPiI)>; + +// [182] "incd $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCD_ZPiI)>; + +// [183] "inch $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCH_XPiI)>; + +// [184] "inch $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCH_ZPiI)>; + +// [185] "incp $Rdn, $Pg"; +def : InstRW<[A64FXWrite_6Cyc_GI124], (instrs INCP_XP_B, INCP_XP_D, INCP_XP_H, INCP_XP_S)>; + +// [186] "incp $Zdn, $Pg"; +def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs INCP_ZP_D, INCP_ZP_H, INCP_ZP_S)>; + +// [187] "incw $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs INCW_XPiI)>; + +// [188] "incw $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs INCW_ZPiI)>; + +// [189] "index $Zd, $Rn, $Rm"; +def : InstRW<[A64FXWrite_17Cyc_GI02], (instrs INDEX_RR_B, INDEX_RR_D, INDEX_RR_H, INDEX_RR_S)>; + +// [190] "index $Zd, $Rn, $imm5"; +def : InstRW<[A64FXWrite_21Cyc_GI02], (instrs INDEX_RI_B, INDEX_RI_D, INDEX_RI_H, INDEX_RI_S)>; + +// [191] "index $Zd, $imm5, $Rm"; +def : InstRW<[A64FXWrite_21Cyc_GI02], (instrs INDEX_IR_B, INDEX_IR_D, INDEX_IR_H, INDEX_IR_S)>; + +// [192] "index $Zd, $imm5, $imm5b"; +def : InstRW<[A64FXWrite_13Cyc_GI0], (instrs INDEX_II_B, INDEX_II_D, INDEX_II_H, INDEX_II_S)>; + +// [193] "insr $Zdn, $Rm"; +def : InstRW<[A64FXWrite_10Cyc_GI02], (instrs INSR_ZR_B, INSR_ZR_D, INSR_ZR_H, INSR_ZR_S)>; + +// [194] "insr $Zdn, $Vm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs INSR_ZV_B, INSR_ZV_D, INSR_ZV_H, INSR_ZV_S)>; + +// [195] "lasta $Rd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_25Cyc_GI056], (instrs LASTA_RPZ_B, LASTA_RPZ_D, LASTA_RPZ_H, LASTA_RPZ_S)>; + +// [196] "lasta $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs LASTA_VPZ_B, LASTA_VPZ_D, LASTA_VPZ_H, LASTA_VPZ_S)>; + +// [197] "lastb $Rd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_25Cyc_GI056], (instrs LASTB_RPZ_B, LASTB_RPZ_D, LASTB_RPZ_H, LASTB_RPZ_S)>; + +// [198] "lastb $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs LASTB_VPZ_B, LASTB_VPZ_D, LASTB_VPZ_H, LASTB_VPZ_S)>; + +// [199] "ld1b $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1B, LD1B_D, LD1B_H, LD1B_S)>; + +// [200] "ld1b $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1B_D_REAL, GLD1B_D_SXTW_REAL, GLD1B_D_UXTW_REAL, GLD1B_S_SXTW_REAL, GLD1B_S_UXTW_REAL)>; + +// [201] "ld1b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1B_D_IMM_REAL, LD1B_H_IMM_REAL, LD1B_IMM_REAL, LD1B_S_IMM_REAL)>; + +// [202] "ld1b $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1B_D_IMM_REAL, GLD1B_S_IMM_REAL)>; + +// [203] "ld1d $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1D)>; + +// [204] "ld1d $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1D_REAL, GLD1D_SCALED_REAL, GLD1D_SXTW_REAL, GLD1D_SXTW_SCALED_REAL, GLD1D_UXTW_REAL, GLD1D_UXTW_SCALED_REAL)>; + +// [205] "ld1d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1D_IMM_REAL)>; + +// [206] "ld1d $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1D_IMM_REAL)>; + +// [207] "ld1h $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1H, LD1H_D, LD1H_S)>; + +// [208] "ld1h $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1H_D_REAL, GLD1H_D_SCALED_REAL, GLD1H_D_SXTW_REAL, GLD1H_D_SXTW_SCALED_REAL, GLD1H_D_UXTW_REAL, GLD1H_D_UXTW_SCALED_REAL, GLD1H_S_SXTW_REAL, GLD1H_S_SXTW_SCALED_REAL, GLD1H_S_UXTW_REAL, GLD1H_S_UXTW_SCALED_REAL)>; + +// [209] "ld1h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1H_D_IMM_REAL, LD1H_IMM_REAL, LD1H_S_IMM_REAL)>; + +// [210] "ld1h $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1H_D_IMM_REAL, GLD1H_S_IMM_REAL)>; + +// [211] "ld1rb $Zt, $Pg/z, [$Rn, $imm6]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RB_D_IMM, LD1RB_H_IMM, LD1RB_IMM, LD1RB_S_IMM)>; + +// [212] "ld1rd $Zt, $Pg/z, [$Rn, $imm6]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RD_IMM)>; + +// [213] "ld1rh $Zt, $Pg/z, [$Rn, $imm6]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RH_D_IMM, LD1RH_IMM, LD1RH_S_IMM)>; + +// [214] "ld1rqb $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_B)>; + +// [215] "ld1rqb $Zt, $Pg/z, [$Rn, $imm4]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_B_IMM)>; + +// [216] "ld1rqd $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_D)>; + +// [217] "ld1rqd $Zt, $Pg/z, [$Rn, $imm4]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_D_IMM)>; + +// [218] "ld1rqh $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_H)>; + +// [219] "ld1rqh $Zt, $Pg/z, [$Rn, $imm4]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_H_IMM)>; + +// [220] "ld1rqw $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_W)>; + +// [221] "ld1rqw $Zt, $Pg/z, [$Rn, $imm4]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RQ_W_IMM)>; + +// [222] "ld1rsb $Zt, $Pg/z, [$Rn, $imm6]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSB_D_IMM, LD1RSB_H_IMM, LD1RSB_S_IMM)>; + +// [223] "ld1rsh $Zt, $Pg/z, [$Rn, $imm6]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSH_D_IMM, LD1RSH_S_IMM)>; + +// [224] "ld1rsw $Zt, $Pg/z, [$Rn, $imm6]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RSW_IMM)>; + +// [225] "ld1rw $Zt, $Pg/z, [$Rn, $imm6]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1RW_D_IMM, LD1RW_IMM)>; + +// [226] "ld1sb $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SB_D, LD1SB_H, LD1SB_S)>; + +// [227] "ld1sb $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SB_D_REAL, GLD1SB_D_SXTW_REAL, GLD1SB_D_UXTW_REAL, GLD1SB_S_SXTW_REAL, GLD1SB_S_UXTW_REAL)>; + +// [228] "ld1sb $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SB_D_IMM_REAL, LD1SB_H_IMM_REAL, LD1SB_S_IMM_REAL)>; + +// [229] "ld1sb $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SB_D_IMM_REAL, GLD1SB_S_IMM_REAL)>; + +// [230] "ld1sh $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SH_D, LD1SH_S)>; + +// [231] "ld1sh $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SH_D_REAL, GLD1SH_D_SCALED_REAL, GLD1SH_D_SXTW_REAL, GLD1SH_D_SXTW_SCALED_REAL, GLD1SH_D_UXTW_REAL, GLD1SH_D_UXTW_SCALED_REAL, GLD1SH_S_SXTW_REAL, GLD1SH_S_SXTW_SCALED_REAL, GLD1SH_S_UXTW_REAL, GLD1SH_S_UXTW_SCALED_REAL)>; + +// [232] "ld1sh $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SH_D_IMM_REAL, LD1SH_S_IMM_REAL)>; + +// [233] "ld1sh $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SH_D_IMM_REAL, GLD1SH_S_IMM_REAL)>; + +// [234] "ld1sw $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SW_D)>; + +// [235] "ld1sw $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1SW_D_REAL, GLD1SW_D_SCALED_REAL, GLD1SW_D_SXTW_REAL, GLD1SW_D_SXTW_SCALED_REAL, GLD1SW_D_UXTW_REAL, GLD1SW_D_UXTW_SCALED_REAL)>; + +// [236] "ld1sw $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1SW_D_IMM_REAL)>; + +// [237] "ld1sw $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1SW_D_IMM_REAL)>; + +// [238] "ld1w $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1W, LD1W_D)>; + +// [239] "ld1w $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLD1W_D_REAL, GLD1W_D_SCALED_REAL, GLD1W_D_SXTW_REAL, GLD1W_D_SXTW_SCALED_REAL, GLD1W_D_UXTW_REAL, GLD1W_D_UXTW_SCALED_REAL, GLD1W_SXTW_REAL, GLD1W_SXTW_SCALED_REAL, GLD1W_UXTW_REAL, GLD1W_UXTW_SCALED_REAL)>; + +// [240] "ld1w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD1W_D_IMM_REAL, LD1W_IMM_REAL)>; + +// [241] "ld1w $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLD1W_D_IMM_REAL, GLD1W_IMM_REAL)>; + +// [242] "ld2b $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2B)>; + +// [243] "ld2b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2B_IMM)>; + +// [244] "ld2d $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2D)>; + +// [245] "ld2d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2D_IMM)>; + +// [246] "ld2h $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2H)>; + +// [247] "ld2h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD2H_IMM)>; + +// [248] "ld2w $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2W)>; + +// [249] "ld2w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD2W_IMM)>; + +// [250] "ld3b $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3B)>; + +// [251] "ld3b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3B_IMM)>; + +// [252] "ld3d $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3D)>; + +// [253] "ld3d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3D_IMM)>; + +// [254] "ld3h $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3H)>; + +// [255] "ld3h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD3H_IMM)>; + +// [256] "ld3w $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3W)>; + +// [257] "ld3w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD3W_IMM)>; + +// [258] "ld4b $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD4B)>; + +// [259] "ld4b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_44Cyc_GI56], (instrs LD4B_IMM)>; + +// [260] "ld4d $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4D)>; + +// [261] "ld4d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4D_IMM)>; + +// [262] "ld4h $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4H)>; + +// [263] "ld4h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4H_IMM)>; + +// [264] "ld4w $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4W)>; + +// [265] "ld4w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LD4W_IMM)>; + +// [266] "ldff1b $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1B_D_REAL, LDFF1B_H_REAL, LDFF1B_REAL, LDFF1B_S_REAL)>; + +// [267] "ldff1b $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1B_D_REAL, GLDFF1B_D_SXTW_REAL, GLDFF1B_D_UXTW_REAL, GLDFF1B_S_SXTW_REAL, GLDFF1B_S_UXTW_REAL)>; + +// [268] "ldff1b $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1B_D_IMM_REAL, GLDFF1B_S_IMM_REAL)>; + +// [269] "ldff1d $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1D_REAL)>; + +// [270] "ldff1d $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1D_REAL, GLDFF1D_SCALED_REAL, GLDFF1D_SXTW_REAL, GLDFF1D_SXTW_SCALED_REAL, GLDFF1D_UXTW_REAL, GLDFF1D_UXTW_SCALED_REAL)>; + +// [271] "ldff1d $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1D_IMM_REAL)>; + +// [272] "ldff1h $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1H_D_REAL, LDFF1H_REAL, LDFF1H_S_REAL)>; + +// [273] "ldff1h $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1H_D_REAL, GLDFF1H_D_SCALED_REAL, GLDFF1H_D_SXTW_REAL, GLDFF1H_D_SXTW_SCALED_REAL, GLDFF1H_D_UXTW_REAL, GLDFF1H_D_UXTW_SCALED_REAL, GLDFF1H_S_SXTW_REAL, GLDFF1H_S_SXTW_SCALED_REAL, GLDFF1H_S_UXTW_REAL, GLDFF1H_S_UXTW_SCALED_REAL)>; + +// [274] "ldff1h $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1H_D_IMM_REAL, GLDFF1H_S_IMM_REAL)>; + +// [275] "ldff1sb $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SB_D_REAL, LDFF1SB_H_REAL, LDFF1SB_S_REAL)>; + +// [276] "ldff1sb $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SB_D_REAL, GLDFF1SB_D_SXTW_REAL, GLDFF1SB_D_UXTW_REAL, GLDFF1SB_S_SXTW_REAL, GLDFF1SB_S_UXTW_REAL)>; + +// [277] "ldff1sb $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SB_D_IMM_REAL, GLDFF1SB_S_IMM_REAL)>; + +// [278] "ldff1sh $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SH_D_REAL, LDFF1SH_S_REAL)>; + +// [279] "ldff1sh $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SH_D_REAL, GLDFF1SH_D_SCALED_REAL, GLDFF1SH_D_SXTW_REAL, GLDFF1SH_D_SXTW_SCALED_REAL, GLDFF1SH_D_UXTW_REAL, GLDFF1SH_D_UXTW_SCALED_REAL, GLDFF1SH_S_SXTW_REAL, GLDFF1SH_S_SXTW_SCALED_REAL, GLDFF1SH_S_UXTW_REAL, GLDFF1SH_S_UXTW_SCALED_REAL)>; + +// [280] "ldff1sh $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SH_D_IMM_REAL, GLDFF1SH_S_IMM_REAL)>; + +// [281] "ldff1sw $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1SW_D_REAL)>; + +// [282] "ldff1sw $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1SW_D_REAL, GLDFF1SW_D_SCALED_REAL, GLDFF1SW_D_SXTW_REAL, GLDFF1SW_D_SXTW_SCALED_REAL, GLDFF1SW_D_UXTW_REAL, GLDFF1SW_D_UXTW_SCALED_REAL)>; + +// [283] "ldff1sw $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1SW_D_IMM_REAL)>; + +// [284] "ldff1w $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDFF1W_D_REAL, LDFF1W_REAL)>; + +// [285] "ldff1w $Zt, $Pg/z, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_19Cyc_GI0256], (instrs GLDFF1W_D_REAL, GLDFF1W_D_SCALED_REAL, GLDFF1W_D_SXTW_REAL, GLDFF1W_D_SXTW_SCALED_REAL, GLDFF1W_D_UXTW_REAL, GLDFF1W_D_UXTW_SCALED_REAL, GLDFF1W_SXTW_REAL, GLDFF1W_SXTW_SCALED_REAL, GLDFF1W_UXTW_REAL, GLDFF1W_UXTW_SCALED_REAL)>; + +// [286] "ldff1w $Zt, $Pg/z, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_15Cyc_GI056], (instrs GLDFF1W_D_IMM_REAL, GLDFF1W_IMM_REAL)>; + +// [287] "ldnf1b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1B_D_IMM_REAL, LDNF1B_H_IMM_REAL, LDNF1B_IMM_REAL, LDNF1B_S_IMM_REAL)>; + +// [288] "ldnf1d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1D_IMM_REAL)>; + +// [289] "ldnf1h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1H_D_IMM_REAL, LDNF1H_IMM_REAL, LDNF1H_S_IMM_REAL)>; + +// [290] "ldnf1sb $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SB_D_IMM_REAL, LDNF1SB_H_IMM_REAL, LDNF1SB_S_IMM_REAL)>; + +// [291] "ldnf1sh $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SH_D_IMM_REAL, LDNF1SH_S_IMM_REAL)>; + +// [292] "ldnf1sw $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1SW_D_IMM_REAL)>; + +// [293] "ldnf1w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNF1W_D_IMM_REAL, LDNF1W_IMM_REAL)>; + +// [294] "ldnt1b $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1B_ZRR)>; + +// [295] "ldnt1b $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1B_ZRI)>; + +// [296] "ldnt1d $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1D_ZRR)>; + +// [297] "ldnt1d $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1D_ZRI)>; + +// [298] "ldnt1h $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1H_ZRR)>; + +// [299] "ldnt1h $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1H_ZRI)>; + +// [300] "ldnt1w $Zt, $Pg/z, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1W_ZRR)>; + +// [301] "ldnt1w $Zt, $Pg/z, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI56], (instrs LDNT1W_ZRI)>; + +// [302] "ldr $Pt, [$Rn, $imm9, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI5], (instrs LDR_PXI)>; + +// [303] "ldr $Zt, [$Rn, $imm9, mul vl]"; +def : InstRW<[A64FXWrite_11Cyc_GI5], (instrs LDR_ZXI)>; + +// [304] "lsl $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_WIDE_ZZZ_B, LSL_WIDE_ZZZ_H, LSL_WIDE_ZZZ_S)>; + +// [305] "lsl $Zd, $Zn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_ZZI_B, LSL_ZZI_D, LSL_ZZI_H, LSL_ZZI_S)>; + +// [306] "lsl $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_WIDE_ZPmZ_B, LSL_WIDE_ZPmZ_H, LSL_WIDE_ZPmZ_S, LSL_ZPmZ_B, LSL_ZPmZ_D, LSL_ZPmZ_H, LSL_ZPmZ_S)>; + +// [307] "lsl $Zdn, $Pg/m, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSL_ZPmI_B, LSL_ZPmI_D, LSL_ZPmI_H, LSL_ZPmI_S)>; + +// [308] "lslr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSLR_ZPmZ_B, LSLR_ZPmZ_D, LSLR_ZPmZ_H, LSLR_ZPmZ_S)>; + +// [309] "lsr $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_WIDE_ZZZ_B, LSR_WIDE_ZZZ_H, LSR_WIDE_ZZZ_S)>; + +// [310] "lsr $Zd, $Zn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_ZZI_B, LSR_ZZI_D, LSR_ZZI_H, LSR_ZZI_S)>; + +// [311] "lsr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_WIDE_ZPmZ_B, LSR_WIDE_ZPmZ_H, LSR_WIDE_ZPmZ_S, LSR_ZPmZ_B, LSR_ZPmZ_D, LSR_ZPmZ_H, LSR_ZPmZ_S)>; + +// [312] "lsr $Zdn, $Pg/m, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSR_ZPmI_B, LSR_ZPmI_D, LSR_ZPmI_H, LSR_ZPmI_S)>; + +// [313] "lsrr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs LSRR_ZPmZ_B, LSRR_ZPmZ_D, LSRR_ZPmZ_H, LSRR_ZPmZ_S)>; + +// [314] "mad $Zdn, $Pg/m, $Zm, $Za"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MAD_ZPmZZ_B, MAD_ZPmZZ_D, MAD_ZPmZZ_H, MAD_ZPmZZ_S)>; + +// [315] "mla $Zda, $Pg/m, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MLA_ZPmZZ_B, MLA_ZPmZZ_D, MLA_ZPmZZ_H, MLA_ZPmZZ_S)>; + +// [316] "mls $Zda, $Pg/m, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MLS_ZPmZZ_B, MLS_ZPmZZ_D, MLS_ZPmZZ_H, MLS_ZPmZZ_S)>; + +// [317] "movprfx $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZPmZ_B, MOVPRFX_ZPmZ_D, MOVPRFX_ZPmZ_H, MOVPRFX_ZPmZ_S)>; + +// [318] "movprfx $Zd, $Pg/z, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZPzZ_B, MOVPRFX_ZPzZ_D, MOVPRFX_ZPzZ_H, MOVPRFX_ZPzZ_S)>; + +// [319] "movprfx $Zd, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs MOVPRFX_ZZ)>; + +// [320] "msb $Zdn, $Pg/m, $Zm, $Za"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MSB_ZPmZZ_B, MSB_ZPmZZ_D, MSB_ZPmZZ_H, MSB_ZPmZZ_S)>; + +// [321] "mul $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs MUL_ZPmZ_B, MUL_ZPmZ_D, MUL_ZPmZ_H, MUL_ZPmZ_S)>; + +// [322] "mul $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_9Cyc_GI0], (instrs MUL_ZI_B, MUL_ZI_D, MUL_ZI_H, MUL_ZI_S)>; + +// [323] "nand $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NAND_PPzPP)>; + +// [324] "nands $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NANDS_PPzPP)>; + +// [325] "neg $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs NEG_ZPmZ_B, NEG_ZPmZ_D, NEG_ZPmZ_H, NEG_ZPmZ_S)>; + +// [326] "nor $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NOR_PPzPP)>; + +// [327] "nors $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs NORS_PPzPP)>; + +// [328] "not $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs NOT_ZPmZ_B, NOT_ZPmZ_D, NOT_ZPmZ_H, NOT_ZPmZ_S)>; + +// [329] "orn $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORN_PPzPP)>; + +// [330] "orns $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORNS_PPzPP)>; + +// [331] "orr $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORR_PPzPP)>; + +// [332] "orr $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ORR_ZZZ)>; + +// [333] "orr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs ORR_ZPmZ_B, ORR_ZPmZ_D, ORR_ZPmZ_H, ORR_ZPmZ_S)>; + +// [334] "orr $Zdn, $_Zdn, $imms13"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs ORR_ZI)>; + +// [335] "orrs $Pd, $Pg/z, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs ORRS_PPzPP)>; + +// [336] "orv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs ORV_VPZ_B, ORV_VPZ_D, ORV_VPZ_H, ORV_VPZ_S)>; + +// [337] "pfalse $Pd"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PFALSE)>; + +// [338] "pnext $Pdn, $Pg, $_Pdn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PNEXT_B, PNEXT_D, PNEXT_H, PNEXT_S)>; + +// [339] "prfb $prfop, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFB_PRR)>; + +// [340] "prfb $prfop, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFB_D_SCALED, PRFB_D_SXTW_SCALED, PRFB_D_UXTW_SCALED, PRFB_S_SXTW_SCALED, PRFB_S_UXTW_SCALED)>; + +// [341] "prfb $prfop, $Pg, [$Rn, $imm6, mul vl]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFB_PRI)>; + +// [342] "prfb $prfop, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFB_D_PZI, PRFB_S_PZI)>; + +// [343] "prfd $prfop, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFD_PRR)>; + +// [344] "prfd $prfop, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFD_D_SCALED, PRFD_D_SXTW_SCALED, PRFD_D_UXTW_SCALED, PRFD_S_SXTW_SCALED, PRFD_S_UXTW_SCALED)>; + +// [345] "prfd $prfop, $Pg, [$Rn, $imm6, mul vl]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFD_PRI)>; + +// [346] "prfd $prfop, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFD_D_PZI, PRFD_S_PZI)>; + +// [347] "prfh $prfop, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFH_PRR)>; + +// [348] "prfh $prfop, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFH_D_SCALED, PRFH_D_SXTW_SCALED, PRFH_D_UXTW_SCALED, PRFH_S_SXTW_SCALED, PRFH_S_UXTW_SCALED)>; + +// [349] "prfh $prfop, $Pg, [$Rn, $imm6, mul vl]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFH_PRI)>; + +// [350] "prfh $prfop, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFH_D_PZI, PRFH_S_PZI)>; + +// [351] "prfw $prfop, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFS_PRR)>; + +// [352] "prfw $prfop, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_14Cyc_GI0256], (instrs PRFW_D_SCALED, PRFW_D_SXTW_SCALED, PRFW_D_UXTW_SCALED, PRFW_S_SXTW_SCALED, PRFW_S_UXTW_SCALED)>; + +// [353] "prfw $prfop, $Pg, [$Rn, $imm6, mul vl]"; +def : InstRW<[A64FXWrite_6Cyc_GI56], (instrs PRFW_PRI)>; + +// [354] "prfw $prfop, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_10Cyc_GI056], (instrs PRFW_D_PZI, PRFW_S_PZI)>; + +// [355] "ptest $Pg, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTEST_PP)>; + +// [356] "ptrue $Pd, $pattern"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTRUE_B, PTRUE_D, PTRUE_H, PTRUE_S)>; + +// [357] "ptrues $Pd, $pattern"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PTRUES_B, PTRUES_D, PTRUES_H, PTRUES_S)>; + +// [358] "punpkhi $Pd, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PUNPKHI_PP)>; + +// [359] "punpklo $Pd, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs PUNPKLO_PP)>; + +// [360] "rbit $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs RBIT_ZPmZ_B, RBIT_ZPmZ_D, RBIT_ZPmZ_H, RBIT_ZPmZ_S)>; + +// [361] "rdffr $Pd"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFR_P)>; + +// [362] "rdffr $Pd, $Pg/z"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFR_PPz)>; + +// [363] "rdffrs $Pd, $Pg/z"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs RDFFRS_PPz)>; + +// [364] "rdvl $Rd, $imm6"; +def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs RDVLI_XI)>; + +// [365] "rev $Pd, $Pn"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs REV_PP_B, REV_PP_D, REV_PP_H, REV_PP_S)>; + +// [366] "rev $Zd, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs REV_ZZ_B, REV_ZZ_D, REV_ZZ_H, REV_ZZ_S)>; + +// [367] "revb $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVB_ZPmZ_D, REVB_ZPmZ_H, REVB_ZPmZ_S)>; + +// [368] "revh $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVH_ZPmZ_D, REVH_ZPmZ_S)>; + +// [369] "revw $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs REVW_ZPmZ_D)>; + +// [370] "sabd $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SABD_ZPmZ_B, SABD_ZPmZ_D, SABD_ZPmZ_H, SABD_ZPmZ_S)>; + +// [371] "saddv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_12Cyc_GI03], (instrs SADDV_VPZ_B, SADDV_VPZ_H, SADDV_VPZ_S)>; + +// [372] "scvtf $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SCVTF_ZPmZ_DtoD, SCVTF_ZPmZ_DtoH, SCVTF_ZPmZ_DtoS, SCVTF_ZPmZ_HtoH, SCVTF_ZPmZ_StoD, SCVTF_ZPmZ_StoH, SCVTF_ZPmZ_StoS)>; + +// [373] "sdiv $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs SDIV_ZPmZ_D, SDIV_ZPmZ_S)>; + +// [374] "sdivr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs SDIVR_ZPmZ_D, SDIVR_ZPmZ_S)>; + +// [375] "sdot $Zda, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SDOT_ZZZ_D, SDOT_ZZZ_S)>; + +// [376] "sdot $Zda, $Zn, $Zm$iop"; +def : InstRW<[A64FXWrite_15Cyc_NGI03], (instrs SDOT_ZZZI_D, SDOT_ZZZI_S)>; + +// [377] "sel $Pd, $Pg, $Pn, $Pm"; +def : InstRW<[A64FXWrite_3Cyc_GI1], (instrs SEL_PPPP)>; + +// [378] "sel $Zd, $Pg, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SEL_ZPZZ_B, SEL_ZPZZ_D, SEL_ZPZZ_H, SEL_ZPZZ_S)>; + +// [379] "setffr"; +def : InstRW<[A64FXWrite_6Cyc], (instrs SETFFR)>; + +// [380] "smax $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SMAX_ZPmZ_B, SMAX_ZPmZ_D, SMAX_ZPmZ_H, SMAX_ZPmZ_S)>; + +// [381] "smax $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SMAX_ZI_B, SMAX_ZI_D, SMAX_ZI_H, SMAX_ZI_S)>; + +// [382] "smaxv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs SMAXV_VPZ_B, SMAXV_VPZ_D, SMAXV_VPZ_H, SMAXV_VPZ_S)>; + +// [383] "smin $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SMIN_ZPmZ_B, SMIN_ZPmZ_D, SMIN_ZPmZ_H, SMIN_ZPmZ_S)>; + +// [384] "smin $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SMIN_ZI_B, SMIN_ZI_D, SMIN_ZI_H, SMIN_ZI_S)>; + +// [385] "sminv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs SMINV_VPZ_B, SMINV_VPZ_D, SMINV_VPZ_H, SMINV_VPZ_S)>; + +// [386] "smulh $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs SMULH_ZPmZ_B, SMULH_ZPmZ_D, SMULH_ZPmZ_H, SMULH_ZPmZ_S)>; + +// [387] "splice $Zdn, $Pg, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SPLICE_ZPZ_B, SPLICE_ZPZ_D, SPLICE_ZPZ_H, SPLICE_ZPZ_S)>; + +// [388] "sqadd $Zd, $Zn, $Zm"; + +// [389] "sqadd $Zdn, $_Zdn, $imm"; + +// [390] "sqdecb $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECB_XPiWdI)>; + +// [391] "sqdecb $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECB_XPiI)>; + +// [392] "sqdecd $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECD_XPiWdI)>; + +// [393] "sqdecd $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECD_XPiI)>; + +// [394] "sqdecd $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECD_ZPiI)>; + +// [395] "sqdech $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECH_XPiWdI)>; + +// [396] "sqdech $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECH_XPiI)>; + +// [397] "sqdech $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECH_ZPiI)>; + +// [398] "sqdecp $Rdn, $Pg"; +def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQDECP_XP_B, SQDECP_XP_D, SQDECP_XP_H, SQDECP_XP_S)>; + +// [399] "sqdecp $Rdn, $Pg, $_Rdn"; +def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQDECP_XPWd_B, SQDECP_XPWd_D, SQDECP_XPWd_H, SQDECP_XPWd_S)>; + +// [400] "sqdecp $Zdn, $Pg"; +def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs SQDECP_ZP_D, SQDECP_ZP_H, SQDECP_ZP_S)>; + +// [401] "sqdecw $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECW_XPiWdI)>; + +// [402] "sqdecw $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQDECW_XPiI)>; + +// [403] "sqdecw $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQDECW_ZPiI)>; + +// [404] "sqincb $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCB_XPiWdI)>; + +// [405] "sqincb $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCB_XPiI)>; + +// [406] "sqincd $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCD_XPiWdI)>; + +// [407] "sqincd $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCD_XPiI)>; + +// [408] "sqincd $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCD_ZPiI)>; + +// [409] "sqinch $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCH_XPiWdI)>; + +// [410] "sqinch $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCH_XPiI)>; + +// [411] "sqinch $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCH_ZPiI)>; + +// [412] "sqincp $Rdn, $Pg"; +def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQINCP_XP_B, SQINCP_XP_D, SQINCP_XP_H, SQINCP_XP_S)>; + +// [413] "sqincp $Rdn, $Pg, $_Rdn"; +def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs SQINCP_XPWd_B, SQINCP_XPWd_D, SQINCP_XPWd_H, SQINCP_XPWd_S)>; + +// [414] "sqincp $Zdn, $Pg"; +def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs SQINCP_ZP_D, SQINCP_ZP_H, SQINCP_ZP_S)>; + +// [415] "sqincw $Rdn, $_Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCW_XPiWdI)>; + +// [416] "sqincw $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs SQINCW_XPiI)>; + +// [417] "sqincw $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SQINCW_ZPiI)>; + +// [418] "sqsub $Zd, $Zn, $Zm"; + +// [419] "sqsub $Zdn, $_Zdn, $imm"; + +// [420] "st1b $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B, ST1B_D, ST1B_H, ST1B_S)>; + +// [421] "st1b $Zt, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1B_D_REAL, SST1B_D_SXTW, SST1B_D_UXTW, SST1B_S_SXTW, SST1B_S_UXTW)>; + +// [422] "st1b $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1B_D_IMM, ST1B_H_IMM, ST1B_IMM, ST1B_S_IMM)>; + +// [423] "st1b $Zt, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1B_D_IMM, SST1B_S_IMM)>; + +// [424] "st1d $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D)>; + +// [425] "st1d $Zt, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1D_REAL, SST1D_SCALED_SCALED_REAL, SST1D_SXTW, SST1D_SXTW_SCALED, SST1D_UXTW, SST1D_UXTW_SCALED)>; + +// [426] "st1d $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1D_IMM)>; + +// [427] "st1d $Zt, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1D_IMM)>; + +// [428] "st1h $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H, ST1H_D, ST1H_S)>; + +// [429] "st1h $Zt, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1H_D_REAL, SST1H_D_SCALED_SCALED_REAL, SST1H_D_SXTW, SST1H_D_SXTW_SCALED, SST1H_D_UXTW, SST1H_D_UXTW_SCALED, SST1H_S_SXTW, SST1H_S_SXTW_SCALED, SST1H_S_UXTW, SST1H_S_UXTW_SCALED)>; + +// [430] "st1h $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1H_D_IMM, ST1H_IMM, ST1H_S_IMM)>; + +// [431] "st1h $Zt, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1H_D_IMM, SST1H_S_IMM)>; + +// [432] "st1w $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W, ST1W_D)>; + +// [433] "st1w $Zt, $Pg, [$Rn, $Zm]"; +def : InstRW<[A64FXWrite_ST1W_19], (instrs SST1W_D_REAL, SST1W_D_SCALED_SCALED_REAL, SST1W_D_SXTW, SST1W_D_SXTW_SCALED, SST1W_D_UXTW, SST1W_D_UXTW_SCALED, SST1W_SXTW, SST1W_SXTW_SCALED, SST1W_UXTW, SST1W_UXTW_SCALED)>; + +// [434] "st1w $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs ST1W_D_IMM, ST1W_IMM)>; + +// [435] "st1w $Zt, $Pg, [$Zn, $imm5]"; +def : InstRW<[A64FXWrite_ST1W_15], (instrs SST1W_D_IMM, SST1W_IMM)>; + +// [436] "st2b $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2B)>; + +// [437] "st2b $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2B_IMM)>; + +// [438] "st2d $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2D)>; + +// [439] "st2d $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2D_IMM)>; + +// [440] "st2h $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2H)>; + +// [441] "st2h $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2H_IMM)>; + +// [442] "st2w $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2W)>; + +// [443] "st2w $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST2W_7], (instrs ST2W_IMM)>; + +// [444] "st3b $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3B)>; + +// [445] "st3b $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3B_IMM)>; + +// [446] "st3d $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3D)>; + +// [447] "st3d $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3D_IMM)>; + +// [448] "st3h $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3H)>; + +// [449] "st3h $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3H_IMM)>; + +// [450] "st3w $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3W)>; + +// [451] "st3w $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST3W_8], (instrs ST3W_IMM)>; + +// [452] "st4b $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4B)>; + +// [453] "st4b $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4B_IMM)>; + +// [454] "st4d $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4D)>; + +// [455] "st4d $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4D_IMM)>; + +// [456] "st4h $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4H)>; + +// [457] "st4h $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4H_IMM)>; + +// [458] "st4w $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4W)>; + +// [459] "st4w $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST4W_9], (instrs ST4W_IMM)>; + +// [460] "stnt1b $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1B_ZRR)>; + +// [461] "stnt1b $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1B_ZRI)>; + +// [462] "stnt1d $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1D_ZRR)>; + +// [463] "stnt1d $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1D_ZRI)>; + +// [464] "stnt1h $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1H_ZRR)>; + +// [465] "stnt1h $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1H_ZRI)>; + +// [466] "stnt1w $Zt, $Pg, [$Rn, $Rm]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1W_ZRR)>; + +// [467] "stnt1w $Zt, $Pg, [$Rn, $imm4, mul vl]"; +def : InstRW<[A64FXWrite_ST1W_6], (instrs STNT1W_ZRI)>; + +// [468] "str $Pt, [$Rn, $imm9, mul vl]"; +def : InstRW<[A64FXWrite_6Cyc_GI15], (instrs STR_PXI)>; + +// [469] "str $Zt, [$Rn, $imm9, mul vl]"; +def : InstRW<[A64FXWrite_6Cyc_GI05], (instrs STR_ZXI)>; + +// [470] "sub $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZZZ_B, SUB_ZZZ_D, SUB_ZZZ_H, SUB_ZZZ_S)>; + +// [471] "sub $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZPmZ_B, SUB_ZPmZ_D, SUB_ZPmZ_H, SUB_ZPmZ_S)>; + +// [472] "sub $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUB_ZI_B, SUB_ZI_D, SUB_ZI_H, SUB_ZI_S)>; + +// [473] "subr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SUBR_ZPmZ_B, SUBR_ZPmZ_D, SUBR_ZPmZ_H, SUBR_ZPmZ_S)>; + +// [474] "subr $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs SUBR_ZI_B, SUBR_ZI_D, SUBR_ZI_H, SUBR_ZI_S)>; + +// [475] "sunpkhi $Zd, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SUNPKHI_ZZ_D, SUNPKHI_ZZ_H, SUNPKHI_ZZ_S)>; + +// [476] "sunpklo $Zd, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs SUNPKLO_ZZ_D, SUNPKLO_ZZ_H, SUNPKLO_ZZ_S)>; + +// [477] "sxtb $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTB_ZPmZ_D, SXTB_ZPmZ_H, SXTB_ZPmZ_S)>; + +// [478] "sxth $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTH_ZPmZ_D, SXTH_ZPmZ_S)>; + +// [479] "sxtw $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs SXTW_ZPmZ_D)>; + +// [480] "tbl $Zd, $Zn, $Zm"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs TBL_ZZZ_B, TBL_ZZZ_D, TBL_ZZZ_H, TBL_ZZZ_S)>; + +// [481] "trn1 $Pd, $Pn, $Pm"; + +// [482] "trn1 $Zd, $Zn, $Zm"; + +// [483] "trn2 $Pd, $Pn, $Pm"; + +// [484] "trn2 $Zd, $Zn, $Zm"; + +// [486] "uabd $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UABD_ZPmZ_B, UABD_ZPmZ_D, UABD_ZPmZ_H, UABD_ZPmZ_S)>; + +// [487] "uaddv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_12Cyc_GI03], (instrs UADDV_VPZ_B, UADDV_VPZ_D, UADDV_VPZ_H, UADDV_VPZ_S)>; + +// [488] "ucvtf $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UCVTF_ZPmZ_DtoD, UCVTF_ZPmZ_DtoH, UCVTF_ZPmZ_DtoS, UCVTF_ZPmZ_HtoH, UCVTF_ZPmZ_StoD, UCVTF_ZPmZ_StoH, UCVTF_ZPmZ_StoS)>; + +// [489] "udiv $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs UDIV_ZPmZ_D, UDIV_ZPmZ_S)>; + +// [490] "udivr $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_37Cyc_GI0], (instrs UDIVR_ZPmZ_D, UDIVR_ZPmZ_S)>; + +// [491] "udot $Zda, $Zn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UDOT_ZZZ_D, UDOT_ZZZ_S)>; + +// [492] "udot $Zda, $Zn, $Zm$iop"; +def : InstRW<[A64FXWrite_15Cyc_NGI03], (instrs UDOT_ZZZI_D, UDOT_ZZZI_S)>; + +// [493] "umax $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UMAX_ZPmZ_B, UMAX_ZPmZ_D, UMAX_ZPmZ_H, UMAX_ZPmZ_S)>; + +// [494] "umax $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs UMAX_ZI_B, UMAX_ZI_D, UMAX_ZI_H, UMAX_ZI_S)>; + +// [495] "umaxv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs UMAXV_VPZ_B, UMAXV_VPZ_D, UMAXV_VPZ_H, UMAXV_VPZ_S)>; + +// [496] "umin $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UMIN_ZPmZ_B, UMIN_ZPmZ_D, UMIN_ZPmZ_H, UMIN_ZPmZ_S)>; + +// [497] "umin $Zdn, $_Zdn, $imm"; +def : InstRW<[A64FXWrite_4Cyc_GI0], (instrs UMIN_ZI_B, UMIN_ZI_D, UMIN_ZI_H, UMIN_ZI_S)>; + +// [498] "uminv $Vd, $Pg, $Zn"; +def : InstRW<[A64FXWrite_14Cyc_GI03], (instrs UMINV_VPZ_B, UMINV_VPZ_D, UMINV_VPZ_H, UMINV_VPZ_S)>; + +// [499] "umulh $Zdn, $Pg/m, $_Zdn, $Zm"; +def : InstRW<[A64FXWrite_9Cyc_GI03], (instrs UMULH_ZPmZ_B, UMULH_ZPmZ_D, UMULH_ZPmZ_H, UMULH_ZPmZ_S)>; + +// [500] "uqadd $Zd, $Zn, $Zm"; + +// [501] "uqadd $Zdn, $_Zdn, $imm"; + +// [502] "uqdecb $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECB_WPiI, UQDECB_XPiI)>; + +// [503] "uqdecd $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECD_WPiI, UQDECD_XPiI)>; + +// [504] "uqdecd $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECD_ZPiI)>; + +// [505] "uqdech $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECH_WPiI, UQDECH_XPiI)>; + +// [506] "uqdech $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECH_ZPiI)>; + +// [507] "uqdecp $Rdn, $Pg"; +def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs UQDECP_WP_B, UQDECP_WP_D, UQDECP_WP_H, UQDECP_WP_S, UQDECP_XP_B, UQDECP_XP_D, UQDECP_XP_H, UQDECP_XP_S)>; + +// [508] "uqdecp $Zdn, $Pg"; +def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs UQDECP_ZP_D, UQDECP_ZP_H, UQDECP_ZP_S)>; + +// [509] "uqdecw $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQDECW_WPiI, UQDECW_XPiI)>; + +// [510] "uqdecw $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQDECW_ZPiI)>; + +// [511] "uqincb $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCB_WPiI, UQINCB_XPiI)>; + +// [512] "uqincd $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCD_WPiI, UQINCD_XPiI)>; + +// [513] "uqincd $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCD_ZPiI)>; + +// [514] "uqinch $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCH_WPiI, UQINCH_XPiI)>; + +// [515] "uqinch $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCH_ZPiI)>; + +// [516] "uqincp $Rdn, $Pg"; +def : InstRW<[A64FXWrite_8Cyc_GI124], (instrs UQINCP_WP_B, UQINCP_WP_D, UQINCP_WP_H, UQINCP_WP_S, UQINCP_XP_B, UQINCP_XP_D, UQINCP_XP_H, UQINCP_XP_S)>; + +// [517] "uqincp $Zdn, $Pg"; +def : InstRW<[A64FXWrite_12Cyc_GI01], (instrs UQINCP_ZP_D, UQINCP_ZP_H, UQINCP_ZP_S)>; + +// [518] "uqincw $Rdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs UQINCW_WPiI, UQINCW_XPiI)>; + +// [519] "uqincw $Zdn, $pattern, mul $imm4"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQINCW_ZPiI)>; + +// [520] "uqsub $Zd, $Zn, $Zm"; +//@@@ def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQSUB_ZZZ_B, UQSUB_ZZZ_D, UQSUB_ZZZ_H, UQSUB_ZZZ_S)>; + +// [521] "uqsub $Zdn, $_Zdn, $imm"; +//@@@ def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UQSUB_ZI_B, UQSUB_ZI_D, UQSUB_ZI_H, UQSUB_ZI_S)>; + +// [522] "uunpkhi $Zd, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs UUNPKHI_ZZ_D, UUNPKHI_ZZ_H, UUNPKHI_ZZ_S)>; + +// [523] "uunpklo $Zd, $Zn"; +def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs UUNPKLO_ZZ_D, UUNPKLO_ZZ_H, UUNPKLO_ZZ_S)>; + +// [524] "uxtb $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTB_ZPmZ_D, UXTB_ZPmZ_H, UXTB_ZPmZ_S)>; + +// [525] "uxth $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTH_ZPmZ_D, UXTH_ZPmZ_S)>; + +// [526] "uxtw $Zd, $Pg/m, $Zn"; +def : InstRW<[A64FXWrite_4Cyc_GI03], (instrs UXTW_ZPmZ_D)>; + +// [527] "uzp1 $Pd, $Pn, $Pm"; + +// [528] "uzp1 $Zd, $Zn, $Zm"; + +// [529] "uzp2 $Pd, $Pn, $Pm"; + +// [530] "uzp2 $Zd, $Zn, $Zm"; + +// [531] "whilele $Pd, $Rn, $Rm"; +def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELE_PWW_B, WHILELE_PWW_D, WHILELE_PWW_H, WHILELE_PWW_S, WHILELE_PXX_B, WHILELE_PXX_D, WHILELE_PXX_H, WHILELE_PXX_S)>; + +// [532] "whilelo $Pd, $Rn, $Rm"; +def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELO_PWW_B, WHILELO_PWW_D, WHILELO_PWW_H, WHILELO_PWW_S, WHILELO_PXX_B, WHILELO_PXX_D, WHILELO_PXX_H, WHILELO_PXX_S)>; + +// [533] "whilels $Pd, $Rn, $Rm"; +def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELS_PWW_B, WHILELS_PWW_D, WHILELS_PWW_H, WHILELS_PWW_S, WHILELS_PXX_B, WHILELS_PXX_D, WHILELS_PXX_H, WHILELS_PXX_S)>; + +// [534] "whilelt $Pd, $Rn, $Rm"; +def : InstRW<[A64FXWrite_4Cyc_GI12], (instrs WHILELT_PWW_B, WHILELT_PWW_D, WHILELT_PWW_H, WHILELT_PWW_S, WHILELT_PXX_B, WHILELT_PXX_D, WHILELT_PXX_H, WHILELT_PXX_S)>; + +// [535] "wrffr $Pn"; +def : InstRW<[A64FXWrite_6Cyc_NGI1], (instrs WRFFR)>; + +// [536] "zip1 $Pd, $Pn, $Pm"; + +// [537] "zip1 $Zd, $Zn, $Zm"; + +// [538] "zip2 $Pd, $Pn, $Pm"; + +// [539] "zip2 $Zd, $Zn, $Zm"; + +} // SchedModel = A64FXModel diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedAmpere1.td b/suite/synctools/tablegen/AArch64/AArch64SchedAmpere1.td new file mode 100644 index 00000000..32f7299f --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64SchedAmpere1.td @@ -0,0 +1,1136 @@ +//=- AArch64SchedAmpere1.td - Ampere-1 scheduling def -----*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the Ampere Computing Ampere-1 to +// support instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +// The Ampere-1 core is an out-of-order micro-architecture. The front +// end has branch prediction, with a 10-cycle recovery time from a +// mispredicted branch. Instructions coming out of the front end are +// decoded into internal micro-ops (uops). + +def Ampere1Model : SchedMachineModel { + let IssueWidth = 4; // 4-way decode and dispatch + let MicroOpBufferSize = 174; // micro-op re-order buffer size + let LoadLatency = 4; // Optimistic load latency + let MispredictPenalty = 10; // Branch mispredict penalty + let LoopMicroOpBufferSize = 32; // Instruction queue size + let CompleteModel = 1; + + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + SMEUnsupported.F); +} + +let SchedModel = Ampere1Model in { + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Ampere-1. +// Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP, +// and 2 memory) issue into. The integer and FP schedulers can each issue +// one uop per cycle, while the memory schedulers can each issue one load +// and one store address calculation per cycle. + +def Ampere1UnitA : ProcResource<2>; // integer single-cycle, branch, and flags r/w +def Ampere1UnitB : ProcResource<2>; // integer single-cycle, and complex shifts +def Ampere1UnitBS : ProcResource<1>; // integer multi-cycle +def Ampere1UnitL : ProcResource<2>; // load +def Ampere1UnitS : ProcResource<2>; // store address calculation +def Ampere1UnitX : ProcResource<1>; // FP and vector operations, and flag write +def Ampere1UnitY : ProcResource<1>; // FP and vector operations, and crypto +def Ampere1UnitZ : ProcResource<1>; // FP store data and FP-to-integer moves + +def Ampere1UnitAB : ProcResGroup<[Ampere1UnitA, Ampere1UnitB]>; +def Ampere1UnitXY : ProcResGroup<[Ampere1UnitX, Ampere1UnitY]>; + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Ampere-1. + +def Ampere1Write_1cyc_1A : SchedWriteRes<[Ampere1UnitA]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_2A : SchedWriteRes<[Ampere1UnitA, Ampere1UnitA]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1Write_1cyc_1B : SchedWriteRes<[Ampere1UnitB]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1AB : SchedWriteRes<[Ampere1UnitAB]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1S : SchedWriteRes<[Ampere1UnitS]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_2S : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1Y : SchedWriteRes<[Ampere1UnitY]> { + let Latency = 2; + let NumMicroOps = 1; +} + +def Ampere1Write_2cyc_2AB : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1B_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1B_1A : SchedWriteRes<[Ampere1UnitB, Ampere1UnitA]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_1A : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitA]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_2S : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS, + Ampere1UnitS]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_2cyc_1AB_1S_1Z : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS, + Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_2cyc_1B_1S : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 2; + let NumMicroOps = 1; +} + +def Ampere1Write_2cyc_1S_1Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_3cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1Write_3cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1Write_3cyc_1B_1S_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS, + Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_3cyc_1S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_3cyc_2S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def Ampere1Write_4cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1X : SchedWriteRes<[Ampere1UnitX]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1Y : SchedWriteRes<[Ampere1UnitY]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1Z : SchedWriteRes<[Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1Write_4cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1Write_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitS, Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def Ampere1Write_4cyc_3S_3Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 6; +} + +def Ampere1Write_5cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_1X : SchedWriteRes<[Ampere1UnitX]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1L_1BS : SchedWriteRes<[Ampere1UnitL, Ampere1UnitBS]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_4S_4Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 5; + let NumMicroOps = 8; +} + +def Ampere1Write_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 5; + let NumMicroOps = 6; +} + +def Ampere1Write_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 6; +} + +def Ampere1Write_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 9; +} + +def Ampere1Write_6cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_6cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 1; +} + +def Ampere1Write_6cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_6cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def Ampere1Write_6cyc_3L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def Ampere1Write_6cyc_4L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def Ampere1Write_6cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 7; + let NumMicroOps = 1; +} + +def Ampere1Write_7cyc_1BS_1XY : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_1L_1XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_2L_2XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 4; +} + +def Ampere1Write_7cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 7; + let NumMicroOps = 12; +} + +def Ampere1Write_8cyc_1BS_1A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1Write_8cyc_1BS_2A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA, + Ampere1UnitA]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def Ampere1Write_8cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1Write_8cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def Ampere1Write_8cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 6; +} + +def Ampere1Write_8cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 8; +} + +def Ampere1Write_9cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 6; +} + +def Ampere1Write_9cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 8; +} + +def Ampere1Write_9cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 3; +} + +def Ampere1Write_9cyc_2L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 5; +} + +def Ampere1Write_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 9; + let NumMicroOps = 14; +} + +def Ampere1Write_9cyc_8XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 9; + let NumMicroOps = 16; +} + +def Ampere1Write_10cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_1X_1Z : SchedWriteRes<[Ampere1UnitX, Ampere1UnitZ]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 6; +} + +def Ampere1Write_10cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def Ampere1Write_10cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_1BS_1L : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitL]> { + let Latency = 11; + let NumMicroOps = 2; +} + +def Ampere1Write_11cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 11; + let NumMicroOps = 12; +} + +def Ampere1Write_12cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 12; +} + +def Ampere1Write_12cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 3; +} + +def Ampere1Write_12cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 4; +} + +def Ampere1Write_18cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 18; + let NumMicroOps = 1; +} + +def Ampere1Write_19cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 19; + let NumMicroOps = 1; +} + +def Ampere1Write_25cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 25; + let NumMicroOps = 1; +} + +def Ampere1Write_32cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 32; + let NumMicroOps = 1; +} + +def Ampere1Write_34cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 34; + let NumMicroOps = 1; +} + +def Ampere1Write_34cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 34; + let NumMicroOps = 1; +} + +def Ampere1Write_39cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 39; + let NumMicroOps = 1; +} + +def Ampere1Write_62cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 62; + let NumMicroOps = 1; +} + +// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4), +// which are a single uop, and for extended registers, which have full flexibility +// across Unit A or B for both uops. +def Ampere1Write_Arith : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar]>; + +def Ampere1Write_ArithFlagsetting : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar]>; + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latencies for Ampere-1. +// This provides a coarse model, which is then specialised below. + +def : WriteRes; // MOVN, MOVZ +def : WriteRes; // ALU +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; +} // ALU of Shifted-Reg +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; +} // ALU of Extended-Reg +def : WriteRes; // EXTR shifts a reg pair +def : WriteRes; // Shift/Scale +def : WriteRes { + let Latency = 18; +} // 32-bit Divide +def : WriteRes { + let Latency = 34; +} // 64-bit Divide +def : WriteRes { + let Latency = 3; +} // 32-bit Multiply +def : WriteRes { + let Latency = 3; +} // 32-bit Multiply +def : WriteRes; +def : WriteRes; +def : WriteRes { + let Latency = 4; +} // Load from base addr plus immediate offset +def : WriteRes { + let Latency = 1; +} // Store to base addr plus immediate offset +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} // Store a register pair. +def : WriteRes; +def : WriteRes { + let Latency = 5; + let NumMicroOps = 2; +} // Load from a register index (maybe scaled). +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} // Store to a register index (maybe scaled). +def : WriteRes { + let Latency = 2; +} // General floating-point ops. +def : WriteRes { + let Latency = 5; +} // Floating-point compare. +def : WriteRes { + let Latency = 6; +} // Float conversion. +def : WriteRes { +} // Float-int register copy. +def : WriteRes { + let Latency = 2; +} // Float-int register copy. +def : WriteRes { + let Latency = 5; +} // Floating-point multiply. +def : WriteRes { + let Latency = 34; +} // Floating-point division. +def : WriteRes { + let Latency = 3; +} // 64bit Vector D ops. +def : WriteRes { + let Latency = 3; +} // 128bit Vector Q ops. +def : WriteRes { + let Latency = 5; +} // Vector loads. +def : WriteRes { + let Latency = 2; +} // Vector stores. + +def : WriteRes { let Unsupported = 1; } + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +def : WriteRes { + let Latency = 4; +} // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP + +// Forwarding logic. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Specialising the scheduling model further for Ampere-1. + +def : InstRW<[Ampere1Write_1cyc_1AB], (instrs COPY)>; + +// Branch instructions +def : InstRW<[Ampere1Write_1cyc_1A], (instrs Bcc, BL, RET)>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; +def : InstRW<[Ampere1Write_1cyc_2A], (instrs BLR)>; + +// Cryptography instructions +// -- AES encryption/decryption +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AES[DE]")>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AESI?MC")>; +// -- Polynomial multiplication +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^PMUL", "^PMULL")>; +// -- SHA-256 hash +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA256(H|H2)")>; +// -- SHA-256 schedule update +def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA256SU[01]")>; +// -- SHA-3 instructions +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>; +// -- SHA-512 hash +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA512(H|H2)")>; +// -- SHA-512 schedule update +def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA512SU[01]")>; +// -- SHA1 choose/majority/parity +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA1[CMP]")>; +// -- SHA1 hash/schedule update +def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1SU[01]")>; +def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1H")>; + +// FP and vector load instructions +// -- Load 1-element structure to one/all lanes +// ---- all lanes +def : InstRW<[Ampere1Write_7cyc_1L_1XY], + (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// ---- one lane +def : InstRW<[Ampere1Write_7cyc_1L_1XY], + (instregex "^LD1i(8|16|32|64)")>; +// -- Load 1-element structure to one/all lanes, 1D size +def : InstRW<[Ampere1Write_5cyc_1L], + (instregex "^LD1Rv1d")>; +// -- Load 1-element structures to 1 register +def : InstRW<[Ampere1Write_5cyc_1L], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 2 registers +def : InstRW<[Ampere1Write_5cyc_2L], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 3 registers +def : InstRW<[Ampere1Write_6cyc_3L], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 4 registers +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 2-element structure to all lanes of 2 registers, 1D size +def : InstRW<[Ampere1Write_5cyc_2L], + (instregex "^LD2Rv1d")>; +// -- Load 2-element structure to all lanes of 2 registers, other sizes +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 2-element structure to one lane of 2 registers +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2i(8|16|32|64)")>; +// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2Twov(16b|8h|4s|2d)")>; +// -- Load 2-element structures to 2 registers, 8B/4H/2S size +def : InstRW<[Ampere1Write_9cyc_2L_3XY], + (instregex "^LD2Twov(8b|4h|2s)")>; +// -- Load 3-element structure to all lanes of 3 registers, 1D size +def : InstRW<[Ampere1Write_6cyc_3L], + (instregex "^LD3Rv1d")>; +// -- Load 3-element structure to all lanes of 3 registers, other sizes +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 3-element structure to one lane of 3 registers +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3i(8|16|32|64)")>; +// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes +def : InstRW<[Ampere1Write_9cyc_3L_3XY], + (instregex "^LD3Threev(16b|8h|4s)")>; +// -- Load 3-element structures to 3 registers, 2D size +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3Threev2d")>; +// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_10cyc_3L_3XY], + (instregex "^LD3Threev(8b|4h|2s)")>; +// -- Load 4-element structure to all lanes of 4 registers, 1D size +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD4Rv1d")>; +// -- Load 4-element structure to all lanes of 4 registers, other sizes +def : InstRW<[Ampere1Write_8cyc_4L_4XY], + (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 4-element structure to one lane of 4 registers +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD4i(8|16|32|64)")>; +// -- Load 4-element structures to 4 registers, 2D size +def : InstRW<[Ampere1Write_9cyc_4L_4XY], + (instregex "^LD4Fourv2d")>; +// -- Load 4-element structures to 4 registers, 2S size +def : InstRW<[Ampere1Write_12cyc_4L_8XY], + (instregex "^LD4Fourv2s")>; +// -- Load 4-element structures to 4 registers, other sizes +def : InstRW<[Ampere1Write_11cyc_4L_8XY], + (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>; +// -- Load pair, Q-form +def : InstRW<[Ampere1Write_5cyc_2L], (instregex "LDN?PQ")>; +// -- Load pair, S/D-form +def : InstRW<[Ampere1Write_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>; +// -- Load register +def : InstRW<[Ampere1Write_5cyc_1L], (instregex "LDU?R[BHSDQ]i")>; +// -- Load register, sign-extended register +def : InstRW<[Ampere1Write_6cyc_1AB_1L], (instregex "LDR[BHSDQ]ro(W|X)")>; + +// FP and vector store instructions +// -- Store 1-element structure from one lane of 1 register +def : InstRW<[Ampere1Write_4cyc_1XY_1S_1Z], + (instregex "^ST1i(8|16|32|64)")>; +// -- Store 1-element structures from 1 register +def : InstRW<[Ampere1Write_2cyc_1S_1Z], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 2 registers +def : InstRW<[Ampere1Write_3cyc_2S_2Z], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 3 registers +def : InstRW<[Ampere1Write_4cyc_3S_3Z], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 4 registers +def : InstRW<[Ampere1Write_5cyc_4S_4Z], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 2-element structure from one lane of 2 registers +def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z], + (instregex "^ST2i(8|16|32|64)")>; +// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes +def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z], + (instregex "^ST2Twov(16b|8h|4s|2d)")>; +// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_6cyc_2XY_2S_2Z], + (instregex "^ST2Twov(8b|4h|2s)")>; +// -- Store 3-element structure from one lane of 3 registers +def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z], + (instregex "^ST3i(8|16|32|64)")>; +// -- Store 3-element structures from 3 registers +def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z], + (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 4-element structure from one lane of 4 registers +def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z], + (instregex "^ST4i(8|16|32|64)")>; +// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes +def : InstRW<[Ampere1Write_9cyc_8XY_4S_4Z], + (instregex "^ST4Fourv(16b|8h|4s)")>; +// -- Store 4-element structures from 4 registers, 2D sizes +def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z], + (instregex "^ST4Fourv2d")>; +// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_9cyc_6XY_4S_4Z], + (instregex "^ST4Fourv(8b|4h|2s)")>; +// -- Store pair, Q-form +def : InstRW<[Ampere1Write_3cyc_2S_2Z], (instregex "^STN?PQ")>; +// -- Store pair, S/D-form +def : InstRW<[Ampere1Write_3cyc_1S_2Z], (instregex "^STN?P[SD]")>; +// -- Store register +def : InstRW<[Ampere1Write_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>; +// -- Store register, sign-extended register offset +def : InstRW<[Ampere1Write_2cyc_1AB_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>; + +// FP data processing, bfloat16 format +def : InstRW<[Ampere1Write_5cyc_1XY], (instrs BFCVT)>; +def : InstRW<[Ampere1Write_7cyc_2XY], (instrs BFCVTN, BFCVTN2)>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>; +def : InstRW<[Ampere1Write_4cyc_2XY], (instrs BFMMLA)>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^BFMLAL")>; + +// FP data processing, scalar/vector, half precision +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>; +def : InstRW<[Ampere1Write_4cyc_1X], + (instregex "^FCMPE?H")>; +def : InstRW<[Ampere1Write_10cyc_1A_1BS_1X], + (instregex "^FCCMPE?H")>; +def : InstRW<[Ampere1Write_10cyc_1A_1BS_1XY], + (instregex "^FCSELH")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^[SU]CVTFv.[fi]16")>; +def : InstRW<[Ampere1Write_25cyc_1XY], (instregex "^FDIVv.[if]16", "FDIVH")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>; +def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>; +def : InstRW<[Ampere1Write_12cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FMULX?v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instrs FMULX16)>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRECPXv.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>; +def : InstRW<[Ampere1Write_39cyc_1XY], (instregex "^FSQRTv.f16", "^FSQRTHr")>; + +// FP data processing, scalar/vector, single/double precision +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1X], + (instregex "^FCMPE?(S|D)")>; +def : InstRW<[Ampere1Write_11cyc_1A_1BS_1X], + (instregex "^FCCMPE?(S|D)")>; +def : InstRW<[Ampere1Write_11cyc_1A_1BS_1XY], + (instregex "^FCSEL(S|D)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^[SU]CVTFv.[fi](32|64)")>; +def : InstRW<[Ampere1Write_34cyc_1XY], (instregex "^FDIVv.[if](64)", "FDIVD")>; +def : InstRW<[Ampere1Write_19cyc_1XY], (instregex "^FDIVv.[if](32)", "FDIVS")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>; +def : InstRW<[Ampere1Write_10cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instrs FMULX32, FMULX64)>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT(32|64)")>; +def : InstRW<[Ampere1Write_62cyc_1XY], (instregex "^FSQRTv.f64", "^FSQRTDr")>; +def : InstRW<[Ampere1Write_32cyc_1XY], (instregex "^FSQRTv.f32", "^FSQRTSr")>; + +// FP miscellaneous instructions +def : InstRW<[Ampere1Write_10cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FCVT[HSD]Hr")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[HSD][SD]r")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVTLv")>; +def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^FCVT(N|XN)v")>; +def : InstRW<[Ampere1Write_10cyc_1X_1Z], (instrs FJCVTZS)>; +def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>; +def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>; +def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "^FMOVXDHighr")>; +def : InstRW<[Ampere1Write_4cyc_1Z], (instregex "^FMOV[WX][HSD]r")>; + +// Integer arithmetic and logical instructions +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "ADC(W|X)r", "SBC(W|X)r")>; +def : InstRW<[Ampere1Write_Arith], + (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r")>; +def : InstRW<[Ampere1Write_ArithFlagsetting], + (instregex "(ADD|AND|BIC|SUB)S(W|X)r")>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(ADC|SBC)S(W|X)r")>; +def : InstRW<[Ampere1Write_1cyc_1A], (instrs RMIF)>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(CCMN|CCMP)(X|W)")>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>; +def : InstRW<[Ampere1Write_18cyc_1BS], (instrs SDIVWr, UDIVWr)>; +def : InstRW<[Ampere1Write_34cyc_1BS], (instrs SDIVXr, UDIVXr)>; +def : InstRW<[Ampere1Write_3cyc_1BS], + (instregex "(S|U)MULHr")>; +def : InstRW<[Ampere1Write_4cyc_1BS], + (instregex "(S|U)?M(ADD|SUB)L?r")>; + +// Integer load instructions +def : InstRW<[Ampere1Write_4cyc_2L], + (instregex "(LDNP|LDP|LDPSW)(X|W)")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDR(B|D|H|Q|S)ui")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDR(D|Q|W|X)l")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDTR(B|H|W|X)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDTRS(BW|BX|HW|HX|W)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDUR(BB|HH|X|W)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDURS(BW|BX|HW|HX|W)i")>; +def : InstRW<[Ampere1Write_5cyc_1AB_1L], + (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1L], + (instrs PRFMl, PRFUMi, PRFUMi)>; +def : InstRW<[Ampere1Write_2cyc_1AB_1L], + (instrs PRFMroW, PRFMroX)>; + +// Integer miscellaneous instructions +def : InstRW<[Ampere1Write_1cyc_1A], (instrs ADR, ADRP)>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "EXTR(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "(S|U)?BFM(W|X)")>; +def : InstRW<[Ampere1Write_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "CLS(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1A], (instrs SETF8, SETF16)>; +def : InstRW<[Ampere1Write_1cyc_1AB], + (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; +def : InstRW<[Ampere1Write_1cyc_1B], + (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>; +def : InstRW<[Ampere1Write_1cyc_1B], + (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>; + +// Integer store instructions +def : InstRW<[Ampere1Write_1cyc_2S], (instregex "STNP(X|W)i")>; +def : InstRW<[Ampere1Write_2cyc_1B_1S], + (instrs STPWi, STPXi)>; +def : InstRW<[Ampere1Write_3cyc_1B_1S_1AB], + (instregex "STP(W|X)(pre|post)")>; +def : InstRW<[Ampere1Write_1cyc_1S], + (instrs STTRBi, STTRHi, STTRWi, STTRXi)>; +def : InstRW<[Ampere1Write_1cyc_1S], + (instregex "STUR(BB|HH|X|W)i", + "STR(X|W)ui", + "STUR(BB|HH|X|W)i")>; +def : InstRW<[Ampere1Write_1cyc_2S], (instrs STRWroX, STRXroX)>; +def : InstRW<[Ampere1Write_2cyc_1AB_2S], (instrs STRWroW, STRXroW)>; + +// Pointer authentication +//def : InstRW<[Ampere1Write_7cyc_1BS], +// (instrs AUTIAZ, AUTIBZ, AUTIASP, AUTIBSP, AUTIA1716, AUTIB1716)>; +def : InstRW<[Ampere1Write_8cyc_1BS_1A], + (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>; +def : InstRW<[Ampere1Write_8cyc_1BS_2A], + (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>; +//def : InstRW<[Ampere1Write_7cyc_1BS], +// (instrs PACIAZ, PACIBZ, PACIASP, PACIBSP, PACIA1716, PACIB1716)>; +def : InstRW<[Ampere1Write_11cyc_1BS_1L], (instregex "^LDRA(A|B)")>; +def : InstRW<[Ampere1Write_7cyc_1BS], (instrs XPACD, XPACI)>; + +// Vector integer instructions +// -- absolute difference +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv", + "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>; +// -- arithmetic +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD", + "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW", + "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>; +// -- arithmetic, horizontal, 16B +def : InstRW<[Ampere1Write_12cyc_4XY], + (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>; +def : InstRW<[Ampere1Write_12cyc_4XY], + (instregex "^[SU](MIN|MAX)Vv16i8v")>; +// -- arithmetic, horizontal, 4H/4S +def : InstRW<[Ampere1Write_6cyc_2XY], + (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>; +def : InstRW<[Ampere1Write_6cyc_2XY], + (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>; +// -- arithmetic, horizontal, 8B/8H +def : InstRW<[Ampere1Write_9cyc_3XY], + (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>; +def : InstRW<[Ampere1Write_9cyc_3XY], + (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>; +// -- arithmetic, narrowing +def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(ADD|SUB)HNv.*")>; +def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>; +// -- arithmetic, pairwise +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>; +// -- arithmetic, saturating +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>; +// -- bit count +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^(CLS|CLZ|CNT)v")>; +// -- compare +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv", + "^CMHIv", "^CMHSv")>; +// -- compare non-zero +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^CMTSTv")>; +// -- dot product +def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>; +// -- fp reciprocal estimate +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPEv", "^FRSQRTEv")>; +// -- integer reciprocal estimate +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>; +// -- logical +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; +// -- logical, narrowing +def : InstRW<[Ampere1Write_5cyc_2XY], + (instregex "RSHRNv", + "SHRNv", "SQSHRNv", "SQSHRUNv", + "UQXTNv")>; +// -- matrix multiply +def : InstRW<[Ampere1Write_6cyc_2XY], + (instrs SMMLA, UMMLA, USMMLA)>; +// -- max/min +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>; +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>; +// -- move immediate +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>; +// -- multiply +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>; +// -- multiply accumulate +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>; +// -- negation, saturating +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>; +// -- reverse bits/bytes +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>; +// -- shift +def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// -- shift and accumulate +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>; +// -- shift, saturating +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU", + "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL", + "^UQSHL")>; + +// Vector miscellaneous instructions +// -- duplicate element +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^DUPv.+lane")>; +// -- duplicate from GPR +def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^DUPv.+gpr")>; +// -- extract narrow +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^XTNv")>; +// -- insert/extract element +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>; +// -- move FP immediate +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOVv")>; +// -- move element to GPR +def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "(S|U)MOVv")>; +// -- move from GPR to any element +def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>; +// -- table lookup +def : InstRW<[Ampere1Write_2cyc_1XY], + (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>; +def : InstRW<[Ampere1Write_4cyc_2XY], + (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>; +def : InstRW<[Ampere1Write_6cyc_3XY], + (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>; +def : InstRW<[Ampere1Write_8cyc_4XY], + (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>; +// -- transpose +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>; +// -- zip/unzip +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>; + +} // SchedModel = Ampere1Model diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedCyclone.td b/suite/synctools/tablegen/AArch64/AArch64SchedCyclone.td index 7a474ba8..9fbb4691 100644 --- a/suite/synctools/tablegen/AArch64/AArch64SchedCyclone.td +++ b/suite/synctools/tablegen/AArch64/AArch64SchedCyclone.td @@ -1,9 +1,8 @@ //=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -19,7 +18,9 @@ def CycloneModel : SchedMachineModel { let MispredictPenalty = 16; // 14-19 cycles are typical. let CompleteModel = 1; - list UnsupportedFeatures = [HasSVE]; + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F, + SMEUnsupported.F); } //===----------------------------------------------------------------------===// @@ -257,6 +258,7 @@ def CyReadAdrBase : SchedReadVariant<[ SchedVar, // Read base reg after shifting offset. SchedVar]>; // Read base reg with no shift. def : SchedAlias; // Map AArch64->Cyclone type. +def : ReadAdvance; //--- // 7.8.9,7.8.11. Load/Store, paired @@ -302,7 +304,8 @@ def : WriteRes {let Latency = -1;} // 7.9 Vector Unit Instructions // Simple vector operations take 2 cycles. -def : WriteRes {let Latency = 2;} +def : WriteRes {let Latency = 2;} +def : WriteRes {let Latency = 2;} // Define some longer latency vector op types for Cyclone. def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;} @@ -333,7 +336,7 @@ def : WriteRes {let Latency = 2;} // COPY is handled above in the WriteMov Variant. def WriteVMov : SchedWriteVariant<[ SchedVar, - SchedVar]>; + SchedVar]>; def : InstRW<[WriteVMov], (instrs ORRv16i8)>; // FMOVSr,FMOVDr are WriteF. @@ -353,7 +356,7 @@ def : WriteRes { def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>; // INS V[x],R -def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>; +def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteVq]>; def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>; // SMOV,UMOV R,V[x] @@ -495,7 +498,7 @@ def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>; // WriteV includes: // SHLL,SSHLL,USHLL // SLI,SRI -// BIF,BIT,BSL +// BIF,BIT,BSL,BSP // EXT // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN // XTN2 @@ -569,7 +572,7 @@ def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>; //--- // FCVT lengthen f16/s32 -def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>; +def : InstRW<[WriteVq], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>; // FCVT,FCVTN,FCVTXN // SCVTF,UCVTF V,V @@ -679,61 +682,61 @@ def : InstRW<[WriteVLDShuffle], def : InstRW<[WriteVLDShuffle, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[WriteVLDShuffle, WriteV], +def : InstRW<[WriteVLDShuffle, WriteVq], (instregex "LD2Twov(8b|4h|2s)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVq], (instregex "LD2Twov(8b|4h|2s)_POST$")>; def : InstRW<[WriteVLDShuffle, WriteVLDShuffle], (instregex "LD2Twov(16b|8h|4s|2d)$")>; def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle], (instregex "LD2Twov(16b|8h|4s|2d)_POST")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVq], (instregex "LD2i(8|16|32)$")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVq], (instregex "LD2i(8|16|32)_POST")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVq], (instregex "LD2i64$")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVq], (instregex "LD2i64_POST")>; -def : InstRW<[WriteVLDShuffle, WriteV], +def : InstRW<[WriteVLDShuffle, WriteVq], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVq], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; -def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVq], (instregex "LD3Threev(8b|4h|2s)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVq], (instregex "LD3Threev(8b|4h|2s)_POST")>; def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle], (instregex "LD3Threev(16b|8h|4s|2d)$")>; def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle], (instregex "LD3Threev(16b|8h|4s|2d)_POST")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV], +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVq, WriteVq], (instregex "LD3i(8|16|32)$")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV], +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVq, WriteVq], (instregex "LD3i(8|16|32)_POST")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV], +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteVq], (instregex "LD3i64$")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteVq], (instregex "LD3i64_POST")>; -def : InstRW<[WriteVLDShuffle, WriteV, WriteV], +def : InstRW<[WriteVLDShuffle, WriteVq, WriteVq], (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV], +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVq, WriteVq], (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>; -def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVq], (instrs LD3Rv1d,LD3Rv2d)>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVq], (instrs LD3Rv1d_POST,LD3Rv2d_POST)>; -def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVq, WriteVq], (instregex "LD4Fourv(8b|4h|2s)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVq, WriteVq], (instregex "LD4Fourv(8b|4h|2s)_POST")>; def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle, WriteVLDPairShuffle, WriteVLDPairShuffle], @@ -742,25 +745,25 @@ def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle, WriteVLDPairShuffle, WriteVLDPairShuffle], (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV], +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVq, WriteVq, WriteVq], (instregex "LD4i(8|16|32)$")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV], +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVq, WriteVq, WriteVq], (instregex "LD4i(8|16|32)_POST")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV], +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteVq, WriteVq], (instrs LD4i64)>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteVq], (instrs LD4i64_POST)>; -def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV], +def : InstRW<[WriteVLDShuffle, WriteVq, WriteVq, WriteVq], (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV], +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVq, WriteVq, WriteVq], (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>; -def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVq, WriteVq], (instrs LD4Rv1d,LD4Rv2d)>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVq, WriteVq], (instrs LD4Rv1d_POST,LD4Rv2d_POST)>; //--- diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedExynosM1.td b/suite/synctools/tablegen/AArch64/AArch64SchedExynosM1.td deleted file mode 100644 index ecc68aed..00000000 --- a/suite/synctools/tablegen/AArch64/AArch64SchedExynosM1.td +++ /dev/null @@ -1,847 +0,0 @@ -//=- AArch64SchedExynosM1.td - Samsung Exynos M1 Sched Defs --*- tablegen -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the machine model for the Samsung Exynos M1 to support -// instruction scheduling and other instruction cost heuristics. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// The Exynos-M1 is a traditional superscalar microprocessor with a -// 4-wide in-order stage for decode and dispatch and a wider issue stage. -// The execution units and loads and stores are out-of-order. - -def ExynosM1Model : SchedMachineModel { - let IssueWidth = 4; // Up to 4 uops per cycle. - let MicroOpBufferSize = 96; // ROB size. - let LoopMicroOpBufferSize = 24; // Based on the instruction queue size. - let LoadLatency = 4; // Optimistic load cases. - let MispredictPenalty = 14; // Minimum branch misprediction penalty. - let CompleteModel = 1; // Use the default model otherwise. - - list UnsupportedFeatures = [HasSVE]; -} - -//===----------------------------------------------------------------------===// -// Define each kind of processor resource and number available on the Exynos-M1, -// which has 9 pipelines, each with its own queue with out-of-order dispatch. - -let SchedModel = ExynosM1Model in { - -def M1UnitA : ProcResource<2>; // Simple integer -def M1UnitC : ProcResource<1>; // Simple and complex integer -def M1UnitD : ProcResource<1>; // Integer division (inside C, serialized) -def M1UnitB : ProcResource<2>; // Branch -def M1UnitL : ProcResource<1>; // Load -def M1UnitS : ProcResource<1>; // Store -def M1PipeF0 : ProcResource<1>; // FP #0 -let Super = M1PipeF0 in { - def M1UnitFMAC : ProcResource<1>; // FP multiplication - def M1UnitNAL0 : ProcResource<1>; // Simple vector - def M1UnitNMISC : ProcResource<1>; // Miscellanea - def M1UnitFCVT : ProcResource<1>; // FP conversion - def M1UnitNCRYPT : ProcResource<1>; // Cryptographic -} -def M1PipeF1 : ProcResource<1>; // FP #1 -let Super = M1PipeF1 in { - def M1UnitFADD : ProcResource<1>; // Simple FP - def M1UnitNAL1 : ProcResource<1>; // Simple vector - def M1UnitFVAR : ProcResource<1>; // FP division & square root (serialized) - def M1UnitFST : ProcResource<1>; // FP store -} - -def M1UnitALU : ProcResGroup<[M1UnitA, - M1UnitC]>; // All integer -def M1UnitNALU : ProcResGroup<[M1UnitNAL0, - M1UnitNAL1]>; // All simple vector - -//===----------------------------------------------------------------------===// -// Predicates. - -def M1BranchLinkFastPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR && - MI->getOperand(0).getReg() != AArch64::LR}]>; -def M1ShiftLeftFastPred : SchedPredicate<[{TII->isExynosShiftLeftFast(*MI)}]>; - -//===----------------------------------------------------------------------===// -// Coarse scheduling model. - -def M1WriteA1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; } -def M1WriteA2 : SchedWriteRes<[M1UnitALU]> { let Latency = 2; } -def M1WriteAA : SchedWriteRes<[M1UnitALU]> { let Latency = 2; - let ResourceCycles = [2]; } -def M1WriteAB : SchedWriteRes<[M1UnitALU, - M1UnitC]> { let Latency = 1; - let NumMicroOps = 2; } -def M1WriteAC : SchedWriteRes<[M1UnitALU, - M1UnitALU, - M1UnitC]> { let Latency = 2; - let NumMicroOps = 3; } -def M1WriteAD : SchedWriteRes<[M1UnitALU, - M1UnitC]> { let Latency = 2; - let NumMicroOps = 2; } -def M1WriteAX : SchedWriteVariant<[SchedVar, - SchedVar]>; -def M1WriteC1 : SchedWriteRes<[M1UnitC]> { let Latency = 1; } -def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; } - -def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; } -def M1WriteBX : SchedWriteVariant<[SchedVar, - SchedVar]>; - -def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; } -def M1WriteL6 : SchedWriteRes<[M1UnitL]> { let Latency = 6; } -def M1WriteLA : SchedWriteRes<[M1UnitL]> { let Latency = 6; - let ResourceCycles = [2]; } -def M1WriteLB : SchedWriteRes<[M1UnitL, - M1UnitA]> { let Latency = 4; - let NumMicroOps = 2; } -def M1WriteLC : SchedWriteRes<[M1UnitL, - M1UnitA]> { let Latency = 5; - let NumMicroOps = 2; } -def M1WriteLD : SchedWriteRes<[M1UnitL, - M1UnitA]> { let Latency = 6; - let NumMicroOps = 2; - let ResourceCycles = [2, 1]; } -def M1WriteLH : SchedWriteRes<[]> { let Latency = 5; - let NumMicroOps = 0; } -def M1WriteLX : SchedWriteVariant<[SchedVar, - SchedVar]>; -def M1WriteLY : SchedWriteVariant<[SchedVar, - SchedVar]>; - -def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; } -def M1WriteS3 : SchedWriteRes<[M1UnitS]> { let Latency = 3; } -def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; } -def M1WriteSA : SchedWriteRes<[M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST]> { let Latency = 1; - let NumMicroOps = 2; } -def M1WriteSB : SchedWriteRes<[M1UnitS, - M1UnitFST, - M1UnitA]> { let Latency = 3; - let NumMicroOps = 2; } -def M1WriteSC : SchedWriteRes<[M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitA]> { let Latency = 3; - let NumMicroOps = 3; } -def M1WriteSD : SchedWriteRes<[M1UnitS, - M1UnitFST, - M1UnitA]> { let Latency = 1; - let NumMicroOps = 2; } -def M1WriteSE : SchedWriteRes<[M1UnitS, - M1UnitA]> { let Latency = 2; - let NumMicroOps = 2; } -def M1WriteSX : SchedWriteVariant<[SchedVar, - SchedVar]>; -def M1WriteSY : SchedWriteVariant<[SchedVar, - SchedVar]>; - -def M1ReadAdrBase : SchedReadVariant<[SchedVar, - SchedVar]>; - -// Branch instructions. -def : WriteRes { let Latency = 0; } -def : WriteRes { let Latency = 1; } - -// Arithmetic and logical integer instructions. -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } - -// Move instructions. -def : WriteRes { let Latency = 1; } - -// Divide and multiply instructions. -def : WriteRes { let Latency = 13; - let ResourceCycles = [1, 13]; } -def : WriteRes { let Latency = 21; - let ResourceCycles = [1, 21]; } -// TODO: Long multiplication take 5 cycles and also the ALU. -def : WriteRes { let Latency = 3; } -def : WriteRes { let Latency = 4; - let ResourceCycles = [2]; } - -// Miscellaneous instructions. -def : WriteRes { let Latency = 2; - let NumMicroOps = 2; } - -// Addressing modes. -def : WriteRes { let Latency = 1; - let NumMicroOps = 0; } -def : SchedAlias; - -// Load instructions. -def : WriteRes { let Latency = 4; } -def : WriteRes { let Latency = 4; - let NumMicroOps = 0; } -def : SchedAlias; - -// Store instructions. -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : SchedAlias; - -// FP data instructions. -def : WriteRes { let Latency = 3; } -def : WriteRes { let Latency = 4; } -def : WriteRes { let Latency = 15; - let ResourceCycles = [15]; } -def : WriteRes { let Latency = 4; } - -// FP miscellaneous instructions. -def : WriteRes { let Latency = 3; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 4; } - -// FP load instructions. -def : WriteRes { let Latency = 5; } - -// FP store instructions. -def : WriteRes { let Latency = 1; - let NumMicroOps = 1; } - -// ASIMD FP instructions. -def : WriteRes { let Latency = 3; } - -// Other miscellaneous instructions. -def : WriteRes { let Unsupported = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } - -//===----------------------------------------------------------------------===// -// Fast forwarding. - -// TODO: Add FP register forwarding rules. -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; -// TODO: The forwarding for WriteIM32 saves actually 2 cycles. -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; - -//===----------------------------------------------------------------------===// -// Finer scheduling model. - -def M1WriteNEONA : SchedWriteRes<[M1UnitNALU, - M1UnitNALU, - M1UnitFADD]> { let Latency = 9; - let NumMicroOps = 3; } -def M1WriteNEONB : SchedWriteRes<[M1UnitNALU, - M1UnitFST]> { let Latency = 5; - let NumMicroOps = 2;} -def M1WriteNEONC : SchedWriteRes<[M1UnitNALU, - M1UnitFST]> { let Latency = 6; - let NumMicroOps = 2; } -def M1WriteNEOND : SchedWriteRes<[M1UnitNALU, - M1UnitFST, - M1UnitL]> { let Latency = 10; - let NumMicroOps = 3; } -def M1WriteNEONE : SchedWriteRes<[M1UnitFCVT, - M1UnitFST]> { let Latency = 8; - let NumMicroOps = 2; } -def M1WriteNEONF : SchedWriteRes<[M1UnitFCVT, - M1UnitFST, - M1UnitL]> { let Latency = 13; - let NumMicroOps = 3; } -def M1WriteNEONG : SchedWriteRes<[M1UnitNMISC, - M1UnitFST]> { let Latency = 6; - let NumMicroOps = 2; } -def M1WriteNEONH : SchedWriteRes<[M1UnitNALU, - M1UnitFST]> { let Latency = 3; - let NumMicroOps = 2; } -def M1WriteNEONI : SchedWriteRes<[M1UnitFST, - M1UnitL]> { let Latency = 9; - let NumMicroOps = 2; } -def M1WriteNEONJ : SchedWriteRes<[M1UnitNMISC, - M1UnitFMAC]> { let Latency = 6; - let NumMicroOps = 2; } -def M1WriteNEONK : SchedWriteRes<[M1UnitNMISC, - M1UnitFMAC]> { let Latency = 7; - let NumMicroOps = 2; } -def M1WriteNEONL : SchedWriteRes<[M1UnitNALU]> { let Latency = 2; - let ResourceCycles = [2]; } -def M1WriteFADD3 : SchedWriteRes<[M1UnitFADD]> { let Latency = 3; } -def M1WriteFCVT3 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 3; } -def M1WriteFCVT4 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 4; } -def M1WriteFMAC4 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 4; } -def M1WriteFMAC5 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 5; } -// TODO -def M1WriteFVAR15 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 15; - let ResourceCycles = [15]; } -def M1WriteFVAR23 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 23; - let ResourceCycles = [23]; } -def M1WriteNALU1 : SchedWriteRes<[M1UnitNALU]> { let Latency = 1; } -def M1WriteNALU2 : SchedWriteRes<[M1UnitNALU]> { let Latency = 2; } -def M1WriteNAL11 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 1; } -def M1WriteNAL12 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 2; } -def M1WriteNAL13 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 3; } -def M1WriteNCRYPT1 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } -def M1WriteNCRYPT5 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 5; } -def M1WriteNMISC1 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 1; } -def M1WriteNMISC2 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 2; } -def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; } -def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; } -def M1WriteTB : SchedWriteRes<[M1UnitC, - M1UnitALU]> { let Latency = 2; - let NumMicroOps = 2; } -def M1WriteVLDA : SchedWriteRes<[M1UnitL, - M1UnitL]> { let Latency = 6; - let NumMicroOps = 2; } -def M1WriteVLDB : SchedWriteRes<[M1UnitL, - M1UnitL, - M1UnitL]> { let Latency = 7; - let NumMicroOps = 3; } -def M1WriteVLDC : SchedWriteRes<[M1UnitL, - M1UnitL, - M1UnitL, - M1UnitL]> { let Latency = 8; - let NumMicroOps = 4; } -def M1WriteVLDD : SchedWriteRes<[M1UnitL, - M1UnitNALU]> { let Latency = 7; - let NumMicroOps = 2; - let ResourceCycles = [2, 1]; } -def M1WriteVLDE : SchedWriteRes<[M1UnitL, - M1UnitNALU]> { let Latency = 6; - let NumMicroOps = 2; } -def M1WriteVLDF : SchedWriteRes<[M1UnitL, - M1UnitL]> { let Latency = 10; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; } -def M1WriteVLDG : SchedWriteRes<[M1UnitL, - M1UnitNALU, - M1UnitNALU]> { let Latency = 7; - let NumMicroOps = 3; - let ResourceCycles = [2, 1, 1]; } -def M1WriteVLDH : SchedWriteRes<[M1UnitL, - M1UnitNALU, - M1UnitNALU]> { let Latency = 6; - let NumMicroOps = 3; } -def M1WriteVLDI : SchedWriteRes<[M1UnitL, - M1UnitL, - M1UnitL]> { let Latency = 12; - let NumMicroOps = 3; - let ResourceCycles = [2, 2, 2]; } -def M1WriteVLDJ : SchedWriteRes<[M1UnitL, - M1UnitNALU, - M1UnitNALU, - M1UnitNALU]> { let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1, 1]; } -def M1WriteVLDK : SchedWriteRes<[M1UnitL, - M1UnitNALU, - M1UnitNALU, - M1UnitNALU, - M1UnitNALU]> { let Latency = 9; - let NumMicroOps = 5; - let ResourceCycles = [2, 1, 1, 1, 1]; } -def M1WriteVLDL : SchedWriteRes<[M1UnitL, - M1UnitNALU, - M1UnitNALU, - M1UnitL, - M1UnitNALU]> { let Latency = 7; - let NumMicroOps = 5; - let ResourceCycles = [1, 1, 1, 1, 1]; } -def M1WriteVLDM : SchedWriteRes<[M1UnitL, - M1UnitNALU, - M1UnitNALU, - M1UnitL, - M1UnitNALU, - M1UnitNALU]> { let Latency = 7; - let NumMicroOps = 6; - let ResourceCycles = [1, 1, 1, 1, 1, 1]; } -def M1WriteVLDN : SchedWriteRes<[M1UnitL, - M1UnitL, - M1UnitL, - M1UnitL]> { let Latency = 14; - let NumMicroOps = 4; - let ResourceCycles = [2, 1, 2, 1]; } -def M1WriteVSTA : WriteSequence<[WriteVST], 2>; -def M1WriteVSTB : WriteSequence<[WriteVST], 3>; -def M1WriteVSTC : WriteSequence<[WriteVST], 4>; -def M1WriteVSTD : SchedWriteRes<[M1UnitS, - M1UnitFST, - M1UnitFST]> { let Latency = 7; - let NumMicroOps = 2; - let ResourceCycles = [7, 1, 1]; } -def M1WriteVSTE : SchedWriteRes<[M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitFST]> { let Latency = 8; - let NumMicroOps = 3; - let ResourceCycles = [7, 1, 1, 1, 1]; } -def M1WriteVSTF : SchedWriteRes<[M1UnitNALU, - M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitFST, - M1UnitFST]> { let Latency = 15; - let NumMicroOps = 5; - let ResourceCycles = [1, 7, 1, 7, 1, 1, 1]; } -def M1WriteVSTG : SchedWriteRes<[M1UnitNALU, - M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitFST, - M1UnitFST]> { let Latency = 16; - let NumMicroOps = 6; - let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1]; } -def M1WriteVSTH : SchedWriteRes<[M1UnitNALU, - M1UnitS, - M1UnitFST, - M1UnitFST, - M1UnitFST]> { let Latency = 14; - let NumMicroOps = 4; - let ResourceCycles = [1, 7, 1, 7, 1]; } -def M1WriteVSTI : SchedWriteRes<[M1UnitNALU, - M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitFST, - M1UnitFST]> { let Latency = 17; - let NumMicroOps = 7; - let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 1]; } - -// Branch instructions -def : InstRW<[M1WriteB1], (instrs Bcc)>; -def : InstRW<[M1WriteA1], (instrs BL)>; -def : InstRW<[M1WriteBX], (instrs BLR)>; -def : InstRW<[M1WriteC1], (instregex "^CBN?Z[WX]")>; -def : InstRW<[M1WriteAD], (instregex "^TBN?Z[WX]")>; - -// Arithmetic and logical integer instructions. -def : InstRW<[M1WriteA1], (instrs COPY)>; -def : InstRW<[M1WriteAX], (instregex ".+r[sx](64)?$")>; - -// Divide and multiply instructions. - -// Miscellaneous instructions. - -// Load instructions. -def : InstRW<[M1WriteLB, - WriteLDHi, - WriteAdr], (instregex "^LDP(SW|W|X)(post|pre)")>; -def : InstRW<[M1WriteLX, - ReadAdrBase], (instregex "^PRFMro[WX]")>; - -// Store instructions. - -// FP data instructions. -def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)[DS]r")>; -def : InstRW<[M1WriteFADD3], (instregex "^F(ADD|SUB)[DS]rr")>; -def : InstRW<[M1WriteNEONG], (instregex "^FCCMPE?[DS]rr")>; -def : InstRW<[M1WriteNMISC4], (instregex "^FCMPE?[DS]r")>; -def : InstRW<[M1WriteFVAR15], (instrs FDIVSrr)>; -def : InstRW<[M1WriteFVAR23], (instrs FDIVDrr)>; -def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN).+rr")>; -def : InstRW<[M1WriteFMAC4], (instregex "^FN?MUL[DS]rr")>; -def : InstRW<[M1WriteFMAC5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>; -def : InstRW<[M1WriteFCVT3], (instregex "^FRINT.+r")>; -def : InstRW<[M1WriteNEONH], (instregex "^FCSEL[DS]rrr")>; -def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>; -def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>; - -// FP miscellaneous instructions. -def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>; -def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>; -def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>; -def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>; -def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev1")>; -def : InstRW<[M1WriteNMISC1], (instregex "^FRECPXv1")>; -def : InstRW<[M1WriteFMAC5], (instregex "^F(RECP|RSQRT)S(16|32|64)")>; -def : InstRW<[M1WriteS4], (instregex "^FMOV[WX][DS](High)?r")>; -def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>; - -// FP load instructions. -def : InstRW<[WriteVLD], (instregex "^LDR[DSQ]l")>; -def : InstRW<[WriteVLD], (instregex "^LDUR[BDHSQ]i")>; -def : InstRW<[WriteVLD, - WriteAdr], (instregex "^LDR[BDHSQ](post|pre)")>; -def : InstRW<[WriteVLD], (instregex "^LDR[BDHSQ]ui")>; -def : InstRW<[M1WriteLY, - ReadAdrBase], (instregex "^LDR[BDHS]ro[WX]")>; -def : InstRW<[M1WriteLD, - ReadAdrBase], (instregex "^LDRQro[WX]")>; -def : InstRW<[WriteVLD, - M1WriteLH], (instregex "^LDN?P[DS]i")>; -def : InstRW<[M1WriteLA, - M1WriteLH], (instregex "^LDN?PQi")>; -def : InstRW<[M1WriteLC, - M1WriteLH, - WriteAdr], (instregex "^LDP[DS](post|pre)")>; -def : InstRW<[M1WriteLD, - M1WriteLH, - WriteAdr], (instregex "^LDPQ(post|pre)")>; - -// FP store instructions. -def : InstRW<[WriteVST], (instregex "^STUR[BDHSQ]i")>; -def : InstRW<[WriteVST, - WriteAdr], (instregex "^STR[BDHSQ](post|pre)")>; -def : InstRW<[WriteVST], (instregex "^STR[BDHSQ]ui")>; -def : InstRW<[M1WriteSY, - ReadAdrBase], (instregex "^STR[BDHS]ro[WX]")>; -def : InstRW<[M1WriteSB, - ReadAdrBase], (instregex "^STRQro[WX]")>; -def : InstRW<[WriteVST], (instregex "^STN?P[DSQ]i")>; -def : InstRW<[WriteVST, - WriteAdr], (instregex "^STP[DS](post|pre)")>; -def : InstRW<[M1WriteSC, - WriteAdr], (instregex "^STPQ(post|pre)")>; - -// ASIMD instructions. -def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABAL?v")>; -def : InstRW<[M1WriteNMISC1], (instregex "^[SU]ABDL?v")>; -def : InstRW<[M1WriteNMISC1], (instregex "^(SQ)?ABSv")>; -def : InstRW<[M1WriteNMISC1], (instregex "^SQNEGv")>; -def : InstRW<[M1WriteNALU1], (instregex "^(ADD|NEG|SUB)v")>; -def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?H(ADD|SUB)v")>; -def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?AD[AD](L|LP|P|W)V?2?v")>; -def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?SUB[LW]2?v")>; -def : InstRW<[M1WriteNMISC3], (instregex "^R?(ADD|SUB)HN?2?v")>; -def : InstRW<[M1WriteNMISC3], (instregex "^[SU]+Q(ADD|SUB)v")>; -def : InstRW<[M1WriteNMISC3], (instregex "^[SU]RHADDv")>; -def : InstRW<[M1WriteNMISC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>; -def : InstRW<[M1WriteNALU1], (instregex "^CMTSTv")>; -def : InstRW<[M1WriteNALU1], (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>; -def : InstRW<[M1WriteNMISC1], (instregex "^[SU](MIN|MAX)v")>; -def : InstRW<[M1WriteNMISC2], (instregex "^[SU](MIN|MAX)Pv")>; -def : InstRW<[M1WriteNMISC3], (instregex "^[SU](MIN|MAX)Vv")>; -def : InstRW<[M1WriteNMISC4], (instregex "^(MUL|SQR?DMULH)v")>; -def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>; -def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>; -def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>; -def : InstRW<[M1WriteNAL13], (instregex "^(S|SR|U|UR)SRAv")>; -def : InstRW<[M1WriteNALU1], (instregex "^SHL[dv]")>; -def : InstRW<[M1WriteNALU1], (instregex "^[SU]SH[LR][dv]")>; -def : InstRW<[M1WriteNALU1], (instregex "^S[RS]I[dv]")>; -def : InstRW<[M1WriteNAL13], (instregex "^(([SU]Q)?R)?SHRU?N[bhsv]")>; -def : InstRW<[M1WriteNAL13], (instregex "^[SU]RSH[LR][dv]")>; -def : InstRW<[M1WriteNAL13], (instregex "^[SU]QR?SHLU?[bdhsv]")>; - -// ASIMD FP instructions. -def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)v")>; -def : InstRW<[M1WriteNMISC3], (instregex "^F(ABD|ADD|SUB)v")>; -def : InstRW<[M1WriteNEONA], (instregex "^FADDP")>; -def : InstRW<[M1WriteNMISC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>; -def : InstRW<[M1WriteFCVT3], (instregex "^[FVSU]CVTX?[AFLMNPZ][SU]?(_Int)?v")>; -def : InstRW<[M1WriteFVAR15], (instregex "FDIVv.f32")>; -def : InstRW<[M1WriteFVAR23], (instregex "FDIVv2f64")>; -def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>; -def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>; -def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>; -def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>; -def : InstRW<[M1WriteNEONJ], (instregex "^FMULX?v.i")>; -def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v.f")>; -def : InstRW<[M1WriteNEONK], (instregex "^FML[AS]v.i")>; -def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v.f")>; -def : InstRW<[M1WriteFCVT3], (instregex "^FRINT[AIMNPXZ]v")>; - -// ASIMD miscellaneous instructions. -def : InstRW<[M1WriteNALU1], (instregex "^RBITv")>; -def : InstRW<[M1WriteNAL11], (instregex "^(BIF|BIT|BSL)v")>; -def : InstRW<[M1WriteNEONB], (instregex "^DUPv.+gpr")>; -def : InstRW<[M1WriteNALU1], (instregex "^DUPv.+lane")>; -def : InstRW<[M1WriteNALU1], (instregex "^EXTv8")>; -def : InstRW<[M1WriteNEONL], (instregex "^EXTv16")>; -def : InstRW<[M1WriteNAL13], (instregex "^[SU]?Q?XTU?Nv")>; -def : InstRW<[M1WriteNALU1], (instregex "^CPY")>; -def : InstRW<[M1WriteNALU1], (instregex "^INSv.+lane")>; -def : InstRW<[M1WriteNALU1], (instregex "^MOVI[Dv]")>; -def : InstRW<[M1WriteNALU1], (instregex "^FMOVv")>; -def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev[248]")>; -def : InstRW<[M1WriteFMAC5], (instregex "^F(RECP|RSQRT)Sv")>; -def : InstRW<[M1WriteNALU1], (instregex "^REV(16|32|64)v")>; -def : InstRW<[M1WriteNAL11], (instregex "^TB[LX]v8i8One")>; -def : InstRW<[WriteSequence<[M1WriteNAL11], 2>], - (instregex "^TB[LX]v8i8Two")>; -def : InstRW<[WriteSequence<[M1WriteNAL11], 3>], - (instregex "^TB[LX]v8i8Three")>; -def : InstRW<[WriteSequence<[M1WriteNAL11], 4>], - (instregex "^TB[LX]v8i8Four")>; -def : InstRW<[M1WriteNAL12], (instregex "^TB[LX]v16i8One")>; -def : InstRW<[WriteSequence<[M1WriteNAL12], 2>], - (instregex "^TB[LX]v16i8Two")>; -def : InstRW<[WriteSequence<[M1WriteNAL12], 3>], - (instregex "^TB[LX]v16i8Three")>; -def : InstRW<[WriteSequence<[M1WriteNAL12], 4>], - (instregex "^TB[LX]v16i8Four")>; -def : InstRW<[M1WriteNEOND], (instregex "^[SU]MOVv")>; -def : InstRW<[M1WriteNEONC], (instregex "^INSv.+gpr")>; -def : InstRW<[M1WriteNALU1], (instregex "^(TRN|UZP)[12](v8i8|v4i16|v2i32)")>; -def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64)")>; -def : InstRW<[M1WriteNALU1], (instregex "^ZIP[12]v")>; - -// ASIMD load instructions. -def : InstRW<[M1WriteVLDD], (instregex "LD1i(8|16|32)$")>; -def : InstRW<[M1WriteVLDD, - WriteAdr], (instregex "LD1i(8|16|32)_POST$")>; -def : InstRW<[M1WriteVLDE], (instregex "LD1i(64)$")>; -def : InstRW<[M1WriteVLDE, - WriteAdr], (instregex "LD1i(64)_POST$")>; - -def : InstRW<[M1WriteL5], (instregex "LD1Rv(8b|4h|2s)$")>; -def : InstRW<[M1WriteL5, - WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>; -def : InstRW<[M1WriteL5], (instregex "LD1Rv(1d)$")>; -def : InstRW<[M1WriteL5, - WriteAdr], (instregex "LD1Rv(1d)_POST$")>; -def : InstRW<[M1WriteL5], (instregex "LD1Rv(16b|8h|4s|2d)$")>; -def : InstRW<[M1WriteL5, - WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; - -def : InstRW<[M1WriteL5], (instregex "LD1Onev(8b|4h|2s|1d)$")>; -def : InstRW<[M1WriteL5, - WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; -def : InstRW<[M1WriteL5], (instregex "LD1Onev(16b|8h|4s|2d)$")>; -def : InstRW<[M1WriteL5, - WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; -def : InstRW<[M1WriteVLDA], (instregex "LD1Twov(8b|4h|2s|1d)$")>; -def : InstRW<[M1WriteVLDA, - WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>; -def : InstRW<[M1WriteVLDA], (instregex "LD1Twov(16b|8h|4s|2d)$")>; -def : InstRW<[M1WriteVLDA, - WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>; -def : InstRW<[M1WriteVLDB], (instregex "LD1Threev(8b|4h|2s|1d)$")>; -def : InstRW<[M1WriteVLDB, - WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>; -def : InstRW<[M1WriteVLDB], (instregex "LD1Threev(16b|8h|4s|2d)$")>; -def : InstRW<[M1WriteVLDB, - WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>; -def : InstRW<[M1WriteVLDC], (instregex "LD1Fourv(8b|4h|2s|1d)$")>; -def : InstRW<[M1WriteVLDC, - WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>; -def : InstRW<[M1WriteVLDC], (instregex "LD1Fourv(16b|8h|4s|2d)$")>; -def : InstRW<[M1WriteVLDC, - WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>; - -def : InstRW<[M1WriteVLDG], (instregex "LD2i(8|16)$")>; -def : InstRW<[M1WriteVLDG, - WriteAdr], (instregex "LD2i(8|16)_POST$")>; -def : InstRW<[M1WriteVLDG], (instregex "LD2i(32)$")>; -def : InstRW<[M1WriteVLDG, - WriteAdr], (instregex "LD2i(32)_POST$")>; -def : InstRW<[M1WriteVLDH], (instregex "LD2i(64)$")>; -def : InstRW<[M1WriteVLDH, - WriteAdr], (instregex "LD2i(64)_POST$")>; - -def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(8b|4h|2s)$")>; -def : InstRW<[M1WriteVLDA, - WriteAdr], (instregex "LD2Rv(8b|4h|2s)_POST$")>; -def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(1d)$")>; -def : InstRW<[M1WriteVLDA, - WriteAdr], (instregex "LD2Rv(1d)_POST$")>; -def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(16b|8h|4s|2d)$")>; -def : InstRW<[M1WriteVLDA, - WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>; - -def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(8b|4h|2s)$")>; -def : InstRW<[M1WriteVLDF, - WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>; -def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(16b|8h|4s)$")>; -def : InstRW<[M1WriteVLDF, - WriteAdr], (instregex "LD2Twov(16b|8h|4s)_POST$")>; -def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(2d)$")>; -def : InstRW<[M1WriteVLDF, - WriteAdr], (instregex "LD2Twov(2d)_POST$")>; - -def : InstRW<[M1WriteVLDJ], (instregex "LD3i(8|16)$")>; -def : InstRW<[M1WriteVLDJ, - WriteAdr], (instregex "LD3i(8|16)_POST$")>; -def : InstRW<[M1WriteVLDJ], (instregex "LD3i(32)$")>; -def : InstRW<[M1WriteVLDJ, - WriteAdr], (instregex "LD3i(32)_POST$")>; -def : InstRW<[M1WriteVLDL], (instregex "LD3i(64)$")>; -def : InstRW<[M1WriteVLDL, - WriteAdr], (instregex "LD3i(64)_POST$")>; - -def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(8b|4h|2s)$")>; -def : InstRW<[M1WriteVLDB, - WriteAdr], (instregex "LD3Rv(8b|4h|2s)_POST$")>; -def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(1d)$")>; -def : InstRW<[M1WriteVLDB, - WriteAdr], (instregex "LD3Rv(1d)_POST$")>; -def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(16b|8h|4s)$")>; -def : InstRW<[M1WriteVLDB, - WriteAdr], (instregex "LD3Rv(16b|8h|4s)_POST$")>; -def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(2d)$")>; -def : InstRW<[M1WriteVLDB, - WriteAdr], (instregex "LD3Rv(2d)_POST$")>; - -def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(8b|4h|2s)$")>; -def : InstRW<[M1WriteVLDI, - WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>; -def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(16b|8h|4s)$")>; -def : InstRW<[M1WriteVLDI, - WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>; -def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(2d)$")>; -def : InstRW<[M1WriteVLDI, - WriteAdr], (instregex "LD3Threev(2d)_POST$")>; - -def : InstRW<[M1WriteVLDK], (instregex "LD4i(8|16)$")>; -def : InstRW<[M1WriteVLDK, - WriteAdr], (instregex "LD4i(8|16)_POST$")>; -def : InstRW<[M1WriteVLDK], (instregex "LD4i(32)$")>; -def : InstRW<[M1WriteVLDK, - WriteAdr], (instregex "LD4i(32)_POST$")>; -def : InstRW<[M1WriteVLDM], (instregex "LD4i(64)$")>; -def : InstRW<[M1WriteVLDM, - WriteAdr], (instregex "LD4i(64)_POST$")>; - -def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(8b|4h|2s)$")>; -def : InstRW<[M1WriteVLDC, - WriteAdr], (instregex "LD4Rv(8b|4h|2s)_POST$")>; -def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(1d)$")>; -def : InstRW<[M1WriteVLDC, - WriteAdr], (instregex "LD4Rv(1d)_POST$")>; -def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(16b|8h|4s)$")>; -def : InstRW<[M1WriteVLDC, - WriteAdr], (instregex "LD4Rv(16b|8h|4s)_POST$")>; -def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(2d)$")>; -def : InstRW<[M1WriteVLDC, - WriteAdr], (instregex "LD4Rv(2d)_POST$")>; - -def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(8b|4h|2s)$")>; -def : InstRW<[M1WriteVLDN, - WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>; -def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(16b|8h|4s)$")>; -def : InstRW<[M1WriteVLDN, - WriteAdr], (instregex "LD4Fourv(16b|8h|4s)_POST$")>; -def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(2d)$")>; -def : InstRW<[M1WriteVLDN, - WriteAdr], (instregex "LD4Fourv(2d)_POST$")>; - -// ASIMD store instructions. -def : InstRW<[M1WriteVSTD], (instregex "ST1i(8|16|32)$")>; -def : InstRW<[M1WriteVSTD, - WriteAdr], (instregex "ST1i(8|16|32)_POST$")>; -def : InstRW<[M1WriteVSTD], (instregex "ST1i(64)$")>; -def : InstRW<[M1WriteVSTD, - WriteAdr], (instregex "ST1i(64)_POST$")>; - -def : InstRW<[WriteVST], (instregex "ST1Onev(8b|4h|2s|1d)$")>; -def : InstRW<[WriteVST, - WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>; -def : InstRW<[WriteVST], (instregex "ST1Onev(16b|8h|4s|2d)$")>; -def : InstRW<[WriteVST, - WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>; -def : InstRW<[M1WriteVSTA], (instregex "ST1Twov(8b|4h|2s|1d)$")>; -def : InstRW<[M1WriteVSTA, - WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>; -def : InstRW<[M1WriteVSTA], (instregex "ST1Twov(16b|8h|4s|2d)$")>; -def : InstRW<[M1WriteVSTA, - WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>; -def : InstRW<[M1WriteVSTB], (instregex "ST1Threev(8b|4h|2s|1d)$")>; -def : InstRW<[M1WriteVSTB, - WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>; -def : InstRW<[M1WriteVSTB], (instregex "ST1Threev(16b|8h|4s|2d)$")>; -def : InstRW<[M1WriteVSTB, - WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>; -def : InstRW<[M1WriteVSTC], (instregex "ST1Fourv(8b|4h|2s|1d)$")>; -def : InstRW<[M1WriteVSTC, - WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>; -def : InstRW<[M1WriteVSTC], (instregex "ST1Fourv(16b|8h|4s|2d)$")>; -def : InstRW<[M1WriteVSTC, - WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>; - -def : InstRW<[M1WriteVSTD], (instregex "ST2i(8|16|32)$")>; -def : InstRW<[M1WriteVSTD, - WriteAdr], (instregex "ST2i(8|16|32)_POST$")>; -def : InstRW<[M1WriteVSTD], (instregex "ST2i(64)$")>; -def : InstRW<[M1WriteVSTD, - WriteAdr], (instregex "ST2i(64)_POST$")>; - -def : InstRW<[M1WriteVSTD], (instregex "ST2Twov(8b|4h|2s)$")>; -def : InstRW<[M1WriteVSTD, - WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; -def : InstRW<[M1WriteVSTE], (instregex "ST2Twov(16b|8h|4s)$")>; -def : InstRW<[M1WriteVSTE, - WriteAdr], (instregex "ST2Twov(16b|8h|4s)_POST$")>; -def : InstRW<[M1WriteVSTE], (instregex "ST2Twov(2d)$")>; -def : InstRW<[M1WriteVSTE, - WriteAdr], (instregex "ST2Twov(2d)_POST$")>; - -def : InstRW<[M1WriteVSTH], (instregex "ST3i(8|16)$")>; -def : InstRW<[M1WriteVSTH, - WriteAdr], (instregex "ST3i(8|16)_POST$")>; -def : InstRW<[M1WriteVSTH], (instregex "ST3i(32)$")>; -def : InstRW<[M1WriteVSTH, - WriteAdr], (instregex "ST3i(32)_POST$")>; -def : InstRW<[M1WriteVSTF], (instregex "ST3i(64)$")>; -def : InstRW<[M1WriteVSTF, - WriteAdr], (instregex "ST3i(64)_POST$")>; - -def : InstRW<[M1WriteVSTF], (instregex "ST3Threev(8b|4h|2s)$")>; -def : InstRW<[M1WriteVSTF, - WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>; -def : InstRW<[M1WriteVSTG], (instregex "ST3Threev(16b|8h|4s)$")>; -def : InstRW<[M1WriteVSTG, - WriteAdr], (instregex "ST3Threev(16b|8h|4s)_POST$")>; -def : InstRW<[M1WriteVSTG], (instregex "ST3Threev(2d)$")>; -def : InstRW<[M1WriteVSTG, - WriteAdr], (instregex "ST3Threev(2d)_POST$")>; - -def : InstRW<[M1WriteVSTH], (instregex "ST4i(8|16)$")>; -def : InstRW<[M1WriteVSTH, - WriteAdr], (instregex "ST4i(8|16)_POST$")>; -def : InstRW<[M1WriteVSTH], (instregex "ST4i(32)$")>; -def : InstRW<[M1WriteVSTH, - WriteAdr], (instregex "ST4i(32)_POST$")>; -def : InstRW<[M1WriteVSTF], (instregex "ST4i(64)$")>; -def : InstRW<[M1WriteVSTF, - WriteAdr], (instregex "ST4i(64)_POST$")>; - -def : InstRW<[M1WriteVSTF], (instregex "ST4Fourv(8b|4h|2s)$")>; -def : InstRW<[M1WriteVSTF, - WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; -def : InstRW<[M1WriteVSTI], (instregex "ST4Fourv(16b|8h|4s)$")>; -def : InstRW<[M1WriteVSTI, - WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>; -def : InstRW<[M1WriteVSTI], (instregex "ST4Fourv(2d)$")>; -def : InstRW<[M1WriteVSTI, - WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; - -// Cryptography instructions. -def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } -def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>; -def : InstRW<[M1WriteAES], (instregex "^AES[DE]")>; -def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AESI?MC")>; - -def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>; -def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>; -def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>; -def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA256SU0")>; -def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA256(H|SU1)")>; - -// CRC instructions. -def : InstRW<[M1WriteC2], (instregex "^CRC32")>; - -} // SchedModel = ExynosM1Model diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedExynosM3.td b/suite/synctools/tablegen/AArch64/AArch64SchedExynosM3.td index 5e5369a5..d66efb82 100644 --- a/suite/synctools/tablegen/AArch64/AArch64SchedExynosM3.td +++ b/suite/synctools/tablegen/AArch64/AArch64SchedExynosM3.td @@ -1,9 +1,8 @@ //=- AArch64SchedExynosM3.td - Samsung Exynos M3 Sched Defs --*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -25,10 +24,9 @@ def ExynosM3Model : SchedMachineModel { let MispredictPenalty = 16; // Minimum branch misprediction penalty. let CompleteModel = 1; // Use the default model otherwise. - list UnsupportedFeatures = [HasSVE]; - - // FIXME: Remove when all errors have been fixed. - let FullInstRWOverlapCheck = 0; + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F, + SMEUnsupported.F); } //===----------------------------------------------------------------------===// @@ -106,24 +104,13 @@ def M3UnitNSHF : ProcResGroup<[M3UnitNSHF0, M3UnitNSHF1, M3UnitNSHF2]>; -//===----------------------------------------------------------------------===// -// Predicates. - -def M3BranchLinkFastPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR && - MI->getOperand(0).isReg() && - MI->getOperand(0).getReg() != AArch64::LR}]>; -def M3ResetFastPred : SchedPredicate<[{TII->isExynosResetFast(*MI)}]>; -def M3RotateRightFastPred : SchedPredicate<[{(MI->getOpcode() == AArch64::EXTRWrri || - MI->getOpcode() == AArch64::EXTRXrri) && - MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && - MI->getOperand(1).getReg() == MI->getOperand(2).getReg()}]>; -def M3ShiftLeftFastPred : SchedPredicate<[{TII->isExynosShiftLeftFast(*MI)}]>; - //===----------------------------------------------------------------------===// // Coarse scheduling model. def M3WriteZ0 : SchedWriteRes<[]> { let Latency = 0; let NumMicroOps = 1; } +def M3WriteZ1 : SchedWriteRes<[]> { let Latency = 1; + let NumMicroOps = 0; } def M3WriteA1 : SchedWriteRes<[M3UnitALU]> { let Latency = 1; } def M3WriteAA : SchedWriteRes<[M3UnitALU]> { let Latency = 2; @@ -140,15 +127,25 @@ def M3WriteAD : SchedWriteRes<[M3UnitALU, let NumMicroOps = 2; } def M3WriteC1 : SchedWriteRes<[M3UnitC]> { let Latency = 1; } def M3WriteC2 : SchedWriteRes<[M3UnitC]> { let Latency = 2; } -def M3WriteAX : SchedWriteVariant<[SchedVar, - SchedVar, - SchedVar]>; -def M3WriteAY : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M3WriteAU : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar, + SchedVar]>; +def M3WriteAV : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar]>; +def M3WriteAW : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar]>; +def M3WriteAX : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar]>; +def M3WriteAY : SchedWriteVariant<[SchedVar, + SchedVar]>; def M3WriteB1 : SchedWriteRes<[M3UnitB]> { let Latency = 1; } -def M3WriteBX : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M3WriteBX : SchedWriteVariant<[SchedVar, + SchedVar]>; def M3WriteL4 : SchedWriteRes<[M3UnitL]> { let Latency = 4; } def M3WriteL5 : SchedWriteRes<[M3UnitL]> { let Latency = 5; } @@ -165,44 +162,46 @@ def M3WriteLC : SchedWriteRes<[M3UnitA, def M3WriteLD : SchedWriteRes<[M3UnitA, M3UnitL]> { let Latency = 4; let NumMicroOps = 2; } +def M3WriteLE : SchedWriteRes<[M3UnitA, + M3UnitL]> { let Latency = 6; + let NumMicroOps = 2; } def M3WriteLH : SchedWriteRes<[]> { let Latency = 5; let NumMicroOps = 0; } - -def M3WriteLX : SchedWriteVariant<[SchedVar, - SchedVar]>; +def M3WriteLX : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M3WriteLY : SchedWriteVariant<[SchedVar, + SchedVar]>; def M3WriteS1 : SchedWriteRes<[M3UnitS]> { let Latency = 1; } def M3WriteSA : SchedWriteRes<[M3UnitA, M3UnitS, - M3UnitFST]> { let Latency = 2; + M3UnitFST]> { let Latency = 3; let NumMicroOps = 2; } def M3WriteSB : SchedWriteRes<[M3UnitA, - M3UnitS]> { let Latency = 1; - let NumMicroOps = 2; } -def M3WriteSC : SchedWriteRes<[M3UnitA, M3UnitS]> { let Latency = 2; let NumMicroOps = 2; } +def M3WriteSC : SchedWriteRes<[M3UnitA, + M3UnitS, + M3UnitFST]> { let Latency = 1; + let NumMicroOps = 2; } +def M3WriteSY : SchedWriteVariant<[SchedVar, + SchedVar]>; -def M3WriteSX : SchedWriteVariant<[SchedVar, - SchedVar]>; -def M3WriteSY : SchedWriteVariant<[SchedVar, - SchedVar]>; - -def M3ReadAdrBase : SchedReadVariant<[SchedVar, - SchedVar]>; +def M3ReadAdrBase : SchedReadVariant<[SchedVar, + SchedVar]>; // Branch instructions. def : SchedAlias; -def : WriteRes { let Latency = 1; } +def : SchedAlias; // Arithmetic and logical integer instructions. -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } -def : WriteRes { let Latency = 1; } +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; // Move instructions. -def : WriteRes { let Latency = 1; } +def : SchedAlias; // Divide and multiply instructions. def : WriteRes { let Latency = 4; let ResourceCycles = [2]; } // Miscellaneous instructions. -def : WriteRes { let Latency = 1; - let NumMicroOps = 2; } +def : SchedAlias; // Addressing modes. -def : WriteRes { let Latency = 1; - let NumMicroOps = 0; } +def : SchedAlias; def : SchedAlias; // Load instructions. def : SchedAlias; def : WriteRes { let Latency = 4; let NumMicroOps = 0; } -def : SchedAlias; +def : SchedAlias; // Store instructions. def : SchedAlias; def : SchedAlias; def : SchedAlias; -def : SchedAlias; +def : SchedAlias; // FP data instructions. def : WriteRes { let Latency = 2; } @@ -245,7 +241,6 @@ def : WriteRes { let Latency = 12; def : WriteRes { let Latency = 4; } // FP miscellaneous instructions. -// TODO: Conversion between register files is much different. def : WriteRes { let Latency = 3; } def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 1; } @@ -259,7 +254,8 @@ def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } // Other miscellaneous instructions. def : WriteRes { let Unsupported = 1; } @@ -282,6 +278,7 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; //===----------------------------------------------------------------------===// // Finer scheduling model. @@ -481,11 +478,15 @@ def M3WriteVSTI : SchedWriteRes<[M3UnitNALU, // Special cases. def M3WriteAES : SchedWriteRes<[M3UnitNCRY]> { let Latency = 1; } +def M3WriteCOPY : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M3WriteMOVI : SchedWriteVariant<[SchedVar, + SchedVar]>; + +// Fast forwarding. def M3ReadAES : SchedReadAdvance<1, [M3WriteAES]>; def M3ReadFMAC : SchedReadAdvance<1, [M3WriteFMAC4, M3WriteFMAC5]>; -def M3WriteMOVI : SchedWriteVariant<[SchedVar, - SchedVar]>; def M3ReadNMUL : SchedReadAdvance<1, [M3WriteNMUL3]>; // Branch instructions @@ -496,29 +497,40 @@ def : InstRW<[M3WriteC1], (instregex "^CBN?Z[WX]")>; def : InstRW<[M3WriteAD], (instregex "^TBN?Z[WX]")>; // Arithmetic and logical integer instructions. -def : InstRW<[M3WriteA1], (instrs COPY)>; -def : InstRW<[M3WriteAX], (instregex "^(ADD|SUB)S?Xrx64")>; -def : InstRW<[M3WriteAX], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)[WX]r[sx]$")>; -def : InstRW<[M3WriteAX], (instregex "^(ADD|BIC|SUB)S[WX]r[sx]$")>; -def : InstRW<[M3WriteAX], (instregex "^(ADD|AND|EOR|ORR|SUB)[WX]ri")>; +def : InstRW<[M3WriteAX], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|SUB)[WX]rs$")>; +def : InstRW<[M3WriteAU], (instrs ORRWrs, ORRXrs)>; +def : InstRW<[M3WriteAX], (instregex "^(ADD|SUB)S?[WX]rx(64)?$")>; +def : InstRW<[M3WriteAX], (instregex "^(ADD|AND|BIC|SUB)S[WX]rs$")>; +def : InstRW<[M3WriteAV], (instrs ADDWri, ADDXri)>; +def : InstRW<[M3WriteAW], (instrs ORRWri, ORRXri)>; // Move instructions. -def : InstRW<[M3WriteZ0], (instrs ADR, ADRP)>; -def : InstRW<[M3WriteZ0], (instregex "^MOV[NZ][WX]i")>; +def : InstRW<[M3WriteCOPY], (instrs COPY)>; +def : InstRW<[M3WriteZ0], (instrs ADR, ADRP)>; +def : InstRW<[M3WriteZ0], (instregex "^MOV[NZ][WX]i")>; // Divide and multiply instructions. // Miscellaneous instructions. -def : InstRW<[M3WriteAY], (instrs EXTRWrri, EXTRXrri)>; // Load instructions. def : InstRW<[M3WriteLD, WriteLDHi, WriteAdr], (instregex "^LDP(SW|W|X)(post|pre)")>; +def : InstRW<[M3WriteLB, + ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roW")>; def : InstRW<[M3WriteLX, - ReadAdrBase], (instregex "^PRFMro[WX]")>; + ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roX")>; +def : InstRW<[M3WriteLB, + ReadAdrBase], (instrs PRFMroW)>; +def : InstRW<[M3WriteLX, + ReadAdrBase], (instrs PRFMroX)>; // Store instructions. +def : InstRW<[M3WriteSB, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roW")>; +def : InstRW<[WriteST, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roX")>; // FP data instructions. def : InstRW<[M3WriteNSHF1], (instregex "^FABS[DS]r")>; @@ -555,9 +567,11 @@ def : InstRW<[WriteVLD], (instregex "^LDUR[BDHSQ]i")>; def : InstRW<[WriteVLD, WriteAdr], (instregex "^LDR[BDHSQ](post|pre)")>; def : InstRW<[WriteVLD], (instregex "^LDR[BDHSQ]ui")>; -def : InstRW<[M3WriteLX, - ReadAdrBase], (instregex "^LDR[BDHS]ro[WX]")>; -def : InstRW<[M3WriteLB, +def : InstRW<[M3WriteLE, + ReadAdrBase], (instregex "^LDR[BDHS]roW")>; +def : InstRW<[WriteVLD, + ReadAdrBase], (instregex "^LDR[BDHS]roX")>; +def : InstRW<[M3WriteLY, ReadAdrBase], (instregex "^LDRQro[WX]")>; def : InstRW<[WriteVLD, M3WriteLH], (instregex "^LDN?P[DS]i")>; @@ -575,20 +589,24 @@ def : InstRW<[WriteVST], (instregex "^STUR[BDHSQ]i")>; def : InstRW<[WriteVST, WriteAdr], (instregex "^STR[BDHSQ](post|pre)")>; def : InstRW<[WriteVST], (instregex "^STR[BDHSQ]ui")>; -def : InstRW<[M3WriteSY, - ReadAdrBase], (instregex "^STR[BDHS]ro[WX]")>; def : InstRW<[M3WriteSA, - ReadAdrBase], (instregex "^STRQro[WX]")>; + ReadAdrBase], (instregex "^STR[BDHS]roW")>; +def : InstRW<[M3WriteSA, + ReadAdrBase], (instregex "^STRQroW")>; +def : InstRW<[WriteVST, + ReadAdrBase], (instregex "^STR[BDHS]roX")>; +def : InstRW<[M3WriteSY, + ReadAdrBase], (instregex "^STRQroX")>; def : InstRW<[WriteVST], (instregex "^STN?P[DSQ]i")>; def : InstRW<[WriteVST, WriteAdr], (instregex "^STP[DS](post|pre)")>; -def : InstRW<[M3WriteSA, +def : InstRW<[M3WriteSC, WriteAdr], (instregex "^STPQ(post|pre)")>; // ASIMD instructions. def : InstRW<[M3WriteNMSC3], (instregex "^[SU]ABAL?v")>; def : InstRW<[M3WriteNMSC1], (instregex "^[SU]ABDL?v")>; -def : InstRW<[M3WriteNMSC1], (instregex "^(SQ)?(ABS|NEG)v")>; +def : InstRW<[M3WriteNMSC1], (instregex "^((SQ)?ABS|SQNEG)v")>; def : InstRW<[M3WriteNALU1], (instregex "^(ADD|NEG|SUB)v")>; def : InstRW<[M3WriteNMSC3], (instregex "^[SU]?ADDL?Pv")>; def : InstRW<[M3WriteNMSC3], (instregex "^[SU]H(ADD|SUB)v")>; @@ -597,7 +615,6 @@ def : InstRW<[M3WriteNMSC3], (instregex "^R?(ADD|SUB)HN2?v")>; def : InstRW<[M3WriteNMSC3], (instregex "^[SU]Q(ADD|SUB)v")>; def : InstRW<[M3WriteNMSC3], (instregex "^(SU|US)QADDv")>; def : InstRW<[M3WriteNMSC3], (instregex "^[SU]RHADDv")>; -def : InstRW<[M3WriteNMSC3], (instregex "^[SU]?ADDL?Vv")>; def : InstRW<[M3WriteNMSC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>; def : InstRW<[M3WriteNALU1], (instregex "^CMTSTv")>; def : InstRW<[M3WriteNALU1], (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>; @@ -647,12 +664,12 @@ def : InstRW<[M3WriteNEONY], (instrs FSQRTv2f64)>; // ASIMD miscellaneous instructions. def : InstRW<[M3WriteNALU1], (instregex "^RBITv")>; -def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>; +def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>; def : InstRW<[M3WriteNEONB], (instregex "^DUPv.+gpr")>; def : InstRW<[M3WriteNSHF1], (instregex "^DUPv.+lane")>; def : InstRW<[M3WriteNSHF1], (instregex "^EXTv")>; def : InstRW<[M3WriteNSHF1], (instregex "^[SU]?Q?XTU?Nv")>; -def : InstRW<[M3WriteNSHF1], (instregex "^CPY")>; +def : InstRW<[M3WriteNSHF1], (instregex "^DUP(i8|i16|i32|i64)$")>; def : InstRW<[M3WriteNSHF1], (instregex "^INSv.+lane")>; def : InstRW<[M3WriteMOVI], (instregex "^MOVI")>; def : InstRW<[M3WriteNALU1], (instregex "^FMOVv")>; @@ -668,108 +685,108 @@ def : InstRW<[M3WriteNSHF1], (instregex "^(TRN|UZP|ZIP)[12]v")>; // ASIMD load instructions. def : InstRW<[M3WriteL5], (instregex "LD1Onev(8b|4h|2s|1d)$")>; def : InstRW<[M3WriteL5, - WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST")>; + M3WriteA1], (instregex "LD1Onev(8b|4h|2s|1d)_POST")>; def : InstRW<[M3WriteL5], (instregex "LD1Onev(16b|8h|4s|2d)$")>; def : InstRW<[M3WriteL5, - WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST")>; + M3WriteA1], (instregex "LD1Onev(16b|8h|4s|2d)_POST")>; def : InstRW<[M3WriteVLDA], (instregex "LD1Twov(8b|4h|2s|1d)$")>; def : InstRW<[M3WriteVLDA, - WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST")>; + M3WriteA1], (instregex "LD1Twov(8b|4h|2s|1d)_POST")>; def : InstRW<[M3WriteVLDA], (instregex "LD1Twov(16b|8h|4s|2d)$")>; def : InstRW<[M3WriteVLDA, - WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST")>; + M3WriteA1], (instregex "LD1Twov(16b|8h|4s|2d)_POST")>; def : InstRW<[M3WriteVLDB], (instregex "LD1Threev(8b|4h|2s|1d)$")>; def : InstRW<[M3WriteVLDB, - WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST")>; + M3WriteA1], (instregex "LD1Threev(8b|4h|2s|1d)_POST")>; def : InstRW<[M3WriteVLDB], (instregex "LD1Threev(16b|8h|4s|2d)$")>; def : InstRW<[M3WriteVLDB, - WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST")>; + M3WriteA1], (instregex "LD1Threev(16b|8h|4s|2d)_POST")>; def : InstRW<[M3WriteVLDC], (instregex "LD1Fourv(8b|4h|2s|1d)$")>; def : InstRW<[M3WriteVLDC, - WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>; + M3WriteA1], (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>; def : InstRW<[M3WriteVLDC], (instregex "LD1Fourv(16b|8h|4s|2d)$")>; def : InstRW<[M3WriteVLDC, - WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>; + M3WriteA1], (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>; def : InstRW<[M3WriteVLDD], (instregex "LD1i(8|16|32)$")>; def : InstRW<[M3WriteVLDD, - WriteAdr], (instregex "LD1i(8|16|32)_POST")>; + M3WriteA1], (instregex "LD1i(8|16|32)_POST")>; def : InstRW<[M3WriteVLDE], (instregex "LD1i(64)$")>; def : InstRW<[M3WriteVLDE, - WriteAdr], (instregex "LD1i(64)_POST")>; + M3WriteA1], (instregex "LD1i(64)_POST")>; def : InstRW<[M3WriteL5], (instregex "LD1Rv(8b|4h|2s|1d)$")>; def : InstRW<[M3WriteL5, - WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d)_POST")>; + M3WriteA1], (instregex "LD1Rv(8b|4h|2s|1d)_POST")>; def : InstRW<[M3WriteL5], (instregex "LD1Rv(16b|8h|4s|2d)$")>; def : InstRW<[M3WriteL5, - WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST")>; + M3WriteA1], (instregex "LD1Rv(16b|8h|4s|2d)_POST")>; def : InstRW<[M3WriteVLDF], (instregex "LD2Twov(8b|4h|2s)$")>; def : InstRW<[M3WriteVLDF, - WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST")>; + M3WriteA1], (instregex "LD2Twov(8b|4h|2s)_POST")>; def : InstRW<[M3WriteVLDF], (instregex "LD2Twov(16b|8h|4s|2d)$")>; def : InstRW<[M3WriteVLDF, - WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)_POST")>; + M3WriteA1], (instregex "LD2Twov(16b|8h|4s|2d)_POST")>; def : InstRW<[M3WriteVLDG], (instregex "LD2i(8|16|32)$")>; def : InstRW<[M3WriteVLDG, - WriteAdr], (instregex "LD2i(8|16|32)_POST")>; + M3WriteA1], (instregex "LD2i(8|16|32)_POST")>; def : InstRW<[M3WriteVLDH], (instregex "LD2i(64)$")>; def : InstRW<[M3WriteVLDH, - WriteAdr], (instregex "LD2i(64)_POST")>; + M3WriteA1], (instregex "LD2i(64)_POST")>; def : InstRW<[M3WriteVLDA], (instregex "LD2Rv(8b|4h|2s|1d)$")>; def : InstRW<[M3WriteVLDA, - WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d)_POST")>; + M3WriteA1], (instregex "LD2Rv(8b|4h|2s|1d)_POST")>; def : InstRW<[M3WriteVLDA], (instregex "LD2Rv(16b|8h|4s|2d)$")>; def : InstRW<[M3WriteVLDA, - WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST")>; + M3WriteA1], (instregex "LD2Rv(16b|8h|4s|2d)_POST")>; def : InstRW<[M3WriteVLDI], (instregex "LD3Threev(8b|4h|2s)$")>; def : InstRW<[M3WriteVLDI, - WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST")>; + M3WriteA1], (instregex "LD3Threev(8b|4h|2s)_POST")>; def : InstRW<[M3WriteVLDI], (instregex "LD3Threev(16b|8h|4s|2d)$")>; def : InstRW<[M3WriteVLDI, - WriteAdr], (instregex "LD3Threev(16b|8h|4s|2d)_POST")>; + M3WriteA1], (instregex "LD3Threev(16b|8h|4s|2d)_POST")>; def : InstRW<[M3WriteVLDJ], (instregex "LD3i(8|16|32)$")>; def : InstRW<[M3WriteVLDJ, - WriteAdr], (instregex "LD3i(8|16|32)_POST")>; + M3WriteA1], (instregex "LD3i(8|16|32)_POST")>; def : InstRW<[M3WriteVLDL], (instregex "LD3i(64)$")>; def : InstRW<[M3WriteVLDL, - WriteAdr], (instregex "LD3i(64)_POST")>; + M3WriteA1], (instregex "LD3i(64)_POST")>; def : InstRW<[M3WriteVLDB], (instregex "LD3Rv(8b|4h|2s|1d)$")>; def : InstRW<[M3WriteVLDB, - WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d)_POST")>; + M3WriteA1], (instregex "LD3Rv(8b|4h|2s|1d)_POST")>; def : InstRW<[M3WriteVLDB], (instregex "LD3Rv(16b|8h|4s|2d)$")>; def : InstRW<[M3WriteVLDB, - WriteAdr], (instregex "LD3Rv(16b|8h|4s|2d)_POST")>; + M3WriteA1], (instregex "LD3Rv(16b|8h|4s|2d)_POST")>; def : InstRW<[M3WriteVLDN], (instregex "LD4Fourv(8b|4h|2s)$")>; def : InstRW<[M3WriteVLDN, - WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST")>; + M3WriteA1], (instregex "LD4Fourv(8b|4h|2s)_POST")>; def : InstRW<[M3WriteVLDN], (instregex "LD4Fourv(16b|8h|4s|2d)$")>; def : InstRW<[M3WriteVLDN, - WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>; + M3WriteA1], (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>; def : InstRW<[M3WriteVLDK], (instregex "LD4i(8|16|32)$")>; def : InstRW<[M3WriteVLDK, - WriteAdr], (instregex "LD4i(8|16|32)_POST")>; + M3WriteA1], (instregex "LD4i(8|16|32)_POST")>; def : InstRW<[M3WriteVLDM], (instregex "LD4i(64)$")>; def : InstRW<[M3WriteVLDM, - WriteAdr], (instregex "LD4i(64)_POST")>; + M3WriteA1], (instregex "LD4i(64)_POST")>; def : InstRW<[M3WriteVLDC], (instregex "LD4Rv(8b|4h|2s|1d)$")>; def : InstRW<[M3WriteVLDC, - WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d)_POST")>; + M3WriteA1], (instregex "LD4Rv(8b|4h|2s|1d)_POST")>; def : InstRW<[M3WriteVLDC], (instregex "LD4Rv(16b|8h|4s|2d)$")>; def : InstRW<[M3WriteVLDC, - WriteAdr], (instregex "LD4Rv(16b|8h|4s|2d)_POST")>; + M3WriteA1], (instregex "LD4Rv(16b|8h|4s|2d)_POST")>; // ASIMD store instructions. def : InstRW<[WriteVST], (instregex "ST1Onev(8b|4h|2s|1d)$")>; diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedExynosM4.td b/suite/synctools/tablegen/AArch64/AArch64SchedExynosM4.td new file mode 100644 index 00000000..94e70793 --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64SchedExynosM4.td @@ -0,0 +1,1017 @@ +//=- AArch64SchedExynosM4.td - Samsung Exynos M4 Sched Defs --*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the Samsung Exynos M4 to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// The Exynos-M4 is an advanced superscalar microprocessor with a 6-wide +// in-order stage for decode and dispatch and a wider issue stage. +// The execution units and loads and stores are out-of-order. + +def ExynosM4Model : SchedMachineModel { + let IssueWidth = 6; // Up to 6 uops per cycle. + let MicroOpBufferSize = 228; // ROB size. + let LoopMicroOpBufferSize = 48; // Based on the instruction queue size. + let LoadLatency = 4; // Optimistic load cases. + let MispredictPenalty = 16; // Minimum branch misprediction penalty. + let CompleteModel = 1; // Use the default model otherwise. + + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F, + SMEUnsupported.F); +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on the Exynos-M4. + +let SchedModel = ExynosM4Model in { + +def M4UnitA : ProcResource<2>; // Simple integer +def M4UnitC : ProcResource<2>; // Simple and complex integer +let Super = M4UnitC, BufferSize = 1 in +def M4UnitD : ProcResource<1>; // Integer division (inside C0, serialized) +let Super = M4UnitC in +def M4UnitE : ProcResource<1>; // CRC (inside C0) +def M4UnitB : ProcResource<2>; // Branch +def M4UnitL0 : ProcResource<1>; // Load +def M4UnitS0 : ProcResource<1>; // Store +def M4PipeLS : ProcResource<1>; // Load/Store +let Super = M4PipeLS in { + def M4UnitL1 : ProcResource<1>; + def M4UnitS1 : ProcResource<1>; +} +def M4PipeF0 : ProcResource<1>; // FP #0 +let Super = M4PipeF0 in { + def M4UnitFMAC0 : ProcResource<1>; // FP multiplication + def M4UnitFADD0 : ProcResource<1>; // Simple FP + def M4UnitFCVT0 : ProcResource<1>; // FP conversion + def M4UnitNALU0 : ProcResource<1>; // Simple vector + def M4UnitNHAD : ProcResource<1>; // Horizontal vector + def M4UnitNMSC : ProcResource<1>; // FP and vector miscellanea + def M4UnitNMUL0 : ProcResource<1>; // Vector multiplication + def M4UnitNSHT0 : ProcResource<1>; // Vector shifting + def M4UnitNSHF0 : ProcResource<1>; // Vector shuffling + def M4UnitNCRY0 : ProcResource<1>; // Cryptographic +} +def M4PipeF1 : ProcResource<1>; // FP #1 +let Super = M4PipeF1 in { + def M4UnitFMAC1 : ProcResource<1>; // FP multiplication + def M4UnitFADD1 : ProcResource<1>; // Simple FP + def M4UnitFDIV0 : ProcResource<2>; // FP division (serialized) + def M4UnitFSQR0 : ProcResource<2>; // FP square root (serialized) + def M4UnitFST0 : ProcResource<1>; // FP store + def M4UnitNALU1 : ProcResource<1>; // Simple vector + def M4UnitNSHT1 : ProcResource<1>; // Vector shifting + def M4UnitNSHF1 : ProcResource<1>; // Vector shuffling +} +def M4PipeF2 : ProcResource<1>; // FP #2 +let Super = M4PipeF2 in { + def M4UnitFMAC2 : ProcResource<1>; // FP multiplication + def M4UnitFADD2 : ProcResource<1>; // Simple FP + def M4UnitFCVT1 : ProcResource<1>; // FP conversion + def M4UnitFDIV1 : ProcResource<2>; // FP division (serialized) + def M4UnitFSQR1 : ProcResource<2>; // FP square root (serialized) + def M4UnitFST1 : ProcResource<1>; // FP store + def M4UnitNALU2 : ProcResource<1>; // Simple vector + def M4UnitNMUL1 : ProcResource<1>; // Vector multiplication + def M4UnitNSHT2 : ProcResource<1>; // Vector shifting + def M4UnitNCRY1 : ProcResource<1>; // Cryptographic +} + +def M4UnitALU : ProcResGroup<[M4UnitA, + M4UnitC]>; +def M4UnitL : ProcResGroup<[M4UnitL0, + M4UnitL1]>; +def M4UnitS : ProcResGroup<[M4UnitS0, + M4UnitS1]>; +def M4UnitFMAC : ProcResGroup<[M4UnitFMAC0, + M4UnitFMAC1, + M4UnitFMAC2]>; +def M4UnitFMACH : ProcResGroup<[M4UnitFMAC0, + M4UnitFMAC1]>; +def M4UnitFADD : ProcResGroup<[M4UnitFADD0, + M4UnitFADD1, + M4UnitFADD2]>; +def M4UnitFADDH : ProcResGroup<[M4UnitFADD0, + M4UnitFADD1]>; +def M4UnitFCVT : ProcResGroup<[M4UnitFCVT0, + M4UnitFCVT1]>; +def M4UnitFCVTH : ProcResGroup<[M4UnitFCVT0]>; +def M4UnitFDIV : ProcResGroup<[M4UnitFDIV0, + M4UnitFDIV1]>; +def M4UnitFDIVH : ProcResGroup<[M4UnitFDIV0]>; +def M4UnitFSQR : ProcResGroup<[M4UnitFSQR0, + M4UnitFSQR1]>; +def M4UnitFSQRH : ProcResGroup<[M4UnitFSQR0]>; +def M4UnitFST : ProcResGroup<[M4UnitFST0, + M4UnitFST1]>; +def M4UnitNALU : ProcResGroup<[M4UnitNALU0, + M4UnitNALU1, + M4UnitNALU2]>; +def M4UnitNALUH : ProcResGroup<[M4UnitNALU0, + M4UnitNALU1]>; +def M4UnitNMUL : ProcResGroup<[M4UnitNMUL0, + M4UnitNMUL1]>; +def M4UnitNSHT : ProcResGroup<[M4UnitNSHT0, + M4UnitNSHT1, + M4UnitNSHT2]>; +def M4UnitNSHF : ProcResGroup<[M4UnitNSHF0, + M4UnitNSHF1]>; +def M4UnitNSHFH : ProcResGroup<[M4UnitNSHF0]>; +def M4UnitNCRY : ProcResGroup<[M4UnitNCRY0, + M4UnitNCRY1]>; + +//===----------------------------------------------------------------------===// +// Resources details. + +def M4WriteZ0 : SchedWriteRes<[]> { let Latency = 0; } +def M4WriteZ1 : SchedWriteRes<[]> { let Latency = 1; + let NumMicroOps = 0; } +def M4WriteZ4 : SchedWriteRes<[]> { let Latency = 4; + let NumMicroOps = 0; } + +def M4WriteA1 : SchedWriteRes<[M4UnitALU]> { let Latency = 1; } +def M4WriteA2 : SchedWriteRes<[M4UnitALU]> { let Latency = 2; } +def M4WriteAA : SchedWriteRes<[M4UnitALU]> { let Latency = 2; + let ResourceCycles = [2]; } +def M4WriteAB : SchedWriteRes<[M4UnitALU, + M4UnitC]> { let Latency = 2; + let NumMicroOps = 2; } +def M4WriteAC : SchedWriteRes<[M4UnitALU, + M4UnitALU, + M4UnitC]> { let Latency = 3; + let NumMicroOps = 3; } +def M4WriteAD : SchedWriteRes<[M4UnitALU, + M4UnitC]> { let Latency = 2; + let NumMicroOps = 2; } +def M4WriteAF : SchedWriteRes<[M4UnitALU]> { let Latency = 2; + let NumMicroOps = 2; } +def M4WriteAU : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar, + SchedVar]>; +def M4WriteAV : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar, + SchedVar]>; +def M4WriteAX : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar]>; +def M4WriteAY : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M4WriteB1 : SchedWriteRes<[M4UnitB]> { let Latency = 1; } +def M4WriteBX : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M4WriteC1 : SchedWriteRes<[M4UnitC]> { let Latency = 1; } +def M4WriteC3 : SchedWriteRes<[M4UnitC]> { let Latency = 3; } +def M4WriteCA : SchedWriteRes<[M4UnitC]> { let Latency = 4; + let ResourceCycles = [2]; } + +def M4WriteD12 : SchedWriteRes<[M4UnitD]> { let Latency = 12; + let ResourceCycles = [12]; } +def M4WriteD21 : SchedWriteRes<[M4UnitD]> { let Latency = 21; + let ResourceCycles = [21]; } + +def M4WriteE2 : SchedWriteRes<[M4UnitE]> { let Latency = 2; } + +def M4WriteL4 : SchedWriteRes<[M4UnitL]> { let Latency = 4; } +def M4WriteL5 : SchedWriteRes<[M4UnitL]> { let Latency = 5; } +def M4WriteLA : SchedWriteRes<[M4UnitL, + M4UnitL]> { let Latency = 5; + let NumMicroOps = 1; } +def M4WriteLB : SchedWriteRes<[M4UnitA, + M4UnitL]> { let Latency = 5; + let NumMicroOps = 2; } +def M4WriteLC : SchedWriteRes<[M4UnitA, + M4UnitL, + M4UnitL]> { let Latency = 5; + let NumMicroOps = 2; } +def M4WriteLD : SchedWriteRes<[M4UnitA, + M4UnitL]> { let Latency = 4; + let NumMicroOps = 2; } +def M4WriteLE : SchedWriteRes<[M4UnitA, + M4UnitL]> { let Latency = 6; + let NumMicroOps = 2; } +def M4WriteLH : SchedWriteRes<[]> { let Latency = 5; + let NumMicroOps = 0; } +def M4WriteLX : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M4WriteLY : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M4WriteS1 : SchedWriteRes<[M4UnitS]> { let Latency = 1; } +def M4WriteSA : SchedWriteRes<[M4UnitS0]> { let Latency = 3; } +def M4WriteSB : SchedWriteRes<[M4UnitA, + M4UnitS]> { let Latency = 2; + let NumMicroOps = 1; } +def M4WriteSX : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M4ReadAdrBase : SchedReadVariant<[SchedVar< + MCSchedPredicate< + CheckAny< + [ScaledIdxFn, + ExynosScaledIdxFn]>>, [ReadDefault]>, + SchedVar]>; + +def M4WriteNEONA : SchedWriteRes<[M4UnitNSHF, + M4UnitFADD]> { let Latency = 3; + let NumMicroOps = 2; } +def M4WriteNEONB : SchedWriteRes<[M4UnitNALU, + M4UnitS0]> { let Latency = 5; + let NumMicroOps = 2; } +def M4WriteNEOND : SchedWriteRes<[M4UnitNSHF, + M4UnitFST]> { let Latency = 6; + let NumMicroOps = 2; } +def M4WriteNEONH : SchedWriteRes<[M4UnitNALU, + M4UnitS0]> { let Latency = 5; + let NumMicroOps = 2; } +def M4WriteNEONI : SchedWriteRes<[M4UnitNSHF, + M4UnitS0]> { let Latency = 2; + let NumMicroOps = 2; } +def M4WriteNEONJ : SchedWriteRes<[M4UnitNMSC, + M4UnitS0]> { let Latency = 4; } +def M4WriteNEONK : SchedWriteRes<[M4UnitNSHF, + M4UnitNMSC, + M4UnitS0]> { let Latency = 5; + let NumMicroOps = 2; } +def M4WriteNEONL : SchedWriteRes<[M4UnitNMUL]> { let Latency = 3; } +def M4WriteNEONN : SchedWriteRes<[M4UnitNMSC, + M4UnitNMSC]> { let Latency = 5; + let NumMicroOps = 2; } +def M4WriteNEONO : SchedWriteRes<[M4UnitNMSC, + M4UnitNMSC, + M4UnitNMSC]> { let Latency = 8; + let NumMicroOps = 3; } +def M4WriteNEONP : SchedWriteRes<[M4UnitNSHF, + M4UnitNMSC]> { let Latency = 4; + let NumMicroOps = 2; } +def M4WriteNEONQ : SchedWriteRes<[M4UnitNMSC, + M4UnitC]> { let Latency = 3; + let NumMicroOps = 1; } +def M4WriteNEONR : SchedWriteRes<[M4UnitFCVT0, + M4UnitS0]> { let Latency = 4; + let NumMicroOps = 1; } +def M4WriteNEONV : SchedWriteRes<[M4UnitFDIV, + M4UnitFDIV]> { let Latency = 7; + let ResourceCycles = [6, 6]; } +def M4WriteNEONVH : SchedWriteRes<[M4UnitFDIVH, + M4UnitFDIVH]> { let Latency = 7; + let ResourceCycles = [6, 6]; } +def M4WriteNEONW : SchedWriteRes<[M4UnitFDIV, + M4UnitFDIV]> { let Latency = 12; + let ResourceCycles = [9, 9]; } +def M4WriteNEONX : SchedWriteRes<[M4UnitFSQR, + M4UnitFSQR]> { let Latency = 8; + let ResourceCycles = [7, 7]; } +def M4WriteNEONXH : SchedWriteRes<[M4UnitFSQRH, + M4UnitFSQRH]> { let Latency = 7; + let ResourceCycles = [6, 6]; } +def M4WriteNEONY : SchedWriteRes<[M4UnitFSQR, + M4UnitFSQR]> { let Latency = 12; + let ResourceCycles = [9, 9]; } +def M4WriteNEONZ : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M4WriteFADD2 : SchedWriteRes<[M4UnitFADD]> { let Latency = 2; } +def M4WriteFADD2H : SchedWriteRes<[M4UnitFADDH]> { let Latency = 2; } + +def M4WriteFCVT2 : SchedWriteRes<[M4UnitFCVT]> { let Latency = 2; } +def M4WriteFCVT2A : SchedWriteRes<[M4UnitFCVT0]> { let Latency = 2; } +def M4WriteFCVT2H : SchedWriteRes<[M4UnitFCVTH]> { let Latency = 2; } +def M4WriteFCVT3 : SchedWriteRes<[M4UnitFCVT]> { let Latency = 3; } +def M4WriteFCVT3A : SchedWriteRes<[M4UnitFCVT0]> { let Latency = 3; } +def M4WriteFCVT3H : SchedWriteRes<[M4UnitFCVTH]> { let Latency = 3; } +def M4WriteFCVT4 : SchedWriteRes<[M4UnitFCVT]> { let Latency = 4; } +def M4WriteFCVT4A : SchedWriteRes<[M4UnitFCVT0]> { let Latency = 4; } +def M4WriteFCVT6A : SchedWriteRes<[M4UnitFCVT0]> { let Latency = 6; } + +def M4WriteFDIV7 : SchedWriteRes<[M4UnitFDIV]> { let Latency = 7; + let ResourceCycles = [6]; } +def M4WriteFDIV7H : SchedWriteRes<[M4UnitFDIVH]> { let Latency = 7; + let ResourceCycles = [6]; } +def M4WriteFDIV12 : SchedWriteRes<[M4UnitFDIV]> { let Latency = 12; + let ResourceCycles = [9]; } + +def M4WriteFMAC2H : SchedWriteRes<[M4UnitFMACH]> { let Latency = 2; } +def M4WriteFMAC3H : SchedWriteRes<[M4UnitFMACH]> { let Latency = 3; } +def M4WriteFMAC3 : SchedWriteRes<[M4UnitFMAC]> { let Latency = 3; } +def M4WriteFMAC4 : SchedWriteRes<[M4UnitFMAC]> { let Latency = 4; } +def M4WriteFMAC4H : SchedWriteRes<[M4UnitFMACH]> { let Latency = 4; } +def M4WriteFMAC5 : SchedWriteRes<[M4UnitFMAC]> { let Latency = 5; } + +def M4WriteFSQR7H : SchedWriteRes<[M4UnitFSQRH]> { let Latency = 7; + let ResourceCycles = [6]; } +def M4WriteFSQR8 : SchedWriteRes<[M4UnitFSQR]> { let Latency = 8; + let ResourceCycles = [7]; } +def M4WriteFSQR12 : SchedWriteRes<[M4UnitFSQR]> { let Latency = 12; + let ResourceCycles = [9]; } + +def M4WriteNALU1 : SchedWriteRes<[M4UnitNALU]> { let Latency = 1; } +def M4WriteNALU1H : SchedWriteRes<[M4UnitNALUH]> { let Latency = 1; } + +def M4WriteNCRY1 : SchedWriteRes<[M4UnitNCRY]> { let Latency = 1; } +def M4WriteNCRY1A : SchedWriteRes<[M4UnitNCRY0]> { let Latency = 1; } +def M4WriteNCRY3A : SchedWriteRes<[M4UnitNCRY0]> { let Latency = 3; } +def M4WriteNCRY5A : SchedWriteRes<[M4UnitNCRY]> { let Latency = 5; } + +def M4WriteNHAD1 : SchedWriteRes<[M4UnitNHAD]> { let Latency = 1; } +def M4WriteNHAD3 : SchedWriteRes<[M4UnitNHAD]> { let Latency = 3; } + +def M4WriteNMSC1 : SchedWriteRes<[M4UnitNMSC]> { let Latency = 1; } +def M4WriteNMSC2 : SchedWriteRes<[M4UnitNMSC]> { let Latency = 2; } +def M4WriteNMSC3 : SchedWriteRes<[M4UnitNMSC]> { let Latency = 3; } + +def M4WriteNMUL3 : SchedWriteRes<[M4UnitNMUL]> { let Latency = 3; } + +def M4WriteNSHF1 : SchedWriteRes<[M4UnitNSHF]> { let Latency = 1; } +def M4WriteNSHF1H : SchedWriteRes<[M4UnitNSHFH]> { let Latency = 1; } +def M4WriteNSHF3 : SchedWriteRes<[M4UnitNSHF]> { let Latency = 3; } +def M4WriteNSHFA : SchedWriteRes<[M4UnitNSHF]> { let Latency = 1; + let ResourceCycles = [2]; } +def M4WriteNSHFB : SchedWriteRes<[M4UnitNSHF]> { let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; } +def M4WriteNSHFC : SchedWriteRes<[M4UnitNSHF]> { let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [4]; } +def M4WriteNSHFD : SchedWriteRes<[M4UnitNSHF]> { let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [4]; } + +def M4WriteNSHT1 : SchedWriteRes<[M4UnitNSHT]> { let Latency = 1; } +def M4WriteNSHT2 : SchedWriteRes<[M4UnitNSHT]> { let Latency = 2; } +def M4WriteNSHT3 : SchedWriteRes<[M4UnitNSHT]> { let Latency = 3; } +def M4WriteNSHT4A : SchedWriteRes<[M4UnitNSHT1]> { let Latency = 4; } + +def M4WriteVLDA : SchedWriteRes<[M4UnitL, + M4UnitL]> { let Latency = 5; + let NumMicroOps = 2; } +def M4WriteVLDB : SchedWriteRes<[M4UnitL, + M4UnitL, + M4UnitL]> { let Latency = 6; + let NumMicroOps = 3; } +def M4WriteVLDC : SchedWriteRes<[M4UnitL, + M4UnitL, + M4UnitL, + M4UnitL]> { let Latency = 6; + let NumMicroOps = 4; } +def M4WriteVLDD : SchedWriteRes<[M4UnitL, + M4UnitNSHF]> { let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [2, 1]; } +def M4WriteVLDF : SchedWriteRes<[M4UnitL, + M4UnitL]> { let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [3, 3]; } +def M4WriteVLDG : SchedWriteRes<[M4UnitL, + M4UnitNSHF, + M4UnitNSHF]> { let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [2, 1, 1]; } +def M4WriteVLDI : SchedWriteRes<[M4UnitL, + M4UnitL, + M4UnitL]> { let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [3, 3, 3]; } +def M4WriteVLDJ : SchedWriteRes<[M4UnitL, + M4UnitNSHF, + M4UnitNSHF, + M4UnitNSHF]> { let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [3, 1, 1, 1]; } +def M4WriteVLDK : SchedWriteRes<[M4UnitL, + M4UnitNSHF, + M4UnitNSHF, + M4UnitNSHF, + M4UnitNSHF]> { let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [3, 1, 1, 1, 1]; } +def M4WriteVLDL : SchedWriteRes<[M4UnitL, + M4UnitNSHF, + M4UnitNSHF, + M4UnitL, + M4UnitNSHF]> { let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [3, 1, 1, 6, 1]; } +def M4WriteVLDM : SchedWriteRes<[M4UnitL, + M4UnitNSHF, + M4UnitNSHF, + M4UnitL, + M4UnitNSHF, + M4UnitNSHF]> { let Latency = 7; + let NumMicroOps = 6; + let ResourceCycles = [3, 1, 1, 3, 1, 1]; } +def M4WriteVLDN : SchedWriteRes<[M4UnitL, + M4UnitL, + M4UnitL, + M4UnitL]> { let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [3, 3, 3, 3]; } + +def M4WriteVST1 : SchedWriteRes<[M4UnitS, + M4UnitFST]> { let Latency = 1; + let NumMicroOps = 1; } +def M4WriteVSTA : WriteSequence<[WriteVST], 2>; +def M4WriteVSTB : WriteSequence<[WriteVST], 3>; +def M4WriteVSTC : WriteSequence<[WriteVST], 4>; +def M4WriteVSTD : SchedWriteRes<[M4UnitS, + M4UnitFST]> { let Latency = 2; } +def M4WriteVSTE : SchedWriteRes<[M4UnitS, + M4UnitFST, + M4UnitS, + M4UnitFST]> { let Latency = 2; + let NumMicroOps = 2; } +def M4WriteVSTF : SchedWriteRes<[M4UnitNSHF, + M4UnitS, + M4UnitFST, + M4UnitS, + M4UnitFST]> { let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1, 2, 1, 2, 1]; } +def M4WriteVSTG : SchedWriteRes<[M4UnitNSHF, + M4UnitNSHF, + M4UnitNSHF, + M4UnitS, + M4UnitFST, + M4UnitS, + M4UnitFST, + M4UnitS, + M4UnitFST]> { let Latency = 5; + let NumMicroOps = 6; + let ResourceCycles = [1, 1, 1, 2, 1, 2, 1, 2, 1]; } +def M4WriteVSTI : SchedWriteRes<[M4UnitNSHF, + M4UnitNSHF, + M4UnitNSHF, + M4UnitNSHF, + M4UnitS, + M4UnitFST, + M4UnitS, + M4UnitFST, + M4UnitS, + M4UnitFST, + M4UnitS, + M4UnitFST]> { let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1]; } +def M4WriteVSTJ : SchedWriteRes<[M4UnitA, + M4UnitS, + M4UnitFST, + M4UnitS, + M4UnitFST]> { let Latency = 1; + let NumMicroOps = 2; } +def M4WriteVSTK : SchedWriteRes<[M4UnitA, + M4UnitS, + M4UnitFST]> { let Latency = 3; + let NumMicroOps = 2; } +def M4WriteVSTL : SchedWriteRes<[M4UnitNSHF, + M4UnitNSHF, + M4UnitS, + M4UnitFST, + M4UnitS, + M4UnitFST]> { let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 2, 1, 2, 1]; } +def M4WriteVSTY : SchedWriteVariant<[SchedVar, + SchedVar]>; + +// Special cases. +def M4WriteCOPY : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M4WriteMOVI : SchedWriteVariant<[SchedVar, + SchedVar]>; + +// Fast forwarding. +def M4ReadAESM1 : SchedReadAdvance<+1, [M4WriteNCRY1]>; +def M4ReadFMACM1 : SchedReadAdvance<+1, [M4WriteFMAC4, + M4WriteFMAC4H, + M4WriteFMAC5]>; +def M4ReadNMULM1 : SchedReadAdvance<+1, [M4WriteNMUL3]>; +def M4ReadNMULP2 : SchedReadAdvance<-2, [M4WriteNMUL3]>; + + +//===----------------------------------------------------------------------===// +// Coarse scheduling model. + +// Branch instructions. +def : SchedAlias; +def : SchedAlias; + +// Arithmetic and logical integer instructions. +def : SchedAlias; +def : SchedAlias; // FIXME: M4WriteAX crashes TableGen. +def : SchedAlias; // FIXME: M4WriteAX crashes TableGen. +def : SchedAlias; + +// Move instructions. +def : SchedAlias; + +// Divide and multiply instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// Miscellaneous instructions. +def : SchedAlias; + +// Addressing modes. +def : SchedAlias; +def : SchedAlias; + +// Load instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// Store instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// FP data instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// FP miscellaneous instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// FP load instructions. +def : SchedAlias; + +// FP store instructions. +def : SchedAlias; + +// ASIMD FP instructions. +def : SchedAlias; +def : SchedAlias; + +// Other miscellaneous instructions. +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +//===----------------------------------------------------------------------===// +// Generic fast forwarding. + +// TODO: Add FP register forwarding rules. + +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +// TODO: The forwarding for 32 bits actually saves 2 cycles. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Finer scheduling model. + +// Branch instructions +def : InstRW<[M4WriteB1], (instrs Bcc)>; +def : InstRW<[M4WriteAF], (instrs BL)>; +def : InstRW<[M4WriteBX], (instrs BLR)>; +def : InstRW<[M4WriteC1], (instregex "^CBN?Z[WX]")>; +def : InstRW<[M4WriteAD], (instregex "^TBN?Z[WX]")>; + +// Arithmetic and logical integer instructions. +def : InstRW<[M4WriteAX], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|SUB)[WX]rs$")>; +def : InstRW<[M4WriteAU], (instrs ORRWrs, ORRXrs)>; +def : InstRW<[M4WriteAX], (instregex "^(ADD|AND|BIC|SUB)S[WX]rs$")>; +def : InstRW<[M4WriteAX], (instregex "^(ADD|SUB)S?[WX]rx(64)?$")>; +def : InstRW<[M4WriteAV], (instrs ADDWri, ADDXri, ORRWri, ORRXri)>; + +// Move instructions. +def : InstRW<[M4WriteCOPY], (instrs COPY)>; +def : InstRW<[M4WriteZ0], (instrs ADR, ADRP)>; +def : InstRW<[M4WriteZ0], (instregex "^MOV[NZ][WX]i")>; + +// Divide and multiply instructions. + +// Miscellaneous instructions. + +// Load instructions. +def : InstRW<[M4WriteLD, + WriteLDHi, + WriteAdr], (instregex "^LDP(SW|W|X)(post|pre)")>; +def : InstRW<[M4WriteL5, + ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roW")>; +def : InstRW<[WriteLDIdx, + ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roX")>; +def : InstRW<[M4WriteL5, + ReadAdrBase], (instrs PRFMroW)>; +def : InstRW<[WriteLDIdx, + ReadAdrBase], (instrs PRFMroX)>; + +// Store instructions. +def : InstRW<[M4WriteSB, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roW")>; +def : InstRW<[WriteST, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roX")>; + +// FP data instructions. +def : InstRW<[M4WriteNSHF1H], (instrs FABSHr)>; +def : InstRW<[M4WriteNSHF1], (instregex "^FABS[SD]r")>; +def : InstRW<[M4WriteFADD2H], (instregex "^F(ADD|SUB)Hrr")>; +def : InstRW<[M4WriteFADD2], (instregex "^F(ADD|SUB)[SD]rr")>; +def : InstRW<[M4WriteFADD2H], (instregex "^FADDPv.i16")>; +def : InstRW<[M4WriteFADD2], (instregex "^FADDPv.i(32|64)")>; +def : InstRW<[M4WriteNEONQ], (instregex "^FCCMPE?[HSD]rr")>; +def : InstRW<[M4WriteNMSC2], (instregex "^FCMPE?[HSD]r[ir]")>; +def : InstRW<[M4WriteNMSC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(16|32|64|v1)")>; +def : InstRW<[M4WriteFDIV7H], (instrs FDIVHrr)>; +def : InstRW<[M4WriteFDIV7], (instrs FDIVSrr)>; +def : InstRW<[M4WriteFDIV12], (instrs FDIVDrr)>; +def : InstRW<[M4WriteNMSC1], (instregex "^F(MAX|MIN)(NM)?[HSD]rr")>; +def : InstRW<[M4WriteFMAC3H], (instregex "^FN?MULHrr")>; +def : InstRW<[M4WriteFMAC3], (instregex "^FN?MUL[SD]rr")>; +def : InstRW<[M4WriteFMAC3H], (instrs FMULX16)>; +def : InstRW<[M4WriteFMAC3], (instregex "^FMULX(32|64)")>; +def : InstRW<[M4WriteFMAC4H, + M4ReadFMACM1], (instregex "^FN?M(ADD|SUB)Hrrr")>; +def : InstRW<[M4WriteFMAC4, + M4ReadFMACM1], (instregex "^FN?M(ADD|SUB)[SD]rrr")>; +def : InstRW<[M4WriteNALU1H], (instrs FNEGHr)>; +def : InstRW<[M4WriteNALU1], (instregex "^FNEG[SD]r")>; +def : InstRW<[M4WriteFCVT3A], (instregex "^FRINT.+r")>; +def : InstRW<[M4WriteNEONH], (instregex "^FCSEL[HSD]rrr")>; +def : InstRW<[M4WriteFSQR7H], (instrs FSQRTHr)>; +def : InstRW<[M4WriteFSQR8], (instrs FSQRTSr)>; +def : InstRW<[M4WriteFSQR12], (instrs FSQRTDr)>; + +// FP miscellaneous instructions. +def : InstRW<[M4WriteFCVT2H], (instregex "^FCVTH[SD]r")>; +def : InstRW<[M4WriteFCVT2H], (instregex "^FCVT[SD]Hr")>; +def : InstRW<[M4WriteFCVT2], (instregex "^FCVT[SD][SD]r")>; +def : InstRW<[M4WriteFCVT6A], (instregex "^[SU]CVTF[SU][XW][HSD]ri")>; +def : InstRW<[M4WriteNEONR], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; +def : InstRW<[M4WriteNALU1], (instregex "^FMOV[HSD][ir]")>; +def : InstRW<[M4WriteSA], (instregex "^FMOV[WX][HSD]r")>; +def : InstRW<[M4WriteNEONJ], (instregex "^FMOV[HSD][WX]r")>; +def : InstRW<[M4WriteNEONI], (instregex "^FMOVXDHighr")>; +def : InstRW<[M4WriteNEONK], (instregex "^FMOVDXHighr")>; +def : InstRW<[M4WriteFCVT3H], (instregex "^F(RECP|RSQRT)Ev1f16")>; +def : InstRW<[M4WriteFCVT3], (instregex "^F(RECP|RSQRT)Ev1i(32|64)")>; +def : InstRW<[M4WriteNMSC1], (instregex "^FRECPXv1")>; +def : InstRW<[M4WriteFMAC4H], (instregex "^F(RECP|RSQRT)S16")>; +def : InstRW<[M4WriteFMAC4], (instregex "^F(RECP|RSQRT)S(32|64)")>; + +// FP load instructions. +def : InstRW<[WriteVLD], (instregex "^LDR[SDQ]l")>; +def : InstRW<[WriteVLD], (instregex "^LDUR[BHSDQ]i")>; +def : InstRW<[WriteVLD, + WriteAdr], (instregex "^LDR[BHSDQ](post|pre)")>; +def : InstRW<[WriteVLD], (instregex "^LDR[BHSDQ]ui")>; +def : InstRW<[M4WriteLE, + ReadAdrBase], (instregex "^LDR[BHSDQ]roW")>; +def : InstRW<[WriteVLD, + ReadAdrBase], (instregex "^LDR[BHSD]roX")>; +def : InstRW<[M4WriteLY, + ReadAdrBase], (instrs LDRQroX)>; +def : InstRW<[WriteVLD, + M4WriteLH], (instregex "^LDN?P[SD]i")>; +def : InstRW<[M4WriteLA, + M4WriteLH], (instregex "^LDN?PQi")>; +def : InstRW<[M4WriteL5, + M4WriteLH, + WriteAdr], (instregex "^LDP[SD]post")>; +def : InstRW<[M4WriteLB, + M4WriteLH, + WriteAdr], (instrs LDPQpost)>; +def : InstRW<[M4WriteLB, + M4WriteLH, + WriteAdr], (instregex "^LDP[SD]pre")>; +def : InstRW<[M4WriteLC, + M4WriteLH, + WriteAdr], (instrs LDPQpre)>; + +// FP store instructions. +def : InstRW<[WriteVST], (instregex "^STUR[BHSDQ]i")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "^STR[BHSDQ](post|pre)")>; +def : InstRW<[WriteVST], (instregex "^STR[BHSDQ]ui")>; +def : InstRW<[M4WriteVSTK, + ReadAdrBase], (instregex "^STR[BHSD]roW")>; +def : InstRW<[M4WriteVSTK, + ReadAdrBase], (instrs STRQroW)>; +def : InstRW<[WriteVST, + ReadAdrBase], (instregex "^STR[BHSD]roX")>; +def : InstRW<[M4WriteVSTY, + ReadAdrBase], (instrs STRQroX)>; +def : InstRW<[WriteVST], (instregex "^STN?P[SD]i")>; +def : InstRW<[M4WriteVSTJ], (instregex "^STN?PQi")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "^STP[SD](post|pre)")>; +def : InstRW<[M4WriteVSTJ, + WriteAdr], (instregex "^STPQ(post|pre)")>; + +// ASIMD instructions. +def : InstRW<[M4WriteNHAD1], (instregex "^[SU]ABDL?v")>; +def : InstRW<[M4WriteNHAD3], (instregex "^[SU]ABAL?v")>; +def : InstRW<[M4WriteNMSC1], (instregex "^ABSv")>; +def : InstRW<[M4WriteNALU1], (instregex "^(ADD|NEG|SUB)v")>; +def : InstRW<[M4WriteNHAD3], (instregex "^[SU]?ADDL?Pv")>; +def : InstRW<[M4WriteNHAD3], (instregex "^[SU]H(ADD|SUB)v")>; +def : InstRW<[M4WriteNHAD3], (instregex "^[SU](ADD|SUB)[LW]v")>; +def : InstRW<[M4WriteNHAD3], (instregex "^R?(ADD|SUB)HN2?v")>; +def : InstRW<[M4WriteNHAD3], (instregex "^[SU]Q(ADD|SUB)v")>; +def : InstRW<[M4WriteNHAD3], (instregex "^(SU|US)QADDv")>; +def : InstRW<[M4WriteNHAD3], (instregex "^[SU]RHADDv")>; +def : InstRW<[M4WriteNMSC1], (instregex "^SQ(ABS|NEG)v")>; +def : InstRW<[M4WriteNHAD3], (instregex "^[SU]?ADDL?Vv")>; +def : InstRW<[M4WriteNMSC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>; +def : InstRW<[M4WriteNALU1], (instregex "^CMTSTv")>; +def : InstRW<[M4WriteNALU1], (instregex "^(AND|BIC|EOR|NOT|ORN|ORR)v")>; +def : InstRW<[M4WriteNMSC1], (instregex "^[SU](MIN|MAX)v")>; +def : InstRW<[M4WriteNMSC2], (instregex "^[SU](MIN|MAX)Pv")>; +def : InstRW<[M4WriteNHAD3], (instregex "^[SU](MIN|MAX)Vv")>; +def : InstRW<[M4WriteNMUL3, + M4ReadNMULM1], (instregex "^ML[AS]v")>; +def : InstRW<[M4WriteNMUL3, + M4ReadNMULM1], (instregex "^(SQR?D)?MULH?v")>; +def : InstRW<[M4WriteNMUL3, + M4ReadNMULM1], (instregex "^SQRDML[AS]H")>; +def : InstRW<[M4WriteNMUL3, + M4ReadNMULM1], (instregex "^(S|U|SQD)ML[AS]L(v1(i32|i64)|v2i32|v4i16|v8i8)")>; +def : InstRW<[M4WriteNMUL3, + M4ReadNMULP2], (instregex "^(S|U|SQD)ML[AS]L(v4i32|v8i16|v16i8)")>; +def : InstRW<[M4WriteNMUL3, + M4ReadNMULM1], (instregex "^(S|U|SQD)MULL(v1(i32|i64)|v2i32|v4i16|v8i8)")>; +def : InstRW<[M4WriteNMUL3, + M4ReadNMULP2], (instregex "^(S|U|SQD)MULL(v4i32|v8i16|v16i8)")>; +def : InstRW<[M4WriteNMUL3], (instregex "^[SU]DOT(lane)?v")>; +def : InstRW<[M4WriteNHAD3], (instregex "^[SU]ADALPv")>; +def : InstRW<[M4WriteNSHT4A], (instregex "^[SU]R?SRA[dv]")>; +def : InstRW<[M4WriteNSHT1], (instregex "^SHL[dv]")>; +def : InstRW<[M4WriteNSHT1], (instregex "^S[LR]I[dv]")>; +def : InstRW<[M4WriteNSHT1], (instregex "^[SU]SH[LR][dv]")>; +def : InstRW<[M4WriteNSHT2], (instregex "^[SU]?SHLLv")>; +def : InstRW<[M4WriteNSHT4A], (instregex "^[SU]?Q?R?SHRU?N[bhsv]")>; +def : InstRW<[M4WriteNSHT4A], (instregex "^[SU]RSH[LR][dv]")>; +def : InstRW<[M4WriteNSHT4A], (instregex "^[SU]QR?SHLU?[bhsdv]")>; + +// ASIMD FP instructions. +def : InstRW<[M4WriteNSHF1H], (instregex "^FABSv.f16")>; +def : InstRW<[M4WriteNSHF1], (instregex "^FABSv.f(32|64)")>; +def : InstRW<[M4WriteFADD2H], (instregex "^F(ABD|ADD|SUB)v.f16")>; +def : InstRW<[M4WriteFADD2], (instregex "^F(ABD|ADD|SUB)v.f(32|64)")>; +def : InstRW<[M4WriteFADD2H], (instregex "^FADDPv.f16")>; +def : InstRW<[M4WriteFADD2], (instregex "^FADDPv.f(32|64)")>; +def : InstRW<[M4WriteNMSC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>; +def : InstRW<[M4WriteFCVT2], (instregex "^FCVT(L|N|XN)v")>; +def : InstRW<[M4WriteFCVT2A], (instregex "^FCVT[AMNPZ][SU]v")>; +def : InstRW<[M4WriteFCVT2H], (instregex "^[SU]CVTFv.[fi]16")>; +def : InstRW<[M4WriteFCVT2], (instregex "^[SU]CVTFv.[fi](32|64)")>; +def : InstRW<[M4WriteFDIV7H], (instrs FDIVv4f16)>; +def : InstRW<[M4WriteNEONVH], (instrs FDIVv8f16)>; +def : InstRW<[M4WriteFDIV7], (instrs FDIVv2f32)>; +def : InstRW<[M4WriteNEONV], (instrs FDIVv4f32)>; +def : InstRW<[M4WriteNEONW], (instrs FDIVv2f64)>; +def : InstRW<[M4WriteNMSC1], (instregex "^F(MAX|MIN)(NM)?v")>; +def : InstRW<[M4WriteNMSC2], (instregex "^F(MAX|MIN)(NM)?Pv")>; +def : InstRW<[M4WriteNEONZ], (instregex "^F(MAX|MIN)(NM)?Vv")>; +def : InstRW<[M4WriteFMAC2H], (instregex "^FMULX?v.[fi]16")>; +def : InstRW<[M4WriteFMAC3], (instregex "^FMULX?v.[fi](32|64)")>; +def : InstRW<[M4WriteFMAC4H, + M4ReadFMACM1], (instregex "^FML[AS]v.[fi]16")>; +def : InstRW<[M4WriteFMAC4, + M4ReadFMACM1], (instregex "^FML[AS]v.[fi](32|64)")>; +def : InstRW<[M4WriteNALU1H], (instregex "^FNEGv.f16")>; +def : InstRW<[M4WriteNALU1], (instregex "^FNEGv.f(32|64)")>; +def : InstRW<[M4WriteFCVT3A], (instregex "^FRINT[AIMNPXZ]v")>; +def : InstRW<[M4WriteFSQR7H], (instrs FSQRTv4f16)>; +def : InstRW<[M4WriteNEONXH], (instrs FSQRTv8f16)>; +def : InstRW<[M4WriteFSQR8], (instrs FSQRTv2f32)>; +def : InstRW<[M4WriteNEONX], (instrs FSQRTv4f32)>; +def : InstRW<[M4WriteNEONY], (instrs FSQRTv2f64)>; + +// ASIMD miscellaneous instructions. +def : InstRW<[M4WriteNALU1], (instregex "^RBITv")>; +def : InstRW<[M4WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>; +def : InstRW<[M4WriteNALU1], (instregex "^CL[STZ]v")>; +def : InstRW<[M4WriteNEONB], (instregex "^DUPv.+gpr")>; +def : InstRW<[M4WriteNSHF1], (instregex "^DUP(i8|i16|i32|i64)$")>; +def : InstRW<[M4WriteNSHF1], (instregex "^DUPv.+lane")>; +def : InstRW<[M4WriteNSHF1], (instregex "^EXTv")>; +def : InstRW<[M4WriteNSHT4A], (instregex "^XTNv")>; +def : InstRW<[M4WriteNSHT4A], (instregex "^[SU]?QXTU?Nv")>; +def : InstRW<[M4WriteNEONB], (instregex "^INSv.+gpr")>; +def : InstRW<[M4WriteNSHF1], (instregex "^INSv.+lane")>; +def : InstRW<[M4WriteMOVI], (instregex "^(MOV|MVN)I")>; +def : InstRW<[M4WriteNALU1H], (instregex "^FMOVv.f16")>; +def : InstRW<[M4WriteNALU1], (instregex "^FMOVv.f(32|64)")>; +def : InstRW<[M4WriteFCVT3H], (instregex "^F(RECP|RSQRT)Ev[248]f16")>; +def : InstRW<[M4WriteFCVT3], (instregex "^F(RECP|RSQRT)Ev[248]f(32|64)")>; +def : InstRW<[M4WriteFCVT3], (instregex "^U(RECP|RSQRT)Ev[24]i32")>; +def : InstRW<[M4WriteFMAC4H], (instregex "^F(RECP|RSQRT)Sv.f16")>; +def : InstRW<[M4WriteFMAC4], (instregex "^F(RECP|RSQRT)Sv.f(32|64)")>; +def : InstRW<[M4WriteNSHF1], (instregex "^REV(16|32|64)v")>; +def : InstRW<[M4WriteNSHFA], (instregex "^TB[LX]v(8|16)i8One")>; +def : InstRW<[M4WriteNSHFB], (instregex "^TB[LX]v(8|16)i8Two")>; +def : InstRW<[M4WriteNSHFC], (instregex "^TB[LX]v(8|16)i8Three")>; +def : InstRW<[M4WriteNSHFD], (instregex "^TB[LX]v(8|16)i8Four")>; +def : InstRW<[M4WriteNEONP], (instregex "^[SU]MOVv")>; +def : InstRW<[M4WriteNSHF1], (instregex "^(TRN|UZP|ZIP)[12]v")>; + +// ASIMD load instructions. +def : InstRW<[WriteVLD], (instregex "LD1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVLD, + M4WriteA1], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[WriteVLD], (instregex "LD1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD, + M4WriteA1], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVLDA], (instregex "LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[M4WriteVLDA, + M4WriteA1], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M4WriteVLDA], (instregex "LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVLDA, + M4WriteA1], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVLDB], (instregex "LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[M4WriteVLDB, + M4WriteA1], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M4WriteVLDB], (instregex "LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVLDB, + M4WriteA1], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVLDC], (instregex "LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[M4WriteVLDC, + M4WriteA1], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M4WriteVLDC], (instregex "LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVLDC, + M4WriteA1], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVLDD], (instregex "LD1i(8|16|32|64)$")>; +def : InstRW<[M4WriteVLDD, + M4WriteA1], (instregex "LD1i(8|16|32|64)_POST$")>; + +def : InstRW<[WriteVLD], (instregex "LD1Rv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVLD, + M4WriteA1], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[WriteVLD], (instregex "LD1Rv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD, + M4WriteA1], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVLDF], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[M4WriteVLDF, + M4WriteA1], (instregex "LD2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[M4WriteVLDF], (instregex "LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVLDF, + M4WriteA1], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVLDG], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[M4WriteVLDG, + M4WriteA1], (instregex "LD2i(8|16|32|64)_POST$")>; + +def : InstRW<[M4WriteVLDA], (instregex "LD2Rv(8b|4h|2s|1d)$")>; +def : InstRW<[M4WriteVLDA, + M4WriteA1], (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M4WriteVLDA], (instregex "LD2Rv(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVLDA, + M4WriteA1], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVLDI], (instregex "LD3Threev(8b|4h|2s)$")>; +def : InstRW<[M4WriteVLDI, + M4WriteA1], (instregex "LD3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[M4WriteVLDI], (instregex "LD3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVLDI, + M4WriteA1], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVLDJ], (instregex "LD3i(8|16|32)$")>; +def : InstRW<[M4WriteVLDJ, + M4WriteA1], (instregex "LD3i(8|16|32)_POST$")>; +def : InstRW<[M4WriteVLDL], (instregex "LD3i64$")>; +def : InstRW<[M4WriteVLDL, + M4WriteA1], (instregex "LD3i64_POST$")>; + +def : InstRW<[M4WriteVLDB], (instregex "LD3Rv(8b|4h|2s|1d)$")>; +def : InstRW<[M4WriteVLDB, + M4WriteA1], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M4WriteVLDB], (instregex "LD3Rv(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVLDB, + M4WriteA1], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVLDN], (instregex "LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[M4WriteVLDN, + M4WriteA1], (instregex "LD4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[M4WriteVLDN], (instregex "LD4Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVLDN, + M4WriteA1], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVLDK], (instregex "LD4i(8|16|32)$")>; +def : InstRW<[M4WriteVLDK, + M4WriteA1], (instregex "LD4i(8|16|32)_POST$")>; +def : InstRW<[M4WriteVLDM], (instregex "LD4i64$")>; +def : InstRW<[M4WriteVLDM, + M4WriteA1], (instregex "LD4i64_POST$")>; + +def : InstRW<[M4WriteVLDC], (instregex "LD4Rv(8b|4h|2s|1d)$")>; +def : InstRW<[M4WriteVLDC, + M4WriteA1], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M4WriteVLDC], (instregex "LD4Rv(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVLDC, + M4WriteA1], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>; + +// ASIMD store instructions. +def : InstRW<[WriteVST], (instregex "ST1Onev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVST, + M4WriteA1], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[WriteVST], (instregex "ST1Onev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVST, + M4WriteA1], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVSTA], (instregex "ST1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[M4WriteVSTA, + M4WriteA1], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M4WriteVSTA], (instregex "ST1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVSTA, + M4WriteA1], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVSTB], (instregex "ST1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[M4WriteVSTB, + M4WriteA1], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M4WriteVSTB], (instregex "ST1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVSTB, + M4WriteA1], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVSTC], (instregex "ST1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[M4WriteVSTC, + M4WriteA1], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[M4WriteVSTC], (instregex "ST1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVSTC, + M4WriteA1], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[WriteVST], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[WriteVST, + M4WriteA1], (instregex "ST1i(8|16|32|64)_POST$")>; + +def : InstRW<[M4WriteVSTD], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[M4WriteVSTD, + M4WriteA1], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[M4WriteVSTE], (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVSTE, + M4WriteA1], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVSTD], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[M4WriteVSTD, + M4WriteA1], (instregex "ST2i(8|16|32|64)_POST$")>; + +def : InstRW<[M4WriteVSTF], (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[M4WriteVSTF, + M4WriteA1], (instregex "ST3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[M4WriteVSTG], (instregex "ST3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVSTG, + M4WriteA1], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVSTE], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[M4WriteVSTE, + M4WriteA1], (instregex "ST3i(8|16|32|64)_POST$")>; + +def : InstRW<[M4WriteVSTL], (instregex "ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[M4WriteVSTL, + M4WriteA1], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[M4WriteVSTI], (instregex "ST4Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[M4WriteVSTI, + M4WriteA1], (instregex "ST4Fourv(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[M4WriteVSTE], (instregex "ST4i(8|16|32|64)$")>; +def : InstRW<[M4WriteVSTE, + M4WriteA1], (instregex "ST4i(8|16|32|64)_POST$")>; + +// Cryptography instructions. +def : InstRW<[M4WriteNCRY1], (instregex "^AES[DE]")>; +def : InstRW<[M4WriteNCRY1, + M4ReadAESM1], (instregex "^AESI?MC")>; +def : InstRW<[M4WriteNCRY1A], (instregex "^PMULv")>; +def : InstRW<[M4WriteNCRY1A], (instregex "^PMULLv(1|8)i")>; +def : InstRW<[M4WriteNCRY3A], (instregex "^PMULLv(2|16)i")>; +def : InstRW<[M4WriteNCRY1A], (instregex "^SHA1([CHMP]|SU[01])")>; +def : InstRW<[M4WriteNCRY1A], (instrs SHA256SU0rr)>; +def : InstRW<[M4WriteNCRY5A], (instrs SHA256SU1rrr)>; +def : InstRW<[M4WriteNCRY5A], (instrs SHA256H2rrr)>; + +// CRC instructions. +def : InstRW<[M4WriteE2], (instregex "^CRC32C?[BHWX]rr$")>; + +} // SchedModel = ExynosM4Model diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedExynosM5.td b/suite/synctools/tablegen/AArch64/AArch64SchedExynosM5.td new file mode 100644 index 00000000..1db5f532 --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64SchedExynosM5.td @@ -0,0 +1,1016 @@ +//=- AArch64SchedExynosM5.td - Samsung Exynos M5 Sched Defs --*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the Samsung Exynos M5 to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// The Exynos-M5 is an advanced superscalar microprocessor with a 6-wide +// in-order stage for decode and dispatch and a wider issue stage. +// The execution units and loads and stores are out-of-order. + +def ExynosM5Model : SchedMachineModel { + let IssueWidth = 6; // Up to 6 uops per cycle. + let MicroOpBufferSize = 228; // ROB size. + let LoopMicroOpBufferSize = 60; // Based on the instruction queue size. + let LoadLatency = 4; // Optimistic load cases. + let MispredictPenalty = 15; // Minimum branch misprediction penalty. + let CompleteModel = 1; // Use the default model otherwise. + + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F, + SMEUnsupported.F); +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on the Exynos-M5. + +let SchedModel = ExynosM5Model in { + +def M5UnitA : ProcResource<2>; // Simple integer +def M5UnitC : ProcResource<2>; // Simple and complex integer +let Super = M5UnitC, BufferSize = 1 in +def M5UnitD : ProcResource<1>; // Integer division (inside C0, serialized) +def M5UnitE : ProcResource<2>; // Simple 32-bit integer +let Super = M5UnitC in +def M5UnitF : ProcResource<2>; // CRC (inside C) +def M5UnitB : ProcResource<1>; // Branch +def M5UnitL0 : ProcResource<1>; // Load +def M5UnitS0 : ProcResource<1>; // Store +def M5PipeLS : ProcResource<1>; // Load/Store +let Super = M5PipeLS in { + def M5UnitL1 : ProcResource<1>; + def M5UnitS1 : ProcResource<1>; +} +def M5PipeF0 : ProcResource<1>; // FP #0 +let Super = M5PipeF0 in { + def M5UnitFMAC0 : ProcResource<1>; // FP multiplication + def M5UnitFADD0 : ProcResource<1>; // Simple FP + def M5UnitNALU0 : ProcResource<1>; // Simple vector + def M5UnitNDOT0 : ProcResource<1>; // Dot product vector + def M5UnitNHAD : ProcResource<1>; // Horizontal vector + def M5UnitNMSC : ProcResource<1>; // FP and vector miscellanea + def M5UnitNMUL0 : ProcResource<1>; // Vector multiplication + def M5UnitNSHT0 : ProcResource<1>; // Vector shifting + def M5UnitNSHF0 : ProcResource<1>; // Vector shuffling + def M5UnitNCRY0 : ProcResource<1>; // Cryptographic +} +def M5PipeF1 : ProcResource<1>; // FP #1 +let Super = M5PipeF1 in { + def M5UnitFMAC1 : ProcResource<1>; // FP multiplication + def M5UnitFADD1 : ProcResource<1>; // Simple FP + def M5UnitFCVT0 : ProcResource<1>; // FP conversion + def M5UnitFDIV0 : ProcResource<2>; // FP division (serialized) + def M5UnitFSQR0 : ProcResource<2>; // FP square root (serialized) + def M5UnitFST0 : ProcResource<1>; // FP store + def M5UnitNALU1 : ProcResource<1>; // Simple vector + def M5UnitNDOT1 : ProcResource<1>; // Dot product vector + def M5UnitNSHT1 : ProcResource<1>; // Vector shifting + def M5UnitNSHF1 : ProcResource<1>; // Vector shuffling +} +def M5PipeF2 : ProcResource<1>; // FP #2 +let Super = M5PipeF2 in { + def M5UnitFMAC2 : ProcResource<1>; // FP multiplication + def M5UnitFADD2 : ProcResource<1>; // Simple FP + def M5UnitFCVT1 : ProcResource<1>; // FP conversion + def M5UnitFDIV1 : ProcResource<2>; // FP division (serialized) + def M5UnitFSQR1 : ProcResource<2>; // FP square root (serialized) + def M5UnitFST1 : ProcResource<1>; // FP store + def M5UnitNALU2 : ProcResource<1>; // Simple vector + def M5UnitNDOT2 : ProcResource<1>; // Dot product vector + def M5UnitNMUL1 : ProcResource<1>; // Vector multiplication + def M5UnitNSHT2 : ProcResource<1>; // Vector shifting + def M5UnitNCRY1 : ProcResource<1>; // Cryptographic +} + +def M5UnitAX : ProcResGroup<[M5UnitA, + M5UnitC]>; +def M5UnitAW : ProcResGroup<[M5UnitA, + M5UnitC, + M5UnitE]>; +def M5UnitL : ProcResGroup<[M5UnitL0, + M5UnitL1]>; +def M5UnitS : ProcResGroup<[M5UnitS0, + M5UnitS1]>; +def M5UnitFMAC : ProcResGroup<[M5UnitFMAC0, + M5UnitFMAC1, + M5UnitFMAC2]>; +def M5UnitFADD : ProcResGroup<[M5UnitFADD0, + M5UnitFADD1, + M5UnitFADD2]>; +def M5UnitFCVT : ProcResGroup<[M5UnitFCVT0, + M5UnitFCVT1]>; +def M5UnitFDIV : ProcResGroup<[M5UnitFDIV0, + M5UnitFDIV1]>; +def M5UnitFSQR : ProcResGroup<[M5UnitFSQR0, + M5UnitFSQR1]>; +def M5UnitFST : ProcResGroup<[M5UnitFST0, + M5UnitFST1]>; +def M5UnitNALU : ProcResGroup<[M5UnitNALU0, + M5UnitNALU1, + M5UnitNALU2]>; +def M5UnitNDOT : ProcResGroup<[M5UnitNDOT0, + M5UnitNDOT1, + M5UnitNDOT2]>; +def M5UnitNMUL : ProcResGroup<[M5UnitNMUL0, + M5UnitNMUL1]>; +def M5UnitNSHT : ProcResGroup<[M5UnitNSHT0, + M5UnitNSHT1, + M5UnitNSHT2]>; +def M5UnitNSHF : ProcResGroup<[M5UnitNSHF0, + M5UnitNSHF1]>; +def M5UnitNCRY : ProcResGroup<[M5UnitNCRY0, + M5UnitNCRY1]>; + +//===----------------------------------------------------------------------===// +// Resources details. + +def M5WriteZ0 : SchedWriteRes<[]> { let Latency = 0; } +def M5WriteZ1 : SchedWriteRes<[]> { let Latency = 1; + let NumMicroOps = 0; } +def M5WriteZ4 : SchedWriteRes<[]> { let Latency = 4; + let NumMicroOps = 0; } + +def M5WriteA1W : SchedWriteRes<[M5UnitAW]> { let Latency = 1; } +def M5WriteA1X : SchedWriteRes<[M5UnitAX]> { let Latency = 1; } +def M5WriteAAW : SchedWriteRes<[M5UnitAW]> { let Latency = 2; + let ResourceCycles = [2]; } +def M5WriteAAX : SchedWriteRes<[M5UnitAX]> { let Latency = 2; + let ResourceCycles = [2]; } +def M5WriteAB : SchedWriteRes<[M5UnitAX, + M5UnitC, + M5UnitE]> { let Latency = 2; + let NumMicroOps = 2; } +def M5WriteAC : SchedWriteRes<[M5UnitAX, + M5UnitAX, + M5UnitC]> { let Latency = 3; + let NumMicroOps = 3; } +def M5WriteAD : SchedWriteRes<[M5UnitAW, + M5UnitC]> { let Latency = 2; + let NumMicroOps = 2; } +def M5WriteAFW : SchedWriteRes<[M5UnitAW]> { let Latency = 2; + let NumMicroOps = 2; } +def M5WriteAFX : SchedWriteRes<[M5UnitAX]> { let Latency = 2; + let NumMicroOps = 2; } +def M5WriteAUW : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar, + SchedVar]>; +def M5WriteAUX : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar, + SchedVar]>; +def M5WriteAVW : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar, + SchedVar]>; +def M5WriteAVX : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar, + SchedVar]>; +def M5WriteAXW : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar]>; +def M5WriteAXX : SchedWriteVariant<[SchedVar, + SchedVar, + SchedVar]>; +def M5WriteAYW : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M5WriteAYX : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M5WriteB1 : SchedWriteRes<[M5UnitB]> { let Latency = 1; } +def M5WriteBX : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M5WriteC1 : SchedWriteRes<[M5UnitC]> { let Latency = 1; } +def M5WriteC2 : SchedWriteRes<[M5UnitC]> { let Latency = 2; } +def M5WriteCA : SchedWriteRes<[M5UnitC]> { let Latency = 3; + let ResourceCycles = [2]; } + +def M5WriteD10 : SchedWriteRes<[M5UnitD]> { let Latency = 10; + let ResourceCycles = [10]; } +def M5WriteD16 : SchedWriteRes<[M5UnitD]> { let Latency = 16; + let ResourceCycles = [16]; } + +def M5WriteF2 : SchedWriteRes<[M5UnitF]> { let Latency = 2; } + +def M5WriteL4 : SchedWriteRes<[M5UnitL]> { let Latency = 4; } +def M5WriteL5 : SchedWriteRes<[M5UnitL]> { let Latency = 5; } +def M5WriteL6 : SchedWriteRes<[M5UnitL]> { let Latency = 6; } +def M5WriteLA : SchedWriteRes<[M5UnitL, + M5UnitL]> { let Latency = 6; + let NumMicroOps = 1; } +def M5WriteLB : SchedWriteRes<[M5UnitAX, + M5UnitL]> { let Latency = 6; + let NumMicroOps = 2; } +def M5WriteLC : SchedWriteRes<[M5UnitAX, + M5UnitL, + M5UnitL]> { let Latency = 6; + let NumMicroOps = 2; } +def M5WriteLD : SchedWriteRes<[M5UnitAX, + M5UnitL]> { let Latency = 4; + let NumMicroOps = 2; } +def M5WriteLE : SchedWriteRes<[M5UnitAX, + M5UnitL]> { let Latency = 7; + let NumMicroOps = 2; } +def M5WriteLFW : SchedWriteRes<[M5UnitAW, + M5UnitAW, + M5UnitAW, + M5UnitAW, + M5UnitL]> { let Latency = 15; + let NumMicroOps = 6; + let ResourceCycles = [1, 1, 1, 1, 15]; } +def M5WriteLFX : SchedWriteRes<[M5UnitAX, + M5UnitAX, + M5UnitAX, + M5UnitAX, + M5UnitL]> { let Latency = 15; + let NumMicroOps = 6; + let ResourceCycles = [1, 1, 1, 1, 15]; } +def M5WriteLGW : SchedWriteRes<[M5UnitAW, + M5UnitL]> { let Latency = 13; + let NumMicroOps = 1; + let ResourceCycles = [1, 13]; } +def M5WriteLGX : SchedWriteRes<[M5UnitAX, + M5UnitL]> { let Latency = 13; + let NumMicroOps = 1; + let ResourceCycles = [1, 13]; } +def M5WriteLH : SchedWriteRes<[]> { let Latency = 6; + let NumMicroOps = 0; } +def M5WriteLX : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M5WriteLY : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M5WriteS1 : SchedWriteRes<[M5UnitS]> { let Latency = 1; } +def M5WriteSA : SchedWriteRes<[M5UnitS0]> { let Latency = 4; } +def M5WriteSB : SchedWriteRes<[M5UnitAX, + M5UnitS]> { let Latency = 2; + let NumMicroOps = 1; } +def M5WriteSX : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M5ReadAdrBase : SchedReadVariant<[SchedVar< + MCSchedPredicate< + CheckAny< + [ScaledIdxFn, + ExynosScaledIdxFn]>>, [ReadDefault]>, + SchedVar]>; + +def M5WriteNEONB : SchedWriteRes<[M5UnitNALU, + M5UnitS0]> { let Latency = 5; + let NumMicroOps = 2; } +def M5WriteNEONH : SchedWriteRes<[M5UnitNALU, + M5UnitS0]> { let Latency = 2; + let NumMicroOps = 2; } +def M5WriteNEONI : SchedWriteRes<[M5UnitS0, + M5UnitNSHF]> { let Latency = 6; + let NumMicroOps = 2; } +def M5WriteNEONK : SchedWriteRes<[M5UnitNSHF, + M5UnitFCVT0, + M5UnitS0]> { let Latency = 5; + let NumMicroOps = 2; } +def M5WriteNEONN : SchedWriteRes<[M5UnitNMSC, + M5UnitNMSC]> { let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [7, 7]; } +def M5WriteNEONO : SchedWriteRes<[M5UnitNMSC, + M5UnitNMSC, + M5UnitNMSC]> { let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [10, 10, 10]; } +def M5WriteNEONP : SchedWriteRes<[M5UnitNSHF, + M5UnitS0, + M5UnitFCVT]> { let Latency = 7; + let NumMicroOps = 2; } +def M5WriteNEONQ : SchedWriteRes<[M5UnitNMSC, + M5UnitC]> { let Latency = 3; + let NumMicroOps = 1; } +def M5WriteNEONU : SchedWriteRes<[M5UnitFSQR, + M5UnitFSQR]> { let Latency = 7; + let ResourceCycles = [4, 4]; } +def M5WriteNEONV : SchedWriteRes<[M5UnitFDIV, + M5UnitFDIV]> { let Latency = 7; + let ResourceCycles = [6, 6]; } +def M5WriteNEONW : SchedWriteRes<[M5UnitFDIV, + M5UnitFDIV]> { let Latency = 12; + let ResourceCycles = [9, 9]; } +def M5WriteNEONX : SchedWriteRes<[M5UnitFSQR, + M5UnitFSQR]> { let Latency = 8; + let ResourceCycles = [5, 5]; } +def M5WriteNEONY : SchedWriteRes<[M5UnitFSQR, + M5UnitFSQR]> { let Latency = 12; + let ResourceCycles = [9, 9]; } +def M5WriteNEONZ : SchedWriteVariant<[SchedVar, + SchedVar]>; + +def M5WriteFADD2 : SchedWriteRes<[M5UnitFADD]> { let Latency = 2; } + +def M5WriteFCVT2 : SchedWriteRes<[M5UnitFCVT]> { let Latency = 2; } +def M5WriteFCVT2A : SchedWriteRes<[M5UnitFCVT0]> { let Latency = 2; } +def M5WriteFCVT3 : SchedWriteRes<[M5UnitFCVT]> { let Latency = 3; } +def M5WriteFCVT3A : SchedWriteRes<[M5UnitFCVT0]> { let Latency = 3; } +def M5WriteFCVTA : SchedWriteRes<[M5UnitFCVT0, + M5UnitS0]> { let Latency = 3; + let NumMicroOps = 1; } +def M5WriteFCVTB : SchedWriteRes<[M5UnitFCVT, + M5UnitS0]> { let Latency = 4; + let NumMicroOps = 1; } +def M5WriteFCVTC : SchedWriteRes<[M5UnitFCVT, + M5UnitS0]> { let Latency = 6; + let NumMicroOps = 1; } + +def M5WriteFDIV5 : SchedWriteRes<[M5UnitFDIV]> { let Latency = 5; + let ResourceCycles = [2]; } +def M5WriteFDIV7 : SchedWriteRes<[M5UnitFDIV]> { let Latency = 7; + let ResourceCycles = [4]; } +def M5WriteFDIV12 : SchedWriteRes<[M5UnitFDIV]> { let Latency = 12; + let ResourceCycles = [9]; } + +def M5WriteFMAC3 : SchedWriteRes<[M5UnitFMAC]> { let Latency = 3; } +def M5WriteFMAC4 : SchedWriteRes<[M5UnitFMAC]> { let Latency = 4; } +def M5WriteFMAC5 : SchedWriteRes<[M5UnitFMAC]> { let Latency = 5; } + +def M5WriteFSQR5 : SchedWriteRes<[M5UnitFSQR]> { let Latency = 5; + let ResourceCycles = [2]; } +def M5WriteFSQR7 : SchedWriteRes<[M5UnitFSQR]> { let Latency = 7; + let ResourceCycles = [4]; } +def M5WriteFSQR8 : SchedWriteRes<[M5UnitFSQR]> { let Latency = 8; + let ResourceCycles = [5]; } +def M5WriteFSQR12 : SchedWriteRes<[M5UnitFSQR]> { let Latency = 12; + let ResourceCycles = [9]; } + +def M5WriteNALU1 : SchedWriteRes<[M5UnitNALU]> { let Latency = 1; } +def M5WriteNALU2 : SchedWriteRes<[M5UnitNALU]> { let Latency = 2; } + +def M5WriteNDOT2 : SchedWriteRes<[M5UnitNDOT]> { let Latency = 2; } + +def M5WriteNCRY2 : SchedWriteRes<[M5UnitNCRY]> { let Latency = 2; } +def M5WriteNCRY1A : SchedWriteRes<[M5UnitNCRY0]> { let Latency = 1; } +def M5WriteNCRY2A : SchedWriteRes<[M5UnitNCRY0]> { let Latency = 2; } +def M5WriteNCRY3A : SchedWriteRes<[M5UnitNCRY0]> { let Latency = 3; } +def M5WriteNCRY5A : SchedWriteRes<[M5UnitNCRY]> { let Latency = 5; } + +def M5WriteNHAD1 : SchedWriteRes<[M5UnitNHAD]> { let Latency = 1; } +def M5WriteNHAD3 : SchedWriteRes<[M5UnitNHAD]> { let Latency = 3; } + +def M5WriteNMSC1 : SchedWriteRes<[M5UnitNMSC]> { let Latency = 1; } +def M5WriteNMSC2 : SchedWriteRes<[M5UnitNMSC]> { let Latency = 2; } + +def M5WriteNMUL3 : SchedWriteRes<[M5UnitNMUL]> { let Latency = 3; } + +def M5WriteNSHF1 : SchedWriteRes<[M5UnitNSHF]> { let Latency = 1; } +def M5WriteNSHF2 : SchedWriteRes<[M5UnitNSHF]> { let Latency = 2; } +def M5WriteNSHFA : SchedWriteRes<[M5UnitNSHF]> { let Latency = 2; } +def M5WriteNSHFB : SchedWriteRes<[M5UnitNSHF]> { let Latency = 4; + let NumMicroOps = 2; } +def M5WriteNSHFC : SchedWriteRes<[M5UnitNSHF]> { let Latency = 6; + let NumMicroOps = 3; } +def M5WriteNSHFD : SchedWriteRes<[M5UnitNSHF]> { let Latency = 8; + let NumMicroOps = 4; } + +def M5WriteNSHT2 : SchedWriteRes<[M5UnitNSHT]> { let Latency = 2; } +def M5WriteNSHT4A : SchedWriteRes<[M5UnitNSHT1]> { let Latency = 4; } + +def M5WriteVLDA : SchedWriteRes<[M5UnitL, + M5UnitL]> { let Latency = 6; + let NumMicroOps = 2; } +def M5WriteVLDB : SchedWriteRes<[M5UnitL, + M5UnitL, + M5UnitL]> { let Latency = 7; + let NumMicroOps = 3; } +def M5WriteVLDC : SchedWriteRes<[M5UnitL, + M5UnitL, + M5UnitL, + M5UnitL]> { let Latency = 7; + let NumMicroOps = 4; } +def M5WriteVLDD : SchedWriteRes<[M5UnitL, + M5UnitNSHF]> { let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [2, 1]; } +def M5WriteVLDF : SchedWriteRes<[M5UnitL, + M5UnitL]> { let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [6, 5]; } +def M5WriteVLDG : SchedWriteRes<[M5UnitL, + M5UnitNSHF, + M5UnitNSHF]> { let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2, 1, 1]; } +def M5WriteVLDI : SchedWriteRes<[M5UnitL, + M5UnitL, + M5UnitL]> { let Latency = 13; + let NumMicroOps = 3; } +def M5WriteVLDJ : SchedWriteRes<[M5UnitL, + M5UnitNSHF, + M5UnitNSHF, + M5UnitNSHF]> { let Latency = 8; + let NumMicroOps = 4; } +def M5WriteVLDK : SchedWriteRes<[M5UnitL, + M5UnitNSHF, + M5UnitNSHF, + M5UnitNSHF, + M5UnitNSHF]> { let Latency = 8; + let NumMicroOps = 5; } +def M5WriteVLDL : SchedWriteRes<[M5UnitL, + M5UnitNSHF, + M5UnitNSHF, + M5UnitL, + M5UnitNSHF]> { let Latency = 8; + let NumMicroOps = 5; } +def M5WriteVLDM : SchedWriteRes<[M5UnitL, + M5UnitNSHF, + M5UnitNSHF, + M5UnitL, + M5UnitNSHF, + M5UnitNSHF]> { let Latency = 8; + let NumMicroOps = 6; } +def M5WriteVLDN : SchedWriteRes<[M5UnitL, + M5UnitL, + M5UnitL, + M5UnitL]> { let Latency = 15; + let NumMicroOps = 4; + let ResourceCycles = [2, 2, 2, 2]; } + +def M5WriteVST1 : SchedWriteRes<[M5UnitS, + M5UnitFST]> { let Latency = 1; + let NumMicroOps = 1; } +def M5WriteVSTA : SchedWriteRes<[M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST]> { let Latency = 2; + let NumMicroOps = 2; } +def M5WriteVSTB : SchedWriteRes<[M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST]> { let Latency = 3; + let NumMicroOps = 3; } +def M5WriteVSTC : SchedWriteRes<[M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST]> { let Latency = 4; + let NumMicroOps = 4; } +def M5WriteVSTD : SchedWriteRes<[M5UnitS, + M5UnitFST]> { let Latency = 2; } +def M5WriteVSTE : SchedWriteRes<[M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST]> { let Latency = 2; + let NumMicroOps = 1; } +def M5WriteVSTF : SchedWriteRes<[M5UnitNSHF, + M5UnitNSHF, + M5UnitS, + M5UnitFST]> { let Latency = 4; + let NumMicroOps = 3; } +def M5WriteVSTG : SchedWriteRes<[M5UnitNSHF, + M5UnitNSHF, + M5UnitNSHF, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST]> { let Latency = 4; + let NumMicroOps = 5; } +def M5WriteVSTH : SchedWriteRes<[M5UnitS0, + M5UnitFST]> { let Latency = 1; + let NumMicroOps = 1; } +def M5WriteVSTI : SchedWriteRes<[M5UnitNSHF, + M5UnitNSHF, + M5UnitNSHF, + M5UnitNSHF, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST]> { let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1]; } +def M5WriteVSTJ : SchedWriteRes<[M5UnitA, + M5UnitS0, + M5UnitFST]> { let Latency = 1; + let NumMicroOps = 1; } +def M5WriteVSTK : SchedWriteRes<[M5UnitAX, + M5UnitS, + M5UnitFST]> { let Latency = 3; + let NumMicroOps = 2; } +def M5WriteVSTL : SchedWriteRes<[M5UnitNSHF, + M5UnitNSHF, + M5UnitS, + M5UnitFST, + M5UnitS, + M5UnitFST]> { let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 2, 1, 2, 1]; } +def M5WriteVSTY : SchedWriteVariant<[SchedVar, + SchedVar]>; + +// Special cases. +def M5WriteCOPY : SchedWriteVariant<[SchedVar, + SchedVar]>; +def M5WriteMOVI : SchedWriteVariant<[SchedVar, + SchedVar]>; + +// Fast forwarding. +def M5ReadFM1 : SchedReadAdvance<+1, [M5WriteF2]>; +def M5ReadAESM2 : SchedReadAdvance<+2, [M5WriteNCRY2]>; +def M5ReadFMACM1 : SchedReadAdvance<+1, [M5WriteFMAC4, + M5WriteFMAC5]>; +def M5ReadNMULM1 : SchedReadAdvance<+1, [M5WriteNMUL3]>; + +//===----------------------------------------------------------------------===// +// Coarse scheduling model. + +// Branch instructions. +def : SchedAlias; +def : SchedAlias; + +// Arithmetic and logical integer instructions. +def : SchedAlias; +def : SchedAlias; // FIXME: M5WriteAX crashes TableGen. +def : SchedAlias; // FIXME: M5WriteAX crashes TableGen. +def : SchedAlias; + +// Move instructions. +def : SchedAlias; + +// Divide and multiply instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// Miscellaneous instructions. +def : SchedAlias; + +// Addressing modes. +def : SchedAlias; +def : SchedAlias; + +// Load instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// Store instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// Atomic load and store instructions. +def : SchedAlias; + +// FP data instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// FP miscellaneous instructions. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// FP load instructions. +def : SchedAlias; + +// FP store instructions. +def : SchedAlias; + +// ASIMD FP instructions. +def : SchedAlias; +def : SchedAlias; + +// Other miscellaneous instructions. +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +//===----------------------------------------------------------------------===// +// Generic fast forwarding. + +// TODO: Add FP register forwarding rules. + +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +// TODO: The forwarding for 32 bits actually saves 2 cycles. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Finer scheduling model. + +// Branch instructions +def : InstRW<[M5WriteB1], (instrs Bcc)>; +def : InstRW<[M5WriteAFX], (instrs BL)>; +def : InstRW<[M5WriteBX], (instrs BLR)>; +def : InstRW<[M5WriteC1], (instregex "^CBN?Z[WX]")>; +def : InstRW<[M5WriteAD], (instregex "^TBN?ZW")>; +def : InstRW<[M5WriteAB], (instregex "^TBN?ZX")>; + +// Arithmetic and logical integer instructions. +def : InstRW<[M5WriteA1W], (instregex "^(ADC|SBC)S?Wr$")>; +def : InstRW<[M5WriteA1X], (instregex "^(ADC|SBC)S?Xr$")>; +def : InstRW<[M5WriteAXW], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|SUB)Wrs$")>; +def : InstRW<[M5WriteAXX], (instregex "^(ADD|AND|BIC|EON|EOR|ORN|SUB)Xrs$")>; +def : InstRW<[M5WriteAUW], (instrs ORRWrs)>; +def : InstRW<[M5WriteAUX], (instrs ORRXrs)>; +def : InstRW<[M5WriteAXW], (instregex "^(ADD|AND|BIC|SUB)SWrs$")>; +def : InstRW<[M5WriteAXX], (instregex "^(ADD|AND|BIC|SUB)SXrs$")>; +def : InstRW<[M5WriteAXW], (instregex "^(ADD|SUB)S?Wrx(64)?$")>; +def : InstRW<[M5WriteAXX], (instregex "^(ADD|SUB)S?Xrx(64)?$")>; +def : InstRW<[M5WriteAVW], (instrs ADDWri, ORRWri)>; +def : InstRW<[M5WriteAVX], (instrs ADDXri, ORRXri)>; +def : InstRW<[M5WriteA1W], (instregex "^CCM[NP]W[ir]$")>; +def : InstRW<[M5WriteA1X], (instregex "^CCM[NP]X[ir]$")>; +def : InstRW<[M5WriteA1W], (instrs CSELWr, CSINCWr, CSINVWr, CSNEGWr)>; +def : InstRW<[M5WriteA1X], (instrs CSELXr, CSINCXr, CSINVXr, CSNEGXr)>; + +// Move instructions. +def : InstRW<[M5WriteCOPY], (instrs COPY)>; +def : InstRW<[M5WriteZ0], (instrs ADR, ADRP)>; +def : InstRW<[M5WriteZ0], (instregex "^MOV[NZ][WX]i$")>; + +// Shift instructions. +def : InstRW<[M5WriteA1W], (instrs ASRVWr, LSLVWr, LSRVWr, RORVWr)>; +def : InstRW<[M5WriteA1X], (instrs ASRVXr, LSLVXr, LSRVXr, RORVXr)>; + +// Miscellaneous instructions. +def : InstRW<[M5WriteAYW], (instrs EXTRWrri)>; +def : InstRW<[M5WriteAYX], (instrs EXTRXrri)>; +def : InstRW<[M5WriteA1W], (instrs BFMWri, SBFMWri, UBFMWri)>; +def : InstRW<[M5WriteA1X], (instrs BFMXri, SBFMXri, UBFMXri)>; +def : InstRW<[M5WriteA1W], (instrs CLSWr, CLZWr)>; +def : InstRW<[M5WriteA1X], (instrs CLSXr, CLZXr)>; +def : InstRW<[M5WriteA1W], (instrs RBITWr, REVWr, REV16Wr)>; +def : InstRW<[M5WriteA1X], (instrs RBITXr, REVXr, REV16Xr, REV32Xr)>; + +// Load instructions. +def : InstRW<[M5WriteLD, + WriteLDHi, + WriteAdr], (instregex "^LDP(SW|W|X)(post|pre)")>; +def : InstRW<[M5WriteL5, + ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roW")>; +def : InstRW<[WriteLDIdx, + ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roX")>; +def : InstRW<[M5WriteL5, + ReadAdrBase], (instrs PRFMroW)>; +def : InstRW<[WriteLDIdx, + ReadAdrBase], (instrs PRFMroX)>; + +// Store instructions. +def : InstRW<[M5WriteSB, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roW")>; +def : InstRW<[WriteST, + ReadAdrBase], (instregex "^STR(BB|HH|W|X)roX")>; + +// Atomic load and store instructions. +def : InstRW<[M5WriteLGW], (instregex "^CAS(A|AL|L)?[BHW]$")>; +def : InstRW<[M5WriteLGX], (instregex "^CAS(A|AL|L)?X$")>; +def : InstRW<[M5WriteLFW], (instregex "^CASP(A|AL|L)?W$")>; +def : InstRW<[M5WriteLFX], (instregex "^CASP(A|AL|L)?X$")>; +def : InstRW<[M5WriteLGW], (instregex "^LD(ADD|CLR|EOR|SET|[SU]MAX|[SU]MIN)(A|AL|L)?[BHW]$")>; +def : InstRW<[M5WriteLGX], (instregex "^LD(ADD|CLR|EOR|SET|[SU]MAX|[SU]MIN)(A|AL|L)?X$")>; +def : InstRW<[M5WriteLGW], (instregex "^SWP(A|AL|L)?[BHW]$")>; +def : InstRW<[M5WriteLGX], (instregex "^SWP(A|AL|L)?X$")>; + +// FP data instructions. +def : InstRW<[M5WriteNSHF1], (instrs FABSHr, FABSSr,FABSDr)>; +def : InstRW<[M5WriteFADD2], (instregex "^F(ADD|SUB)[HSD]rr")>; +def : InstRW<[M5WriteFADD2], (instregex "^FADDPv.i(16|32|64)")>; +def : InstRW<[M5WriteNEONQ], (instregex "^FCCMPE?[HSD]rr")>; +def : InstRW<[M5WriteNMSC2], (instregex "^FCMPE?[HSD]r[ir]")>; +def : InstRW<[M5WriteNMSC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(16|32|64|v1)")>; +def : InstRW<[M5WriteFDIV5], (instrs FDIVHrr)>; +def : InstRW<[M5WriteFDIV7], (instrs FDIVSrr)>; +def : InstRW<[M5WriteFDIV12], (instrs FDIVDrr)>; +def : InstRW<[M5WriteNMSC1], (instregex "^F(MAX|MIN)(NM)?[HSD]rr")>; +def : InstRW<[M5WriteFMAC3], (instregex "^FN?MUL[HSD]rr")>; +def : InstRW<[M5WriteFMAC3], (instrs FMULX16, FMULX32, FMULX64)>; +def : InstRW<[M5WriteFMAC4, + M5ReadFMACM1], (instregex "^FN?M(ADD|SUB)[HSD]rrr")>; +def : InstRW<[M5WriteNALU2], (instrs FNEGHr, FNEGSr, FNEGDr)>; +def : InstRW<[M5WriteFCVT3A], (instregex "^FRINT.+r")>; +def : InstRW<[M5WriteNEONH], (instregex "^FCSEL[HSD]rrr")>; +def : InstRW<[M5WriteFSQR5], (instrs FSQRTHr)>; +def : InstRW<[M5WriteFSQR8], (instrs FSQRTSr)>; +def : InstRW<[M5WriteFSQR12], (instrs FSQRTDr)>; + +// FP miscellaneous instructions. +def : InstRW<[M5WriteFCVT2], (instregex "^FCVT[HSD][HSD]r")>; +def : InstRW<[M5WriteFCVTC], (instregex "^[SU]CVTF[SU][XW][HSD]ri")>; +def : InstRW<[M5WriteFCVTB], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; +def : InstRW<[M5WriteNALU1], (instregex "^FMOV[HSD]i")>; +def : InstRW<[M5WriteNALU2], (instregex "^FMOV[HSD]r")>; +def : InstRW<[M5WriteSA], (instregex "^FMOV[WX][HSD]r")>; +def : InstRW<[M5WriteFCVTA], (instregex "^FMOV[HSD][WX]r")>; +def : InstRW<[M5WriteNEONI], (instregex "^FMOVXDHighr")>; +def : InstRW<[M5WriteNEONK], (instregex "^FMOVDXHighr")>; +def : InstRW<[M5WriteFCVT3], (instregex "^F(RECP|RSQRT)Ev1(f16|i32|i64)")>; +def : InstRW<[M5WriteNMSC1], (instregex "^FRECPXv1")>; +def : InstRW<[M5WriteFMAC4], (instregex "^F(RECP|RSQRT)S(16|32|64)")>; + +// FP load instructions. +def : InstRW<[WriteVLD], (instregex "^LDR[SDQ]l")>; +def : InstRW<[WriteVLD], (instregex "^LDUR[BHSDQ]i")>; +def : InstRW<[WriteVLD, + WriteAdr], (instregex "^LDR[BHSDQ](post|pre)")>; +def : InstRW<[WriteVLD], (instregex "^LDR[BHSDQ]ui")>; +def : InstRW<[M5WriteLE, + ReadAdrBase], (instregex "^LDR[BHSDQ]roW")>; +def : InstRW<[WriteVLD, + ReadAdrBase], (instregex "^LDR[BHSD]roX")>; +def : InstRW<[M5WriteLY, + ReadAdrBase], (instrs LDRQroX)>; +def : InstRW<[WriteVLD, + M5WriteLH], (instregex "^LDN?P[SD]i")>; +def : InstRW<[M5WriteLA, + M5WriteLH], (instregex "^LDN?PQi")>; +def : InstRW<[M5WriteLB, + M5WriteLH, + WriteAdr], (instregex "^LDP[SD](post|pre)")>; +def : InstRW<[M5WriteLC, + M5WriteLH, + WriteAdr], (instregex "^LDPQ(post|pre)")>; + +// FP store instructions. +def : InstRW<[WriteVST], (instregex "^STUR[BHSDQ]i")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "^STR[BHSDQ](post|pre)")>; +def : InstRW<[WriteVST], (instregex "^STR[BHSDQ]ui")>; +def : InstRW<[WriteVST, + ReadAdrBase], (instregex "^STR[BHSD]ro[WX]")>; +def : InstRW<[M5WriteVSTK, + ReadAdrBase], (instregex "^STRQroW")>; +def : InstRW<[M5WriteVSTY, + ReadAdrBase], (instregex "^STRQroX")>; +def : InstRW<[WriteVST], (instregex "^STN?P[SD]i")>; +def : InstRW<[M5WriteVSTH], (instregex "^STN?PQi")>; +def : InstRW<[WriteVST, + WriteAdr], (instregex "^STP[SD](post|pre)")>; +def : InstRW<[M5WriteVSTJ, + WriteAdr], (instregex "^STPQ(post|pre)")>; + +// ASIMD instructions. +def : InstRW<[M5WriteNHAD1], (instregex "^[SU]ABDL?v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU]ABAL?v")>; +def : InstRW<[M5WriteNMSC1], (instregex "^ABSv")>; +def : InstRW<[M5WriteNALU2], (instregex "^(ADD|NEG|SUB)v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU]?ADDL?Pv")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU]H(ADD|SUB)v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU](ADD|SUB)[LW]v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^R?(ADD|SUB)HN2?v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU]Q(ADD|SUB)v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^(SU|US)QADDv")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU]RHADDv")>; +def : InstRW<[M5WriteNMSC1], (instregex "^SQ(ABS|NEG)v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU]?ADDL?Vv")>; +def : InstRW<[M5WriteNMSC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>; +def : InstRW<[M5WriteNALU2], (instregex "^CMTSTv")>; +def : InstRW<[M5WriteNALU2], (instregex "^(AND|BIC|EOR|NOT|ORN|ORR)v")>; +def : InstRW<[M5WriteNMSC1], (instregex "^[SU](MIN|MAX)v")>; +def : InstRW<[M5WriteNMSC2], (instregex "^[SU](MIN|MAX)Pv")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU](MIN|MAX)Vv")>; +def : InstRW<[M5WriteNMUL3], (instregex "^(SQR?D)?MULH?v")>; +def : InstRW<[M5WriteNMUL3, + M5ReadNMULM1], (instregex "^ML[AS]v")>; +def : InstRW<[M5WriteNMUL3, + M5ReadNMULM1], (instregex "^SQRDML[AS]H")>; +def : InstRW<[M5WriteNMUL3], (instregex "^(S|U|SQD)ML[AS]L(v1(i32|i64)|v2i32|v4i16|v8i8)")>; +def : InstRW<[M5WriteNMUL3, + M5ReadNMULM1], (instregex "^(S|U|SQD)ML[AS]L(v4i32|v8i16|v16i8)")>; +def : InstRW<[M5WriteNMUL3, + M5ReadNMULM1], (instregex "^(S|U|SQD)MULL(v1(i32|i64)|v2i32|v4i16|v8i8)")>; +def : InstRW<[M5WriteNMUL3, + M5ReadNMULM1], (instregex "^(S|U|SQD)MULL(v4i32|v8i16|v16i8)")>; +def : InstRW<[M5WriteNDOT2], (instregex "^[SU]DOT(lane)?v")>; +def : InstRW<[M5WriteNHAD3], (instregex "^[SU]ADALPv")>; +def : InstRW<[M5WriteNSHT4A], (instregex "^[SU]R?SRA[dv]")>; +def : InstRW<[M5WriteNSHT2], (instregex "^SHL[dv]")>; +def : InstRW<[M5WriteNSHT2], (instregex "^S[LR]I[dv]")>; +def : InstRW<[M5WriteNSHT2], (instregex "^[SU]SH[LR][dv]")>; +def : InstRW<[M5WriteNSHT2], (instregex "^[SU]?SHLLv")>; +def : InstRW<[M5WriteNSHT4A], (instregex "^[SU]?Q?R?SHRU?N[bhsv]")>; +def : InstRW<[M5WriteNSHT4A], (instregex "^[SU]RSH[LR][dv]")>; +def : InstRW<[M5WriteNSHT4A], (instregex "^[SU]QR?SHLU?[bhsdv]")>; + +// ASIMD FP instructions. +def : InstRW<[M5WriteNSHF2], (instregex "^FABSv.f(16|32|64)")>; +def : InstRW<[M5WriteFADD2], (instregex "^F(ABD|ADD|SUB)v.f(16|32|64)")>; +def : InstRW<[M5WriteFADD2], (instregex "^FADDPv.f(16|32|64)")>; +def : InstRW<[M5WriteNMSC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>; +def : InstRW<[M5WriteFCVT2], (instregex "^FCVT(L|N|XN)v")>; +def : InstRW<[M5WriteFCVT2A], (instregex "^FCVT[AMNPZ][SU]v")>; +def : InstRW<[M5WriteFCVT2], (instregex "^[SU]CVTFv.[fi](16|32|64)")>; +def : InstRW<[M5WriteFDIV7], (instrs FDIVv4f16)>; +def : InstRW<[M5WriteNEONV], (instrs FDIVv8f16)>; +def : InstRW<[M5WriteFDIV7], (instrs FDIVv2f32)>; +def : InstRW<[M5WriteNEONV], (instrs FDIVv4f32)>; +def : InstRW<[M5WriteNEONW], (instrs FDIVv2f64)>; +def : InstRW<[M5WriteNMSC1], (instregex "^F(MAX|MIN)(NM)?v")>; +def : InstRW<[M5WriteNMSC2], (instregex "^F(MAX|MIN)(NM)?Pv")>; +def : InstRW<[M5WriteNEONZ], (instregex "^F(MAX|MIN)(NM)?Vv")>; +def : InstRW<[M5WriteFMAC3], (instregex "^FMULX?v.[fi](16|32|64)")>; +def : InstRW<[M5WriteFMAC4, + M5ReadFMACM1], (instregex "^FML[AS]v.[fi](16|32|64)")>; +def : InstRW<[M5WriteNALU2], (instregex "^FNEGv.f(16|32|64)")>; +def : InstRW<[M5WriteFCVT3A], (instregex "^FRINT[AIMNPXZ]v")>; +def : InstRW<[M5WriteFSQR7], (instrs FSQRTv4f16)>; +def : InstRW<[M5WriteNEONU], (instrs FSQRTv8f16)>; +def : InstRW<[M5WriteFSQR8], (instrs FSQRTv2f32)>; +def : InstRW<[M5WriteNEONX], (instrs FSQRTv4f32)>; +def : InstRW<[M5WriteNEONY], (instrs FSQRTv2f64)>; + +// ASIMD miscellaneous instructions. +def : InstRW<[M5WriteNALU2], (instregex "^RBITv")>; +def : InstRW<[M5WriteNALU2], (instregex "^(BIF|BIT|BSL|BSP)v")>; +def : InstRW<[M5WriteNALU2], (instregex "^CL[STZ]v")>; +def : InstRW<[M5WriteNEONB], (instregex "^DUPv.+gpr")>; +def : InstRW<[M5WriteNSHF2], (instregex "^DUP(i8|i16|i32|i64)$")>; +def : InstRW<[M5WriteNSHF2], (instregex "^DUPv.+lane")>; +def : InstRW<[M5WriteNSHF2], (instregex "^EXTv")>; +def : InstRW<[M5WriteNSHT4A], (instregex "^XTNv")>; +def : InstRW<[M5WriteNSHT4A], (instregex "^[SU]?QXTU?Nv")>; +def : InstRW<[M5WriteNEONB], (instregex "^INSv.+gpr")>; +def : InstRW<[M5WriteNSHF2], (instregex "^INSv.+lane")>; +def : InstRW<[M5WriteMOVI], (instregex "^(MOV|MVN)I")>; +def : InstRW<[M5WriteNALU1], (instregex "^FMOVv.f(16|32|64)")>; +def : InstRW<[M5WriteFCVT3], (instregex "^F(RECP|RSQRT)Ev[248]f(16|32|64)")>; +def : InstRW<[M5WriteFCVT3], (instregex "^U(RECP|RSQRT)Ev[24]i32")>; +def : InstRW<[M5WriteFMAC4], (instregex "^F(RECP|RSQRT)Sv.f(16|32|64)")>; +def : InstRW<[M5WriteNSHF2], (instregex "^REV(16|32|64)v")>; +def : InstRW<[M5WriteNSHFA], (instregex "^TB[LX]v(8|16)i8One")>; +def : InstRW<[M5WriteNSHFB], (instregex "^TB[LX]v(8|16)i8Two")>; +def : InstRW<[M5WriteNSHFC], (instregex "^TB[LX]v(8|16)i8Three")>; +def : InstRW<[M5WriteNSHFD], (instregex "^TB[LX]v(8|16)i8Four")>; +def : InstRW<[M5WriteNEONP], (instregex "^[SU]MOVv")>; +def : InstRW<[M5WriteNSHF2], (instregex "^(TRN|UZP|ZIP)[12]v")>; + +// ASIMD load instructions. +def : InstRW<[WriteVLD], (instregex "LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[WriteVLD, + M5WriteA1X, + WriteAdr], (instregex "LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVLDA], (instregex "LD1Twov(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVLDA, + M5WriteA1X, + WriteAdr], (instregex "LD1Twov(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVLDB], (instregex "LD1Threev(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVLDB, + M5WriteA1X, + WriteAdr], (instregex "LD1Threev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVLDC], (instregex "LD1Fourv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVLDC, + M5WriteA1X, + WriteAdr], (instregex "LD1Fourv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVLDD], (instregex "LD1i(8|16|32|64)$")>; +def : InstRW<[M5WriteVLDD, + M5WriteA1X, + WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; +def : InstRW<[WriteVLD], (instregex "LD1Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[WriteVLD, + M5WriteA1X, + WriteAdr], (instregex "LD1Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVLDF], (instregex "LD2Twov(8b|16b|4h|8h|2s|4s|2d)$")>; +def : InstRW<[M5WriteVLDF, + M5WriteA1X, + WriteAdr], (instregex "LD2Twov(8b|16b|4h|8h|2s|4s|2d)_POST$")>; +def : InstRW<[M5WriteVLDG], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[M5WriteVLDG, + M5WriteA1X, + WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>; +def : InstRW<[M5WriteVLDA], (instregex "LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVLDA, + M5WriteA1X, + WriteAdr], (instregex "LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVLDI], (instregex "LD3Threev(8b|16b|4h|8h|2s|4s|2d)$")>; +def : InstRW<[M5WriteVLDI, + M5WriteA1X, + WriteAdr], (instregex "LD3Threev(8b|16b|4h|8h|2s|4s|2d)_POST$")>; +def : InstRW<[M5WriteVLDJ], (instregex "LD3i(8|16|32)$")>; +def : InstRW<[M5WriteVLDJ, + M5WriteA1X, + WriteAdr], (instregex "LD3i(8|16|32)_POST$")>; +def : InstRW<[M5WriteVLDL], (instregex "LD3i64$")>; +def : InstRW<[M5WriteVLDL, + M5WriteA1X, + WriteAdr], (instregex "LD3i64_POST$")>; +def : InstRW<[M5WriteVLDB], (instregex "LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVLDB, + M5WriteA1X], (instregex "LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVLDN], (instregex "LD4Fourv(8b|16b|4h|8h|2s|4s|2d)$")>; +def : InstRW<[M5WriteVLDN, + M5WriteA1X, + WriteAdr], (instregex "LD4Fourv(8b|16b|4h|8h|2s|4s|2d)_POST$")>; +def : InstRW<[M5WriteVLDK], (instregex "LD4i(8|16|32)$")>; +def : InstRW<[M5WriteVLDK, + M5WriteA1X, + WriteAdr], (instregex "LD4i(8|16|32)_POST$")>; +def : InstRW<[M5WriteVLDM], (instregex "LD4i64$")>; +def : InstRW<[M5WriteVLDM, + M5WriteA1X, + WriteAdr], (instregex "LD4i64_POST$")>; +def : InstRW<[M5WriteVLDC], (instregex "LD4Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVLDC, + M5WriteA1X, + WriteAdr], (instregex "LD4Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; + +// ASIMD store instructions. +def : InstRW<[WriteVST], (instregex "ST1Onev(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[WriteVST, + M5WriteA1X, + WriteAdr], (instregex "ST1Onev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVSTA], (instregex "ST1Twov(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVSTA, + M5WriteA1X, + WriteAdr], (instregex "ST1Twov(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; + +def : InstRW<[M5WriteVSTB], (instregex "ST1Threev(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVSTB, + M5WriteA1X, + WriteAdr], (instregex "ST1Threev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[M5WriteVSTC], (instregex "ST1Fourv(8b|16b|4h|8h|2s|4s|1d|2d)$")>; +def : InstRW<[M5WriteVSTC, + M5WriteA1X, + WriteAdr], (instregex "ST1Fourv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>; +def : InstRW<[WriteVST], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[WriteVST, + M5WriteA1X, + WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; +def : InstRW<[M5WriteVSTD], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[M5WriteVSTD, + M5WriteA1X, + WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[M5WriteVSTE], (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[M5WriteVSTE, + M5WriteA1X, + WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M5WriteVSTD], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[M5WriteVSTD, + M5WriteA1X, + WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; +def : InstRW<[M5WriteVSTF], (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[M5WriteVSTF, + M5WriteA1X, + WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[M5WriteVSTG], (instregex "ST3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[M5WriteVSTG, + M5WriteA1X, + WriteAdr], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M5WriteVSTA], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[M5WriteVSTA, + M5WriteA1X, + WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; +def : InstRW<[M5WriteVSTL], (instregex "ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[M5WriteVSTL, + M5WriteA1X, + WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[M5WriteVSTI], (instregex "ST4Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[M5WriteVSTI, + M5WriteA1X, + WriteAdr], (instregex "ST4Fourv(16b|8h|4s|2d)_POST$")>; +def : InstRW<[M5WriteVSTA], (instregex "ST4i(8|16|32|64)$")>; +def : InstRW<[M5WriteVSTA, + M5WriteA1X, + WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>; + +// Cryptography instructions. +def : InstRW<[M5WriteNCRY2], (instregex "^AES[DE]")>; +def : InstRW<[M5WriteNCRY2, + M5ReadAESM2], (instregex "^AESI?MC")>; +def : InstRW<[M5WriteNCRY2A], (instregex "^PMULv")>; +def : InstRW<[M5WriteNCRY1A], (instregex "^PMULLv(1|8)i")>; +def : InstRW<[M5WriteNCRY3A], (instregex "^PMULLv(2|16)i")>; +def : InstRW<[M5WriteNCRY2A], (instregex "^SHA1(H|SU[01])")>; +def : InstRW<[M5WriteNCRY5A], (instregex "^SHA1[CMP]")>; +def : InstRW<[M5WriteNCRY2A], (instrs SHA256SU0rr)>; +def : InstRW<[M5WriteNCRY5A], (instrs SHA256SU1rrr)>; +def : InstRW<[M5WriteNCRY5A], (instregex "^SHA256H2?")>; + +// CRC instructions. +def : InstRW<[M5WriteF2, + M5ReadFM1], (instregex "^CRC32C?[BHWX]")>; + +} // SchedModel = ExynosM5Model diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedFalkor.td b/suite/synctools/tablegen/AArch64/AArch64SchedFalkor.td index 84825458..7c9b0afd 100644 --- a/suite/synctools/tablegen/AArch64/AArch64SchedFalkor.td +++ b/suite/synctools/tablegen/AArch64/AArch64SchedFalkor.td @@ -1,9 +1,8 @@ //==- AArch64SchedFalkor.td - Falkor Scheduling Definitions -*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -24,8 +23,9 @@ def FalkorModel : SchedMachineModel { let MispredictPenalty = 11; // Minimum branch misprediction penalty. let CompleteModel = 1; - list UnsupportedFeatures = [HasSVE]; - + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F, + SMEUnsupported.F); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } @@ -92,7 +92,8 @@ def : WriteRes { let Unsupported = 1; } def : WriteRes { let Unsupported = 1; } def : WriteRes { let Unsupported = 1; } def : WriteRes { let Unsupported = 1; } -def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } +def : WriteRes { let Unsupported = 1; } def : WriteRes { let Unsupported = 1; } def : WriteRes { let Unsupported = 1; } def : WriteRes { let Unsupported = 1; } @@ -111,6 +112,7 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; // Detailed Refinements // ----------------------------------------------------------------------------- diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedFalkorDetails.td b/suite/synctools/tablegen/AArch64/AArch64SchedFalkorDetails.td index ff14e639..a3a038f8 100644 --- a/suite/synctools/tablegen/AArch64/AArch64SchedFalkorDetails.td +++ b/suite/synctools/tablegen/AArch64/AArch64SchedFalkorDetails.td @@ -1,9 +1,8 @@ //==- AArch64SchedFalkorDetails.td - Falkor Scheduling Defs -*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -909,10 +908,10 @@ def : InstRW<[FalkorWr_ADDSUBsx], (instregex "^SUB(S)?(W|X)r(s|x|x64)$")>; // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^DUP(v8i8|v4i16|v2i32)(gpr|lane)$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^CPY(i8|i16|i32|i64)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^DUP(i8|i16|i32|i64)$")>; def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v8i8$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL|BSP)v8i8$")>; def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs EXTv8i8)>; def : InstRW<[FalkorWr_1VXVY_0cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs TBLv8i8One)>; @@ -936,7 +935,7 @@ def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc], (instregex "^INSv(i32|i64)(gpr|lane)$")>; def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>; -def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v16i8$")>; +def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL|BSP)v16i8$")>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs EXTv16i8)>; def : InstRW<[FalkorWr_2VXVY_0cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs NOTv16i8)>; diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedKryo.td b/suite/synctools/tablegen/AArch64/AArch64SchedKryo.td index 68de3e07..cc568a2f 100644 --- a/suite/synctools/tablegen/AArch64/AArch64SchedKryo.td +++ b/suite/synctools/tablegen/AArch64/AArch64SchedKryo.td @@ -1,9 +1,8 @@ //==- AArch64SchedKryo.td - Qualcomm Kryo Scheduling Defs ---*- tablegen -*-==// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -28,8 +27,9 @@ def KryoModel : SchedMachineModel { let LoopMicroOpBufferSize = 16; let CompleteModel = 1; - list UnsupportedFeatures = [HasSVE]; - + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F, + SMEUnsupported.F); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } @@ -95,7 +95,8 @@ def : WriteRes { let Latency = 6; let NumMicroOps = 2; } def : WriteRes { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1 -def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } def : WriteRes { let Latency = 4; } def : WriteRes { let Latency = 4; } @@ -117,6 +118,7 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; //===----------------------------------------------------------------------===// diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedKryoDetails.td b/suite/synctools/tablegen/AArch64/AArch64SchedKryoDetails.td index cf4cdabb..bc5ad0f8 100644 --- a/suite/synctools/tablegen/AArch64/AArch64SchedKryoDetails.td +++ b/suite/synctools/tablegen/AArch64/AArch64SchedKryoDetails.td @@ -1,9 +1,8 @@ //=- AArch64SchedKryoDetails.td - QC Kryo Scheduling Defs ----*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -463,13 +462,13 @@ def KryoWrite_1cyc_X_noRSV_74ln : let Latency = 1; let NumMicroOps = 2; } def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln], - (instrs BIFv8i8, BITv8i8, BSLv8i8)>; + (instrs BIFv8i8, BITv8i8, BSLv8i8, BSPv8i8)>; def KryoWrite_1cyc_X_X_75ln : SchedWriteRes<[KryoUnitX, KryoUnitX]> { let Latency = 1; let NumMicroOps = 2; } def : InstRW<[KryoWrite_1cyc_X_X_75ln], - (instrs BIFv16i8, BITv16i8, BSLv16i8)>; + (instrs BIFv16i8, BITv16i8, BSLv16i8, BSPv16i8)>; def KryoWrite_0cyc_noRSV_11ln : SchedWriteRes<[]> { let Latency = 0; let NumMicroOps = 1; diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedPredAmpere.td b/suite/synctools/tablegen/AArch64/AArch64SchedPredAmpere.td new file mode 100644 index 00000000..8552c07b --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64SchedPredAmpere.td @@ -0,0 +1,25 @@ +//===- AArch64SchedPredAmpere.td - AArch64 Sched Preds -----*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines scheduling predicate definitions that are used by the +// AArch64 Ampere Computing processors. +// +//===----------------------------------------------------------------------===// + +// Auxiliary predicates. + +// Check for a LSL shift <= 4 +def AmpereCheapLSL : MCSchedPredicate< + CheckAny<[CheckShiftBy0, + CheckAll< + [CheckShiftLSL, + CheckAny< + [CheckShiftBy1, + CheckShiftBy2, + CheckShiftBy3, + CheckShiftBy4]>]>]>>; diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedPredExynos.td b/suite/synctools/tablegen/AArch64/AArch64SchedPredExynos.td new file mode 100644 index 00000000..fcda2394 --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64SchedPredExynos.td @@ -0,0 +1,157 @@ +//===- AArch64SchedPredExynos.td - AArch64 Sched Preds -----*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines scheduling predicate definitions that are used by the +// AArch64 Exynos processors. +// +//===----------------------------------------------------------------------===// + +// Auxiliary predicates. + +// Check the shift in arithmetic and logic instructions. +def ExynosCheckShift : CheckAny<[CheckShiftBy0, + CheckAll< + [CheckShiftLSL, + CheckAny< + [CheckShiftBy1, + CheckShiftBy2, + CheckShiftBy3]>]>]>; + +// Exynos predicates. + +// Identify BLR specifying the LR register as the indirect target register. +def ExynosBranchLinkLRPred : MCSchedPredicate< + CheckAll<[CheckOpcode<[BLR]>, + CheckRegOperand<0, LR>]>>; + +// Identify arithmetic instructions without or with limited extension or shift. +def ExynosArithFn : TIIPredicate< + "isExynosArithFast", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsArithExtOp.ValidOpcodes, + MCReturnStatement< + CheckAny<[CheckExtBy0, + CheckAll< + [CheckAny< + [CheckExtUXTW, + CheckExtUXTX]>, + CheckAny< + [CheckExtBy1, + CheckExtBy2, + CheckExtBy3]>]>]>>>, + MCOpcodeSwitchCase< + IsArithShiftOp.ValidOpcodes, + MCReturnStatement>, + MCOpcodeSwitchCase< + IsArithUnshiftOp.ValidOpcodes, + MCReturnStatement>, + MCOpcodeSwitchCase< + IsArithImmOp.ValidOpcodes, + MCReturnStatement>], + MCReturnStatement>>; +def ExynosArithPred : MCSchedPredicate; + +// Identify logic instructions with limited shift. +def ExynosLogicFn : TIIPredicate< + "isExynosLogicFast", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsLogicShiftOp.ValidOpcodes, + MCReturnStatement>, + MCOpcodeSwitchCase< + IsLogicUnshiftOp.ValidOpcodes, + MCReturnStatement>, + MCOpcodeSwitchCase< + IsLogicImmOp.ValidOpcodes, + MCReturnStatement>], + MCReturnStatement>>; +def ExynosLogicPred : MCSchedPredicate; + +// Identify more logic instructions with limited shift. +def ExynosLogicExFn : TIIPredicate< + "isExynosLogicExFast", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsLogicShiftOp.ValidOpcodes, + MCReturnStatement< + CheckAny< + [ExynosCheckShift, + CheckAll< + [CheckShiftLSL, + CheckShiftBy8]>]>>>, + MCOpcodeSwitchCase< + IsLogicUnshiftOp.ValidOpcodes, + MCReturnStatement>, + MCOpcodeSwitchCase< + IsLogicImmOp.ValidOpcodes, + MCReturnStatement>], + MCReturnStatement>>; +def ExynosLogicExPred : MCSchedPredicate; + +// Identify a load or store using the register offset addressing mode +// with a scaled non-extended register. +def ExynosScaledIdxFn : TIIPredicate<"isExynosScaledAddr", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsLoadStoreRegOffsetOp.ValidOpcodes, + MCReturnStatement< + CheckAny< + [CheckMemExtSXTW, + CheckMemExtUXTW, + CheckMemScaled]>>>], + MCReturnStatement>>; +def ExynosScaledIdxPred : MCSchedPredicate; + +// Identify FP instructions. +def ExynosFPPred : MCSchedPredicate>; + +// Identify 128-bit NEON instructions. +def ExynosQFormPred : MCSchedPredicate; + +// Identify instructions that reset a register efficiently. +def ExynosResetFn : TIIPredicate< + "isExynosResetFast", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + [ADR, ADRP, + MOVNWi, MOVNXi, + MOVZWi, MOVZXi], + MCReturnStatement>, + MCOpcodeSwitchCase< + [ORRWri, ORRXri], + MCReturnStatement< + CheckAll< + [CheckIsRegOperand<1>, + CheckAny< + [CheckRegOperand<1, WZR>, + CheckRegOperand<1, XZR>]>]>>>], + MCReturnStatement< + CheckAny< + [IsCopyIdiomFn, + IsZeroFPIdiomFn]>>>>; +def ExynosResetPred : MCSchedPredicate; + +// Identify EXTR as the alias for ROR (immediate). +def ExynosRotateRightImmPred : MCSchedPredicate< + CheckAll<[CheckOpcode<[EXTRWrri, EXTRXrri]>, + CheckSameRegOperand<1, 2>]>>; + +// Identify cheap arithmetic and logic immediate instructions. +def ExynosCheapFn : TIIPredicate< + "isExynosCheapAsMove", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsArithLogicImmOp.ValidOpcodes, + MCReturnStatement>], + MCReturnStatement< + CheckAny< + [ExynosArithFn, ExynosResetFn, ExynosLogicFn]>>>>; diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedPredicates.td b/suite/synctools/tablegen/AArch64/AArch64SchedPredicates.td new file mode 100644 index 00000000..19a3780c --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64SchedPredicates.td @@ -0,0 +1,441 @@ +//===- AArch64SchedPredicates.td - AArch64 Sched Preds -----*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines scheduling predicate definitions that are used by the +// AArch64 subtargets. +// +//===----------------------------------------------------------------------===// + +// Function mappers. + +// Check the extension type in arithmetic instructions. +let FunctionMapper = "AArch64_AM::getArithExtendType" in { + def CheckExtUXTB : CheckImmOperand_s<3, "AArch64_AM::UXTB">; + def CheckExtUXTH : CheckImmOperand_s<3, "AArch64_AM::UXTH">; + def CheckExtUXTW : CheckImmOperand_s<3, "AArch64_AM::UXTW">; + def CheckExtUXTX : CheckImmOperand_s<3, "AArch64_AM::UXTX">; + def CheckExtSXTB : CheckImmOperand_s<3, "AArch64_AM::SXTB">; + def CheckExtSXTH : CheckImmOperand_s<3, "AArch64_AM::SXTH">; + def CheckExtSXTW : CheckImmOperand_s<3, "AArch64_AM::SXTW">; + def CheckExtSXTX : CheckImmOperand_s<3, "AArch64_AM::SXTX">; +} + +// Check for shifting in extended arithmetic instructions. +foreach I = {0-3} in { + let FunctionMapper = "AArch64_AM::getArithShiftValue" in + def CheckExtBy#I : CheckImmOperand<3, I>; +} + +// Check the extension type in the register offset addressing mode. +let FunctionMapper = "AArch64_AM::getMemExtendType" in { + def CheckMemExtUXTW : CheckImmOperand_s<3, "AArch64_AM::UXTW">; + def CheckMemExtLSL : CheckImmOperand_s<3, "AArch64_AM::UXTX">; + def CheckMemExtSXTW : CheckImmOperand_s<3, "AArch64_AM::SXTW">; + def CheckMemExtSXTX : CheckImmOperand_s<3, "AArch64_AM::SXTX">; +} + +// Check for scaling in the register offset addressing mode. +let FunctionMapper = "AArch64_AM::getMemDoShift" in +def CheckMemScaled : CheckImmOperandSimple<4>; + +// Check the shifting type in arithmetic and logic instructions. +let FunctionMapper = "AArch64_AM::getShiftType" in { + def CheckShiftLSL : CheckImmOperand_s<3, "AArch64_AM::LSL">; + def CheckShiftLSR : CheckImmOperand_s<3, "AArch64_AM::LSR">; + def CheckShiftASR : CheckImmOperand_s<3, "AArch64_AM::ASR">; + def CheckShiftROR : CheckImmOperand_s<3, "AArch64_AM::ROR">; + def CheckShiftMSL : CheckImmOperand_s<3, "AArch64_AM::MSL">; +} + +// Check for shifting in arithmetic and logic instructions. +foreach I = {0-4, 8} in { + let FunctionMapper = "AArch64_AM::getShiftValue" in + def CheckShiftBy#I : CheckImmOperand<3, I>; +} + +// Generic predicates. + +// Identify whether an instruction is the 16-bit NEON form based on its result. +def CheckHForm : CheckAll<[CheckIsRegOperand<0>, + CheckAny<[CheckRegOperand<0, H0>, + CheckRegOperand<0, H1>, + CheckRegOperand<0, H2>, + CheckRegOperand<0, H3>, + CheckRegOperand<0, H4>, + CheckRegOperand<0, H5>, + CheckRegOperand<0, H6>, + CheckRegOperand<0, H7>, + CheckRegOperand<0, H8>, + CheckRegOperand<0, H9>, + CheckRegOperand<0, H10>, + CheckRegOperand<0, H11>, + CheckRegOperand<0, H12>, + CheckRegOperand<0, H13>, + CheckRegOperand<0, H14>, + CheckRegOperand<0, H15>, + CheckRegOperand<0, H16>, + CheckRegOperand<0, H17>, + CheckRegOperand<0, H18>, + CheckRegOperand<0, H19>, + CheckRegOperand<0, H20>, + CheckRegOperand<0, H21>, + CheckRegOperand<0, H22>, + CheckRegOperand<0, H23>, + CheckRegOperand<0, H24>, + CheckRegOperand<0, H25>, + CheckRegOperand<0, H26>, + CheckRegOperand<0, H27>, + CheckRegOperand<0, H28>, + CheckRegOperand<0, H29>, + CheckRegOperand<0, H30>, + CheckRegOperand<0, H31>]>]>; + +// Identify whether an instruction is the 32-bit NEON form based on its result. +def CheckSForm : CheckAll<[CheckIsRegOperand<0>, + CheckAny<[CheckRegOperand<0, S0>, + CheckRegOperand<0, S1>, + CheckRegOperand<0, S2>, + CheckRegOperand<0, S3>, + CheckRegOperand<0, S4>, + CheckRegOperand<0, S5>, + CheckRegOperand<0, S6>, + CheckRegOperand<0, S7>, + CheckRegOperand<0, S8>, + CheckRegOperand<0, S9>, + CheckRegOperand<0, S10>, + CheckRegOperand<0, S11>, + CheckRegOperand<0, S12>, + CheckRegOperand<0, S13>, + CheckRegOperand<0, S14>, + CheckRegOperand<0, S15>, + CheckRegOperand<0, S16>, + CheckRegOperand<0, S17>, + CheckRegOperand<0, S18>, + CheckRegOperand<0, S19>, + CheckRegOperand<0, S20>, + CheckRegOperand<0, S21>, + CheckRegOperand<0, S22>, + CheckRegOperand<0, S23>, + CheckRegOperand<0, S24>, + CheckRegOperand<0, S25>, + CheckRegOperand<0, S26>, + CheckRegOperand<0, S27>, + CheckRegOperand<0, S28>, + CheckRegOperand<0, S29>, + CheckRegOperand<0, S30>, + CheckRegOperand<0, S31>]>]>; + +// Identify whether an instruction is the 64-bit NEON form based on its result. +def CheckDForm : CheckAll<[CheckIsRegOperand<0>, + CheckAny<[CheckRegOperand<0, D0>, + CheckRegOperand<0, D1>, + CheckRegOperand<0, D2>, + CheckRegOperand<0, D3>, + CheckRegOperand<0, D4>, + CheckRegOperand<0, D5>, + CheckRegOperand<0, D6>, + CheckRegOperand<0, D7>, + CheckRegOperand<0, D8>, + CheckRegOperand<0, D9>, + CheckRegOperand<0, D10>, + CheckRegOperand<0, D11>, + CheckRegOperand<0, D12>, + CheckRegOperand<0, D13>, + CheckRegOperand<0, D14>, + CheckRegOperand<0, D15>, + CheckRegOperand<0, D16>, + CheckRegOperand<0, D17>, + CheckRegOperand<0, D18>, + CheckRegOperand<0, D19>, + CheckRegOperand<0, D20>, + CheckRegOperand<0, D21>, + CheckRegOperand<0, D22>, + CheckRegOperand<0, D23>, + CheckRegOperand<0, D24>, + CheckRegOperand<0, D25>, + CheckRegOperand<0, D26>, + CheckRegOperand<0, D27>, + CheckRegOperand<0, D28>, + CheckRegOperand<0, D29>, + CheckRegOperand<0, D30>, + CheckRegOperand<0, D31>]>]>; + +// Identify whether an instruction is the 128-bit NEON form based on its result. +def CheckQForm : CheckAll<[CheckIsRegOperand<0>, + CheckAny<[CheckRegOperand<0, Q0>, + CheckRegOperand<0, Q1>, + CheckRegOperand<0, Q2>, + CheckRegOperand<0, Q3>, + CheckRegOperand<0, Q4>, + CheckRegOperand<0, Q5>, + CheckRegOperand<0, Q6>, + CheckRegOperand<0, Q7>, + CheckRegOperand<0, Q8>, + CheckRegOperand<0, Q9>, + CheckRegOperand<0, Q10>, + CheckRegOperand<0, Q11>, + CheckRegOperand<0, Q12>, + CheckRegOperand<0, Q13>, + CheckRegOperand<0, Q14>, + CheckRegOperand<0, Q15>, + CheckRegOperand<0, Q16>, + CheckRegOperand<0, Q17>, + CheckRegOperand<0, Q18>, + CheckRegOperand<0, Q19>, + CheckRegOperand<0, Q20>, + CheckRegOperand<0, Q21>, + CheckRegOperand<0, Q22>, + CheckRegOperand<0, Q23>, + CheckRegOperand<0, Q24>, + CheckRegOperand<0, Q25>, + CheckRegOperand<0, Q26>, + CheckRegOperand<0, Q27>, + CheckRegOperand<0, Q28>, + CheckRegOperand<0, Q29>, + CheckRegOperand<0, Q30>, + CheckRegOperand<0, Q31>]>]>; + +// Identify arithmetic instructions with extend. +def IsArithExtOp : CheckOpcode<[ADDWrx, ADDXrx, ADDSWrx, ADDSXrx, + SUBWrx, SUBXrx, SUBSWrx, SUBSXrx, + ADDXrx64, ADDSXrx64, + SUBXrx64, SUBSXrx64]>; + +// Identify arithmetic immediate instructions. +def IsArithImmOp : CheckOpcode<[ADDWri, ADDXri, ADDSWri, ADDSXri, + SUBWri, SUBXri, SUBSWri, SUBSXri]>; + +// Identify arithmetic instructions with shift. +def IsArithShiftOp : CheckOpcode<[ADDWrs, ADDXrs, ADDSWrs, ADDSXrs, + SUBWrs, SUBXrs, SUBSWrs, SUBSXrs]>; + +// Identify arithmetic instructions without shift. +def IsArithUnshiftOp : CheckOpcode<[ADDWrr, ADDXrr, ADDSWrr, ADDSXrr, + SUBWrr, SUBXrr, SUBSWrr, SUBSXrr]>; + +// Identify logic immediate instructions. +def IsLogicImmOp : CheckOpcode<[ANDWri, ANDXri, + EORWri, EORXri, + ORRWri, ORRXri]>; + +// Identify logic instructions with shift. +def IsLogicShiftOp : CheckOpcode<[ANDWrs, ANDXrs, ANDSWrs, ANDSXrs, + BICWrs, BICXrs, BICSWrs, BICSXrs, + EONWrs, EONXrs, + EORWrs, EORXrs, + ORNWrs, ORNXrs, + ORRWrs, ORRXrs]>; + +// Identify logic instructions without shift. +def IsLogicUnshiftOp : CheckOpcode<[ANDWrr, ANDXrr, ANDSWrr, ANDSXrr, + BICWrr, BICXrr, BICSWrr, BICSXrr, + EONWrr, EONXrr, + EORWrr, EORXrr, + ORNWrr, ORNXrr, + ORRWrr, ORRXrr]>; + +// Identify arithmetic and logic immediate instructions. +def IsArithLogicImmOp : CheckOpcode; + +// Identify arithmetic and logic instructions with shift. +def IsArithLogicShiftOp : CheckOpcode; + +// Identify arithmetic and logic instructions without shift. +def IsArithLogicUnshiftOp : CheckOpcode; + +// Identify whether an instruction is an ASIMD +// load using the post index addressing mode. +def IsLoadASIMDPostOp : CheckOpcode<[LD1Onev8b_POST, LD1Onev4h_POST, LD1Onev2s_POST, LD1Onev1d_POST, + LD1Onev16b_POST, LD1Onev8h_POST, LD1Onev4s_POST, LD1Onev2d_POST, + LD1Twov8b_POST, LD1Twov4h_POST, LD1Twov2s_POST, LD1Twov1d_POST, + LD1Twov16b_POST, LD1Twov8h_POST, LD1Twov4s_POST, LD1Twov2d_POST, + LD1Threev8b_POST, LD1Threev4h_POST, LD1Threev2s_POST, LD1Threev1d_POST, + LD1Threev16b_POST, LD1Threev8h_POST, LD1Threev4s_POST, LD1Threev2d_POST, + LD1Fourv8b_POST, LD1Fourv4h_POST, LD1Fourv2s_POST, LD1Fourv1d_POST, + LD1Fourv16b_POST, LD1Fourv8h_POST, LD1Fourv4s_POST, LD1Fourv2d_POST, + LD1i8_POST, LD1i16_POST, LD1i32_POST, LD1i64_POST, + LD1Rv8b_POST, LD1Rv4h_POST, LD1Rv2s_POST, LD1Rv1d_POST, + LD1Rv16b_POST, LD1Rv8h_POST, LD1Rv4s_POST, LD1Rv2d_POST, + LD2Twov8b_POST, LD2Twov4h_POST, LD2Twov2s_POST, + LD2Twov16b_POST, LD2Twov8h_POST, LD2Twov4s_POST, LD2Twov2d_POST, + LD2i8_POST, LD2i16_POST, LD2i32_POST, LD2i64_POST, + LD2Rv8b_POST, LD2Rv4h_POST, LD2Rv2s_POST, LD2Rv1d_POST, + LD2Rv16b_POST, LD2Rv8h_POST, LD2Rv4s_POST, LD2Rv2d_POST, + LD3Threev8b_POST, LD3Threev4h_POST, LD3Threev2s_POST, + LD3Threev16b_POST, LD3Threev8h_POST, LD3Threev4s_POST, LD3Threev2d_POST, + LD3i8_POST, LD3i16_POST, LD3i32_POST, LD3i64_POST, + LD3Rv8b_POST, LD3Rv4h_POST, LD3Rv2s_POST, LD3Rv1d_POST, + LD3Rv16b_POST, LD3Rv8h_POST, LD3Rv4s_POST, LD3Rv2d_POST, + LD4Fourv8b_POST, LD4Fourv4h_POST, LD4Fourv2s_POST, + LD4Fourv16b_POST, LD4Fourv8h_POST, LD4Fourv4s_POST, LD4Fourv2d_POST, + LD4i8_POST, LD4i16_POST, LD4i32_POST, LD4i64_POST, + LD4Rv8b_POST, LD4Rv4h_POST, LD4Rv2s_POST, LD4Rv1d_POST, + LD4Rv16b_POST, LD4Rv8h_POST, LD4Rv4s_POST, LD4Rv2d_POST]>; + +// Identify whether an instruction is an ASIMD +// store using the post index addressing mode. +def IsStoreASIMDPostOp : CheckOpcode<[ST1Onev8b_POST, ST1Onev4h_POST, ST1Onev2s_POST, ST1Onev1d_POST, + ST1Onev16b_POST, ST1Onev8h_POST, ST1Onev4s_POST, ST1Onev2d_POST, + ST1Twov8b_POST, ST1Twov4h_POST, ST1Twov2s_POST, ST1Twov1d_POST, + ST1Twov16b_POST, ST1Twov8h_POST, ST1Twov4s_POST, ST1Twov2d_POST, + ST1Threev8b_POST, ST1Threev4h_POST, ST1Threev2s_POST, ST1Threev1d_POST, + ST1Threev16b_POST, ST1Threev8h_POST, ST1Threev4s_POST, ST1Threev2d_POST, + ST1Fourv8b_POST, ST1Fourv4h_POST, ST1Fourv2s_POST, ST1Fourv1d_POST, + ST1Fourv16b_POST, ST1Fourv8h_POST, ST1Fourv4s_POST, ST1Fourv2d_POST, + ST1i8_POST, ST1i16_POST, ST1i32_POST, ST1i64_POST, + ST2Twov8b_POST, ST2Twov4h_POST, ST2Twov2s_POST, + ST2Twov16b_POST, ST2Twov8h_POST, ST2Twov4s_POST, ST2Twov2d_POST, + ST2i8_POST, ST2i16_POST, ST2i32_POST, ST2i64_POST, + ST3Threev8b_POST, ST3Threev4h_POST, ST3Threev2s_POST, + ST3Threev16b_POST, ST3Threev8h_POST, ST3Threev4s_POST, ST3Threev2d_POST, + ST3i8_POST, ST3i16_POST, ST3i32_POST, ST3i64_POST, + ST4Fourv8b_POST, ST4Fourv4h_POST, ST4Fourv2s_POST, + ST4Fourv16b_POST, ST4Fourv8h_POST, ST4Fourv4s_POST, ST4Fourv2d_POST, + ST4i8_POST, ST4i16_POST, ST4i32_POST, ST4i64_POST]>; + +// Identify whether an instruction is an ASIMD load +// or store using the post index addressing mode. +def IsLoadStoreASIMDPostOp : CheckOpcode; + +// Identify whether an instruction is a load +// using the register offset addressing mode. +def IsLoadRegOffsetOp : CheckOpcode<[PRFMroW, PRFMroX, + LDRBBroW, LDRBBroX, + LDRSBWroW, LDRSBWroX, LDRSBXroW, LDRSBXroX, + LDRHHroW, LDRHHroX, + LDRSHWroW, LDRSHWroX, LDRSHXroW, LDRSHXroX, + LDRWroW, LDRWroX, + LDRSWroW, LDRSWroX, + LDRXroW, LDRXroX, + LDRBroW, LDRBroX, + LDRHroW, LDRHroX, + LDRSroW, LDRSroX, + LDRDroW, LDRDroX, + LDRQroW, LDRQroX]>; + +// Identify whether an instruction is a store +// using the register offset addressing mode. +def IsStoreRegOffsetOp : CheckOpcode<[STRBBroW, STRBBroX, + STRHHroW, STRHHroX, + STRWroW, STRWroX, + STRXroW, STRXroX, + STRBroW, STRBroX, + STRHroW, STRHroX, + STRSroW, STRSroX, + STRDroW, STRDroX, + STRQroW, STRQroX]>; + +// Identify whether an instruction is a load or +// store using the register offset addressing mode. +def IsLoadStoreRegOffsetOp : CheckOpcode; + +// Target predicates. + +// Identify an instruction that effectively transfers a register to another. +def IsCopyIdiomFn : TIIPredicate<"isCopyIdiom", + MCOpcodeSwitchStatement< + [// MOV {Rd, SP}, {SP, Rn} => + // ADD {Rd, SP}, {SP, Rn}, #0 + MCOpcodeSwitchCase< + [ADDWri, ADDXri], + MCReturnStatement< + CheckAll< + [CheckIsRegOperand<0>, + CheckIsRegOperand<1>, + CheckAny< + [CheckRegOperand<0, WSP>, + CheckRegOperand<0, SP>, + CheckRegOperand<1, WSP>, + CheckRegOperand<1, SP>]>, + CheckZeroOperand<2>]>>>, + // MOV Rd, Rm => + // ORR Rd, ZR, Rm, LSL #0 + MCOpcodeSwitchCase< + [ORRWrs, ORRXrs], + MCReturnStatement< + CheckAll< + [CheckIsRegOperand<1>, + CheckIsRegOperand<2>, + CheckAny< + [CheckRegOperand<1, WZR>, + CheckRegOperand<1, XZR>]>, + CheckShiftBy0]>>>], + MCReturnStatement>>; +def IsCopyIdiomPred : MCSchedPredicate; + +// Identify arithmetic instructions with an extended register. +def RegExtendedFn : TIIPredicate<"hasExtendedReg", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsArithExtOp.ValidOpcodes, + MCReturnStatement< + CheckNot>>>], + MCReturnStatement>>; +def RegExtendedPred : MCSchedPredicate; + +// Identify arithmetic and logic instructions with a shifted register. +def RegShiftedFn : TIIPredicate<"hasShiftedReg", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsArithLogicShiftOp.ValidOpcodes, + MCReturnStatement< + CheckNot>>>], + MCReturnStatement>>; +def RegShiftedPred : MCSchedPredicate; + +// Identify a load or store using the register offset addressing mode +// with an extended or scaled register. +def ScaledIdxFn : TIIPredicate<"isScaledAddr", + MCOpcodeSwitchStatement< + [MCOpcodeSwitchCase< + IsLoadStoreRegOffsetOp.ValidOpcodes, + MCReturnStatement< + CheckAny<[CheckNot, + CheckMemScaled]>>>], + MCReturnStatement>>; +def ScaledIdxPred : MCSchedPredicate; + +// Identify an instruction that effectively resets a FP register to zero. +def IsZeroFPIdiomFn : TIIPredicate<"isZeroFPIdiom", + MCOpcodeSwitchStatement< + [// MOVI Vd, #0 + MCOpcodeSwitchCase< + [MOVIv8b_ns, MOVIv16b_ns, + MOVID, MOVIv2d_ns], + MCReturnStatement>>, + // MOVI Vd, #0, LSL #0 + MCOpcodeSwitchCase< + [MOVIv4i16, MOVIv8i16, + MOVIv2i32, MOVIv4i32], + MCReturnStatement< + CheckAll< + [CheckZeroOperand<1>, + CheckZeroOperand<2>]>>>], + MCReturnStatement>>; +def IsZeroFPIdiomPred : MCSchedPredicate; + +// Identify an instruction that effectively resets a GP register to zero. +def IsZeroIdiomFn : TIIPredicate<"isZeroIdiom", + MCOpcodeSwitchStatement< + [// ORR Rd, ZR, #0 + MCOpcodeSwitchCase< + [ORRWri, ORRXri], + MCReturnStatement< + CheckAll< + [CheckIsRegOperand<1>, + CheckAny< + [CheckRegOperand<1, WZR>, + CheckRegOperand<1, XZR>]>, + CheckZeroOperand<2>]>>>], + MCReturnStatement>>; +def IsZeroIdiomPred : MCSchedPredicate; diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedTSV110.td b/suite/synctools/tablegen/AArch64/AArch64SchedTSV110.td new file mode 100644 index 00000000..77fca22a --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64SchedTSV110.td @@ -0,0 +1,747 @@ +//==- AArch64SchedTSV110.td - Huawei TSV110 Scheduling Definitions -*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Huawei TSV110 to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// The following definitions describe the simpler per-operand machine model. +// This works with MachineScheduler. See llvm/MC/MCSchedule.h for details. + +// Huawei TSV110 scheduling machine model. +def TSV110Model : SchedMachineModel { + let IssueWidth = 4; // 4 micro-ops dispatched per cycle. + let MicroOpBufferSize = 128; // 128 micro-op re-order buffer + let LoopMicroOpBufferSize = 16; + let LoadLatency = 4; // Optimistic load latency. + let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch + let CompleteModel = 1; + + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); +} + +// Define each kind of processor resource and number available on the TSV110, +// which has 8 pipelines, each with its own queue where micro-ops wait for +// their operands and issue out-of-order to one of eight execution pipelines. +let SchedModel = TSV110Model in { + def TSV110UnitALU : ProcResource<1>; // Int ALU + def TSV110UnitAB : ProcResource<2>; // Int ALU/BRU + def TSV110UnitMDU : ProcResource<1>; // Multi-Cycle + def TSV110UnitFSU1 : ProcResource<1>; // FP/ASIMD + def TSV110UnitFSU2 : ProcResource<1>; // FP/ASIMD + def TSV110UnitLdSt : ProcResource<2>; // Load/Store + + def TSV110UnitF : ProcResGroup<[TSV110UnitFSU1, TSV110UnitFSU2]>; + def TSV110UnitALUAB : ProcResGroup<[TSV110UnitALU, TSV110UnitAB]>; + def TSV110UnitFLdSt : ProcResGroup<[TSV110UnitFSU1, TSV110UnitFSU2, TSV110UnitLdSt]>; +} + +let SchedModel = TSV110Model in { + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latency for +// TSV110 + +// Integer ALU +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +// Integer Mul/MAC/Div +def : WriteRes { let Latency = 12; + let ResourceCycles = [12]; } +def : WriteRes { let Latency = 20; + let ResourceCycles = [20]; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 4; } + +// Load +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } + +// Pre/Post Indexing +def : WriteRes { let Latency = 1; } + +// Store +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +// FP +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 5; } + +// FP Div, Sqrt +def : WriteRes { let Latency = 18; } + +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 1; } + +// Branch +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +def : WriteRes { let Unsupported = 1; } + +// Forwarding logic is modeled only for multiply and accumulate. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +def : InstRW<[WriteI], (instrs COPY)>; + +// Detailed Refinements +//===----------------------------------------------------------------------===// + +// Contains all of the TSV110 specific SchedWriteRes types. The approach +// below is to define a generic SchedWriteRes for every combination of +// latency and microOps. The naming conventions is to use a prefix, one field +// for latency, and one or more microOp count/type designators. +// Prefix: TSV110Wr +// Latency: #cyc +// MicroOp Count/Types: #(ALU|AB|MDU|FSU1|FSU2|LdSt|ALUAB|F|FLdSt) +// +// e.g. TSV110Wr_6cyc_1ALU_6MDU_4LdSt means the total latency is 6 and there are +// 1 micro-ops to be issued down one ALU pipe, six MDU pipes and four LdSt pipes. +// + +//===----------------------------------------------------------------------===// +// Define Generic 1 micro-op types + +def TSV110Wr_1cyc_1AB : SchedWriteRes<[TSV110UnitAB]> { let Latency = 1; } +def TSV110Wr_1cyc_1ALU : SchedWriteRes<[TSV110UnitALU]> { let Latency = 1; } +def TSV110Wr_1cyc_1ALUAB : SchedWriteRes<[TSV110UnitALUAB]> { let Latency = 1; } +def TSV110Wr_1cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 1; } + +def TSV110Wr_2cyc_1AB : SchedWriteRes<[TSV110UnitAB]> { let Latency = 2; } +def TSV110Wr_2cyc_1ALU : SchedWriteRes<[TSV110UnitALU]> { let Latency = 2; } +def TSV110Wr_2cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 2; } +def TSV110Wr_2cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 2; } +def TSV110Wr_2cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 2; } +def TSV110Wr_2cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 2; } + +def TSV110Wr_3cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 3; } +def TSV110Wr_3cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 3; } +def TSV110Wr_3cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 3; } + +def TSV110Wr_4cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 4; } +def TSV110Wr_4cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 4; } +def TSV110Wr_4cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 4; } +def TSV110Wr_4cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 4; } + +def TSV110Wr_5cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 5; } +def TSV110Wr_5cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 5; } +def TSV110Wr_5cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 5; } +def TSV110Wr_5cyc_1LdSt : SchedWriteRes<[TSV110UnitLdSt]> { let Latency = 5; } + +def TSV110Wr_6cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 6; } + +def TSV110Wr_7cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 7; } + +def TSV110Wr_8cyc_1F : SchedWriteRes<[TSV110UnitF]> { let Latency = 8; } + +def TSV110Wr_11cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 11; } + +def TSV110Wr_12cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 12; } + +def TSV110Wr_17cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 17; } + +def TSV110Wr_18cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 18; } + +def TSV110Wr_20cyc_1MDU : SchedWriteRes<[TSV110UnitMDU]> { let Latency = 20; } + +def TSV110Wr_24cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 24; } + +def TSV110Wr_31cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 31; } + +def TSV110Wr_36cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 36; } + +def TSV110Wr_38cyc_1FSU1 : SchedWriteRes<[TSV110UnitFSU1]> { let Latency = 38; } + +def TSV110Wr_64cyc_1FSU2 : SchedWriteRes<[TSV110UnitFSU2]> { let Latency = 64; } + +//===----------------------------------------------------------------------===// +// Define Generic 2 micro-op types + +def TSV110Wr_1cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt, + TSV110UnitALUAB]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def TSV110Wr_2cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt, + TSV110UnitALUAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def TSV110Wr_2cyc_2LdSt : SchedWriteRes<[TSV110UnitLdSt, + TSV110UnitLdSt]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def TSV110Wr_2cyc_2F : SchedWriteRes<[TSV110UnitF, + TSV110UnitF]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def TSV110Wr_2cyc_1FSU1_1FSU2 : SchedWriteRes<[TSV110UnitFSU1, + TSV110UnitFSU2]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def TSV110Wr_4cyc_2F : SchedWriteRes<[TSV110UnitF, + TSV110UnitF]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def TSV110Wr_4cyc_1FSU1_1FSU2 : SchedWriteRes<[TSV110UnitFSU1, + TSV110UnitFSU2]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def TSV110Wr_4cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt, + TSV110UnitALUAB]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def TSV110Wr_5cyc_1ALU_1F : SchedWriteRes<[TSV110UnitALU, + TSV110UnitF]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def TSV110Wr_6cyc_2LdSt : SchedWriteRes<[TSV110UnitLdSt, + TSV110UnitLdSt]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def TSV110Wr_6cyc_1LdSt_1ALUAB : SchedWriteRes<[TSV110UnitLdSt, + TSV110UnitALUAB]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def TSV110Wr_7cyc_1F_1LdSt : SchedWriteRes<[TSV110UnitF, + TSV110UnitLdSt]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def TSV110Wr_8cyc_2FSU1 : SchedWriteRes<[TSV110UnitFSU1, + TSV110UnitFSU1]> { + let Latency = 8; + let NumMicroOps = 2; +} + + +def TSV110Wr_8cyc_1FSU1_1FSU2 : SchedWriteRes<[TSV110UnitFSU1, + TSV110UnitFSU2]> { + let Latency = 8; + let NumMicroOps = 2; +} + +//===----------------------------------------------------------------------===// +// Define Generic 3 micro-op types + +def TSV110Wr_6cyc_3F : SchedWriteRes<[TSV110UnitF, TSV110UnitF, + TSV110UnitF]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def TSV110Wr_6cyc_3LdSt : SchedWriteRes<[TSV110UnitLdSt, TSV110UnitLdSt, + TSV110UnitLdSt]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def TSV110Wr_7cyc_2F_1LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF, + TSV110UnitLdSt]> { + let Latency = 7; + let NumMicroOps = 3; +} + +//===----------------------------------------------------------------------===// +// Define Generic 4 micro-op types + +def TSV110Wr_8cyc_4F : SchedWriteRes<[TSV110UnitF, TSV110UnitF, + TSV110UnitF, TSV110UnitF]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def TSV110Wr_8cyc_3F_1LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF, + TSV110UnitF, TSV110UnitLdSt]> { + let Latency = 8; + let NumMicroOps = 4; +} + +//===----------------------------------------------------------------------===// +// Define Generic 5 micro-op types + +def TSV110Wr_8cyc_3F_2LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF, TSV110UnitF, + TSV110UnitLdSt, TSV110UnitLdSt]> { + let Latency = 8; + let NumMicroOps = 5; +} + +//===----------------------------------------------------------------------===// +// Define Generic 8 micro-op types + +def TSV110Wr_10cyc_4F_4LdSt : SchedWriteRes<[TSV110UnitF, TSV110UnitF, + TSV110UnitF, TSV110UnitF, + TSV110UnitLdSt, TSV110UnitLdSt, + TSV110UnitLdSt, TSV110UnitLdSt]> { + let Latency = 10; + let NumMicroOps = 8; +} + + +// Branch Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_1cyc_1AB], (instrs B)>; +def : InstRW<[TSV110Wr_1cyc_1AB], (instrs BL)>; +def : InstRW<[TSV110Wr_1cyc_1AB], (instrs BLR)>; +def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ))$")>; + + +// Cryptography Extensions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^AES[DE]")>; +def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^AESI?MC")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^SHA1SU1")>; +def : InstRW<[TSV110Wr_2cyc_2F], (instregex "^SHA1(H|SU0)")>; +def : InstRW<[TSV110Wr_5cyc_1FSU1], (instregex "^SHA1[CMP]")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^SHA256SU0")>; +def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^SHA256SU1")>; +def : InstRW<[TSV110Wr_5cyc_1FSU1], (instregex "^SHA256(H|H2)")>; +def TSV110ReadCRC: SchedReadAdvance<1, [TSV110Wr_2cyc_1MDU]>; +def : InstRW<[TSV110Wr_2cyc_1MDU, TSV110ReadCRC], (instregex "^CRC32.*$")>; + + +// Arithmetic and Logical Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(BIC|EON|ORN)[WX]rr")>; +def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "(BIC)S[WX]rr")>; + +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(ADD|AND|EOR|ORR|SUB)[WX]r(r|i)")>; +def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "(ADD|AND|EOR|ORR|SUB)S[WX]r(r|i)")>; + +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(ADC|SBC|BIC)[WX]r$")>; +def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "^(ADC|SBC)S[WX]r$")>; + +def : InstRW<[TSV110Wr_2cyc_1MDU], (instregex "^(AND|BIC|EON|EOR|ORN|ORR)[WX]rs$")>; +def : InstRW<[TSV110Wr_2cyc_1AB], (instregex "^(AND|BIC|EON|EOR|ORN|ORR)S[WX]rs$")>; +def : InstRW<[TSV110Wr_2cyc_1MDU], (instregex "^(ADD|SUB)[WX]r(s|x|x64)$")>; +def : InstRW<[TSV110Wr_2cyc_1AB], (instregex "^(ADD|SUB)S[WX]r(s|x|x64)$")>; + +def : InstRW<[TSV110Wr_1cyc_1AB], (instregex "^(CCMN|CCMP)(W|X)(r|i)$")>; +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(CSEL|CSINC|CSINV|CSNEG)(W|X)r$")>; + + +// Move and Shift Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instrs ADR, ADRP)>; +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^MOV[NZK][WX]i")>; +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "(LSLV|LSRV|ASRV|RORV)(W|X)r")>; + + +// Divide and Multiply Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_12cyc_1MDU], (instregex "^(S|U)DIVWr$")>; +def : InstRW<[TSV110Wr_20cyc_1MDU], (instregex "^(S|U)DIVXr$")>; + +def TSV110ReadMAW : SchedReadAdvance<2, [TSV110Wr_3cyc_1MDU]>; +def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>; +def TSV110ReadMAQ : SchedReadAdvance<3, [TSV110Wr_4cyc_1MDU]>; +def : InstRW<[TSV110Wr_4cyc_1MDU, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>; +def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>; +def : InstRW<[TSV110Wr_4cyc_1MDU], (instregex "^(S|U)MULHrr$")>; + + +// Miscellaneous Data-Processing Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^EXTR(W|X)rri$")>; +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(S|U)?BFM(W|X)ri$")>; +def : InstRW<[TSV110Wr_1cyc_1ALUAB], (instregex "^(CLS|CLZ|RBIT|REV(16|32)?)(W|X)r$")>; + + +// Load Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDR(W|X)l$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instrs LDRSWl)>; + +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDR(BB|HH|W|X)ui$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>; + +def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteAdr], (instregex "^LDR(BB|HH|W|X)(post|pre)$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteAdr], (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>; + +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDTR(B|H|W|X)i$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDUR(BB|HH|W|X)i$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^LDURS(BW|BX|HW|HX|W)i$")>; + +def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi], (instregex "^LDNP(W|X)i$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi], (instregex "^LDP(W|X)i$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt_1ALUAB, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>; + +def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi], (instrs LDPSWi)>; +def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi, WriteAdr], (instrs LDPSWpost)>; +def : InstRW<[TSV110Wr_4cyc_1LdSt, WriteLDHi, WriteAdr], (instrs LDPSWpre)>; + +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instrs PRFMl)>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instrs PRFUMi)>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^PRFMui$")>; +def : InstRW<[TSV110Wr_4cyc_1LdSt], (instregex "^PRFMro(W|X)$")>; + + +// Store Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STN?P(W|X)i$")>; +def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr], (instregex "^STP(W|X)(post|pre)$")>; +def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STUR(BB|HH|W|X)i$")>; +def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STTR(B|H|W|X)i$")>; +def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STR(BB|HH|W|X)ui$")>; + +def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr], (instregex "^STR(BB|HH|W|X)(post|pre)$")>; +def : InstRW<[TSV110Wr_1cyc_1LdSt, WriteAdr], (instregex "^STR(BB|HH|W|X)ro(W|X)$")>; + + +// FP Data Processing Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "F(ABS|NEG)(D|S)r")>; +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCCMP(E)?(S|D)rr$")>; +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCMP(E)?(S|D)r(r|i)$")>; +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCSEL(S|D)rrr$")>; + +def : InstRW<[TSV110Wr_11cyc_1FSU1], (instrs FDIVSrr)>; +def : InstRW<[TSV110Wr_18cyc_1FSU1], (instrs FDIVDrr)>; +def : InstRW<[TSV110Wr_17cyc_1FSU2], (instrs FSQRTSr)>; +def : InstRW<[TSV110Wr_31cyc_1FSU2], (instrs FSQRTDr)>; + +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(MAX|MIN).+rr")>; + +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^FN?M(ADD|SUB)Hrrr")>; +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^FN?M(ADD|SUB)Srrr")>; +def : InstRW<[TSV110Wr_7cyc_1F], (instregex "^FN?M(ADD|SUB)Drrr")>; + +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(ADD|SUB)Hrr")>; +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(ADD|SUB)Srr")>; +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(ADD|SUB)Drr")>; + +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(N)?MULHrr$")>; +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(N)?MULSrr$")>; +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(N)?MULDrr$")>; + +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FRINT.+r")>; + + +// FP Miscellaneous Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_5cyc_1ALU_1F], (instregex "^[SU]CVTF[SU][WX][SD]ri")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^FCVT(A|M|N|P|Z)(S|U)U(W|X)(S|D)r$")>; +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCVT[HSD][HSD]r")>; + +def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^FMOV(DX|WS|XD|SW|DXHigh|XDHigh)r$")>; +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FMOV[SD][ir]$")>; + + +// FP Load Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LDR[DSQ]l")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LDUR[BDHSQ]i")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr], (instregex "^LDR[BDHSQ](post|pre)")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LDR[BDHSQ]ui")>; +def : InstRW<[TSV110Wr_6cyc_1LdSt_1ALUAB, ReadAdrBase], (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteLDHi], (instregex "^LDN?P[DQS]i")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteLDHi, WriteAdr], (instregex "^LDP[DQS](post|pre)")>; + + +// FP Store Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STUR[BHSDQ]i")>; +def : InstRW<[TSV110Wr_1cyc_1LdSt_1ALUAB, ReadAdrBase], (instregex "^STR[BHSDQ](post|pre)")>; +def : InstRW<[TSV110Wr_1cyc_1LdSt], (instregex "^STR[BHSDQ]ui")>; +def : InstRW<[TSV110Wr_2cyc_1LdSt_1ALUAB, ReadAdrBase], (instregex "^STR[BHSDQ]ro[WX]")>; +def : InstRW<[TSV110Wr_2cyc_2LdSt], (instregex "^STN?P[SDQ]i")>; +def : InstRW<[TSV110Wr_2cyc_2LdSt, WriteAdr], (instregex "^STP[SDQ](post|pre)")>; + + +// ASIMD Integer Instructions +// ----------------------------------------------------------------------------- + +// Reference for forms in this group +// D form - v8i8, v4i16, v2i32 +// Q form - v16i8, v8i16, v4i32 +// D form - v1i8, v1i16, v1i32, v1i64 +// Q form - v16i8, v8i16, v4i32, v2i64 +// D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64 +// Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64 + +// ASIMD simple arithmetic +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(ABS|ADD(P)?|NEG|SUB)v")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^[SU](ADD(L|LP|W)|SUB(L|W))v")>; + +// ASIMD complex arithmetic +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]H(ADD|SUB)v")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^R?(ADD|SUB)HN2?v")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]Q(ADD|SUB)v")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^(SU|US)QADDv")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]RHADDv")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ABAL?v")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ABDL?v")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]ADALPv")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^((SQ)(ABS|NEG))v")>; + +// ASIMD compare +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT|TST)v")>; + +// ASIMD max/min +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)P?v")>; + +// ASIMD logical +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(AND|BIC|BIF|BIT|BSL|EOR|MVN|NOT|ORN|ORR)v")>; + +// ASIMD multiply accumulate, D-form +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^(MUL|ML[AS]|SQR?D(MULH))(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)")>; +// ASIMD multiply accumulate, Q-form +def : InstRW<[TSV110Wr_8cyc_2FSU1], (instregex "^(MUL|ML[AS]|SQR?D(MULH))(v16i8|v8i16|v4i32)")>; + +// ASIMD multiply accumulate long +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^PMULL(v8i8|v16i8)")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1], (instregex "^PMULL(v1i64|v2i64)")>; + +// ASIMD shift +// ASIMD shift accumulate +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^(S|SR|U|UR)SRA")>; +// ASIMD shift by immed, basic +def : InstRW<[TSV110Wr_4cyc_1FSU1], + (instregex "SHLv","SLIv","SRIv","SHRNv","SQXTNv","SQXTUNv","UQXTNv")>; +// ASIMD shift by immed, complex +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU]?(Q|R){1,2}SHR")>; +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^SQSHLU")>; +// ASIMD shift by register, basic, Q-form +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// ASIMD shift by register, complex, D-form +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; +// ASIMD shift by register, complex, Q-form +def : InstRW<[TSV110Wr_4cyc_1FSU1], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD reduction +// ASIMD arith, reduce, 4H/4S +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; +// ASIMD arith, reduce, 8B/8H +def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>; +// ASIMD arith, reduce, 16B +def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU]?ADDL?Vv16i8v$")>; + +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[TSV110Wr_4cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>; +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>; +// ASIMD max/min, reduce, 16B +def : InstRW<[TSV110Wr_8cyc_1FSU1_1FSU2], (instregex "^[SU](MIN|MAX)Vv16i8v$")>; + + +// Vector - Floating Point +// ----------------------------------------------------------------------------- + +// Reference for forms in this group +// D form - v2f32 +// Q form - v4f32, v2f64 +// D form - 32, 64 +// D form - v1i32, v1i64 +// D form - v2i32 +// Q form - v4i32, v2i64 + +// ASIMD FP sign manipulation +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FABSv")>; +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FNEGv")>; + +// ASIMD FP compare +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v")>; + +// ASIMD FP convert +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FCVT[AMNPZ][SU]v")>; +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FCVT(L)v")>; +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^FCVT(N|XN)v")>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[TSV110Wr_11cyc_1FSU1], (instregex "FDIVv2f32")>; +// ASIMD FP divide, Q-form, F32 +def : InstRW<[TSV110Wr_24cyc_1FSU1], (instregex "FDIVv4f32")>; +// ASIMD FP divide, Q-form, F64 +def : InstRW<[TSV110Wr_38cyc_1FSU1], (instregex "FDIVv2f64")>; + +// ASIMD FP SQRT +def : InstRW<[TSV110Wr_17cyc_1FSU2], (instrs FSQRTv2f32)>; +def : InstRW<[TSV110Wr_36cyc_1FSU2], (instrs FSQRTv4f32)>; +def : InstRW<[TSV110Wr_64cyc_1FSU2], (instrs FSQRTv2f64)>; + +// ASIMD FP max,min +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(MAX|MIN)(NM)?v")>; +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^F(MAX|MIN)(NM)?Pv")>; +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^F(MAX|MIN)(NM)?Vv")>; + +// ASIMD FP add +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^F(ADD|ADDP|SUB)v")>; + +// ASIMD FP multiply +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^FMULX?v")>; + + +// ASIMD Miscellaneous Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(CLS|CLZ|CNT)v")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(DUP|INS)v.+lane")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^REV(16|32|64)v")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^(UZP|ZIP)[12]v")>; + +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^EXTv")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^XTNv")>; +def : InstRW<[TSV110Wr_2cyc_1FSU1_1FSU2], (instregex "^RBITv")>; + +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^(INS|DUP)v.+gpr")>; + +def : InstRW<[TSV110Wr_3cyc_1FSU1], (instregex "^[SU]MOVv")>; + +// ASIMD table lookup, D-form +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^TB[LX]v8i8One")>; +def : InstRW<[TSV110Wr_4cyc_2F], (instregex "^TB[LX]v8i8Two")>; +def : InstRW<[TSV110Wr_6cyc_3F], (instregex "^TB[LX]v8i8Three")>; +def : InstRW<[TSV110Wr_8cyc_4F], (instregex "^TB[LX]v8i8Four")>; +// ASIMD table lookup, Q-form +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^TB[LX]v16i8One")>; +def : InstRW<[TSV110Wr_4cyc_2F], (instregex "^TB[LX]v16i8Two")>; +def : InstRW<[TSV110Wr_6cyc_3F], (instregex "^TB[LX]v16i8Three")>; +def : InstRW<[TSV110Wr_8cyc_4F], (instregex "^TB[LX]v16i8Four")>; + +def : InstRW<[TSV110Wr_2cyc_1F], (instregex "^FMOVv")>; + +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^FRINT[AIMNPXZ]v")>; +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^[SU]CVTFv")>; +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^[FU](RECP|RSQRT)(E|X)v")>; + + +// ASIMD Load Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_7cyc_1F_1LdSt], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_7cyc_1F_1LdSt, WriteAdr], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_7cyc_2F_1LdSt], (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr], (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_8cyc_3F_1LdSt], (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr], (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_8cyc_3F_2LdSt], (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_8cyc_3F_2LdSt, WriteAdr], (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[TSV110Wr_7cyc_1F_1LdSt], (instregex "LD1i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_7cyc_1F_1LdSt, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; +def : InstRW<[TSV110Wr_7cyc_2F_1LdSt], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>; +def : InstRW<[TSV110Wr_8cyc_3F_1LdSt], (instregex "LD3i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>; +def : InstRW<[TSV110Wr_8cyc_3F_2LdSt], (instregex "LD4i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_8cyc_3F_2LdSt, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>; + +def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr], (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt], (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_5cyc_1LdSt, WriteAdr], (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_6cyc_3LdSt], (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_6cyc_3LdSt, WriteAdr], (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_6cyc_2LdSt], (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_6cyc_2LdSt, WriteAdr], (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[TSV110Wr_7cyc_2F_1LdSt], (instregex "^LD2Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_7cyc_2F_1LdSt, WriteAdr], (instregex "^LD2Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[TSV110Wr_8cyc_3F_1LdSt], (instregex "^LD3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_8cyc_3F_1LdSt, WriteAdr], (instregex "^LD3Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[TSV110Wr_10cyc_4F_4LdSt], (instregex "^LD4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_10cyc_4F_4LdSt, WriteAdr], (instregex "^LD4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + + +// ASIMD Store Instructions +// ----------------------------------------------------------------------------- + +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_3cyc_1F, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_4cyc_1F, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_5cyc_1F, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; +def : InstRW<[TSV110Wr_6cyc_1F], (instregex "ST4i(8|16|32|64)$")>; +def : InstRW<[TSV110Wr_6cyc_1F, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>; + +def : InstRW<[TSV110Wr_3cyc_1F], (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_3cyc_1F, WriteAdr], (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_4cyc_1F, WriteAdr], (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_5cyc_1F, WriteAdr], (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[TSV110Wr_6cyc_1F], (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_6cyc_1F, WriteAdr], (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[TSV110Wr_4cyc_1F], (instregex "^ST2Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_4cyc_1F, WriteAdr], (instregex "^ST2Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[TSV110Wr_5cyc_1F], (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_5cyc_1F, WriteAdr], (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[TSV110Wr_8cyc_1F], (instregex "^ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[TSV110Wr_8cyc_1F, WriteAdr], (instregex "^ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +} // SchedModel = TSV110Model diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedThunderX.td b/suite/synctools/tablegen/AArch64/AArch64SchedThunderX.td index fbbd3850..ff34c0ce 100644 --- a/suite/synctools/tablegen/AArch64/AArch64SchedThunderX.td +++ b/suite/synctools/tablegen/AArch64/AArch64SchedThunderX.td @@ -1,9 +1,8 @@ //==- AArch64SchedThunderX.td - Cavium ThunderX T8X Scheduling Definitions -*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -26,8 +25,9 @@ def ThunderXT8XModel : SchedMachineModel { let PostRAScheduler = 1; // Use PostRA scheduler. let CompleteModel = 1; - list UnsupportedFeatures = [HasSVE]; - + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F, + SMEUnsupported.F); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } @@ -154,7 +154,8 @@ def : WriteRes { let Latency = 6; } def : WriteRes { let Latency = 6; } def : WriteRes { let Latency = 6; } def : WriteRes { let Latency = 6; } -def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } // FP Mul, Div, Sqrt def : WriteRes { let Latency = 6; } @@ -192,6 +193,7 @@ def THXT8XWriteFSqrtDP : SchedWriteRes<[THXT8XUnitFPMDS]> { def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; // FIXME: This needs more targeted benchmarking. // ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedThunderX2T99.td b/suite/synctools/tablegen/AArch64/AArch64SchedThunderX2T99.td index bee3392b..ffa0a5e7 100644 --- a/suite/synctools/tablegen/AArch64/AArch64SchedThunderX2T99.td +++ b/suite/synctools/tablegen/AArch64/AArch64SchedThunderX2T99.td @@ -1,9 +1,8 @@ //=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 ---*- tablegen -*-=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -26,8 +25,9 @@ def ThunderX2T99Model : SchedMachineModel { let PostRAScheduler = 1; // Using PostRA sched. let CompleteModel = 1; - list UnsupportedFeatures = [HasSVE]; - + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F, + SMEUnsupported.F); // FIXME: Remove when all errors have been fixed. let FullInstRWOverlapCheck = 0; } @@ -362,6 +362,7 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +def : ReadAdvance; //===----------------------------------------------------------------------===// // 3. Instruction Tables. @@ -1249,7 +1250,12 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>; // ASIMD shift by register, basic, Q-form // ASIMD shift by register, complex, D-form // ASIMD shift by register, complex, Q-form -def : WriteRes { +def : WriteRes { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} +def : WriteRes { let Latency = 7; let NumMicroOps = 4; let ResourceCycles = [4]; @@ -1483,7 +1489,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>; // ASIMD bitwise insert, D-form // ASIMD bitwise insert, Q-form def : InstRW<[THX2T99Write_5Cyc_F01], - (instregex "^BIFv", "^BITv", "^BSLv")>; + (instregex "^BIFv", "^BITv", "^BSLv", "^BSPv")>; // ASIMD count, D-form // ASIMD count, Q-form @@ -1493,7 +1499,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01], // ASIMD duplicate, gen reg // ASIMD duplicate, element def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv")>; -def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CPY")>; +def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUP(i8|i16|i32|i64)$")>; def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv.+gpr")>; // ASIMD extract @@ -1518,25 +1524,6 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^MOVIv")>; // ASIMD move, FP immed def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMOVv")>; -// ASIMD table lookup, D-form -def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8One")>; -def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Two")>; -def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Three")>; -def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Four")>; - -// ASIMD table lookup, Q-form -def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8One")>; -def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Two")>; -def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Three")>; -def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Four")>; - -// ASIMD transpose -def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1", "^TRN2")>; - -// ASIMD unzip/zip -def : InstRW<[THX2T99Write_5Cyc_F01], - (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>; - // ASIMD reciprocal estimate, D-form // ASIMD reciprocal estimate, Q-form def : InstRW<[THX2T99Write_5Cyc_F01], diff --git a/suite/synctools/tablegen/AArch64/AArch64SchedThunderX3T110.td b/suite/synctools/tablegen/AArch64/AArch64SchedThunderX3T110.td new file mode 100644 index 00000000..46a1c217 --- /dev/null +++ b/suite/synctools/tablegen/AArch64/AArch64SchedThunderX3T110.td @@ -0,0 +1,2003 @@ +//=- AArch64SchedThunderX3T110.td - Marvell ThunderX3 T110 ---*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the scheduling model for Marvell ThunderX3T110 +// family of processors. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Pipeline Description. + +def ThunderX3T110Model : SchedMachineModel { + let IssueWidth = 4; // 4 micro-ops dispatched at a time. + let MicroOpBufferSize = 70; // 70 entries in micro-op re-order buffer. + let LoadLatency = 4; // Optimistic load latency. + let MispredictPenalty = 12; // Extra cycles for mispredicted branch. + // Determined via a mix of micro-arch details and experimentation. + let LoopMicroOpBufferSize = 128; // FIXME: might be much bigger in TX3. + let PostRAScheduler = 1; // Using PostRA sched. + let CompleteModel = 1; + + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + PAUnsupported.F); + // FIXME: Remove when all errors have been fixed. + let FullInstRWOverlapCheck = 0; +} + +let SchedModel = ThunderX3T110Model in { + +// Issue ports. + +// Port 0: ALU. +def THX3T110P0 : ProcResource<1>; + +// Port 1: ALU. +def THX3T110P1 : ProcResource<1>; + +// Port 2: ALU/Branch. +def THX3T110P2 : ProcResource<1>; + +// Port 3: ALU/Branch. +def THX3T110P3 : ProcResource<1>; + +// Port 4: Load/Store. +def THX3T110P4 : ProcResource<1>; + +// Port 5: Load/store. +def THX3T110P5 : ProcResource<1>; + +// Port 6: FP/Neon/SIMD/Crypto. +def THX3T110P6FP0 : ProcResource<1>; + +// Port 7: FP/Neon/SIMD/Crypto. +def THX3T110P7FP1 : ProcResource<1>; + +// Port 8: FP/Neon/SIMD/Crypto. +def THX3T110P8FP2 : ProcResource<1>; + +// Port 9: FP/Neon/SIMD/Crypto. +def THX3T110P9FP3 : ProcResource<1>; + +// Port 10: Store Data Unit. +def THX3T110SD0 : ProcResource<1>; + +// Define groups for the functional units on each issue port. Each group +// created will be used by a WriteRes. + +// Integer divide/mulhi micro-ops only on port I1. +def THX3T110I1 : ProcResGroup<[THX3T110P1]>; + +// Branch micro-ops on ports I2/I3. +def THX3T110I23 : ProcResGroup<[THX3T110P2, THX3T110P3]>; + +// Branch micro-ops on ports I1/I2/I3. +def THX3T110I123 : ProcResGroup<[THX3T110P1, THX3T110P2, THX3T110P3]>; + +// Integer micro-ops on ports I0/I1/I2. +def THX3T110I012 : ProcResGroup<[THX3T110P0, THX3T110P1, THX3T110P2]>; + +// Integer micro-ops on ports I0/I1/I2/I3. +def THX3T110I0123 : ProcResGroup<[THX3T110P0, THX3T110P1, + THX3T110P2, THX3T110P3]>; + +// FP micro-ops on ports FP0/FP1/FP2/FP3. +def THX3T110FP0123 : ProcResGroup<[THX3T110P6FP0, THX3T110P7FP1, + THX3T110P8FP2, THX3T110P9FP3]>; + +// FP micro-ops on ports FP2/FP3. +def THX3T110FP23 : ProcResGroup<[THX3T110P8FP2, THX3T110P9FP3]>; + +// ASIMD micro-ops on ports FP0/FP1/FP2/FP3. +def THX3T110SIMD : ProcResGroup<[THX3T110P6FP0, THX3T110P7FP1, + THX3T110P8FP2, THX3T110P9FP3]>; + +// Store data micro-ops only on port 10. +def THX3T110SD : ProcResGroup<[THX3T110SD0]>; + +// Load/store micro-ops on ports P4/P5. +def THX3T110LS : ProcResGroup<[THX3T110P4, THX3T110P5]>; + +// 70 entry unified scheduler. +def THX3T110ANY: ProcResGroup<[THX3T110P0, THX3T110P1, THX3T110P2, + THX3T110P3, THX3T110P4, THX3T110P5, + THX3T110P6FP0, THX3T110P7FP1, + THX3T110P8FP2, THX3T110P9FP3]> { + let BufferSize = 70; +} + +// Define commonly used write types for InstRW specializations. +// All definitions follow the format: THX3T110Write_Cyc_. + +// 3 cycles on I1. +def THX3T110Write_3Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 3; + let NumMicroOps = 2; +} + +// 4 cycles on I1. +def THX3T110Write_4Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// 5 cycles on I1. +def THX3T110Write_5Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 7 cycles on I1. +def THX3T110Write_7Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// 23 cycles on I1. +def THX3T110Write_23Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 23; + let ResourceCycles = [13, 23]; + let NumMicroOps = 4; +} + +// 39 cycles on I1. +def THX3T110Write_39Cyc_I1 : SchedWriteRes<[THX3T110I1]> { + let Latency = 39; + let ResourceCycles = [13, 39]; + let NumMicroOps = 4; +} + +// 1 cycle on I2/I3 +def THX3T110Write_1Cyc_I23 : SchedWriteRes<[THX3T110I23]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 8 cycles on I2/I3 +def THX3T110Write_8Cyc_I23 : SchedWriteRes<[THX3T110I23]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 1 cycle on I1/I2/I3 +def THX3T110Write_1Cyc_I123 : SchedWriteRes<[THX3T110I123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 8 cycles on I1/I2/I3 +def THX3T110Write_8Cyc_I123 : SchedWriteRes<[THX3T110I123]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 1 cycle on I0/I1/I2/I3. +def THX3T110Write_1Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 2 cycles on I0/I1/I2/I3. +def THX3T110Write_2Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// 3 cycles on I0/I1/I2/I3. +def THX3T110Write_3Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 3; + let NumMicroOps = 2; +} + +// 4 cycles on I0/I1/I2/I3. +def THX3T110Write_4Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 5 cycles on I0/I1/I2/I3. +def THX3T110Write_5Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on I0/I1/I2/I3. +def THX3T110Write_6Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 8 cycles on I0/I1/I2/I3. +def THX3T110Write_8Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 8; + let NumMicroOps = 4; +} + +// 13 cycles on I0/I1/I2/I3. +def THX3T110Write_13Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 13; + let NumMicroOps = 3; +} + +// 23 cycles on I0/I1/I2/I3. +def THX3T110Write_23Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 23; + let NumMicroOps = 3; +} + +// 39 cycles on I0/I1/I2/I3. +def THX3T110Write_39Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> { + let Latency = 39; + let NumMicroOps = 3; +} + +// 4 cycles on F2/F3. +def THX3T110Write_4Cyc_F23 : SchedWriteRes<[THX3T110FP23]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// 5 cycles on F0/F1/F2/F3. +def THX3T110Write_5Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 6 cycles on F0/F1/F2/F3. +def THX3T110Write_6Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 7 cycles on F0/F1/F2/F3. +def THX3T110Write_7Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// 8 cycles on F0/F1/F2/F3. +def THX3T110Write_8Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 10 cycles on F0/F1/F2/F3. +def THX3T110Write_10Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 10; + let NumMicroOps = 3; +} + +// 16 cycles on F0/F1/F2/F3. +def THX3T110Write_16Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let NumMicroOps = 3; + let ResourceCycles = [8]; +} + +// 23 cycles on F0/F1/F2/F3. +def THX3T110Write_23Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 23; + let NumMicroOps = 3; + let ResourceCycles = [11]; +} + +// 1 cycle on LS0/LS1. +def THX3T110Write_1Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 1; + let NumMicroOps = 1; +} + +// 2 cycles on LS0/LS1. +def THX3T110Write_2Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// 4 cycles on LS0/LS1. +def THX3T110Write_4Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} + +// 5 cycles on LS0/LS1. +def THX3T110Write_5Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on LS0/LS1. +def THX3T110Write_6Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 4 + 5 cycles on LS0/LS1. +// First resource is available after 4 cycles. +// Second resource is available after 5 cycles. +// Load vector pair, immed offset, Q-form [LDP/LDNP]. +def THX3T110Write_4_5Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [4, 5]; +} + +// 4 + 8 cycles on LS0/LS1. +// First resource is available after 4 cycles. +// Second resource is available after 8 cycles. +// Load vector pair, immed offset, S/D-form [LDP/LDNP]. +def THX3T110Write_4_8Cyc_LS01 : SchedWriteRes<[THX3T110LS]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [4, 8]; +} + +// 11 cycles on LS0/LS1 and I1. +def THX3T110Write_11Cyc_LS01_I1 : + SchedWriteRes<[THX3T110LS, THX3T110I1]> { + let Latency = 11; + let NumMicroOps = 4; +} + +// 1 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_1Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 1 cycles on LS0/LS1 and 2 of I0/I1/I2/I3. +def THX3T110Write_1Cyc_LS01_I0123_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 3; +} + +// 4 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_4Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 4 cycles on LS0/LS1 and 2 of I0/I1/I2/I3. +def THX3T110Write_4Cyc_LS01_I0123_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 5 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_5Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 5 cycles on LS0/LS1 and 2 of I0/I1/I2/I3. +def THX3T110Write_5Cyc_LS01_I0123_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_6Cyc_LS01_I012 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 6; + let NumMicroOps = 4; +} + +// 6 cycles on LS0/LS1 and 2 of I0/I1/I2/I3. +def THX3T110Write_6Cyc_LS01_I0123_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 1 cycle on LS0/LS1 and SD. +def THX3T110Write_1Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 2 cycles on LS0/LS1 and SD. +def THX3T110Write_2Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// 4 cycles on LS0/LS1 and SD. +def THX3T110Write_4Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 5 cycles on LS0/LS1 and SD. +def THX3T110Write_5Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 5; + let NumMicroOps = 4; +} + +// 6 cycles on LS0/LS1 and SD. +def THX3T110Write_6Cyc_LS01_SD : + SchedWriteRes<[THX3T110LS, THX3T110SD]> { + let Latency = 6; + let NumMicroOps = 5; +} + +// 1 cycle on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_1Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 2 cycles on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_2Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 2; + let NumMicroOps = 2; +} + +// 4 cycles on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_4Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 4; + let NumMicroOps = 3; +} + +// 5 cycles on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_5Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 5; + let NumMicroOps = 4; +} + +// 6 cycles on LS0/LS1, SD and I0/I1/I2/I3. +def THX3T110Write_6Cyc_LS01_SD_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> { + let Latency = 6; + let NumMicroOps = 5; +} + +// 1 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_1Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 1; + let NumMicroOps = 2; +} + +// 5 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_5Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 5; + let NumMicroOps = 3; +} + +// 6 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_6Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 6; + let NumMicroOps = 3; +} + +// 7 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_7Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// 8 cycles on LS0/LS1 and F0/F1/F2/F3. +def THX3T110Write_8Cyc_LS01_F0123 : + SchedWriteRes<[THX3T110LS, THX3T110FP0123]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 8 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_8Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 8; + let NumMicroOps = 3; +} + +// 12 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_12Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 12; + let NumMicroOps = 4; +} + +// 16 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_16Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 16; + let NumMicroOps = 5; +} + +// 24 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_24Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 24; + let NumMicroOps = 10; +} + +// 32 cycles on LS0/LS1 and I0/I1/I2/I3. +def THX3T110Write_32Cyc_LS01_I0123 : + SchedWriteRes<[THX3T110LS, THX3T110I0123]> { + let Latency = 32; + let NumMicroOps = 14; +} + +// 3 cycles on F0/F1/F2/F3. +def THX3T110Write_3Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 3; + let NumMicroOps = 2; +} + +// 4 cycles on F0/F1/F2/F3. +def THX3T110Write_4Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 4; + let NumMicroOps = 2; +} + +// 5 cycles on F0/F1/F2/F3. +def THX3T110Write_5Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// 10 cycles on F0/F1/F2/F3. +def THX3T110Write_10Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 10; + let NumMicroOps = 4; +} + +// 15 cycles on F0/F1/F2/F3. +def THX3T110Write_15Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 15; + let NumMicroOps = 7; +} + +// 16 cycles on F0/F1/F2/F3. +def THX3T110Write_16Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let NumMicroOps = 3; +} + +// 18 cycles on F0/F1/F2/F3. +def THX3T110Write_18Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 18; + let NumMicroOps = 3; +} + +// 19 cycles on F0/F1/F2/F3. +def THX3T110Write_19Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 19; + let NumMicroOps = 4; +} + +// 20 cycles on F0/F1/F2/F3. +def THX3T110Write_20Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 20; + let NumMicroOps = 4; +} + +// 23 cycles on F0/F1/F2/F3. +def THX3T110Write_23Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 23; + let NumMicroOps = 4; +} + +// 3 cycles on F2/F3 and 4 cycles on F0/F1/F2/F3. +def THX3T110Write_3_4Cyc_F23_F0123 : + SchedWriteRes<[THX3T110FP23, THX3T110FP0123]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [3, 4]; +} + + +// Define commonly used read types. + +// No forwarding is provided for these types. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// 3. Instruction Tables. + +//--- +// 3.1 Branch Instructions +//--- + +// Branch, immed +// Branch and link, immed +// Compare and branch +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +// Branch, register +// Branch and link, register != LR +// Branch and link, register = LR +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +def : WriteRes { + let Latency = 4; + let NumMicroOps = 2; +} + +//--- +// Branch +//--- +def : InstRW<[THX3T110Write_1Cyc_I23], (instrs B, BL, BR, BLR)>; +def : InstRW<[THX3T110Write_1Cyc_I23], (instrs Bcc)>; +def : InstRW<[THX3T110Write_1Cyc_I23], (instrs RET)>; +def : InstRW<[THX3T110Write_1Cyc_I23], + (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; + +//--- +// 3.2 Arithmetic and Logical Instructions +// 3.3 Move and Shift Instructions +//--- + + +// ALU, basic +// Conditional compare +// Conditional select +// Address generation +def : WriteRes { + let Latency = 1; + let ResourceCycles = [1]; + let NumMicroOps = 2; +} + +def : InstRW<[WriteI], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +def : InstRW<[WriteI], (instrs COPY)>; + +// ALU, extend and/or shift +def : WriteRes { + let Latency = 2; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} + +def : InstRW<[WriteISReg], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +def : WriteRes { + let Latency = 1; + let ResourceCycles = [1]; + let NumMicroOps = 2; +} + +def : InstRW<[WriteIEReg], + (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?", + "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)", + "ADC(W|X)r", + "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)", + "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)", + "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)", + "SUBS?(W|X)r(i|r|s|x)", "SBC(W|X)r", + "SBCS(W|X)r", "CCMN(W|X)(i|r)", + "CCMP(W|X)(i|r)", "CSEL(W|X)r", + "CSINC(W|X)r", "CSINV(W|X)r", + "CSNEG(W|X)r")>; + +// Move immed +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +def : InstRW<[THX3T110Write_1Cyc_I0123], + (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; + +def : InstRW<[THX3T110Write_1Cyc_I0123], + (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>; + +// Variable shift +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +//--- +// 3.4 Divide and Multiply Instructions +//--- + +// Divide, W-form +// Latency range of 13-23/13-39. +def : WriteRes { + let Latency = 39; + let ResourceCycles = [39]; + let NumMicroOps = 4; +} + +// Divide, X-form +def : WriteRes { + let Latency = 23; + let ResourceCycles = [23]; + let NumMicroOps = 4; +} + +// Multiply accumulate, W-form +def : WriteRes { + let Latency = 5; + let NumMicroOps = 3; +} + +// Multiply accumulate, X-form +def : WriteRes { + let Latency = 5; + let NumMicroOps = 3; +} + +//def : InstRW<[WriteIM32, ReadIM, ReadIM, ReadIMA, THX3T110Write_5Cyc_I012], +// (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>; +def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>; +def : InstRW<[THX3T110Write_5Cyc_I0123], + (instregex "(S|U)(MADDL|MSUBL)rrr")>; + +def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>; +def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>; + +// Bitfield extract, two reg +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +// Multiply high +def : InstRW<[THX3T110Write_4Cyc_I1], (instrs SMULHrr, UMULHrr)>; + +// Miscellaneous Data-Processing Instructions +// Bitfield extract +def : InstRW<[THX3T110Write_1Cyc_I0123], (instrs EXTRWrri, EXTRXrri)>; + +// Bitifield move - basic +def : InstRW<[THX3T110Write_1Cyc_I0123], + (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>; + +// Bitfield move, insert +def : InstRW<[THX3T110Write_1Cyc_I0123], (instregex "^BFM")>; +def : InstRW<[THX3T110Write_1Cyc_I0123], (instregex "(S|U)?BFM.*")>; + +// Count leading +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], + (instregex "^CLS(W|X)r$", "^CLZ(W|X)r$")>; + +// Reverse bits +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instrs RBITWr, RBITXr)>; + +// Cryptography Extensions +def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^AES[DE]")>; +def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^AESI?MC")>; +def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^PMULL")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1SU0")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1(H|SU1)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1[CMP]")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA256SU0")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA256(H|H2|SU1)")>; + +// CRC Instructions +// def : InstRW<[THX3T110Write_4Cyc_I1], (instregex "^CRC32", "^CRC32C")>; +def : InstRW<[THX3T110Write_4Cyc_I1], + (instrs CRC32Brr, CRC32Hrr, CRC32Wrr, CRC32Xrr)>; + +def : InstRW<[THX3T110Write_4Cyc_I1], + (instrs CRC32CBrr, CRC32CHrr, CRC32CWrr, CRC32CXrr)>; + +// Reverse bits/bytes +// NOTE: Handled by WriteI. + +//--- +// 3.6 Load Instructions +// 3.10 FP Load Instructions +//--- + +// Load register, literal +// Load register, unscaled immed +// Load register, immed unprivileged +// Load register, unsigned immed +def : WriteRes { + let Latency = 4; + let NumMicroOps = 4; +} + +// Load register, immed post-index +// NOTE: Handled by WriteLD, WriteI. +// Load register, immed pre-index +// NOTE: Handled by WriteLD, WriteAdr. +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +// Load pair, immed offset, normal +// Load pair, immed offset, signed words, base != SP +// Load pair, immed offset signed words, base = SP +// LDP only breaks into *one* LS micro-op. Thus +// the resources are handled by WriteLD. +def : WriteRes { + let Latency = 4; + let NumMicroOps = 4; +} + +// Load register offset, basic +// Load register, register offset, scale by 4/8 +// Load register, register offset, scale by 2 +// Load register offset, extend +// Load register, register offset, extend, scale by 4/8 +// Load register, register offset, extend, scale by 2 +def THX3T110WriteLDIdx : SchedWriteVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +def THX3T110ReadAdrBase : SchedReadVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +// Load pair, immed pre-index, normal +// Load pair, immed pre-index, signed words +// Load pair, immed post-index, normal +// Load pair, immed post-index, signed words +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPDi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPQi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPSi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPXi)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPDi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPQi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPSi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPSWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPXi)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRBui)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRDui)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRHui)>; +def : InstRW<[THX3T110Write_5Cyc_LS01], (instrs LDRQui)>; +def : InstRW<[THX3T110Write_5Cyc_LS01], (instrs LDRSui)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRDl)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRQl)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRWl)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRXl)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRBi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRHi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRXi)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSBWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSBXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSHWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSHXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSWi)>; + +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPDpre)>; +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPQpre)>; +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPSpre)>; +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPWpre)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr], + (instrs LDRBpre, LDRDpre, LDRHpre, LDRQpre, + LDRSpre, LDRWpre, LDRXpre, + LDRSBWpre, LDRSBXpre, LDRSBWpost, LDRSBXpost, + LDRSHWpre, LDRSHXpre, LDRSHWpost, LDRSHXpost, + LDRBBpre, LDRBBpost, LDRHHpre, LDRHHpost)>; + +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr], + (instrs LDPDpost, LDPQpost, LDPSpost, LDPWpost, LDPXpost)>; + +def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteI], + (instrs LDRBpost, LDRDpost, LDRHpost, + LDRQpost, LDRSpost, LDRWpost, LDRXpost)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteLDHi, WriteAdr], + (instrs LDPDpre, LDPQpre, LDPSpre, LDPWpre, LDPXpre)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteAdr], + (instrs LDRBpre, LDRDpre, LDRHpre, LDRQpre, + LDRSpre, LDRWpre, LDRXpre)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteLDHi, WriteAdr], + (instrs LDPDpost, LDPQpost, LDPSpost, LDPWpost, LDPXpost)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteI], + (instrs LDRBpost, LDRDpost, LDRHpost, LDRQpost, + LDRSpost, LDRWpost, LDRXpost)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRBroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRDroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHHroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRQroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHWroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHXroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRWroW)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRXroW)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRBroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRDroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHHroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRQroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHWroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHXroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRWroX)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRXroX)>; + +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURBi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURBBi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURDi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURHi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURHHi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURQi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSBWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSBXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSHWi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSHXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSWi)>; + +// Load exclusive +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAXR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDXR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAXP(W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDXP(W|X)$")>; + +//--- +// Prefetch +//--- +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMl)>; +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFUMi)>; +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMui)>; +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMroW)>; +def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMroX)>; + +//-- +// 3.7 Store Instructions +// 3.11 FP Store Instructions +//-- + +// Store register, unscaled immed +// Store register, immed unprivileged +// Store register, unsigned immed +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store register, immed post-index +// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase + +// Store register, immed pre-index +// NOTE: Handled by WriteAdr, WriteST + +// Store register, register offset, basic +// Store register, register offset, scaled by 4/8 +// Store register, register offset, scaled by 2 +// Store register, register offset, extend +// Store register, register offset, extend, scale by 4/8 +// Store register, register offset, extend, scale by 1 +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store pair, immed offset, W-form +// Store pair, immed offset, X-form +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} + +// Store pair, immed post-index, W-form +// Store pair, immed post-index, X-form +// Store pair, immed pre-index, W-form +// Store pair, immed pre-index, X-form +// NOTE: Handled by WriteAdr, WriteSTP. +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURBi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURBBi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURDi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURHi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURHHi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURQi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURSi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURWi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURXi)>; + +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRBi)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRHi)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRWi)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRXi)>; + +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPDi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPQi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPXi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPWi)>; + +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPDi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPQi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPXi)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPWi)>; + +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRBui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRDui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRHui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRQui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRXui)>; +def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRWui)>; + +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRBui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRDui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRHui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRQui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRXui)>; +def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRWui)>; + +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRBui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRDui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRHui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRQui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRXui)>; +def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRWui)>; + +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPDpre, STPDpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPDpre, STPDpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPQpre, STPQpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPQpre, STPQpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPSpre, STPSpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPSpre, STPSpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPWpre, STPWpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPWpre, STPWpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STPXpre, STPXpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STPXpre, STPXpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRBpre, STRBpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRDpre, STRDpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRHpre, STRHpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRQpre, STRQpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRSpre, STRSpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRWpre, STRWpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRXpre, STRXpost)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRBroW, STRBroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRBBroW, STRBBroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRDroW, STRDroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRHroW, STRHroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRHHroW, STRHHroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRQroW, STRQroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRSroW, STRSroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRWroW, STRWroX)>; +def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase], + (instrs STRXroW, STRXroX)>; + +// Store exclusive +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instrs STNPWi, STNPXi)>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STXP(W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STXR(B|H|W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLXP(W|X)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLXR(B|H|W|X)$")>; + +//--- +// 3.8 FP Data Processing Instructions +//--- + +// FP absolute value +// FP min/max +// FP negate +def : WriteRes { + let Latency = 5; + let NumMicroOps = 2; +} + +// FP arithmetic +def : InstRW<[THX3T110Write_6Cyc_F01], (instregex "^FADD", "^FSUB")>; + +// FP compare +def : WriteRes { + let Latency = 5; + let NumMicroOps = 2; +} + +// FP Mul, Div, Sqrt +def : WriteRes { + let Latency = 22; + let ResourceCycles = [19]; +} + +def THX3T110XWriteFDiv : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} + +def THX3T110XWriteFDivSP : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} + +def THX3T110XWriteFDivDP : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 23; + let ResourceCycles = [12]; + let NumMicroOps = 4; +} + +def THX3T110XWriteFSqrtSP : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 16; + let ResourceCycles = [8]; + let NumMicroOps = 4; +} + +def THX3T110XWriteFSqrtDP : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 23; + let ResourceCycles = [12]; + let NumMicroOps = 4; +} + +// FP divide, S-form +// FP square root, S-form +def : InstRW<[THX3T110XWriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[THX3T110XWriteFSqrtSP], (instrs FSQRTSr)>; +def : InstRW<[THX3T110XWriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[THX3T110XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[THX3T110Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSr")>; + +// FP divide, D-form +// FP square root, D-form +def : InstRW<[THX3T110XWriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[THX3T110XWriteFSqrtDP], (instrs FSQRTDr)>; +def : InstRW<[THX3T110XWriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[THX3T110XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>; +def : InstRW<[THX3T110Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDr")>; + +// FP multiply +// FP multiply accumulate +def : WriteRes { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def THX3T110XWriteFMul : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def THX3T110XWriteFMulAcc : SchedWriteRes<[THX3T110FP0123]> { + let Latency = 6; + let ResourceCycles = [2]; + let NumMicroOps = 3; +} + +def : InstRW<[THX3T110XWriteFMul], (instregex "^FMUL", "^FNMUL")>; +def : InstRW<[THX3T110XWriteFMulAcc], + (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>; + +// FP round to integral +def : InstRW<[THX3T110Write_7Cyc_F01], + (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>; + +// FP select +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FCSEL")>; + +//--- +// 3.9 FP Miscellaneous Instructions +//--- + +// FP convert, from vec to vec reg +// FP convert, from gen to vec reg +// FP convert, from vec to gen reg +def : WriteRes { + let Latency = 7; + let NumMicroOps = 3; +} + +// FP move, immed +// FP move, register +def : WriteRes { + let Latency = 4; + let NumMicroOps = 2; +} + +// FP transfer, from gen to vec reg +// FP transfer, from vec to gen reg +def : WriteRes { + let Latency = 4; + let NumMicroOps = 2; +} + +def : InstRW<[THX3T110Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>; + +//--- +// 3.12 ASIMD Integer Instructions +//--- + +// ASIMD absolute diff, D-form +// ASIMD absolute diff, Q-form +// ASIMD absolute diff accum, D-form +// ASIMD absolute diff accum, Q-form +// ASIMD absolute diff accum long +// ASIMD absolute diff long +// ASIMD arith, basic +// ASIMD arith, complex +// ASIMD compare +// ASIMD logical (AND, BIC, EOR) +// ASIMD max/min, basic +// ASIMD max/min, reduce, 4H/4S +// ASIMD max/min, reduce, 8B/8H +// ASIMD max/min, reduce, 16B +// ASIMD multiply, D-form +// ASIMD multiply, Q-form +// ASIMD multiply accumulate long +// ASIMD multiply accumulate saturating long +// ASIMD multiply long +// ASIMD pairwise add and accumulate +// ASIMD shift accumulate +// ASIMD shift by immed, basic +// ASIMD shift by immed and insert, basic, D-form +// ASIMD shift by immed and insert, basic, Q-form +// ASIMD shift by immed, complex +// ASIMD shift by register, basic, D-form +// ASIMD shift by register, basic, Q-form +// ASIMD shift by register, complex, D-form +// ASIMD shift by register, complex, Q-form +def : WriteRes { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} +def : WriteRes { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} + +// ASIMD arith, reduce, 4H/4S +// ASIMD arith, reduce, 8B/8H +// ASIMD arith, reduce, 16B + +// ASIMD logical (MVN (alias for NOT), ORN, ORR) +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; + +// ASIMD arith, reduce +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>; + +// ASIMD polynomial (8x8) multiply long +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^(S|U|SQD)MULL")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^PMULL(v8i8|v16i8)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^PMULL(v1i64|v2i64)")>; + +// ASIMD absolute diff accum, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>; +// ASIMD absolute diff accum, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>; +// ASIMD absolute diff accum long +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]ABAL")>; +// ASIMD arith, reduce, 4H/4S +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>; +// ASIMD arith, reduce, 8B +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>; +// ASIMD arith, reduce, 16B/16H +def : InstRW<[THX3T110Write_10Cyc_F0123], + (instregex "^[SU]?ADDL?Vv16i8v$")>; +// ASIMD max/min, reduce, 4H/4S +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>; +// ASIMD max/min, reduce, 8B/8H +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>; +// ASIMD max/min, reduce, 16B/16H +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU](MIN|MAX)Vv16i8v$")>; +// ASIMD multiply, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^(P?MUL|SQR?DMULH)" # + "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" # + "(_indexed)?$")>; +// ASIMD multiply, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; +// ASIMD multiply accumulate, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>; +// ASIMD multiply accumulate, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>; +// ASIMD shift accumulate +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>; + +// ASIMD shift by immed, basic +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "RSHRNv","SHRNv", "SQRSHRNv","SQRSHRUNv", + "SQSHRNv","SQSHRUNv", "UQRSHRNv", + "UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>; +// ASIMD shift by immed, complex +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]?(Q|R){1,2}SHR")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SQSHLU")>; +// ASIMD shift by register, basic, Q-form +def : InstRW<[THX3T110Write_5Cyc_F01], + (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// ASIMD shift by register, complex, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU][QR]{1,2}SHL" # + "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>; +// ASIMD shift by register, complex, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD Arithmetic +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(ADD|SUB)HNv.*")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(RADD|RSUB)HNv.*")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD", + "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" # + "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADALP","^UADALP")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADDLPv","^UADDLPv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADDLV","^UADDLV")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ADDVv","^SMAXVv","^UMAXVv","^SMINVv","^UMINVv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SABAv","^UABAv","^SABALv","^UABALv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SQADDv","^SQSUBv","^UQADDv","^UQSUBv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SUQADDv","^USQADDv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^ADDHNv","^RADDHNv", "^RSUBHNv", + "^SQABS", "^SQADD", "^SQNEG", "^SQSUB", + "^SRHADD", "^SUBHNv", "^SUQADD", + "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^CMEQv","^CMGEv","^CMGTv", + "^CMLEv","^CMLTv", "^CMHIv","^CMHSv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SMAXv","^SMINv","^UMAXv","^UMINv", + "^SMAXPv","^SMINPv","^UMAXPv","^UMINPv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SABDv","^UABDv", "^SABDLv","^UABDLv")>; + +//--- +// 3.13 ASIMD Floating-point Instructions +//--- + +// ASIMD FP absolute value +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FABSv")>; + +// ASIMD FP arith, normal, D-form +// ASIMD FP arith, normal, Q-form +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], + (instregex "^FABDv", "^FADDv", "^FSUBv")>; + +// ASIMD FP arith,pairwise, D-form +// ASIMD FP arith, pairwise, Q-form +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FADDPv")>; + +// ASIMD FP compare, D-form +// ASIMD FP compare, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FACGEv", "^FACGTv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FCMEQv", "^FCMGEv", + "^FCMGTv", "^FCMLEv", + "^FCMLTv")>; + +// ASIMD FP round, D-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FRINT[AIMNPXZ](v2f32)")>; +// ASIMD FP round, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>; + +// ASIMD FP convert, long +// ASIMD FP convert, narrow +// ASIMD FP convert, other, D-form +// ASIMD FP convert, other, Q-form +// NOTE: Handled by WriteV. + +// ASIMD FP convert, long and narrow +def : InstRW<[THX3T110Write_5Cyc_F01], (instregex "^FCVT(L|N|XN)v")>; +// ASIMD FP convert, other, D-form +def : InstRW<[THX3T110Write_5Cyc_F01], + (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>; +// ASIMD FP convert, other, Q-form +def : InstRW<[THX3T110Write_5Cyc_F01], + (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP divide, D-form, F32 +def : InstRW<[THX3T110Write_16Cyc_F0123], (instrs FDIVv2f32)>; +def : InstRW<[THX3T110Write_16Cyc_F0123], (instregex "FDIVv2f32")>; + +// ASIMD FP divide, Q-form, F32 +def : InstRW<[THX3T110Write_16Cyc_F0123], (instrs FDIVv4f32)>; +def : InstRW<[THX3T110Write_16Cyc_F0123], (instregex "FDIVv4f32")>; + +// ASIMD FP divide, Q-form, F64 +def : InstRW<[THX3T110Write_23Cyc_F0123], (instrs FDIVv2f64)>; +def : InstRW<[THX3T110Write_23Cyc_F0123], (instregex "FDIVv2f64")>; + +// ASIMD FP max/min, normal, D-form +// ASIMD FP max/min, normal, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXv", "^FMAXNMv", + "^FMINv", "^FMINNMv")>; + +// ASIMD FP max/min, pairwise, D-form +// ASIMD FP max/min, pairwise, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXPv", "^FMAXNMPv", + "^FMINPv", "^FMINNMPv")>; + +// ASIMD FP max/min, reduce +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXVv", "^FMAXNMVv", + "^FMINVv", "^FMINNMVv")>; + +// ASIMD FP multiply, D-form, FZ +// ASIMD FP multiply, D-form, no FZ +// ASIMD FP multiply, Q-form, FZ +// ASIMD FP multiply, Q-form, no FZ +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FMULv", "^FMULXv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP multiply accumulate, Dform, FZ +// ASIMD FP multiply accumulate, Dform, no FZ +// ASIMD FP multiply accumulate, Qform, FZ +// ASIMD FP multiply accumulate, Qform, no FZ +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FMLAv", "^FMLSv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>; + +// ASIMD FP negate +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FNEGv")>; + +//-- +// 3.14 ASIMD Miscellaneous Instructions +//-- + +// ASIMD bit reverse +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^RBITv")>; + +// ASIMD bitwise insert, D-form +// ASIMD bitwise insert, Q-form +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], + (instregex "^BIFv", "^BITv", "^BSLv")>; + +// ASIMD count, D-form +// ASIMD count, Q-form +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], + (instregex "^CLSv", "^CLZv", "^CNTv")>; + +// ASIMD duplicate, gen reg +// ASIMD duplicate, element +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUPv")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUP(i8|i16|i32|i64)$")>; +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUPv.+gpr")>; + +// ASIMD extract +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^EXTv")>; + +// ASIMD extract narrow +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^XTNv")>; + +// ASIMD extract narrow, saturating +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>; + +// ASIMD insert, element to element +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^INSv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]MOVv")>; + +// ASIMD move, integer immed +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^MOVIv")>; + +// ASIMD move, FP immed +def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FMOVv")>; + +// ASIMD transpose +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^TRN1", "^TRN2")>; + +// ASIMD unzip/zip +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>; + +// ASIMD reciprocal estimate, D-form +// ASIMD reciprocal estimate, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FRECPEv", "^FRECPXv", "^URECPEv", + "^FRSQRTEv", "^URSQRTEv")>; + +// ASIMD reciprocal step, D-form, FZ +// ASIMD reciprocal step, D-form, no FZ +// ASIMD reciprocal step, Q-form, FZ +// ASIMD reciprocal step, Q-form, no FZ +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^FRECPSv", "^FRSQRTSv")>; + +// ASIMD reverse +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^REV16v", "^REV32v", "^REV64v")>; + +// ASIMD table lookup, D-form +// ASIMD table lookup, Q-form +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>; +def : InstRW<[THX3T110Write_10Cyc_F0123], + (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>; +def : InstRW<[THX3T110Write_15Cyc_F0123], + (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>; +def : InstRW<[THX3T110Write_20Cyc_F0123], + (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>; + +// ASIMD transfer, element to word or word +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]MOVv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(S|U)MOVv.*")>; + +// ASIMD transfer gen reg to element +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^INSv")>; + +// ASIMD transpose +def : InstRW<[THX3T110Write_5Cyc_F0123], + (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>; + +// ASIMD unzip/zip +def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^ZIP1v", "^ZIP2v")>; + +//-- +// 3.15 ASIMD Load Instructions +//-- + +// ASIMD load, 1 element, multiple, 1 reg, D-form +// ASIMD load, 1 element, multiple, 1 reg, Q-form +def : InstRW<[THX3T110Write_4Cyc_LS01], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, D-form +// ASIMD load, 1 element, multiple, 2 reg, Q-form +def : InstRW<[THX3T110Write_4Cyc_LS01], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, D-form +// ASIMD load, 1 element, multiple, 3 reg, Q-form +def : InstRW<[THX3T110Write_5Cyc_LS01], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01, WriteAdr], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, D-form +// ASIMD load, 1 element, multiple, 4 reg, Q-form +def : InstRW<[THX3T110Write_6Cyc_LS01], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_6Cyc_LS01, WriteAdr], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, one lane, B/H/S +// ASIMD load, 1 element, one lane, D +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD1i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD1i(8|16|32|64)_POST$")>; + +// ASIMD load, 1 element, all lanes, D-form, B/H/S +// ASIMD load, 1 element, all lanes, D-form, D +// ASIMD load, 1 element, all lanes, Q-form +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, multiple, D-form, B/H/S +// ASIMD load, 2 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, one lane, B/H +// ASIMD load, 2 element, one lane, S +// ASIMD load, 2 element, one lane, D +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD2i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD2i(8|16|32|64)_POST$")>; + +// ASIMD load, 2 element, all lanes, D-form, B/H/S +// ASIMD load, 2 element, all lanes, D-form, D +// ASIMD load, 2 element, all lanes, Q-form +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, multiple, D-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, B/H/S +// ASIMD load, 3 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_8Cyc_LS01_F0123], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_8Cyc_LS01_F0123, WriteAdr], + (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, one lone, B/H +// ASIMD load, 3 element, one lane, S +// ASIMD load, 3 element, one lane, D +def : InstRW<[THX3T110Write_7Cyc_LS01_F0123], + (instregex "^LD3i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_7Cyc_LS01_F0123, WriteAdr], + (instregex "^LD3i(8|16|32|64)_POST$")>; + +// ASIMD load, 3 element, all lanes, D-form, B/H/S +// ASIMD load, 3 element, all lanes, D-form, D +// ASIMD load, 3 element, all lanes, Q-form, B/H/S +// ASIMD load, 3 element, all lanes, Q-form, D +def : InstRW<[THX3T110Write_7Cyc_LS01_F0123], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_7Cyc_LS01_F0123, WriteAdr], + (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, multiple, D-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, B/H/S +// ASIMD load, 4 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_8Cyc_LS01_F0123], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_8Cyc_LS01_F0123, WriteAdr], + (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 4 element, one lane, B/H +// ASIMD load, 4 element, one lane, S +// ASIMD load, 4 element, one lane, D +def : InstRW<[THX3T110Write_6Cyc_LS01_F0123], + (instregex "^LD4i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_6Cyc_LS01_F0123, WriteAdr], + (instregex "^LD4i(8|16|32|64)_POST$")>; + +// ASIMD load, 4 element, all lanes, D-form, B/H/S +// ASIMD load, 4 element, all lanes, D-form, D +// ASIMD load, 4 element, all lanes, Q-form, B/H/S +// ASIMD load, 4 element, all lanes, Q-form, D +def : InstRW<[THX3T110Write_6Cyc_LS01_F0123], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_6Cyc_LS01_F0123, WriteAdr], + (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +//-- +// 3.16 ASIMD Store Instructions +//-- + +// ASIMD store, 1 element, multiple, 1 reg, D-form +// ASIMD store, 1 element, multiple, 1 reg, Q-form +def : InstRW<[THX3T110Write_1Cyc_LS01], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, D-form +// ASIMD store, 1 element, multiple, 2 reg, Q-form +def : InstRW<[THX3T110Write_1Cyc_LS01], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, D-form +// ASIMD store, 1 element, multiple, 3 reg, Q-form +def : InstRW<[THX3T110Write_1Cyc_LS01], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, D-form +// ASIMD store, 1 element, multiple, 4 reg, Q-form +def : InstRW<[THX3T110Write_1Cyc_LS01], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, one lane, B/H/S +// ASIMD store, 1 element, one lane, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST1i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST1i(8|16|32|64)_POST$")>; + +// ASIMD store, 2 element, multiple, D-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, B/H/S +// ASIMD store, 2 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 2 element, one lane, B/H/S +// ASIMD store, 2 element, one lane, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST2i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST2i(8|16|32|64)_POST$")>; + +// ASIMD store, 3 element, multiple, D-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, B/H/S +// ASIMD store, 3 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 3 element, one lane, B/H +// ASIMD store, 3 element, one lane, S +// ASIMD store, 3 element, one lane, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST3i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST3i(8|16|32|64)_POST$")>; + +// ASIMD store, 4 element, multiple, D-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, B/H/S +// ASIMD store, 4 element, multiple, Q-form, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 4 element, one lane, B/H +// ASIMD store, 4 element, one lane, S +// ASIMD store, 4 element, one lane, D +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123], + (instregex "^ST4i(8|16|32|64)$")>; +def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr], + (instregex "^ST4i(8|16|32|64)_POST$")>; + +// V8.1a Atomics (LSE) +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs CASB, CASH, CASW, CASX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs CASAB, CASAH, CASAW, CASAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs CASLB, CASLH, CASLW, CASLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs CASALB, CASALH, CASALW, CASALX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDLARB, LDLARH, LDLARW, LDLARX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDADDB, LDADDH, LDADDW, LDADDX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDADDAB, LDADDAH, LDADDAW, LDADDAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDADDLB, LDADDLH, LDADDLW, LDADDLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs LDADDALB, LDADDALH, LDADDALW, LDADDALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDCLRB, LDCLRH, LDCLRW, LDCLRX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDCLRAB, LDCLRAH, LDCLRAW, LDCLRAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDCLRLB, LDCLRLH, LDCLRLW, LDCLRLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs LDCLRALB, LDCLRALH, LDCLRALW, LDCLRALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDEORB, LDEORH, LDEORW, LDEORX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDEORAB, LDEORAH, LDEORAW, LDEORAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDEORLB, LDEORLH, LDEORLW, LDEORLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs LDEORALB, LDEORALH, LDEORALW, LDEORALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDSETB, LDSETH, LDSETW, LDSETX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDSETAB, LDSETAH, LDSETAW, LDSETAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs LDSETLB, LDSETLH, LDSETLW, LDSETLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs LDSETALB, LDSETALH, LDSETALW, LDSETALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDSMAXB, LDSMAXH, LDSMAXW, LDSMAXX, + LDSMAXAB, LDSMAXAH, LDSMAXAW, LDSMAXAX, + LDSMAXLB, LDSMAXLH, LDSMAXLW, LDSMAXLX, + LDSMAXALB, LDSMAXALH, LDSMAXALW, LDSMAXALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDSMINB, LDSMINH, LDSMINW, LDSMINX, + LDSMINAB, LDSMINAH, LDSMINAW, LDSMINAX, + LDSMINLB, LDSMINLH, LDSMINLW, LDSMINLX, + LDSMINALB, LDSMINALH, LDSMINALW, LDSMINALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDUMAXB, LDUMAXH, LDUMAXW, LDUMAXX, + LDUMAXAB, LDUMAXAH, LDUMAXAW, LDUMAXAX, + LDUMAXLB, LDUMAXLH, LDUMAXLW, LDUMAXLX, + LDUMAXALB, LDUMAXALH, LDUMAXALW, LDUMAXALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs LDUMINB, LDUMINH, LDUMINW, LDUMINX, + LDUMINAB, LDUMINAH, LDUMINAW, LDUMINAX, + LDUMINLB, LDUMINLH, LDUMINLW, LDUMINLX, + LDUMINALB, LDUMINALH, LDUMINALW, LDUMINALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs SWPB, SWPH, SWPW, SWPX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs SWPAB, SWPAH, SWPAW, SWPAX)>; + +def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic], + (instrs SWPLB, SWPLH, SWPLW, SWPLX)>; + +def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic], + (instrs SWPALB, SWPALH, SWPALW, SWPALX)>; + +def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic], + (instrs STLLRB, STLLRH, STLLRW, STLLRX)>; + +// V8.3a PAC +def : InstRW<[THX3T110Write_11Cyc_LS01_I1], (instregex "^LDRAA", "^LDRAB")>; +def : InstRW<[THX3T110Write_8Cyc_I123], + (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, + BRAA, BRAAZ, BRAB, BRABZ)>; +def : InstRW<[THX3T110Write_8Cyc_I123], (instrs RETAA, RETAB)>; + +} // SchedModel = ThunderX3T110Model diff --git a/suite/synctools/tablegen/AArch64/AArch64Schedule.td b/suite/synctools/tablegen/AArch64/AArch64Schedule.td index ce81f48a..b8572c9b 100644 --- a/suite/synctools/tablegen/AArch64/AArch64Schedule.td +++ b/suite/synctools/tablegen/AArch64/AArch64Schedule.td @@ -1,9 +1,8 @@ //==-- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// @@ -48,19 +47,9 @@ def WriteAdr : SchedWrite; // Address pre/post increment. def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled). def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled). +def ReadST : SchedRead; // Read the stored value. def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST. -// Predicate for determining when a shiftable register is shifted. -def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(*MI)}]>; - -// Predicate for determining when a extendedable register is extended. -def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(*MI)}]>; - -// ScaledIdxPred is true if a WriteLDIdx operand will be -// scaled. Subtargets can use this to dynamically select resources and -// latency for WriteLDIdx and ReadAdrBase. -def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(*MI)}]>; - // Serialized two-level address load. // EXAMPLE: LOADGot def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>; @@ -88,7 +77,8 @@ def WriteFImm : SchedWrite; // Floating-point immediate. def WriteFMul : SchedWrite; // Floating-point multiply. def WriteFDiv : SchedWrite; // Floating-point division. -def WriteV : SchedWrite; // Vector ops. +def WriteVd : SchedWrite; // 64bit Vector D ops. +def WriteVq : SchedWrite; // 128bit Vector Q ops. def WriteVLD : SchedWrite; // Vector loads. def WriteVST : SchedWrite; // Vector stores. @@ -98,9 +88,9 @@ def WriteAtomic : SchedWrite; // Atomic memory operations (CAS, Swap, LDOP) def ReadVLD : SchedRead; // Sequential vector load and shuffle. -def WriteVLDShuffle : WriteSequence<[WriteVLD, WriteV]>; -def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>; +def WriteVLDShuffle : WriteSequence<[WriteVLD, WriteVq]>; +def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteVq, WriteVq]>; // Store a shuffled vector. -def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>; -def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>; +def WriteVSTShuffle : WriteSequence<[WriteVq, WriteVST]>; +def WriteVSTPairShuffle : WriteSequence<[WriteVq, WriteVq, WriteVST]>; diff --git a/suite/synctools/tablegen/AArch64/AArch64SystemOperands.td b/suite/synctools/tablegen/AArch64/AArch64SystemOperands.td index dbc4deaf..cce5813f 100644 --- a/suite/synctools/tablegen/AArch64/AArch64SystemOperands.td +++ b/suite/synctools/tablegen/AArch64/AArch64SystemOperands.td @@ -1,9 +1,8 @@ //===- AArch64SystemOperands.td ----------------------------*- tablegen -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -14,6 +13,30 @@ include "llvm/TableGen/SearchableTable.td" +//===----------------------------------------------------------------------===// +// Features that, for the compiler, only enable system operands and PStates +//===----------------------------------------------------------------------===// + +def HasCCPP : Predicate<"Subtarget->hasCCPP()">, + AssemblerPredicate<(all_of FeatureCCPP), "ccpp">; + +def HasPAN : Predicate<"Subtarget->hasPAN()">, + AssemblerPredicate<(all_of FeaturePAN), + "ARM v8.1 Privileged Access-Never extension">; + +def HasPsUAO : Predicate<"Subtarget->hasPsUAO()">, + AssemblerPredicate<(all_of FeaturePsUAO), + "ARM v8.2 UAO PState extension (psuao)">; + +def HasPAN_RWV : Predicate<"Subtarget->hasPAN_RWV()">, + AssemblerPredicate<(all_of FeaturePAN_RWV), + "ARM v8.2 PAN AT S1E1R and AT S1E1W Variation">; + +def HasCONTEXTIDREL2 + : Predicate<"Subtarget->hasCONTEXTIDREL2()">, + AssemblerPredicate<(all_of FeatureCONTEXTIDREL2), + "Target contains CONTEXTIDR_EL2 RW operand">; + //===----------------------------------------------------------------------===// // AT (address translate) instruction options. //===----------------------------------------------------------------------===// @@ -45,7 +68,7 @@ def : AT<"S12E1W", 0b100, 0b0111, 0b1000, 0b101>; def : AT<"S12E0R", 0b100, 0b0111, 0b1000, 0b110>; def : AT<"S12E0W", 0b100, 0b0111, 0b1000, 0b111>; -let Requires = [{ {AArch64::HasV8_2aOps} }] in { +let Requires = [{ {AArch64::FeaturePAN_RWV} }] in { def : AT<"S1E1RP", 0b000, 0b0111, 0b1001, 0b000>; def : AT<"S1E1WP", 0b000, 0b0111, 0b1001, 0b001>; } @@ -75,6 +98,21 @@ def : DB<"ld", 0xd>; def : DB<"st", 0xe>; def : DB<"sy", 0xf>; +class DBnXS encoding, bits<5> immValue> : SearchableTable { + let SearchableFields = ["Name", "Encoding", "ImmValue"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<4> Encoding = encoding; + bits<5> ImmValue = immValue; + code Requires = [{ {AArch64::FeatureXS} }]; +} + +def : DBnXS<"oshnxs", 0x3, 0x10>; +def : DBnXS<"nshnxs", 0x7, 0x14>; +def : DBnXS<"ishnxs", 0xb, 0x18>; +def : DBnXS<"synxs", 0xf, 0x1c>; + //===----------------------------------------------------------------------===// // DC (data cache maintenance) instruction options. //===----------------------------------------------------------------------===// @@ -102,9 +140,33 @@ def : DC<"CVAU", 0b011, 0b0111, 0b1011, 0b001>; def : DC<"CIVAC", 0b011, 0b0111, 0b1110, 0b001>; def : DC<"CISW", 0b000, 0b0111, 0b1110, 0b010>; -let Requires = [{ {AArch64::HasV8_2aOps} }] in +let Requires = [{ {AArch64::FeatureCCPP} }] in def : DC<"CVAP", 0b011, 0b0111, 0b1100, 0b001>; +let Requires = [{ {AArch64::FeatureCacheDeepPersist} }] in +def : DC<"CVADP", 0b011, 0b0111, 0b1101, 0b001>; + +let Requires = [{ {AArch64::FeatureMTE} }] in { +def : DC<"IGVAC", 0b000, 0b0111, 0b0110, 0b011>; +def : DC<"IGSW", 0b000, 0b0111, 0b0110, 0b100>; +def : DC<"CGSW", 0b000, 0b0111, 0b1010, 0b100>; +def : DC<"CIGSW", 0b000, 0b0111, 0b1110, 0b100>; +def : DC<"CGVAC", 0b011, 0b0111, 0b1010, 0b011>; +def : DC<"CGVAP", 0b011, 0b0111, 0b1100, 0b011>; +def : DC<"CGVADP", 0b011, 0b0111, 0b1101, 0b011>; +def : DC<"CIGVAC", 0b011, 0b0111, 0b1110, 0b011>; +def : DC<"GVA", 0b011, 0b0111, 0b0100, 0b011>; +def : DC<"IGDVAC", 0b000, 0b0111, 0b0110, 0b101>; +def : DC<"IGDSW", 0b000, 0b0111, 0b0110, 0b110>; +def : DC<"CGDSW", 0b000, 0b0111, 0b1010, 0b110>; +def : DC<"CIGDSW", 0b000, 0b0111, 0b1110, 0b110>; +def : DC<"CGDVAC", 0b011, 0b0111, 0b1010, 0b101>; +def : DC<"CGDVAP", 0b011, 0b0111, 0b1100, 0b101>; +def : DC<"CGDVADP", 0b011, 0b0111, 0b1101, 0b101>; +def : DC<"CIGDVAC", 0b011, 0b0111, 0b1110, 0b101>; +def : DC<"GZVA", 0b011, 0b0111, 0b0100, 0b100>; +} + //===----------------------------------------------------------------------===// // IC (instruction cache maintenance) instruction options. //===----------------------------------------------------------------------===// @@ -154,7 +216,7 @@ class TSB encoding> : SearchableTable{ bits<4> Encoding; let Encoding = encoding; - code Requires = [{ {AArch64::HasV8_4aOps} }]; + code Requires = [{ {AArch64::FeatureTRACEV8_4} }]; } def : TSB<"csync", 0>; @@ -290,14 +352,41 @@ def : PState<"SPSel", 0b00101>; def : PState<"DAIFSet", 0b11110>; def : PState<"DAIFClr", 0b11111>; // v8.1a "Privileged Access Never" extension-specific PStates -let Requires = [{ {AArch64::HasV8_1aOps} }] in +let Requires = [{ {AArch64::FeaturePAN} }] in def : PState<"PAN", 0b00100>; + // v8.2a "User Access Override" extension-specific PStates -let Requires = [{ {AArch64::HasV8_2aOps} }] in +let Requires = [{ {AArch64::FeaturePsUAO} }] in def : PState<"UAO", 0b00011>; -// v8.4a timining insensitivity of data processing instructions -let Requires = [{ {AArch64::HasV8_4aOps} }] in +// v8.4a timing insensitivity of data processing instructions +let Requires = [{ {AArch64::FeatureDIT} }] in def : PState<"DIT", 0b11010>; +// v8.5a Spectre Mitigation +let Requires = [{ {AArch64::FeatureSSBS} }] in +def : PState<"SSBS", 0b11001>; +// v8.5a Memory Tagging Extension +let Requires = [{ {AArch64::FeatureMTE} }] in +def : PState<"TCO", 0b11100>; + +//===----------------------------------------------------------------------===// +// SVCR instruction options. +//===----------------------------------------------------------------------===// + +class SVCR encoding> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<3> Encoding; + let Encoding = encoding; + code Requires = [{ {} }]; +} + +let Requires = [{ {AArch64::FeatureSME} }] in { +def : SVCR<"SVCRSM", 0b001>; +def : SVCR<"SVCRZA", 0b010>; +def : SVCR<"SVCRSMZA", 0b011>; +} //===----------------------------------------------------------------------===// // PSB instruction options. @@ -315,14 +404,28 @@ class PSB encoding> : SearchableTable { def : PSB<"csync", 0x11>; //===----------------------------------------------------------------------===// -// TLBI (translation lookaside buffer invalidate) instruction options. +// BTI instruction options. //===----------------------------------------------------------------------===// -class TLBI op1, bits<4> crn, bits<4> crm, - bits<3> op2, bit needsreg = 1> : SearchableTable { +class BTI encoding> : SearchableTable { let SearchableFields = ["Name", "Encoding"]; let EnumValueField = "Encoding"; + string Name = name; + bits<3> Encoding; + let Encoding = encoding; +} + +def : BTI<"c", 0b010>; +def : BTI<"j", 0b100>; +def : BTI<"jc", 0b110>; + +//===----------------------------------------------------------------------===// +// TLBI (translation lookaside buffer invalidate) instruction options. +//===----------------------------------------------------------------------===// + +class TLBIEntry op1, bits<4> crn, bits<4> crm, + bits<3> op2, bit needsreg> { string Name = name; bits<14> Encoding; let Encoding{13-11} = op1; @@ -330,94 +433,147 @@ class TLBI op1, bits<4> crn, bits<4> crm, let Encoding{6-3} = crm; let Encoding{2-0} = op2; bit NeedsReg = needsreg; - code Requires = [{ {} }]; + list Requires = []; + list ExtraRequires = []; + code RequiresStr = [{ { }] # !interleave(Requires # ExtraRequires, [{, }]) # [{ } }]; } -def : TLBI<"IPAS2E1IS", 0b100, 0b1000, 0b0000, 0b001>; -def : TLBI<"IPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b101>; -def : TLBI<"VMALLE1IS", 0b000, 0b1000, 0b0011, 0b000, 0>; -def : TLBI<"ALLE2IS", 0b100, 0b1000, 0b0011, 0b000, 0>; -def : TLBI<"ALLE3IS", 0b110, 0b1000, 0b0011, 0b000, 0>; -def : TLBI<"VAE1IS", 0b000, 0b1000, 0b0011, 0b001>; -def : TLBI<"VAE2IS", 0b100, 0b1000, 0b0011, 0b001>; -def : TLBI<"VAE3IS", 0b110, 0b1000, 0b0011, 0b001>; -def : TLBI<"ASIDE1IS", 0b000, 0b1000, 0b0011, 0b010>; -def : TLBI<"VAAE1IS", 0b000, 0b1000, 0b0011, 0b011>; -def : TLBI<"ALLE1IS", 0b100, 0b1000, 0b0011, 0b100, 0>; -def : TLBI<"VALE1IS", 0b000, 0b1000, 0b0011, 0b101>; -def : TLBI<"VALE2IS", 0b100, 0b1000, 0b0011, 0b101>; -def : TLBI<"VALE3IS", 0b110, 0b1000, 0b0011, 0b101>; -def : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>; -def : TLBI<"VAALE1IS", 0b000, 0b1000, 0b0011, 0b111>; -def : TLBI<"IPAS2E1", 0b100, 0b1000, 0b0100, 0b001>; -def : TLBI<"IPAS2LE1", 0b100, 0b1000, 0b0100, 0b101>; -def : TLBI<"VMALLE1", 0b000, 0b1000, 0b0111, 0b000, 0>; -def : TLBI<"ALLE2", 0b100, 0b1000, 0b0111, 0b000, 0>; -def : TLBI<"ALLE3", 0b110, 0b1000, 0b0111, 0b000, 0>; -def : TLBI<"VAE1", 0b000, 0b1000, 0b0111, 0b001>; -def : TLBI<"VAE2", 0b100, 0b1000, 0b0111, 0b001>; -def : TLBI<"VAE3", 0b110, 0b1000, 0b0111, 0b001>; -def : TLBI<"ASIDE1", 0b000, 0b1000, 0b0111, 0b010>; -def : TLBI<"VAAE1", 0b000, 0b1000, 0b0111, 0b011>; -def : TLBI<"ALLE1", 0b100, 0b1000, 0b0111, 0b100, 0>; -def : TLBI<"VALE1", 0b000, 0b1000, 0b0111, 0b101>; -def : TLBI<"VALE2", 0b100, 0b1000, 0b0111, 0b101>; -def : TLBI<"VALE3", 0b110, 0b1000, 0b0111, 0b101>; -def : TLBI<"VMALLS12E1", 0b100, 0b1000, 0b0111, 0b110, 0>; -def : TLBI<"VAALE1", 0b000, 0b1000, 0b0111, 0b111>; +def TLBITable : GenericTable { + let FilterClass = "TLBIEntry"; + let CppTypeName = "TLBI"; + let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"]; +} +def lookupTLBIByName : SearchIndex { + let Table = TLBITable; + let Key = ["Name"]; +} + +def lookupTLBIByEncoding : SearchIndex { + let Table = TLBITable; + let Key = ["Encoding"]; +} + +multiclass TLBI op1, bits<4> crn, bits<4> crm, + bits<3> op2, bit needsreg = 1> { + def : TLBIEntry; + def : TLBIEntry { + let Encoding{7} = 1; + let ExtraRequires = ["AArch64::FeatureXS"]; + } +} + +defm : TLBI<"IPAS2E1IS", 0b100, 0b1000, 0b0000, 0b001>; +defm : TLBI<"IPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b101>; +defm : TLBI<"VMALLE1IS", 0b000, 0b1000, 0b0011, 0b000, 0>; +defm : TLBI<"ALLE2IS", 0b100, 0b1000, 0b0011, 0b000, 0>; +defm : TLBI<"ALLE3IS", 0b110, 0b1000, 0b0011, 0b000, 0>; +defm : TLBI<"VAE1IS", 0b000, 0b1000, 0b0011, 0b001>; +defm : TLBI<"VAE2IS", 0b100, 0b1000, 0b0011, 0b001>; +defm : TLBI<"VAE3IS", 0b110, 0b1000, 0b0011, 0b001>; +defm : TLBI<"ASIDE1IS", 0b000, 0b1000, 0b0011, 0b010>; +defm : TLBI<"VAAE1IS", 0b000, 0b1000, 0b0011, 0b011>; +defm : TLBI<"ALLE1IS", 0b100, 0b1000, 0b0011, 0b100, 0>; +defm : TLBI<"VALE1IS", 0b000, 0b1000, 0b0011, 0b101>; +defm : TLBI<"VALE2IS", 0b100, 0b1000, 0b0011, 0b101>; +defm : TLBI<"VALE3IS", 0b110, 0b1000, 0b0011, 0b101>; +defm : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>; +defm : TLBI<"VAALE1IS", 0b000, 0b1000, 0b0011, 0b111>; +defm : TLBI<"IPAS2E1", 0b100, 0b1000, 0b0100, 0b001>; +defm : TLBI<"IPAS2LE1", 0b100, 0b1000, 0b0100, 0b101>; +defm : TLBI<"VMALLE1", 0b000, 0b1000, 0b0111, 0b000, 0>; +defm : TLBI<"ALLE2", 0b100, 0b1000, 0b0111, 0b000, 0>; +defm : TLBI<"ALLE3", 0b110, 0b1000, 0b0111, 0b000, 0>; +defm : TLBI<"VAE1", 0b000, 0b1000, 0b0111, 0b001>; +defm : TLBI<"VAE2", 0b100, 0b1000, 0b0111, 0b001>; +defm : TLBI<"VAE3", 0b110, 0b1000, 0b0111, 0b001>; +defm : TLBI<"ASIDE1", 0b000, 0b1000, 0b0111, 0b010>; +defm : TLBI<"VAAE1", 0b000, 0b1000, 0b0111, 0b011>; +defm : TLBI<"ALLE1", 0b100, 0b1000, 0b0111, 0b100, 0>; +defm : TLBI<"VALE1", 0b000, 0b1000, 0b0111, 0b101>; +defm : TLBI<"VALE2", 0b100, 0b1000, 0b0111, 0b101>; +defm : TLBI<"VALE3", 0b110, 0b1000, 0b0111, 0b101>; +defm : TLBI<"VMALLS12E1", 0b100, 0b1000, 0b0111, 0b110, 0>; +defm : TLBI<"VAALE1", 0b000, 0b1000, 0b0111, 0b111>; + +// Armv8.4-A Translation Lookaside Buffer Instructions (TLBI) +let Requires = ["AArch64::FeatureTLB_RMI"] in { // Armv8.4-A Outer Sharable TLB Maintenance instructions: -let Requires = [{ {AArch64::HasV8_4aOps} }] in { // op1 CRn CRm op2 -def : TLBI<"VMALLE1OS", 0b000, 0b1000, 0b0001, 0b000, 0>; -def : TLBI<"VAE1OS", 0b000, 0b1000, 0b0001, 0b001>; -def : TLBI<"ASIDE1OS", 0b000, 0b1000, 0b0001, 0b010>; -def : TLBI<"VAAE1OS", 0b000, 0b1000, 0b0001, 0b011>; -def : TLBI<"VALE1OS", 0b000, 0b1000, 0b0001, 0b101>; -def : TLBI<"VAALE1OS", 0b000, 0b1000, 0b0001, 0b111>; -def : TLBI<"IPAS2E1OS", 0b100, 0b1000, 0b0100, 0b000>; -def : TLBI<"IPAS2LE1OS", 0b100, 0b1000, 0b0100, 0b100>; -def : TLBI<"VAE2OS", 0b100, 0b1000, 0b0001, 0b001>; -def : TLBI<"VALE2OS", 0b100, 0b1000, 0b0001, 0b101>; -def : TLBI<"VMALLS12E1OS", 0b100, 0b1000, 0b0001, 0b110, 0>; -def : TLBI<"VAE3OS", 0b110, 0b1000, 0b0001, 0b001>; -def : TLBI<"VALE3OS", 0b110, 0b1000, 0b0001, 0b101>; -def : TLBI<"ALLE2OS", 0b100, 0b1000, 0b0001, 0b000, 0>; -def : TLBI<"ALLE1OS", 0b100, 0b1000, 0b0001, 0b100, 0>; -def : TLBI<"ALLE3OS", 0b110, 0b1000, 0b0001, 0b000, 0>; +defm : TLBI<"VMALLE1OS", 0b000, 0b1000, 0b0001, 0b000, 0>; +defm : TLBI<"VAE1OS", 0b000, 0b1000, 0b0001, 0b001>; +defm : TLBI<"ASIDE1OS", 0b000, 0b1000, 0b0001, 0b010>; +defm : TLBI<"VAAE1OS", 0b000, 0b1000, 0b0001, 0b011>; +defm : TLBI<"VALE1OS", 0b000, 0b1000, 0b0001, 0b101>; +defm : TLBI<"VAALE1OS", 0b000, 0b1000, 0b0001, 0b111>; +defm : TLBI<"IPAS2E1OS", 0b100, 0b1000, 0b0100, 0b000>; +defm : TLBI<"IPAS2LE1OS", 0b100, 0b1000, 0b0100, 0b100>; +defm : TLBI<"VAE2OS", 0b100, 0b1000, 0b0001, 0b001>; +defm : TLBI<"VALE2OS", 0b100, 0b1000, 0b0001, 0b101>; +defm : TLBI<"VMALLS12E1OS", 0b100, 0b1000, 0b0001, 0b110, 0>; +defm : TLBI<"VAE3OS", 0b110, 0b1000, 0b0001, 0b001>; +defm : TLBI<"VALE3OS", 0b110, 0b1000, 0b0001, 0b101>; +defm : TLBI<"ALLE2OS", 0b100, 0b1000, 0b0001, 0b000, 0>; +defm : TLBI<"ALLE1OS", 0b100, 0b1000, 0b0001, 0b100, 0>; +defm : TLBI<"ALLE3OS", 0b110, 0b1000, 0b0001, 0b000, 0>; // Armv8.4-A TLB Range Maintenance instructions: // op1 CRn CRm op2 -def : TLBI<"RVAE1", 0b000, 0b1000, 0b0110, 0b001>; -def : TLBI<"RVAAE1", 0b000, 0b1000, 0b0110, 0b011>; -def : TLBI<"RVALE1", 0b000, 0b1000, 0b0110, 0b101>; -def : TLBI<"RVAALE1", 0b000, 0b1000, 0b0110, 0b111>; -def : TLBI<"RVAE1IS", 0b000, 0b1000, 0b0010, 0b001>; -def : TLBI<"RVAAE1IS", 0b000, 0b1000, 0b0010, 0b011>; -def : TLBI<"RVALE1IS", 0b000, 0b1000, 0b0010, 0b101>; -def : TLBI<"RVAALE1IS", 0b000, 0b1000, 0b0010, 0b111>; -def : TLBI<"RVAE1OS", 0b000, 0b1000, 0b0101, 0b001>; -def : TLBI<"RVAAE1OS", 0b000, 0b1000, 0b0101, 0b011>; -def : TLBI<"RVALE1OS", 0b000, 0b1000, 0b0101, 0b101>; -def : TLBI<"RVAALE1OS", 0b000, 0b1000, 0b0101, 0b111>; -def : TLBI<"RIPAS2E1IS", 0b100, 0b1000, 0b0000, 0b010>; -def : TLBI<"RIPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b110>; -def : TLBI<"RIPAS2E1", 0b100, 0b1000, 0b0100, 0b010>; -def : TLBI<"RIPAS2LE1", 0b100, 0b1000, 0b0100, 0b110>; -def : TLBI<"RIPAS2E1OS", 0b100, 0b1000, 0b0100, 0b011>; -def : TLBI<"RIPAS2LE1OS", 0b100, 0b1000, 0b0100, 0b111>; -def : TLBI<"RVAE2", 0b100, 0b1000, 0b0110, 0b001>; -def : TLBI<"RVALE2", 0b100, 0b1000, 0b0110, 0b101>; -def : TLBI<"RVAE2IS", 0b100, 0b1000, 0b0010, 0b001>; -def : TLBI<"RVALE2IS", 0b100, 0b1000, 0b0010, 0b101>; -def : TLBI<"RVAE2OS", 0b100, 0b1000, 0b0101, 0b001>; -def : TLBI<"RVALE2OS", 0b100, 0b1000, 0b0101, 0b101>; -def : TLBI<"RVAE3", 0b110, 0b1000, 0b0110, 0b001>; -def : TLBI<"RVALE3", 0b110, 0b1000, 0b0110, 0b101>; -def : TLBI<"RVAE3IS", 0b110, 0b1000, 0b0010, 0b001>; -def : TLBI<"RVALE3IS", 0b110, 0b1000, 0b0010, 0b101>; -def : TLBI<"RVAE3OS", 0b110, 0b1000, 0b0101, 0b001>; -def : TLBI<"RVALE3OS", 0b110, 0b1000, 0b0101, 0b101>; +defm : TLBI<"RVAE1", 0b000, 0b1000, 0b0110, 0b001>; +defm : TLBI<"RVAAE1", 0b000, 0b1000, 0b0110, 0b011>; +defm : TLBI<"RVALE1", 0b000, 0b1000, 0b0110, 0b101>; +defm : TLBI<"RVAALE1", 0b000, 0b1000, 0b0110, 0b111>; +defm : TLBI<"RVAE1IS", 0b000, 0b1000, 0b0010, 0b001>; +defm : TLBI<"RVAAE1IS", 0b000, 0b1000, 0b0010, 0b011>; +defm : TLBI<"RVALE1IS", 0b000, 0b1000, 0b0010, 0b101>; +defm : TLBI<"RVAALE1IS", 0b000, 0b1000, 0b0010, 0b111>; +defm : TLBI<"RVAE1OS", 0b000, 0b1000, 0b0101, 0b001>; +defm : TLBI<"RVAAE1OS", 0b000, 0b1000, 0b0101, 0b011>; +defm : TLBI<"RVALE1OS", 0b000, 0b1000, 0b0101, 0b101>; +defm : TLBI<"RVAALE1OS", 0b000, 0b1000, 0b0101, 0b111>; +defm : TLBI<"RIPAS2E1IS", 0b100, 0b1000, 0b0000, 0b010>; +defm : TLBI<"RIPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b110>; +defm : TLBI<"RIPAS2E1", 0b100, 0b1000, 0b0100, 0b010>; +defm : TLBI<"RIPAS2LE1", 0b100, 0b1000, 0b0100, 0b110>; +defm : TLBI<"RIPAS2E1OS", 0b100, 0b1000, 0b0100, 0b011>; +defm : TLBI<"RIPAS2LE1OS", 0b100, 0b1000, 0b0100, 0b111>; +defm : TLBI<"RVAE2", 0b100, 0b1000, 0b0110, 0b001>; +defm : TLBI<"RVALE2", 0b100, 0b1000, 0b0110, 0b101>; +defm : TLBI<"RVAE2IS", 0b100, 0b1000, 0b0010, 0b001>; +defm : TLBI<"RVALE2IS", 0b100, 0b1000, 0b0010, 0b101>; +defm : TLBI<"RVAE2OS", 0b100, 0b1000, 0b0101, 0b001>; +defm : TLBI<"RVALE2OS", 0b100, 0b1000, 0b0101, 0b101>; +defm : TLBI<"RVAE3", 0b110, 0b1000, 0b0110, 0b001>; +defm : TLBI<"RVALE3", 0b110, 0b1000, 0b0110, 0b101>; +defm : TLBI<"RVAE3IS", 0b110, 0b1000, 0b0010, 0b001>; +defm : TLBI<"RVALE3IS", 0b110, 0b1000, 0b0010, 0b101>; +defm : TLBI<"RVAE3OS", 0b110, 0b1000, 0b0101, 0b001>; +defm : TLBI<"RVALE3OS", 0b110, 0b1000, 0b0101, 0b101>; +} //FeatureTLB_RMI + +// Armv9-A Realm Management Extention TLBI Instructions +let Requires = ["AArch64::FeatureRME"] in { +defm : TLBI<"RPAOS", 0b110, 0b1000, 0b0100, 0b011>; +defm : TLBI<"RPALOS", 0b110, 0b1000, 0b0100, 0b111>; +defm : TLBI<"PAALLOS", 0b110, 0b1000, 0b0001, 0b100, 0>; +defm : TLBI<"PAALL", 0b110, 0b1000, 0b0111, 0b100, 0>; +} + +// Armv8.5-A Prediction Restriction by Context instruction options: +class PRCTX crm> : SearchableTable { + let SearchableFields = ["Name", "Encoding"]; + let EnumValueField = "Encoding"; + + string Name = name; + bits<11> Encoding; + let Encoding{10-4} = 0b0110111; + let Encoding{3-0} = crm; + bit NeedsReg = 1; + code Requires = [{ {} }]; +} + +let Requires = [{ {AArch64::FeaturePredRes} }] in { +def : PRCTX<"RCTX", 0b0011>; } //===----------------------------------------------------------------------===// @@ -430,6 +586,7 @@ class SysReg op0, bits<3> op1, bits<4> crn, bits<4> crm, let EnumValueField = "Encoding"; string Name = name; + string AltName = name; bits<16> Encoding; let Encoding{15-14} = op0; let Encoding{13-11} = op1; @@ -476,8 +633,10 @@ def : ROSysReg<"PMCEID0_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b110>; def : ROSysReg<"PMCEID1_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b111>; def : ROSysReg<"MIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b000>; def : ROSysReg<"CCSIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b000>; + +//v8.3 CCIDX - extending the CCsIDr number of sets def : ROSysReg<"CCSIDR2_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b010> { - let Requires = [{ {AArch64::HasV8_3aOps} }]; + let Requires = [{ {AArch64::FeatureCCIDX} }]; } def : ROSysReg<"CLIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b001>; def : ROSysReg<"CTR_EL0", 0b11, 0b011, 0b0000, 0b0000, 0b001>; @@ -487,6 +646,9 @@ def : ROSysReg<"AIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b111>; def : ROSysReg<"DCZID_EL0", 0b11, 0b011, 0b0000, 0b0000, 0b111>; def : ROSysReg<"ID_PFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b000>; def : ROSysReg<"ID_PFR1_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b001>; +def : ROSysReg<"ID_PFR2_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b100> { + let Requires = [{ {AArch64::FeatureSpecRestrict} }]; +} def : ROSysReg<"ID_DFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b010>; def : ROSysReg<"ID_AFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b011>; def : ROSysReg<"ID_MMFR0_EL1", 0b11, 0b000, 0b0000, 0b0001, 0b100>; @@ -510,11 +672,10 @@ def : ROSysReg<"ID_AA64AFR0_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b100>; def : ROSysReg<"ID_AA64AFR1_EL1", 0b11, 0b000, 0b0000, 0b0101, 0b101>; def : ROSysReg<"ID_AA64ISAR0_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b000>; def : ROSysReg<"ID_AA64ISAR1_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b001>; +def : ROSysReg<"ID_AA64ISAR2_EL1", 0b11, 0b000, 0b0000, 0b0110, 0b010>; def : ROSysReg<"ID_AA64MMFR0_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b000>; def : ROSysReg<"ID_AA64MMFR1_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b001>; -def : ROSysReg<"ID_AA64MMFR2_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b010> { - let Requires = [{ {AArch64::HasV8_2aOps} }]; -} +def : ROSysReg<"ID_AA64MMFR2_EL1", 0b11, 0b000, 0b0000, 0b0111, 0b010>; def : ROSysReg<"MVFR0_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b000>; def : ROSysReg<"MVFR1_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b001>; def : ROSysReg<"MVFR2_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b010>; @@ -525,6 +686,7 @@ def : ROSysReg<"ISR_EL1", 0b11, 0b000, 0b1100, 0b0001, 0b000>; def : ROSysReg<"CNTPCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b001>; def : ROSysReg<"CNTVCT_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b010>; def : ROSysReg<"ID_MMFR4_EL1", 0b11, 0b000, 0b0000, 0b0010, 0b110>; +def : ROSysReg<"ID_MMFR5_EL1", 0b11, 0b000, 0b0000, 0b0011, 0b110>; // Trace registers // Op0 Op1 CRn CRm Op2 @@ -584,7 +746,7 @@ def : ROSysReg<"ID_AA64ZFR0_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b100>; // v8.1a "Limited Ordering Regions" extension-specific system register // Op0 Op1 CRn CRm Op2 -let Requires = [{ {AArch64::HasV8_1aOps} }] in +let Requires = [{ {AArch64::FeatureLOR} }] in def : ROSysReg<"LORID_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b111>; // v8.2a "RAS extension" registers @@ -594,6 +756,35 @@ def : ROSysReg<"ERRIDR_EL1", 0b11, 0b000, 0b0101, 0b0011, 0b000>; def : ROSysReg<"ERXFR_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b000>; } +// v8.5a "random number" registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureRandGen} }] in { +def : ROSysReg<"RNDR", 0b11, 0b011, 0b0010, 0b0100, 0b000>; +def : ROSysReg<"RNDRRS", 0b11, 0b011, 0b0010, 0b0100, 0b001>; +} + +// v8.5a Software Context Number registers +let Requires = [{ {AArch64::FeatureSpecRestrict} }] in { +def : RWSysReg<"SCXTNUM_EL0", 0b11, 0b011, 0b1101, 0b0000, 0b111>; +def : RWSysReg<"SCXTNUM_EL1", 0b11, 0b000, 0b1101, 0b0000, 0b111>; +def : RWSysReg<"SCXTNUM_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b111>; +def : RWSysReg<"SCXTNUM_EL3", 0b11, 0b110, 0b1101, 0b0000, 0b111>; +def : RWSysReg<"SCXTNUM_EL12", 0b11, 0b101, 0b1101, 0b0000, 0b111>; +} + +// v9a Realm Management Extension registers +let Requires = [{ {AArch64::FeatureRME} }] in { +def : RWSysReg<"MFAR_EL3", 0b11, 0b110, 0b0110, 0b0000, 0b101>; +def : RWSysReg<"GPCCR_EL3", 0b11, 0b110, 0b0010, 0b0001, 0b110>; +def : RWSysReg<"GPTBR_EL3", 0b11, 0b110, 0b0010, 0b0001, 0b100>; +} + +// v9-a Scalable Matrix Extension (SME) registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureSME} }] in { +def : ROSysReg<"ID_AA64SMFR0_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b101>; +} + //===---------------------- // Write-only regs //===---------------------- @@ -710,6 +901,9 @@ def : RWSysReg<"ACTLR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b001>; def : RWSysReg<"ACTLR_EL2", 0b11, 0b100, 0b0001, 0b0000, 0b001>; def : RWSysReg<"ACTLR_EL3", 0b11, 0b110, 0b0001, 0b0000, 0b001>; def : RWSysReg<"HCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b000>; +def : RWSysReg<"HCRX_EL2", 0b11, 0b100, 0b0001, 0b0010, 0b010> { + let Requires = [{ {AArch64::FeatureHCX} }]; +} def : RWSysReg<"SCR_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b000>; def : RWSysReg<"MDCR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b001>; def : RWSysReg<"SDER32_EL3", 0b11, 0b110, 0b0001, 0b0001, 0b001>; @@ -719,13 +913,19 @@ def : RWSysReg<"HSTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b011>; def : RWSysReg<"HACR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b111>; def : RWSysReg<"MDCR_EL3", 0b11, 0b110, 0b0001, 0b0011, 0b001>; def : RWSysReg<"TTBR0_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b000>; -def : RWSysReg<"TTBR0_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b000>; def : RWSysReg<"TTBR0_EL3", 0b11, 0b110, 0b0010, 0b0000, 0b000>; + +let Requires = [{ {AArch64::FeatureEL2VMSA} }] in { +def : RWSysReg<"TTBR0_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b000> { + let AltName = "VSCTLR_EL2"; +} +def : RWSysReg<"VTTBR_EL2", 0b11, 0b100, 0b0010, 0b0001, 0b000>; +} + def : RWSysReg<"TTBR1_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b001>; def : RWSysReg<"TCR_EL1", 0b11, 0b000, 0b0010, 0b0000, 0b010>; def : RWSysReg<"TCR_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b010>; def : RWSysReg<"TCR_EL3", 0b11, 0b110, 0b0010, 0b0000, 0b010>; -def : RWSysReg<"VTTBR_EL2", 0b11, 0b100, 0b0010, 0b0001, 0b000>; def : RWSysReg<"VTCR_EL2", 0b11, 0b100, 0b0010, 0b0001, 0b010>; def : RWSysReg<"DACR32_EL2", 0b11, 0b100, 0b0011, 0b0000, 0b000>; def : RWSysReg<"SPSR_EL1", 0b11, 0b000, 0b0100, 0b0000, 0b000>; @@ -740,7 +940,7 @@ def : RWSysReg<"SP_EL2", 0b11, 0b110, 0b0100, 0b0001, 0b000>; def : RWSysReg<"SPSel", 0b11, 0b000, 0b0100, 0b0010, 0b000>; def : RWSysReg<"NZCV", 0b11, 0b011, 0b0100, 0b0010, 0b000>; def : RWSysReg<"DAIF", 0b11, 0b011, 0b0100, 0b0010, 0b001>; -def : RWSysReg<"CurrentEL", 0b11, 0b000, 0b0100, 0b0010, 0b010>; +def : ROSysReg<"CurrentEL", 0b11, 0b000, 0b0100, 0b0010, 0b010>; def : RWSysReg<"SPSR_irq", 0b11, 0b100, 0b0100, 0b0011, 0b000>; def : RWSysReg<"SPSR_abt", 0b11, 0b100, 0b0100, 0b0011, 0b001>; def : RWSysReg<"SPSR_und", 0b11, 0b100, 0b0100, 0b0011, 0b010>; @@ -777,6 +977,7 @@ def : RWSysReg<"PMUSERENR_EL0", 0b11, 0b011, 0b1001, 0b1110, 0b000>; def : RWSysReg<"PMINTENSET_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b001>; def : RWSysReg<"PMINTENCLR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b010>; def : RWSysReg<"PMOVSSET_EL0", 0b11, 0b011, 0b1001, 0b1110, 0b011>; +def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>; def : RWSysReg<"MAIR_EL1", 0b11, 0b000, 0b1010, 0b0010, 0b000>; def : RWSysReg<"MAIR_EL2", 0b11, 0b100, 0b1010, 0b0010, 0b000>; def : RWSysReg<"MAIR_EL3", 0b11, 0b110, 0b1010, 0b0010, 0b000>; @@ -1063,7 +1264,6 @@ def : RWSysReg<"ICC_SRE_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b101>; def : RWSysReg<"ICC_IGRPEN0_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b110>; def : RWSysReg<"ICC_IGRPEN1_EL1", 0b11, 0b000, 0b1100, 0b1100, 0b111>; def : RWSysReg<"ICC_IGRPEN1_EL3", 0b11, 0b110, 0b1100, 0b1100, 0b111>; -def : RWSysReg<"ICC_SEIEN_EL1", 0b11, 0b000, 0b1100, 0b1101, 0b000>; def : RWSysReg<"ICC_AP0R0_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b100>; def : RWSysReg<"ICC_AP0R1_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b101>; def : RWSysReg<"ICC_AP0R2_EL1", 0b11, 0b000, 0b1100, 0b1000, 0b110>; @@ -1081,9 +1281,8 @@ def : RWSysReg<"ICH_AP1R1_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b001>; def : RWSysReg<"ICH_AP1R2_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b010>; def : RWSysReg<"ICH_AP1R3_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b011>; def : RWSysReg<"ICH_HCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b000>; -def : RWSysReg<"ICH_MISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b010>; +def : ROSysReg<"ICH_MISR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b010>; def : RWSysReg<"ICH_VMCR_EL2", 0b11, 0b100, 0b1100, 0b1011, 0b111>; -def : RWSysReg<"ICH_VSEIR_EL2", 0b11, 0b100, 0b1100, 0b1001, 0b100>; def : RWSysReg<"ICH_LR0_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b000>; def : RWSysReg<"ICH_LR1_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b001>; def : RWSysReg<"ICH_LR2_EL2", 0b11, 0b100, 0b1100, 0b1100, 0b010>; @@ -1101,24 +1300,74 @@ def : RWSysReg<"ICH_LR13_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b101>; def : RWSysReg<"ICH_LR14_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b110>; def : RWSysReg<"ICH_LR15_EL2", 0b11, 0b100, 0b1100, 0b1101, 0b111>; +// v8r system registers +let Requires = [{ {AArch64::HasV8_0rOps} }] in { +//Virtualization System Control Register +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"VSCTLR_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b000> { + let AltName = "TTBR0_EL2"; +} + +//MPU Type Register +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"MPUIR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b100>; +def : RWSysReg<"MPUIR_EL2", 0b11, 0b100, 0b0000, 0b0000, 0b100>; + +//Protection Region Enable Register +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"PRENR_EL1", 0b11, 0b000, 0b0110, 0b0001, 0b001>; +def : RWSysReg<"PRENR_EL2", 0b11, 0b100, 0b0110, 0b0001, 0b001>; + +//Protection Region Selection Register +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"PRSELR_EL1", 0b11, 0b000, 0b0110, 0b0010, 0b001>; +def : RWSysReg<"PRSELR_EL2", 0b11, 0b100, 0b0110, 0b0010, 0b001>; + +//Protection Region Base Address Register +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"PRBAR_EL1", 0b11, 0b000, 0b0110, 0b1000, 0b000>; +def : RWSysReg<"PRBAR_EL2", 0b11, 0b100, 0b0110, 0b1000, 0b000>; + +//Protection Region Limit Address Register +// Op0 Op1 CRn CRm Op2 +def : RWSysReg<"PRLAR_EL1", 0b11, 0b000, 0b0110, 0b1000, 0b001>; +def : RWSysReg<"PRLAR_EL2", 0b11, 0b100, 0b0110, 0b1000, 0b001>; + +foreach n = 1-15 in { +foreach x = 1-2 in { +//Direct acces to Protection Region Base Address Register for n th MPU region + def : RWSysReg{ + let Encoding{5-2} = n; + let Encoding{13} = !add(x,-1); + } + + def : RWSysReg{ + let Encoding{5-2} = n; + let Encoding{13} = !add(x,-1); + } +} //foreach x = 1-2 in +} //foreach n = 1-15 in +} //let Requires = [{ {AArch64::HasV8_0rOps} }] in + // v8.1a "Privileged Access Never" extension-specific system registers -let Requires = [{ {AArch64::HasV8_1aOps} }] in +let Requires = [{ {AArch64::FeaturePAN} }] in def : RWSysReg<"PAN", 0b11, 0b000, 0b0100, 0b0010, 0b011>; // v8.1a "Limited Ordering Regions" extension-specific system registers // Op0 Op1 CRn CRm Op2 -let Requires = [{ {AArch64::HasV8_1aOps} }] in { +let Requires = [{ {AArch64::FeatureLOR} }] in { def : RWSysReg<"LORSA_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b000>; def : RWSysReg<"LOREA_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b001>; def : RWSysReg<"LORN_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b010>; def : RWSysReg<"LORC_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b011>; } -// v8.1a "Virtualization hos extensions" system registers +// v8.1a "Virtualization Host extensions" system registers // Op0 Op1 CRn CRm Op2 -let Requires = [{ {AArch64::HasV8_1aOps} }] in { +let Requires = [{ {AArch64::FeatureVH} }] in { def : RWSysReg<"TTBR1_EL2", 0b11, 0b100, 0b0010, 0b0000, 0b001>; -def : RWSysReg<"CONTEXTIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b001>; def : RWSysReg<"CNTHV_TVAL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b000>; def : RWSysReg<"CNTHV_CVAL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b010>; def : RWSysReg<"CNTHV_CTL_EL2", 0b11, 0b100, 0b1110, 0b0011, 0b001>; @@ -1144,10 +1393,13 @@ def : RWSysReg<"CNTV_CTL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b001>; def : RWSysReg<"CNTV_CVAL_EL02", 0b11, 0b101, 0b1110, 0b0011, 0b010>; def : RWSysReg<"SPSR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b000>; def : RWSysReg<"ELR_EL12", 0b11, 0b101, 0b0100, 0b0000, 0b001>; +let Requires = [{ {AArch64::FeatureCONTEXTIDREL2} }] in { + def : RWSysReg<"CONTEXTIDR_EL2", 0b11, 0b100, 0b1101, 0b0000, 0b001>; +} } // v8.2a registers // Op0 Op1 CRn CRm Op2 -let Requires = [{ {AArch64::HasV8_2aOps} }] in +let Requires = [{ {AArch64::FeaturePsUAO} }] in def : RWSysReg<"UAO", 0b11, 0b000, 0b0100, 0b0010, 0b100>; // v8.2a "Statistical Profiling extension" registers @@ -1156,7 +1408,7 @@ let Requires = [{ {AArch64::FeatureSPE} }] in { def : RWSysReg<"PMBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b000>; def : RWSysReg<"PMBPTR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b001>; def : RWSysReg<"PMBSR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b011>; -def : RWSysReg<"PMBIDR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b111>; +def : ROSysReg<"PMBIDR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b111>; def : RWSysReg<"PMSCR_EL2", 0b11, 0b100, 0b1001, 0b1001, 0b000>; def : RWSysReg<"PMSCR_EL12", 0b11, 0b101, 0b1001, 0b1001, 0b000>; def : RWSysReg<"PMSCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b000>; @@ -1165,7 +1417,7 @@ def : RWSysReg<"PMSIRR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b011>; def : RWSysReg<"PMSFCR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b100>; def : RWSysReg<"PMSEVFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b101>; def : RWSysReg<"PMSLATFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b110>; -def : RWSysReg<"PMSIDR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b111>; +def : ROSysReg<"PMSIDR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b111>; } // v8.2a "RAS extension" registers @@ -1184,7 +1436,7 @@ def : RWSysReg<"VSESR_EL2", 0b11, 0b100, 0b0101, 0b0010, 0b011>; // v8.3a "Pointer authentication extension" registers // Op0 Op1 CRn CRm Op2 -let Requires = [{ {AArch64::HasV8_3aOps} }] in { +let Requires = [{ {AArch64::FeaturePAuth} }] in { def : RWSysReg<"APIAKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b000>; def : RWSysReg<"APIAKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b001>; def : RWSysReg<"APIBKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0001, 0b010>; @@ -1197,12 +1449,14 @@ def : RWSysReg<"APGAKeyLo_EL1", 0b11, 0b000, 0b0010, 0b0011, 0b000>; def : RWSysReg<"APGAKeyHi_EL1", 0b11, 0b000, 0b0010, 0b0011, 0b001>; } -let Requires = [{ {AArch64::HasV8_4aOps} }] in { - +// v8.4 "Secure Exception Level 2 extension" +let Requires = [{ {AArch64::FeatureSEL2} }] in { // v8.4a "Virtualization secure second stage translation" registers // Op0 Op1 CRn CRm Op2 def : RWSysReg<"VSTCR_EL2" , 0b11, 0b100, 0b0010, 0b0110, 0b010>; -def : RWSysReg<"VSTTBR_EL2", 0b11, 0b100, 0b0010, 0b0110, 0b000>; +def : RWSysReg<"VSTTBR_EL2", 0b11, 0b100, 0b0010, 0b0110, 0b000> { + let Requires = [{ {AArch64::HasV8_0aOps} }]; +} // v8.4a "Virtualization timer" registers // Op0 Op1 CRn CRm Op2 @@ -1216,18 +1470,19 @@ def : RWSysReg<"CNTHPS_CTL_EL2", 0b11, 0b100, 0b1110, 0b0101, 0b001>; // v8.4a "Virtualization debug state" registers // Op0 Op1 CRn CRm Op2 def : RWSysReg<"SDER32_EL2", 0b11, 0b100, 0b0001, 0b0011, 0b001>; +} // FeatureSEL2 // v8.4a RAS registers -// Op0 Op1 CRn CRm Op2 +// Op0 Op1 CRn CRm Op2 def : RWSysReg<"ERXPFGCTL_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b101>; def : RWSysReg<"ERXPFGCDN_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b110>; -def : RWSysReg<"ERXTS_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b111>; def : RWSysReg<"ERXMISC2_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b010>; def : RWSysReg<"ERXMISC3_EL1", 0b11, 0b000, 0b0101, 0b0101, 0b011>; def : ROSysReg<"ERXPFGF_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b100>; // v8.4a MPAM registers // Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureMPAM} }] in { def : RWSysReg<"MPAM0_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b001>; def : RWSysReg<"MPAM1_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b000>; def : RWSysReg<"MPAM2_EL2", 0b11, 0b100, 0b1010, 0b0101, 0b000>; @@ -1244,9 +1499,11 @@ def : RWSysReg<"MPAMVPM5_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b101>; def : RWSysReg<"MPAMVPM6_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b110>; def : RWSysReg<"MPAMVPM7_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b111>; def : ROSysReg<"MPAMIDR_EL1", 0b11, 0b000, 0b1010, 0b0100, 0b100>; +} //FeatureMPAM -// v8.4a Activitiy monitor registers +// v8.4a Activity Monitor registers // Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureAM} }] in { def : RWSysReg<"AMCR_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b000>; def : ROSysReg<"AMCFGR_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b001>; def : ROSysReg<"AMCGCR_EL0", 0b11, 0b011, 0b1101, 0b0010, 0b010>; @@ -1295,6 +1552,7 @@ def : RWSysReg<"AMEVTYPER112_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b100>; def : RWSysReg<"AMEVTYPER113_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b101>; def : RWSysReg<"AMEVTYPER114_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b110>; def : RWSysReg<"AMEVTYPER115_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b111>; +} //FeatureAM // v8.4a Trace Extension registers // @@ -1303,19 +1561,24 @@ def : RWSysReg<"AMEVTYPER115_EL0", 0b11, 0b011, 0b1101, 0b1111, 0b111>; // but they are already defined above. // // Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureTRACEV8_4} }] in { def : RWSysReg<"TRFCR_EL1", 0b11, 0b000, 0b0001, 0b0010, 0b001>; def : RWSysReg<"TRFCR_EL2", 0b11, 0b100, 0b0001, 0b0010, 0b001>; def : RWSysReg<"TRFCR_EL12", 0b11, 0b101, 0b0001, 0b0010, 0b001>; +} //FeatureTRACEV8_4 -// v8.4a Timining insensitivity of data processing instructions +// v8.4a Timing insensitivity of data processing instructions +// DIT: Data Independent Timing instructions // Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureDIT} }] in { def : RWSysReg<"DIT", 0b11, 0b011, 0b0100, 0b0010, 0b101>; +} //FeatureDIT // v8.4a Enhanced Support for Nested Virtualization // Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureNV} }] in { def : RWSysReg<"VNCR_EL2", 0b11, 0b100, 0b0010, 0b0010, 0b000>; - -} // HasV8_4aOps +} //FeatureNV // SVE control registers // Op0 Op1 CRn CRm Op2 @@ -1326,7 +1589,131 @@ def : RWSysReg<"ZCR_EL3", 0b11, 0b110, 0b0001, 0b0010, 0b000>; def : RWSysReg<"ZCR_EL12", 0b11, 0b101, 0b0001, 0b0010, 0b000>; } +// V8.5a Spectre mitigation SSBS register +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureSSBS} }] in +def : RWSysReg<"SSBS", 0b11, 0b011, 0b0100, 0b0010, 0b110>; + +// v8.5a Memory Tagging Extension +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureMTE} }] in { +def : RWSysReg<"TCO", 0b11, 0b011, 0b0100, 0b0010, 0b111>; +def : RWSysReg<"GCR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b110>; +def : RWSysReg<"RGSR_EL1", 0b11, 0b000, 0b0001, 0b0000, 0b101>; +def : RWSysReg<"TFSR_EL1", 0b11, 0b000, 0b0101, 0b0110, 0b000>; +def : RWSysReg<"TFSR_EL2", 0b11, 0b100, 0b0101, 0b0110, 0b000>; +def : RWSysReg<"TFSR_EL3", 0b11, 0b110, 0b0101, 0b0110, 0b000>; +def : RWSysReg<"TFSR_EL12", 0b11, 0b101, 0b0101, 0b0110, 0b000>; +def : RWSysReg<"TFSRE0_EL1", 0b11, 0b000, 0b0101, 0b0110, 0b001>; +def : ROSysReg<"GMID_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b100>; +} // HasMTE + +// Embedded Trace Extension R/W System registers +let Requires = [{ {AArch64::FeatureETE} }] in { +// Name Op0 Op1 CRn CRm Op2 +def : RWSysReg<"TRCRSR", 0b10, 0b001, 0b0000, 0b1010, 0b000>; +// TRCEXTINSELR0 has the same encoding as ETM TRCEXTINSELR +def : RWSysReg<"TRCEXTINSELR0", 0b10, 0b001, 0b0000, 0b1000, 0b100>; +def : RWSysReg<"TRCEXTINSELR1", 0b10, 0b001, 0b0000, 0b1001, 0b100>; +def : RWSysReg<"TRCEXTINSELR2", 0b10, 0b001, 0b0000, 0b1010, 0b100>; +def : RWSysReg<"TRCEXTINSELR3", 0b10, 0b001, 0b0000, 0b1011, 0b100>; +} // FeatureETE + +// Trace Buffer Extension System registers +let Requires = [{ {AArch64::FeatureTRBE} }] in { +// Name Op0 Op1 CRn CRm Op2 +def : RWSysReg<"TRBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b000>; +def : RWSysReg<"TRBPTR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b001>; +def : RWSysReg<"TRBBASER_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b010>; +def : RWSysReg<"TRBSR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b011>; +def : RWSysReg<"TRBMAR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b100>; +def : RWSysReg<"TRBTRG_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b110>; +def : ROSysReg<"TRBIDR_EL1", 0b11, 0b000, 0b1001, 0b1011, 0b111>; +} // FeatureTRBE + + +// v8.6a Activity Monitors Virtualization Support +let Requires = [{ {AArch64::FeatureAMVS} }] in { +foreach n = 0-15 in { + foreach x = 0-1 in { + def : RWSysReg<"AMEVCNTVOFF"#x#n#"_EL2", + 0b11, 0b100, 0b1101, 0b1000, 0b000>{ + let Encoding{4} = x; + let Encoding{3-0} = n; + } + } +} +} + +// v8.6a Fine Grained Virtualization Traps +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureFineGrainedTraps} }] in { +def : RWSysReg<"HFGRTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b100>; +def : RWSysReg<"HFGWTR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b101>; +def : RWSysReg<"HFGITR_EL2", 0b11, 0b100, 0b0001, 0b0001, 0b110>; +def : RWSysReg<"HDFGRTR_EL2", 0b11, 0b100, 0b0011, 0b0001, 0b100>; +def : RWSysReg<"HDFGWTR_EL2", 0b11, 0b100, 0b0011, 0b0001, 0b101>; +} + +// v8.6a Enhanced Counter Virtualization +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureEnhancedCounterVirtualization} }] in { +def : RWSysReg<"CNTSCALE_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b100>; +def : RWSysReg<"CNTISCALE_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b101>; +def : RWSysReg<"CNTPOFF_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b110>; +def : RWSysReg<"CNTVFRQ_EL2", 0b11, 0b100, 0b1110, 0b0000, 0b111>; +def : RWSysReg<"CNTPCTSS_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b101>; +def : RWSysReg<"CNTVCTSS_EL0", 0b11, 0b011, 0b1110, 0b0000, 0b110>; +} + +// v8.7a LD64B/ST64B Accelerator Extension system register +let Requires = [{ {AArch64::FeatureLS64} }] in +def : RWSysReg<"ACCDATA_EL1", 0b11, 0b000, 0b1101, 0b0000, 0b101>; + +// Branch Record Buffer system registers +let Requires = [{ {AArch64::FeatureBRBE} }] in { +def : RWSysReg<"BRBCR_EL1", 0b10, 0b001, 0b1001, 0b0000, 0b000>; +def : RWSysReg<"BRBCR_EL12", 0b10, 0b101, 0b1001, 0b0000, 0b000>; +def : RWSysReg<"BRBCR_EL2", 0b10, 0b100, 0b1001, 0b0000, 0b000>; +def : RWSysReg<"BRBFCR_EL1", 0b10, 0b001, 0b1001, 0b0000, 0b001>; +def : ROSysReg<"BRBIDR0_EL1", 0b10, 0b001, 0b1001, 0b0010, 0b000>; +def : RWSysReg<"BRBINFINJ_EL1", 0b10, 0b001, 0b1001, 0b0001, 0b000>; +def : RWSysReg<"BRBSRCINJ_EL1", 0b10, 0b001, 0b1001, 0b0001, 0b001>; +def : RWSysReg<"BRBTGTINJ_EL1", 0b10, 0b001, 0b1001, 0b0001, 0b010>; +def : RWSysReg<"BRBTS_EL1", 0b10, 0b001, 0b1001, 0b0000, 0b010>; +foreach n = 0-31 in { + defvar nb = !cast>(n); + def : ROSysReg<"BRBINF"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b00}>; + def : ROSysReg<"BRBSRC"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b01}>; + def : ROSysReg<"BRBTGT"#n#"_EL1", 0b10, 0b001, 0b1000, nb{3-0}, {nb{4},0b10}>; +} +} + +// Statistical Profiling Extension system register +let Requires = [{ {AArch64::FeatureSPE_EEF} }] in +def : RWSysReg<"PMSNEVFR_EL1", 0b11, 0b000, 0b1001, 0b1001, 0b001>; + // Cyclone specific system registers // Op0 Op1 CRn CRm Op2 -let Requires = [{ {AArch64::ProcCyclone} }] in +let Requires = [{ {AArch64::FeatureAppleA7SysReg} }] in def : RWSysReg<"CPM_IOACC_CTL_EL3", 0b11, 0b111, 0b1111, 0b0010, 0b000>; + +// Scalable Matrix Extension (SME) +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureSME} }] in { +def : RWSysReg<"SMCR_EL1", 0b11, 0b000, 0b0001, 0b0010, 0b110>; +def : RWSysReg<"SMCR_EL2", 0b11, 0b100, 0b0001, 0b0010, 0b110>; +def : RWSysReg<"SMCR_EL3", 0b11, 0b110, 0b0001, 0b0010, 0b110>; +def : RWSysReg<"SMCR_EL12", 0b11, 0b101, 0b0001, 0b0010, 0b110>; +def : RWSysReg<"SVCR", 0b11, 0b011, 0b0100, 0b0010, 0b010>; +def : RWSysReg<"SMPRI_EL1", 0b11, 0b000, 0b0001, 0b0010, 0b100>; +def : RWSysReg<"SMPRIMAP_EL2", 0b11, 0b100, 0b0001, 0b0010, 0b101>; +def : ROSysReg<"SMIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b110>; +def : RWSysReg<"TPIDR2_EL0", 0b11, 0b011, 0b1101, 0b0000, 0b101>; +} // HasSME + +// v8.4a MPAM and SME registers +// Op0 Op1 CRn CRm Op2 +let Requires = [{ {AArch64::FeatureMPAM, AArch64::FeatureSME} }] in { +def : RWSysReg<"MPAMSM_EL1", 0b11, 0b000, 0b1010, 0b0101, 0b011>; +} // HasMPAM, HasSME diff --git a/suite/synctools/tablegen/AArch64/SMEInstrFormats.td b/suite/synctools/tablegen/AArch64/SMEInstrFormats.td new file mode 100644 index 00000000..41f2cead --- /dev/null +++ b/suite/synctools/tablegen/AArch64/SMEInstrFormats.td @@ -0,0 +1,726 @@ +//=-- SMEInstrFormats.td - AArch64 SME Instruction classes -*- tablegen -*--=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// AArch64 Scalable Matrix Extension (SME) Instruction Class Definitions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SME Outer Products +//===----------------------------------------------------------------------===// + +class sme_fp_outer_product_inst + : I<(outs za_ty:$ZAda), + (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm), + mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm", + "", []>, + Sched<[]> { + bits<5> Zm; + bits<3> Pm; + bits<3> Pn; + bits<5> Zn; + let Inst{31-23} = 0b100000001; + let Inst{22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15-13} = Pm; + let Inst{12-10} = Pn; + let Inst{9-5} = Zn; + let Inst{4} = S; + let Inst{3} = 0b0; +} + +class sme_outer_product_fp32 + : sme_fp_outer_product_inst { + bits<2> ZAda; + let Inst{1-0} = ZAda; + let Inst{2} = 0b0; +} + +class sme_outer_product_fp64 + : sme_fp_outer_product_inst { + bits<3> ZAda; + let Inst{2-0} = ZAda; +} + +class sme_int_outer_product_inst + : I<(outs za_ty:$ZAda), + (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn, zpr_ty:$Zm), + mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm", + "", []>, + Sched<[]> { + bits<5> Zm; + bits<3> Pm; + bits<3> Pn; + bits<5> Zn; + let Inst{31-25} = 0b1010000; + let Inst{24} = u0; + let Inst{23} = 0b1; + let Inst{22} = sz; + let Inst{21} = u1; + let Inst{20-16} = Zm; + let Inst{15-13} = Pm; + let Inst{12-10} = Pn; + let Inst{9-5} = Zn; + let Inst{4} = S; + let Inst{3} = 0b0; +} + +class sme_int_outer_product_i32 opc, string mnemonic> + : sme_int_outer_product_inst { + bits<2> ZAda; + let Inst{1-0} = ZAda; + let Inst{2} = 0b0; +} + +class sme_int_outer_product_i64 opc, string mnemonic> + : sme_int_outer_product_inst { + bits<3> ZAda; + let Inst{2-0} = ZAda; +} + +class sme_outer_product_widening_inst + : I<(outs TileOp32:$ZAda), + (ins PPR3bAny:$Pn, PPR3bAny:$Pm, ZPR16:$Zn, ZPR16:$Zm), + mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn, $Zm", + "", []>, + Sched<[]> { + bits<5> Zm; + bits<3> Pm; + bits<3> Pn; + bits<5> Zn; + bits<2> ZAda; + let Inst{31-22} = 0b1000000110; + let Inst{21} = op; + let Inst{20-16} = Zm; + let Inst{15-13} = Pm; + let Inst{12-10} = Pn; + let Inst{9-5} = Zn; + let Inst{4} = S; + let Inst{3-2} = 0b00; + let Inst{1-0} = ZAda; +} + +multiclass sme_bf16_outer_product { + def : sme_outer_product_widening_inst<0b0, S, mnemonic>; +} + +multiclass sme_f16_outer_product { + def : sme_outer_product_widening_inst<0b1, S, mnemonic>; +} + +//===----------------------------------------------------------------------===// +// SME Add Vector to Tile +//===----------------------------------------------------------------------===// + +class sme_add_vector_to_tile_inst + : I<(outs tile_ty:$ZAda), + (ins PPR3bAny:$Pn, PPR3bAny:$Pm, zpr_ty:$Zn), + mnemonic, "\t$ZAda, $Pn/m, $Pm/m, $Zn", + "", []>, Sched<[]> { + bits<3> Pm; + bits<3> Pn; + bits<5> Zn; + let Inst{31-23} = 0b110000001; + let Inst{22} = op; + let Inst{21-17} = 0b01000; + let Inst{16} = V; + let Inst{15-13} = Pm; + let Inst{12-10} = Pn; + let Inst{9-5} = Zn; + let Inst{4-3} = 0b00; +} + +class sme_add_vector_to_tile_u32 + : sme_add_vector_to_tile_inst<0b0, V, TileOp32, ZPR32, mnemonic> { + bits<2> ZAda; + let Inst{2} = 0b0; + let Inst{1-0} = ZAda; +} + +class sme_add_vector_to_tile_u64 + : sme_add_vector_to_tile_inst<0b1, V, TileOp64, ZPR64, mnemonic> { + bits<3> ZAda; + let Inst{2-0} = ZAda; +} + +//===----------------------------------------------------------------------===// +// SME Contiguous Loads +//===----------------------------------------------------------------------===// + +class sme_mem_ld_ss_base msz, dag outs, dag ins, + string mnemonic, string argstr> + : I, Sched<[]> { + bits<5> Rm; + bits<2> Rv; + bits<3> Pg; + bits<5> Rn; + let Inst{31-25} = 0b1110000; + let Inst{24} = Q; + let Inst{23-22} = msz; + let Inst{21} = 0b0; + let Inst{20-16} = Rm; + let Inst{15} = V; + let Inst{14-13} = Rv; + let Inst{12-10} = Pg; + let Inst{9-5} = Rn; + let Inst{4} = 0b0; + + let mayLoad = 1; +} + +class sme_mem_ld_ss_inst msz, string mnemonic, + MatrixTileVectorOperand tile_ty, bit is_col, + Operand imm_ty, RegisterOperand gpr_ty> + : sme_mem_ld_ss_base< + Q, is_col, msz, (outs tile_ty:$ZAt), + (ins MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, GPR64sp:$Rn, + gpr_ty:$Rm), + mnemonic, "\t\\{$ZAt[$Rv, $imm]\\}, $Pg/z, [$Rn, $Rm]">; + +multiclass sme_mem_ss_aliases_base { + def : InstAlias; + // Default XZR offset aliases + def : InstAlias; + def : InstAlias; +} + +multiclass sme_mem_ss_aliases { + defm : sme_mem_ss_aliases_base(inst # _B), + !if(is_col, TileVectorOpV8, TileVectorOpH8), + sme_elm_idx0_15, GPR64shifted8, pg_suffix>; + defm : sme_mem_ss_aliases_base(inst # _H), + !if(is_col, TileVectorOpV16, TileVectorOpH16), + sme_elm_idx0_7, GPR64shifted16, pg_suffix>; + defm : sme_mem_ss_aliases_base(inst # _S), + !if(is_col, TileVectorOpV32, TileVectorOpH32), + sme_elm_idx0_3, GPR64shifted32, pg_suffix>; + defm : sme_mem_ss_aliases_base(inst # _D), + !if(is_col, TileVectorOpV64, TileVectorOpH64), + sme_elm_idx0_1, GPR64shifted64, pg_suffix>; + defm : sme_mem_ss_aliases_base(inst # _Q), + !if(is_col, TileVectorOpV128, TileVectorOpH128), + sme_elm_idx0_0, GPR64shifted128, pg_suffix>; +} + +multiclass sme_mem_ld_ss_aliases { + defm NAME : sme_mem_ss_aliases<"ld1", inst, is_col, "/z">; +} + +multiclass sme_mem_ld_v_ss { + def _B : sme_mem_ld_ss_inst<0b0, 0b00, mnemonic # "b", + !if(is_col, TileVectorOpV8, TileVectorOpH8), + is_col, sme_elm_idx0_15, GPR64shifted8> { + bits<4> imm; + let Inst{3-0} = imm; + } + def _H : sme_mem_ld_ss_inst<0b0, 0b01, mnemonic # "h", + !if(is_col, TileVectorOpV16, TileVectorOpH16), + is_col, sme_elm_idx0_7, GPR64shifted16> { + bits<1> ZAt; + bits<3> imm; + let Inst{3} = ZAt; + let Inst{2-0} = imm; + } + def _S : sme_mem_ld_ss_inst<0b0, 0b10, mnemonic # "w", + !if(is_col, TileVectorOpV32, TileVectorOpH32), + is_col, sme_elm_idx0_3, GPR64shifted32> { + bits<2> ZAt; + bits<2> imm; + let Inst{3-2} = ZAt; + let Inst{1-0} = imm; + } + def _D : sme_mem_ld_ss_inst<0b0, 0b11, mnemonic # "d", + !if(is_col, TileVectorOpV64, TileVectorOpH64), + is_col, sme_elm_idx0_1, GPR64shifted64> { + bits<3> ZAt; + bits<1> imm; + let Inst{3-1} = ZAt; + let Inst{0} = imm; + } + def _Q : sme_mem_ld_ss_inst<0b1, 0b11, mnemonic # "q", + !if(is_col, TileVectorOpV128, TileVectorOpH128), + is_col, sme_elm_idx0_0, GPR64shifted128> { + bits<4> ZAt; + let Inst{3-0} = ZAt; + } + + defm : sme_mem_ld_ss_aliases; +} + +multiclass sme_mem_ld_ss { + defm _H : sme_mem_ld_v_ss; + defm _V : sme_mem_ld_v_ss; +} + +//===----------------------------------------------------------------------===// +// SME Contiguous Stores +//===----------------------------------------------------------------------===// + +class sme_mem_st_ss_base msz, dag ins, + string mnemonic, string argstr> + : I<(outs), ins, mnemonic, argstr, "", []>, Sched<[]> { + bits<5> Rm; + bits<2> Rv; + bits<3> Pg; + bits<5> Rn; + let Inst{31-25} = 0b1110000; + let Inst{24} = Q; + let Inst{23-22} = msz; + let Inst{21} = 0b1; + let Inst{20-16} = Rm; + let Inst{15} = V; + let Inst{14-13} = Rv; + let Inst{12-10} = Pg; + let Inst{9-5} = Rn; + let Inst{4} = 0b0; + + let mayStore = 1; + let hasSideEffects = 1; +} + +class sme_mem_st_ss_inst msz, string mnemonic, + MatrixTileVectorOperand tile_ty, bit is_col, + Operand imm_ty, RegisterOperand gpr_ty> + : sme_mem_st_ss_base< + Q, is_col, msz, + (ins tile_ty:$ZAt, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, + GPR64sp:$Rn, gpr_ty:$Rm), + mnemonic, "\t\\{$ZAt[$Rv, $imm]\\}, $Pg, [$Rn, $Rm]">; + +multiclass sme_mem_st_ss_aliases { + defm NAME : sme_mem_ss_aliases<"st1", inst, is_col>; +} + +multiclass sme_mem_st_v_ss { + def _B : sme_mem_st_ss_inst<0b0, 0b00, mnemonic # "b", + !if(is_col, TileVectorOpV8, TileVectorOpH8), + is_col, sme_elm_idx0_15, GPR64shifted8> { + bits<4> imm; + let Inst{3-0} = imm; + } + def _H : sme_mem_st_ss_inst<0b0, 0b01, mnemonic # "h", + !if(is_col, TileVectorOpV16, TileVectorOpH16), + is_col, sme_elm_idx0_7, GPR64shifted16> { + bits<1> ZAt; + bits<3> imm; + let Inst{3} = ZAt; + let Inst{2-0} = imm; + } + def _S : sme_mem_st_ss_inst<0b0, 0b10, mnemonic # "w", + !if(is_col, TileVectorOpV32, TileVectorOpH32), + is_col, sme_elm_idx0_3, GPR64shifted32> { + bits<2> ZAt; + bits<2> imm; + let Inst{3-2} = ZAt; + let Inst{1-0} = imm; + } + def _D : sme_mem_st_ss_inst<0b0, 0b11, mnemonic # "d", + !if(is_col, TileVectorOpV64, TileVectorOpH64), + is_col, sme_elm_idx0_1, GPR64shifted64> { + bits<3> ZAt; + bits<1> imm; + let Inst{3-1} = ZAt; + let Inst{0} = imm; + } + def _Q : sme_mem_st_ss_inst<0b1, 0b11, mnemonic # "q", + !if(is_col, TileVectorOpV128, TileVectorOpH128), + is_col, sme_elm_idx0_0, GPR64shifted128> { + bits<4> ZAt; + let Inst{3-0} = ZAt; + } + + defm : sme_mem_st_ss_aliases; +} + +multiclass sme_mem_st_ss { + defm _H : sme_mem_st_v_ss; + defm _V : sme_mem_st_v_ss; +} + +//===----------------------------------------------------------------------===// +// SME Save and Restore Array +//===----------------------------------------------------------------------===// + +class sme_spill_fill_inst + : I, + Sched<[]> { + bits<2> Rv; + bits<5> Rn; + bits<4> imm4; + let Inst{31-22} = 0b1110000100; + let Inst{21} = isStore; + let Inst{20-15} = 0b000000; + let Inst{14-13} = Rv; + let Inst{12-10} = 0b000; + let Inst{9-5} = Rn; + let Inst{4} = 0b0; + let Inst{3-0} = imm4; + + let mayLoad = !not(isStore); + let mayStore = isStore; +} + +multiclass sme_spill_fill { + def NAME : sme_spill_fill_inst; + + def : InstAlias(NAME) MatrixOp:$ZAt, + MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; +} + +multiclass sme_spill { + defm NAME : sme_spill_fill<0b1, (outs), + (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, + sme_elm_idx0_15:$imm4, GPR64sp:$Rn, + imm0_15:$offset), + opcodestr>; +} + +multiclass sme_fill { + defm NAME : sme_spill_fill<0b0, (outs MatrixOp:$ZAt), + (ins MatrixIndexGPR32Op12_15:$Rv, + sme_elm_idx0_15:$imm4, GPR64sp:$Rn, + imm0_15:$offset), + opcodestr>; +} + +//===----------------------------------------------------------------------===// +// Move instructions +//===----------------------------------------------------------------------===// + +class sme_vector_to_tile_base sz, dag outs, dag ins, + string mnemonic, string argstr> + : I, Sched<[]> { + bits<2> Rv; + bits<3> Pg; + bits<5> Zn; + let Inst{31-24} = 0b11000000; + let Inst{23-22} = sz; + let Inst{21-17} = 0b00000; + let Inst{16} = Q; + let Inst{15} = V; + let Inst{14-13} = Rv; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4} = 0b0; +} + +class sme_vector_to_tile_inst sz, MatrixTileVectorOperand tile_ty, + bit is_col, Operand imm_ty, ZPRRegOp zpr_ty, + string mnemonic> + : sme_vector_to_tile_base; + +multiclass sme_vector_to_tile_aliases { + def : InstAlias<"mov\t$ZAd[$Rv, $imm], $Pg/m, $Zn", + (inst tile_ty:$ZAd, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, zpr_ty:$Zn), 1>; +} + +multiclass sme_vector_v_to_tile { + def _B : sme_vector_to_tile_inst<0b0, 0b00, !if(is_col, TileVectorOpV8, + TileVectorOpH8), + is_col, sme_elm_idx0_15, ZPR8, mnemonic> { + bits<4> imm; + let Inst{3-0} = imm; + } + def _H : sme_vector_to_tile_inst<0b0, 0b01, !if(is_col, TileVectorOpV16, + TileVectorOpH16), + is_col, sme_elm_idx0_7, ZPR16, mnemonic> { + bits<1> ZAd; + bits<3> imm; + let Inst{3} = ZAd; + let Inst{2-0} = imm; + } + def _S : sme_vector_to_tile_inst<0b0, 0b10, !if(is_col, TileVectorOpV32, + TileVectorOpH32), + is_col, sme_elm_idx0_3, ZPR32, mnemonic> { + bits<2> ZAd; + bits<2> imm; + let Inst{3-2} = ZAd; + let Inst{1-0} = imm; + } + def _D : sme_vector_to_tile_inst<0b0, 0b11, !if(is_col, TileVectorOpV64, + TileVectorOpH64), + is_col, sme_elm_idx0_1, ZPR64, mnemonic> { + bits<3> ZAd; + bits<1> imm; + let Inst{3-1} = ZAd; + let Inst{0} = imm; + } + def _Q : sme_vector_to_tile_inst<0b1, 0b11, !if(is_col, TileVectorOpV128, + TileVectorOpH128), + is_col, sme_elm_idx0_0, ZPR128, mnemonic> { + bits<4> ZAd; + bits<1> imm; + let Inst{3-0} = ZAd; + } + + defm : sme_vector_to_tile_aliases(NAME # _B), + !if(is_col, TileVectorOpV8, + TileVectorOpH8), + ZPR8, sme_elm_idx0_15>; + defm : sme_vector_to_tile_aliases(NAME # _H), + !if(is_col, TileVectorOpV16, + TileVectorOpH16), + ZPR16, sme_elm_idx0_7>; + defm : sme_vector_to_tile_aliases(NAME # _S), + !if(is_col, TileVectorOpV32, + TileVectorOpH32), + ZPR32, sme_elm_idx0_3>; + defm : sme_vector_to_tile_aliases(NAME # _D), + !if(is_col, TileVectorOpV64, + TileVectorOpH64), + ZPR64, sme_elm_idx0_1>; + defm : sme_vector_to_tile_aliases(NAME # _Q), + !if(is_col, TileVectorOpV128, + TileVectorOpH128), + ZPR128, sme_elm_idx0_0>; +} + +multiclass sme_vector_to_tile { + defm _H : sme_vector_v_to_tile; + defm _V : sme_vector_v_to_tile; +} + +class sme_tile_to_vector_base sz, dag outs, dag ins, + string mnemonic, string argstr> + : I, Sched<[]> { + bits<2> Rv; + bits<3> Pg; + bits<5> Zd; + let Inst{31-24} = 0b11000000; + let Inst{23-22} = sz; + let Inst{21-17} = 0b00001; + let Inst{16} = Q; + let Inst{15} = V; + let Inst{14-13} = Rv; + let Inst{12-10} = Pg; + let Inst{9} = 0b0; + let Inst{4-0} = Zd; +} + +class sme_tile_to_vector_inst sz, ZPRRegOp zpr_ty, + MatrixTileVectorOperand tile_ty, + bit is_col, Operand imm_ty, string mnemonic> + : sme_tile_to_vector_base; + +multiclass sme_tile_to_vector_aliases { + def : InstAlias<"mov\t$Zd, $Pg/m, $ZAn[$Rv, $imm]", + (inst zpr_ty:$Zd, PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm), 1>; +} + +multiclass sme_tile_to_vector_v { + def _B : sme_tile_to_vector_inst<0b0, 0b00, ZPR8, !if(is_col, TileVectorOpV8, + TileVectorOpH8), + is_col, sme_elm_idx0_15, mnemonic> { + bits<4> imm; + let Inst{8-5} = imm; + } + def _H : sme_tile_to_vector_inst<0b0, 0b01, ZPR16, !if(is_col, TileVectorOpV16, + TileVectorOpH16), + is_col, sme_elm_idx0_7, mnemonic> { + bits<1> ZAn; + bits<3> imm; + let Inst{8} = ZAn; + let Inst{7-5} = imm; + } + def _S : sme_tile_to_vector_inst<0b0, 0b10, ZPR32, !if(is_col, TileVectorOpV32, + TileVectorOpH32), + is_col, sme_elm_idx0_3, mnemonic> { + bits<2> ZAn; + bits<2> imm; + let Inst{8-7} = ZAn; + let Inst{6-5} = imm; + } + def _D : sme_tile_to_vector_inst<0b0, 0b11, ZPR64, !if(is_col, TileVectorOpV64, + TileVectorOpH64), + is_col, sme_elm_idx0_1, mnemonic> { + bits<3> ZAn; + bits<1> imm; + let Inst{8-6} = ZAn; + let Inst{5} = imm; + } + def _Q : sme_tile_to_vector_inst<0b1, 0b11, ZPR128, !if(is_col, TileVectorOpV128, + TileVectorOpH128), + is_col, sme_elm_idx0_0, mnemonic> { + bits<4> ZAn; + let Inst{8-5} = ZAn; + } + + defm : sme_tile_to_vector_aliases(NAME # _B), ZPR8, + !if(is_col, TileVectorOpV8, + TileVectorOpH8), sme_elm_idx0_15>; + defm : sme_tile_to_vector_aliases(NAME # _H), ZPR16, + !if(is_col, TileVectorOpV16, + TileVectorOpH16), sme_elm_idx0_7>; + defm : sme_tile_to_vector_aliases(NAME # _S), ZPR32, + !if(is_col, TileVectorOpV32, + TileVectorOpH32), sme_elm_idx0_3>; + defm : sme_tile_to_vector_aliases(NAME # _D), ZPR64, + !if(is_col, TileVectorOpV64, + TileVectorOpH64), sme_elm_idx0_1>; + defm : sme_tile_to_vector_aliases(NAME # _Q), ZPR128, + !if(is_col, TileVectorOpV128, + TileVectorOpH128), sme_elm_idx0_0>; +} + +multiclass sme_tile_to_vector { + defm _H : sme_tile_to_vector_v; + defm _V : sme_tile_to_vector_v; +} + +//===----------------------------------------------------------------------===// +// SME Zero +//===----------------------------------------------------------------------===// + +class sme_zero_inst + : I<(outs MatrixTileList:$imm), (ins), + mnemonic, "\t$imm", "", []>, Sched<[]> { + bits<8> imm; + let Inst{31-8} = 0b110000000000100000000000; + let Inst{7-0} = imm; +} + +multiclass sme_zero { + def NAME : sme_zero_inst; + + def : InstAlias<"zero\t\\{za\\}", (!cast(NAME) 0b11111111), 1>; + def : InstAlias<"zero\t\\{za0.h\\}", (!cast(NAME) 0b01010101), 1>; + def : InstAlias<"zero\t\\{za1.h\\}", (!cast(NAME) 0b10101010), 1>; + def : InstAlias<"zero\t\\{za0.s\\}", (!cast(NAME) 0b00010001), 1>; + def : InstAlias<"zero\t\\{za1.s\\}", (!cast(NAME) 0b00100010), 1>; + def : InstAlias<"zero\t\\{za2.s\\}", (!cast(NAME) 0b01000100), 1>; + def : InstAlias<"zero\t\\{za3.s\\}", (!cast(NAME) 0b10001000), 1>; + def : InstAlias<"zero\t\\{za0.s,za1.s\\}", (!cast(NAME) 0b00110011), 1>; + def : InstAlias<"zero\t\\{za0.s,za3.s\\}", (!cast(NAME) 0b10011001), 1>; + def : InstAlias<"zero\t\\{za1.s,za2.s\\}", (!cast(NAME) 0b01100110), 1>; + def : InstAlias<"zero\t\\{za2.s,za3.s\\}", (!cast(NAME) 0b11001100), 1>; + def : InstAlias<"zero\t\\{za0.s,za1.s,za2.s\\}", (!cast(NAME) 0b01110111), 1>; + def : InstAlias<"zero\t\\{za0.s,za1.s,za3.s\\}", (!cast(NAME) 0b10111011), 1>; + def : InstAlias<"zero\t\\{za0.s,za2.s,za3.s\\}", (!cast(NAME) 0b11011101), 1>; + def : InstAlias<"zero\t\\{za1.s,za2.s,za3.s\\}", (!cast(NAME) 0b11101110), 1>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Instructions +//===----------------------------------------------------------------------===// + +class sve2_int_perm_revd + : I<(outs ZPR128:$Zd), (ins ZPR128:$_Zd, PPR3bAny:$Pg, ZPR128:$Zn), + asm, "\t$Zd, $Pg/m, $Zn", "", []>, + Sched<[]> { + bits<5> Zd; + bits<3> Pg; + bits<5> Zn; + let Inst{31-24} = 0b00000101; + let Inst{23-22} = 0b00; // size + let Inst{21-13} = 0b101110100; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = DestructiveUnary; + let ElementSize = ZPR128.ElementSize; +} + +class sve2_clamp sz, bit U, ZPRRegOp zpr_ty> + : I<(outs zpr_ty:$Zd), (ins zpr_ty:$Zn, zpr_ty:$Zm, zpr_ty:$_Zd), + asm, "\t$Zd, $Zn, $Zm", "", []>, + Sched<[]> { + bits<5> Zm; + bits<5> Zn; + bits<5> Zd; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15-11} = 0b11000; + let Inst{10} = U; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = DestructiveOther; + let ElementSize = zpr_ty.ElementSize; +} + +multiclass sve2_clamp { + def _B : sve2_clamp; + def _H : sve2_clamp; + def _S : sve2_clamp; + def _D : sve2_clamp; +} + +class sve2_int_perm_sel_p + : I<(outs PPRAny:$Pd), (ins PPRAny:$Pn, ppr_ty:$Pm, + MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm), + asm, "\t$Pd, $Pn, $Pm[$Rv, $imm]", "", []>, + Sched<[]> { + bits<2> Rv; + bits<4> Pn; + bits<4> Pm; + bits<4> Pd; + let Inst{31-24} = 0b00100101; + let Inst{21} = 0b1; + let Inst{17-16} = Rv; + let Inst{15-14} = 0b01; + let Inst{13-10} = Pn; + let Inst{9} = 0b0; + let Inst{8-5} = Pm; + let Inst{4} = 0b0; + let Inst{3-0} = Pd; +} + +multiclass sve2_int_perm_sel_p { + def _B : sve2_int_perm_sel_p { + bits<4> imm; + let Inst{23-22} = imm{3-2}; + let Inst{20-19} = imm{1-0}; + let Inst{18} = 0b1; + } + def _H : sve2_int_perm_sel_p { + bits<3> imm; + let Inst{23-22} = imm{2-1}; + let Inst{20} = imm{0}; + let Inst{19-18} = 0b10; + } + def _S : sve2_int_perm_sel_p { + bits<2> imm; + let Inst{23-22} = imm{1-0}; + let Inst{20-18} = 0b100; + } + def _D : sve2_int_perm_sel_p { + bits<1> imm; + let Inst{23} = imm; + let Inst{22} = 0b1; + let Inst{20-18} = 0b000; + } +} diff --git a/suite/synctools/tablegen/AArch64/SVEInstrFormats.td b/suite/synctools/tablegen/AArch64/SVEInstrFormats.td index 7a8dd8bc..37b2ac4d 100644 --- a/suite/synctools/tablegen/AArch64/SVEInstrFormats.td +++ b/suite/synctools/tablegen/AArch64/SVEInstrFormats.td @@ -1,9 +1,8 @@ //=-- SVEInstrFormats.td - AArch64 SVE Instruction classes -*- tablegen -*--=// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -11,6 +10,14 @@ // //===----------------------------------------------------------------------===// +def SDT_AArch64Setcc : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, + SDTCVecEltisVT<0, i1>, SDTCVecEltisVT<1, i1>, SDTCisSameAs<2, 3>, + SDTCisVT<4, OtherVT> +]>; + +def AArch64setcc_z : SDNode<"AArch64ISD::SETCC_MERGE_ZERO", SDT_AArch64Setcc>; + def SVEPatternOperand : AsmOperandClass { let Name = "SVEPattern"; let ParserMethod = "tryParseSVEPattern"; @@ -19,7 +26,7 @@ def SVEPatternOperand : AsmOperandClass { let DiagnosticType = "InvalidSVEPattern"; } -def sve_pred_enum : Operand, ImmLeaf, TImmLeaf { @@ -34,7 +41,7 @@ def SVEPrefetchOperand : AsmOperandClass { let RenderMethod = "addPrefetchOperands"; } -def sve_prfop : Operand, ImmLeaf, TImmLeaf { let PrintMethod = "printPrefetchOp"; @@ -168,8 +175,8 @@ def SVEAddSubImmOperand32 : SVEShiftedImmOperand<32, "AddSub", "isSVEAddSubImm">; class imm8_opt_lsl - : Operand, ImmLeaf { + AsmOperandClass OpndClass> + : Operand { let EncoderMethod = "getImm8OptLsl"; let DecoderMethod = "DecodeImm8OptLsl<" # ElementWidth # ">"; let PrintMethod = "printImm8OptLsl<" # printType # ">"; @@ -177,31 +184,57 @@ class imm8_opt_lsl(Imm); -}]>; -def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16, [{ - return AArch64_AM::isSVECpyImm(Imm); -}]>; -def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32, [{ - return AArch64_AM::isSVECpyImm(Imm); -}]>; -def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64, [{ - return AArch64_AM::isSVECpyImm(Imm); -}]>; +def cpy_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "int8_t", SVECpyImmOperand8>; +def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16>; +def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32>; +def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64>; -def addsub_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "uint8_t", SVEAddSubImmOperand8, [{ - return AArch64_AM::isSVEAddSubImm(Imm); -}]>; -def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16, [{ - return AArch64_AM::isSVEAddSubImm(Imm); -}]>; -def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32, [{ - return AArch64_AM::isSVEAddSubImm(Imm); -}]>; -def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64, [{ - return AArch64_AM::isSVEAddSubImm(Imm); -}]>; +def addsub_imm8_opt_lsl_i8 : imm8_opt_lsl<8, "uint8_t", SVEAddSubImmOperand8>; +def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16>; +def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32>; +def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64>; + +def SVEAddSubImm8Pat : ComplexPattern", []>; +def SVEAddSubImm16Pat : ComplexPattern", []>; +def SVEAddSubImm32Pat : ComplexPattern", []>; +def SVEAddSubImm64Pat : ComplexPattern", []>; + +def SVELogicalImm8Pat : ComplexPattern", []>; +def SVELogicalImm16Pat : ComplexPattern", []>; +def SVELogicalImm32Pat : ComplexPattern", []>; +def SVELogicalImm64Pat : ComplexPattern", []>; + +def SVELogicalImm8NotPat : ComplexPattern", []>; +def SVELogicalImm16NotPat : ComplexPattern", []>; +def SVELogicalImm32NotPat : ComplexPattern", []>; +def SVELogicalImm64NotPat : ComplexPattern", []>; + +def SVE8BitLslImm32 : ComplexPattern; +def SVE8BitLslImm64 : ComplexPattern; +class SVE8BitLslImm { + ComplexPattern Pat = !cond( + !eq(ty, i32): SVE8BitLslImm32, + !eq(ty, i64): SVE8BitLslImm64); +} + +def SVEArithUImm8Pat : ComplexPattern", []>; +def SVEArithUImm16Pat : ComplexPattern", []>; +def SVEArithUImm32Pat : ComplexPattern", []>; +def SVEArithUImm64Pat : ComplexPattern", []>; + +def SVEArithSImmPat32 : ComplexPattern; +def SVEArithSImmPat64 : ComplexPattern; + +def SVEShiftImmL8 : ComplexPattern", []>; +def SVEShiftImmL16 : ComplexPattern", []>; +def SVEShiftImmL32 : ComplexPattern", []>; +def SVEShiftImmL64 : ComplexPattern", []>; +def SVEShiftImmR8 : ComplexPattern", []>; +def SVEShiftImmR16 : ComplexPattern", []>; +def SVEShiftImmR32 : ComplexPattern", []>; +def SVEShiftImmR64 : ComplexPattern", []>; + +def SVEAllActive : ComplexPattern; class SVEExactFPImm : AsmOperandClass { let Name = "SVEExactFPImmOperand" # Suffix; @@ -226,7 +259,7 @@ def sve_fpimm_zero_one : SVEExactFPImmOperand<"ZeroOne", "AArch64ExactFPImm::zero", "AArch64ExactFPImm::one">; -def sve_incdec_imm : Operand, ImmLeaf, TImmLeaf 0) && (((uint32_t)Imm) < 17); }]> { let ParserMatchClass = Imm1_16Operand; @@ -234,16 +267,38 @@ def sve_incdec_imm : Operand, ImmLeaf">; +def sve_cnt_mul_imm_i64 : ComplexPattern">; +def sve_cnt_shl_imm : ComplexPattern">; + +def sve_ext_imm_0_31 : ComplexPattern">; +def sve_ext_imm_0_63 : ComplexPattern">; +def sve_ext_imm_0_127 : ComplexPattern">; +def sve_ext_imm_0_255 : ComplexPattern">; + +def int_aarch64_sve_cntp_oneuse : PatFrag<(ops node:$pred, node:$src2), + (int_aarch64_sve_cntp node:$pred, node:$src2), [{ + return N->hasOneUse(); +}]>; + +def step_vector_oneuse : PatFrag<(ops node:$idx), + (step_vector node:$idx), [{ + return N->hasOneUse(); +}]>; + + //===----------------------------------------------------------------------===// // SVE PTrue - These are used extensively throughout the pattern matching so // it's important we define them first. //===----------------------------------------------------------------------===// -class sve_int_ptrue sz8_64, bits<3> opc, string asm, PPRRegOp pprty> +class sve_int_ptrue sz8_64, bits<3> opc, string asm, PPRRegOp pprty, + ValueType vt, SDPatternOperator op> : I<(outs pprty:$Pd), (ins sve_pred_enum:$pattern), asm, "\t$Pd, $pattern", "", - []>, Sched<[]> { + [(set (vt pprty:$Pd), (op sve_pred_enum:$pattern))]>, Sched<[]> { bits<4> Pd; bits<5> pattern; let Inst{31-24} = 0b00100101; @@ -257,13 +312,15 @@ class sve_int_ptrue sz8_64, bits<3> opc, string asm, PPRRegOp pprty> let Inst{3-0} = Pd; let Defs = !if(!eq (opc{0}, 1), [NZCV], []); + let ElementSize = pprty.ElementSize; + let isReMaterializable = 1; } -multiclass sve_int_ptrue opc, string asm> { - def _B : sve_int_ptrue<0b00, opc, asm, PPR8>; - def _H : sve_int_ptrue<0b01, opc, asm, PPR16>; - def _S : sve_int_ptrue<0b10, opc, asm, PPR32>; - def _D : sve_int_ptrue<0b11, opc, asm, PPR64>; +multiclass sve_int_ptrue opc, string asm, SDPatternOperator op> { + def _B : sve_int_ptrue<0b00, opc, asm, PPR8, nxv16i1, op>; + def _H : sve_int_ptrue<0b01, opc, asm, PPR16, nxv8i1, op>; + def _S : sve_int_ptrue<0b10, opc, asm, PPR32, nxv4i1, op>; + def _D : sve_int_ptrue<0b11, opc, asm, PPR64, nxv2i1, op>; def : InstAlias(NAME # _B) PPR8:$Pd, 0b11111), 1>; @@ -275,11 +332,268 @@ multiclass sve_int_ptrue opc, string asm> { (!cast(NAME # _D) PPR64:$Pd, 0b11111), 1>; } -let Predicates = [HasSVE] in { - defm PTRUE : sve_int_ptrue<0b000, "ptrue">; - defm PTRUES : sve_int_ptrue<0b001, "ptrues">; +def SDT_AArch64PTrue : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>; +def AArch64ptrue : SDNode<"AArch64ISD::PTRUE", SDT_AArch64PTrue>; + +let Predicates = [HasSVEorStreamingSVE] in { + defm PTRUE : sve_int_ptrue<0b000, "ptrue", AArch64ptrue>; + defm PTRUES : sve_int_ptrue<0b001, "ptrues", null_frag>; } +//===----------------------------------------------------------------------===// +// SVE pattern match helpers. +//===----------------------------------------------------------------------===// + +class SVE_1_Op_Pat +: Pat<(vtd (op vt1:$Op1)), + (inst $Op1)>; + +class SVE_1_Op_Passthru_Pat +: Pat<(vtd (op pg:$Op1, vts:$Op2, vtd:$Op3)), + (inst $Op3, $Op1, $Op2)>; + + +multiclass SVE_1_Op_PassthruUndef_Pat { + def : Pat<(vtd (op pg:$Op1, vts:$Op2, (vtd undef))), + (inst (IMPLICIT_DEF), $Op1, $Op2)>; + def : Pat<(vtd (op (pg (SVEAllActive:$Op1)), vts:$Op2, vtd:$Op3)), + (inst $Op3, $Op1, $Op2)>; +} + +// Used to match FP_ROUND_MERGE_PASSTHRU, which has an additional flag for the +// type of rounding. This is matched by timm0_1 in pattern below and ignored. +class SVE_1_Op_Passthru_Round_Pat +: Pat<(vtd (op pg:$Op1, vts:$Op2, (i64 timm0_1), vtd:$Op3)), + (inst $Op3, $Op1, $Op2)>; + +multiclass SVE_1_Op_PassthruUndef_Round_Pat{ + def : Pat<(vtd (op pg:$Op1, vts:$Op2, (i64 timm0_1), (vtd undef))), + (inst (IMPLICIT_DEF), $Op1, $Op2)>; + def : Pat<(vtd (op (pg (SVEAllActive:$Op1)), vts:$Op2, (i64 timm0_1), vtd:$Op3)), + (inst $Op3, $Op1, $Op2)>; +} + +class SVE_1_Op_Imm_OptLsl_Reverse_Pat + : Pat<(vt (op (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))), (vt zprty:$Op1))), + (inst $Op1, i32:$imm, i32:$shift)>; + +class SVE_1_Op_Imm_OptLsl_Pat + : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))), + (inst $Op1, i32:$imm, i32:$shift)>; + +class SVE_1_Op_Imm_Arith_All_Active + : Pat<(vt (op (pt (SVEAllActive)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))), + (inst $Op1, i32:$imm)>; + +class SVE_1_Op_Imm_Log_Pat + : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i64:$imm)))))), + (inst $Op1, i64:$imm)>; + +class SVE_2_Op_Pat +: Pat<(vtd (op vt1:$Op1, vt2:$Op2)), + (inst $Op1, $Op2)>; + +class SVE_2_Op_Pred_All_Active +: Pat<(vtd (op (pt (SVEAllActive)), vt1:$Op1, vt2:$Op2)), + (inst $Op1, $Op2)>; + +class SVE_2_Op_Pred_All_Active_Pt +: Pat<(vtd (op (pt (SVEAllActive:$Op1)), vt1:$Op2, vt2:$Op3)), + (inst $Op1, $Op2, $Op3)>; + +class SVE_3_Op_Pat +: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3)), + (inst $Op1, $Op2, $Op3)>; + +multiclass SVE_3_Op_Undef_Pat { + def : Pat<(vtd (op (vt1 undef), vt2:$Op1, vt3:$Op2)), + (inst (IMPLICIT_DEF), $Op1, $Op2)>; + def : Pat<(vtd (op vt1:$Op1, (vt2 (SVEAllActive:$Op2)), vt3:$Op3)), + (inst $Op1, $Op2, $Op3)>; +} + +class SVE_4_Op_Pat +: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, vt4:$Op4)), + (inst $Op1, $Op2, $Op3, $Op4)>; + +class SVE_2_Op_Imm_Pat +: Pat<(vtd (op vt1:$Op1, (vt2 ImmTy:$Op2))), + (inst $Op1, ImmTy:$Op2)>; + +class SVE_3_Op_Imm_Pat +: Pat<(vtd (op vt1:$Op1, vt2:$Op2, (vt3 ImmTy:$Op3))), + (inst $Op1, $Op2, ImmTy:$Op3)>; + +class SVE_4_Op_Imm_Pat +: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))), + (inst $Op1, $Op2, $Op3, ImmTy:$Op4)>; + +def SVEDup0 : ComplexPattern; +def SVEDup0Undef : ComplexPattern; + +let AddedComplexity = 1 in { +class SVE_3_Op_Pat_SelZero +: Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))), + (inst $Op1, $Op2, $Op3)>; + +class SVE_3_Op_Pat_Shift_Imm_SelZero +: Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), (i32 (vt3:$Op3)))), + (inst $Op1, $Op2, vt3:$Op3)>; +} + +// +// Common but less generic patterns. +// + +class SVE_1_Op_AllActive_Pat +: Pat<(vtd (op vt1:$Op1)), + (inst (IMPLICIT_DEF), (ptrue 31), $Op1)>; + +class SVE_2_Op_AllActive_Pat +: Pat<(vtd (op vt1:$Op1, vt2:$Op2)), + (inst (ptrue 31), $Op1, $Op2)>; + +class SVE_InReg_Extend +: Pat<(vt (op pt:$Pg, vt:$Src, inreg_vt, vt:$PassThru)), + (inst $PassThru, $Pg, $Src)>; + +multiclass SVE_InReg_Extend_PassthruUndef { + def : Pat<(vt (op pt:$Pg, vt:$Src, inreg_vt, (vt undef))), + (inst (IMPLICIT_DEF), $Pg, $Src)>; + def : Pat<(vt (op (pt (SVEAllActive:$Pg)), vt:$Src, inreg_vt, vt:$PassThru)), + (inst $PassThru, $Pg, $Src)>; +} + +class SVE_Shift_DupImm_Pred_Pat +: Pat<(vt (op pt:$Pg, vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), + (inst $Pg, $Rn, i32:$imm)>; + +class SVE_Shift_DupImm_All_Active_Pat +: Pat<(vt (op (pt (SVEAllActive)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))), + (inst $Rn, i32:$imm)>; + +class SVE_2_Op_Fp_Imm_Pat +: Pat<(vt (op (pt PPR_3b:$Pg), (vt ZPR:$Zs1), (vt (AArch64dup (it immL))))), + (inst $Pg, $Zs1, imm)>; + +class SVE_2_Op_Fp_Imm_Pat_Zero +: Pat<(vt (op pt:$Pg, (vselect pt:$Pg, vt:$Zs1, (SVEDup0)), + (vt (AArch64dup (it immL))))), + (inst $Pg, $Zs1, imm)>; + +// +// Pseudo -> Instruction mappings +// +def getSVEPseudoMap : InstrMapping { + let FilterClass = "SVEPseudo2Instr"; + let RowFields = ["PseudoName"]; + let ColFields = ["IsInstr"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +class SVEPseudo2Instr { + string PseudoName = name; + bit IsInstr = instr; +} + +// Lookup e.g. DIV -> DIVR +def getSVERevInstr : InstrMapping { + let FilterClass = "SVEInstr2Rev"; + let RowFields = ["InstrName"]; + let ColFields = ["isReverseInstr"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + +// Lookup e.g. DIVR -> DIV +def getSVENonRevInstr : InstrMapping { + let FilterClass = "SVEInstr2Rev"; + let RowFields = ["InstrName"]; + let ColFields = ["isReverseInstr"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + +class SVEInstr2Rev { + string InstrName = !if(name1IsReverseInstr, name1, name2); + bit isReverseInstr = name1IsReverseInstr; +} + +// +// Pseudos for destructive operands +// +let hasNoSchedulingInfo = 1 in { + class PredTwoOpPseudo + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2), []> { + let FalseLanes = flags; + } + + class PredTwoOpImmPseudo + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> { + let FalseLanes = flags; + } + + class PredThreeOpPseudo + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2, zprty:$Zs3), []> { + let FalseLanes = flags; + } +} + +// +// Pseudos for passthru operands +// +let hasNoSchedulingInfo = 1 in { + class PredOneOpPassthruPseudo + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins zprty:$Passthru, PPR3bAny:$Pg, zprty:$Zs), []>; +} //===----------------------------------------------------------------------===// // SVE Predicate Misc Group @@ -299,6 +613,17 @@ class sve_int_pfalse opc, string asm> let Inst{9} = opc{0}; let Inst{8-4} = 0b00000; let Inst{3-0} = Pd; + + let isReMaterializable = 1; +} + +multiclass sve_int_pfalse opc, string asm> { + def NAME : sve_int_pfalse; + + def : Pat<(nxv16i1 (splat_vector (i32 0))), (!cast(NAME))>; + def : Pat<(nxv8i1 (splat_vector (i32 0))), (!cast(NAME))>; + def : Pat<(nxv4i1 (splat_vector (i32 0))), (!cast(NAME))>; + def : Pat<(nxv2i1 (splat_vector (i32 0))), (!cast(NAME))>; } class sve_int_ptest opc, string asm> @@ -319,6 +644,7 @@ class sve_int_ptest opc, string asm> let Inst{4-0} = 0b00000; let Defs = [NZCV]; + let isCompare = 1; } class sve_int_pfirst_next sz8_64, bits<5> opc, string asm, @@ -341,17 +667,26 @@ class sve_int_pfirst_next sz8_64, bits<5> opc, string asm, let Constraints = "$Pdn = $_Pdn"; let Defs = [NZCV]; + let isPTestLike = 1; + let ElementSize = pprty.ElementSize; } -multiclass sve_int_pfirst opc, string asm> { - def : sve_int_pfirst_next<0b01, opc, asm, PPR8>; +multiclass sve_int_pfirst opc, string asm, SDPatternOperator op> { + def _B : sve_int_pfirst_next<0b01, opc, asm, PPR8>; + + def : SVE_2_Op_Pat(NAME # _B)>; } -multiclass sve_int_pnext opc, string asm> { +multiclass sve_int_pnext opc, string asm, SDPatternOperator op> { def _B : sve_int_pfirst_next<0b00, opc, asm, PPR8>; def _H : sve_int_pfirst_next<0b01, opc, asm, PPR16>; def _S : sve_int_pfirst_next<0b10, opc, asm, PPR32>; def _D : sve_int_pfirst_next<0b11, opc, asm, PPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -382,34 +717,96 @@ class sve_int_count_r sz8_64, bits<5> opc, string asm, let Constraints = "$Rdn = $_Rdn"; } -multiclass sve_int_count_r_s32 opc, string asm> { +multiclass sve_int_count_r_s32 opc, string asm, + SDPatternOperator op> { def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64as32>; def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64as32>; def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64as32>; def _D : sve_int_count_r<0b11, opc, asm, GPR64z, PPR64, GPR64as32>; + + def : Pat<(i32 (op GPR32:$Rn, (nxv16i1 PPRAny:$Pg))), + (EXTRACT_SUBREG (!cast(NAME # _B) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32)), sub_32)>; + def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (nxv16i1 PPRAny:$Pg))))), + (!cast(NAME # _B) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32))>; + + def : Pat<(i32 (op GPR32:$Rn, (nxv8i1 PPRAny:$Pg))), + (EXTRACT_SUBREG (!cast(NAME # _H) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32)), sub_32)>; + def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (nxv8i1 PPRAny:$Pg))))), + (!cast(NAME # _H) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32))>; + + def : Pat<(i32 (op GPR32:$Rn, (nxv4i1 PPRAny:$Pg))), + (EXTRACT_SUBREG (!cast(NAME # _S) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32)), sub_32)>; + def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (nxv4i1 PPRAny:$Pg))))), + (!cast(NAME # _S) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32))>; + + def : Pat<(i32 (op GPR32:$Rn, (nxv2i1 PPRAny:$Pg))), + (EXTRACT_SUBREG (!cast(NAME # _D) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32)), sub_32)>; + def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (nxv2i1 PPRAny:$Pg))))), + (!cast(NAME # _D) PPRAny:$Pg, (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32))>; } -multiclass sve_int_count_r_u32 opc, string asm> { +multiclass sve_int_count_r_u32 opc, string asm, + SDPatternOperator op> { def _B : sve_int_count_r<0b00, opc, asm, GPR32z, PPR8, GPR32z>; def _H : sve_int_count_r<0b01, opc, asm, GPR32z, PPR16, GPR32z>; def _S : sve_int_count_r<0b10, opc, asm, GPR32z, PPR32, GPR32z>; def _D : sve_int_count_r<0b11, opc, asm, GPR32z, PPR64, GPR32z>; + + def : Pat<(i32 (op GPR32:$Rn, (nxv16i1 PPRAny:$Pg))), + (!cast(NAME # _B) PPRAny:$Pg, $Rn)>; + def : Pat<(i32 (op GPR32:$Rn, (nxv8i1 PPRAny:$Pg))), + (!cast(NAME # _H) PPRAny:$Pg, $Rn)>; + def : Pat<(i32 (op GPR32:$Rn, (nxv4i1 PPRAny:$Pg))), + (!cast(NAME # _S) PPRAny:$Pg, $Rn)>; + def : Pat<(i32 (op GPR32:$Rn, (nxv2i1 PPRAny:$Pg))), + (!cast(NAME # _D) PPRAny:$Pg, $Rn)>; } -multiclass sve_int_count_r_x64 opc, string asm> { +multiclass sve_int_count_r_x64 opc, string asm, + SDPatternOperator op, + SDPatternOperator combine_op = null_frag> { def _B : sve_int_count_r<0b00, opc, asm, GPR64z, PPR8, GPR64z>; def _H : sve_int_count_r<0b01, opc, asm, GPR64z, PPR16, GPR64z>; def _S : sve_int_count_r<0b10, opc, asm, GPR64z, PPR32, GPR64z>; def _D : sve_int_count_r<0b11, opc, asm, GPR64z, PPR64, GPR64z>; + + def : Pat<(i64 (op GPR64:$Rn, (nxv16i1 PPRAny:$Pg))), + (!cast(NAME # _B) PPRAny:$Pg, $Rn)>; + def : Pat<(i64 (op GPR64:$Rn, (nxv8i1 PPRAny:$Pg))), + (!cast(NAME # _H) PPRAny:$Pg, $Rn)>; + def : Pat<(i64 (op GPR64:$Rn, (nxv4i1 PPRAny:$Pg))), + (!cast(NAME # _S) PPRAny:$Pg, $Rn)>; + def : Pat<(i64 (op GPR64:$Rn, (nxv2i1 PPRAny:$Pg))), + (!cast(NAME # _D) PPRAny:$Pg, $Rn)>; + + // combine_op(x, cntp(all_active, p)) ==> inst p, x + def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv16i1 (SVEAllActive)), (nxv16i1 PPRAny:$pred)))), + (!cast(NAME # _B) PPRAny:$pred, $Rn)>; + def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv8i1 (SVEAllActive)), (nxv8i1 PPRAny:$pred)))), + (!cast(NAME # _H) PPRAny:$pred, $Rn)>; + def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv4i1 (SVEAllActive)), (nxv4i1 PPRAny:$pred)))), + (!cast(NAME # _S) PPRAny:$pred, $Rn)>; + def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv2i1 (SVEAllActive)), (nxv2i1 PPRAny:$pred)))), + (!cast(NAME # _D) PPRAny:$pred, $Rn)>; + + // combine_op(x, cntp(p, p)) ==> inst p, x + def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv16i1 PPRAny:$pred), (nxv16i1 PPRAny:$pred)))), + (!cast(NAME # _B) PPRAny:$pred, $Rn)>; + def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv8i1 PPRAny:$pred), (nxv8i1 PPRAny:$pred)))), + (!cast(NAME # _H) PPRAny:$pred, $Rn)>; + def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv4i1 PPRAny:$pred), (nxv4i1 PPRAny:$pred)))), + (!cast(NAME # _S) PPRAny:$pred, $Rn)>; + def : Pat<(i64 (combine_op GPR64:$Rn, (int_aarch64_sve_cntp_oneuse (nxv2i1 PPRAny:$pred), (nxv2i1 PPRAny:$pred)))), + (!cast(NAME # _D) PPRAny:$pred, $Rn)>; } class sve_int_count_v sz8_64, bits<5> opc, string asm, - ZPRRegOp zprty> -: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, PPRAny:$Pg), - asm, "\t$Zdn, $Pg", + ZPRRegOp zprty, PPRRegOp pprty> +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, pprty:$Pm), + asm, "\t$Zdn, $Pm", "", []>, Sched<[]> { - bits<4> Pg; + bits<4> Pm; bits<5> Zdn; let Inst{31-24} = 0b00100101; let Inst{23-22} = sz8_64; @@ -417,18 +814,30 @@ class sve_int_count_v sz8_64, bits<5> opc, string asm, let Inst{18-16} = opc{4-2}; let Inst{15-11} = 0b10000; let Inst{10-9} = opc{1-0}; - let Inst{8-5} = Pg; + let Inst{8-5} = Pm; let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve_int_count_v opc, string asm> { - def _H : sve_int_count_v<0b01, opc, asm, ZPR16>; - def _S : sve_int_count_v<0b10, opc, asm, ZPR32>; - def _D : sve_int_count_v<0b11, opc, asm, ZPR64>; +multiclass sve_int_count_v opc, string asm, + SDPatternOperator op = null_frag> { + def _H : sve_int_count_v<0b01, opc, asm, ZPR16, PPR16>; + def _S : sve_int_count_v<0b10, opc, asm, ZPR32, PPR32>; + def _D : sve_int_count_v<0b11, opc, asm, ZPR64, PPR64>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; + + def : InstAlias(NAME # "_H") ZPR16:$Zdn, PPRAny:$Pm), 0>; + def : InstAlias(NAME # "_S") ZPR32:$Zdn, PPRAny:$Pm), 0>; + def : InstAlias(NAME # "_D") ZPR64:$Zdn, PPRAny:$Pm), 0>; } class sve_int_pcount_pred sz8_64, bits<4> opc, string asm, @@ -451,11 +860,17 @@ class sve_int_pcount_pred sz8_64, bits<4> opc, string asm, let Inst{4-0} = Rd; } -multiclass sve_int_pcount_pred opc, string asm> { +multiclass sve_int_pcount_pred opc, string asm, + SDPatternOperator int_op> { def _B : sve_int_pcount_pred<0b00, opc, asm, PPR8>; def _H : sve_int_pcount_pred<0b01, opc, asm, PPR16>; def _S : sve_int_pcount_pred<0b10, opc, asm, PPR32>; def _D : sve_int_pcount_pred<0b11, opc, asm, PPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -480,13 +895,22 @@ class sve_int_count opc, string asm> let Inst{4-0} = Rd; } -multiclass sve_int_count opc, string asm> { +multiclass sve_int_count opc, string asm, SDPatternOperator op> { def NAME : sve_int_count; def : InstAlias(NAME) GPR64:$Rd, sve_pred_enum:$pattern, 1), 1>; def : InstAlias(NAME) GPR64:$Rd, 0b11111, 1), 2>; + + def : Pat<(i64 (mul (op sve_pred_enum:$pattern), (sve_cnt_mul_imm_i64 i32:$imm))), + (!cast(NAME) sve_pred_enum:$pattern, sve_incdec_imm:$imm)>; + + def : Pat<(i64 (shl (op sve_pred_enum:$pattern), (sve_cnt_shl_imm i32:$imm))), + (!cast(NAME) sve_pred_enum:$pattern, sve_incdec_imm:$imm)>; + + def : Pat<(i64 (op sve_pred_enum:$pattern)), + (!cast(NAME) sve_pred_enum:$pattern, 1)>; } class sve_int_countvlv opc, string asm, ZPRRegOp zprty> @@ -508,17 +932,22 @@ class sve_int_countvlv opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve_int_countvlv opc, string asm, ZPRRegOp zprty> { +multiclass sve_int_countvlv opc, string asm, ZPRRegOp zprty, + SDPatternOperator op = null_frag, + ValueType vt = OtherVT> { def NAME : sve_int_countvlv; def : InstAlias(NAME) zprty:$Zdn, sve_pred_enum:$pattern, 1), 1>; def : InstAlias(NAME) zprty:$Zdn, 0b11111, 1), 2>; + + def : Pat<(vt (op (vt zprty:$Zn), (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))), + (!cast(NAME) $Zn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4)>; } class sve_int_pred_pattern_a opc, string asm> @@ -541,13 +970,43 @@ class sve_int_pred_pattern_a opc, string asm> let Constraints = "$Rdn = $_Rdn"; } -multiclass sve_int_pred_pattern_a opc, string asm> { - def NAME : sve_int_pred_pattern_a; +multiclass sve_int_pred_pattern_a opc, string asm, + SDPatternOperator op, + SDPatternOperator opcnt> { + let Predicates = [HasSVEorStreamingSVE] in { + def NAME : sve_int_pred_pattern_a; - def : InstAlias(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1), 1>; - def : InstAlias(NAME) GPR64:$Rdn, 0b11111, 1), 2>; + def : InstAlias(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1), 1>; + def : InstAlias(NAME) GPR64:$Rdn, 0b11111, 1), 2>; + } + + let Predicates = [HasSVEorStreamingSVE, UseScalarIncVL] in { + def : Pat<(i64 (op GPR64:$Rdn, (opcnt sve_pred_enum:$pattern))), + (!cast(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1)>; + + def : Pat<(i64 (op GPR64:$Rdn, (mul (opcnt sve_pred_enum:$pattern), (sve_cnt_mul_imm_i64 i32:$imm)))), + (!cast(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, $imm)>; + + def : Pat<(i64 (op GPR64:$Rdn, (shl (opcnt sve_pred_enum:$pattern), (sve_cnt_shl_imm i32:$imm)))), + (!cast(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, $imm)>; + + def : Pat<(i32 (op GPR32:$Rdn, (i32 (trunc (opcnt (sve_pred_enum:$pattern)))))), + (i32 (EXTRACT_SUBREG (!cast(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, 1), + sub_32))>; + + def : Pat<(i32 (op GPR32:$Rdn, (mul (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (sve_cnt_mul_imm_i32 i32:$imm)))), + (i32 (EXTRACT_SUBREG (!cast(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, $imm), + sub_32))>; + + def : Pat<(i32 (op GPR32:$Rdn, (shl (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (sve_cnt_shl_imm i32:$imm)))), + (i32 (EXTRACT_SUBREG (!cast(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, $imm), + sub_32))>; + } } class sve_int_pred_pattern_b opc, string asm, RegisterOperand dt, @@ -577,31 +1036,48 @@ class sve_int_pred_pattern_b opc, string asm, RegisterOperand dt, let Constraints = "$Rdn = $_Rdn"; } -multiclass sve_int_pred_pattern_b_s32 opc, string asm> { +multiclass sve_int_pred_pattern_b_s32 opc, string asm, + SDPatternOperator op> { def NAME : sve_int_pred_pattern_b; def : InstAlias(NAME) GPR64z:$Rd, GPR64as32:$Rn, sve_pred_enum:$pattern, 1), 1>; def : InstAlias(NAME) GPR64z:$Rd, GPR64as32:$Rn, 0b11111, 1), 2>; + + // NOTE: Register allocation doesn't like tied operands of differing register + // class, hence the extra INSERT_SUBREG complication. + + def : Pat<(i32 (op GPR32:$Rn, (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))), + (EXTRACT_SUBREG (!cast(NAME) (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32), sve_pred_enum:$pattern, sve_incdec_imm:$imm4), sub_32)>; + def : Pat<(i64 (sext (i32 (op GPR32:$Rn, (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))))), + (!cast(NAME) (INSERT_SUBREG (IMPLICIT_DEF), $Rn, sub_32), sve_pred_enum:$pattern, sve_incdec_imm:$imm4)>; } -multiclass sve_int_pred_pattern_b_u32 opc, string asm> { +multiclass sve_int_pred_pattern_b_u32 opc, string asm, + SDPatternOperator op> { def NAME : sve_int_pred_pattern_b; def : InstAlias(NAME) GPR32z:$Rdn, sve_pred_enum:$pattern, 1), 1>; def : InstAlias(NAME) GPR32z:$Rdn, 0b11111, 1), 2>; + + def : Pat<(i32 (op GPR32:$Rn, (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))), + (!cast(NAME) $Rn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4)>; } -multiclass sve_int_pred_pattern_b_x64 opc, string asm> { +multiclass sve_int_pred_pattern_b_x64 opc, string asm, + SDPatternOperator op> { def NAME : sve_int_pred_pattern_b; def : InstAlias(NAME) GPR64z:$Rdn, sve_pred_enum:$pattern, 1), 1>; def : InstAlias(NAME) GPR64z:$Rdn, 0b11111, 1), 2>; + + def : Pat<(i64 (op GPR64:$Rn, (sve_pred_enum:$pattern), (sve_incdec_imm:$imm4))), + (!cast(NAME) $Rn, sve_pred_enum:$pattern, sve_incdec_imm:$imm4)>; } @@ -610,11 +1086,12 @@ multiclass sve_int_pred_pattern_b_x64 opc, string asm> { //===----------------------------------------------------------------------===// class sve_int_perm_dup_r sz8_64, string asm, ZPRRegOp zprty, - RegisterClass srcRegType> + ValueType vt, RegisterClass srcRegType, + SDPatternOperator op> : I<(outs zprty:$Zd), (ins srcRegType:$Rn), asm, "\t$Zd, $Rn", "", - []>, Sched<[]> { + [(set (vt zprty:$Zd), (op srcRegType:$Rn))]>, Sched<[]> { bits<5> Rn; bits<5> Zd; let Inst{31-24} = 0b00000101; @@ -624,11 +1101,11 @@ class sve_int_perm_dup_r sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; } -multiclass sve_int_perm_dup_r { - def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, GPR32sp>; - def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, GPR32sp>; - def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, GPR32sp>; - def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, GPR64sp>; +multiclass sve_int_perm_dup_r { + def _B : sve_int_perm_dup_r<0b00, asm, ZPR8, nxv16i8, GPR32sp, op>; + def _H : sve_int_perm_dup_r<0b01, asm, ZPR16, nxv8i16, GPR32sp, op>; + def _S : sve_int_perm_dup_r<0b10, asm, ZPR32, nxv4i32, GPR32sp, op>; + def _D : sve_int_perm_dup_r<0b11, asm, ZPR64, nxv2i64, GPR64sp, op>; def : InstAlias<"mov $Zd, $Rn", (!cast(NAME # _B) ZPR8:$Zd, GPR32sp:$Rn), 1>; @@ -699,9 +1176,33 @@ multiclass sve_int_perm_dup_i { (!cast(NAME # _D) ZPR64:$Zd, FPR64asZPR:$Dn, 0), 2>; def : InstAlias<"mov $Zd, $Qn", (!cast(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>; + + // Duplicate extracted element of vector into all vector elements + def : Pat<(nxv16i8 (AArch64dup (i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)))), + (!cast(NAME # _B) ZPR:$vec, sve_elm_idx_extdup_b:$index)>; + def : Pat<(nxv8i16 (AArch64dup (i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), + (!cast(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>; + def : Pat<(nxv4i32 (AArch64dup (i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), + (!cast(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; + def : Pat<(nxv2i64 (AArch64dup (i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), + (!cast(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; + def : Pat<(nxv8f16 (AArch64dup (f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), + (!cast(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>; + def : Pat<(nxv8bf16 (AArch64dup (bf16 (vector_extract (nxv8bf16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), + (!cast(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>; + def : Pat<(nxv4f16 (AArch64dup (f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), + (!cast(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; + def : Pat<(nxv2f16 (AArch64dup (f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), + (!cast(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; + def : Pat<(nxv4f32 (AArch64dup (f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), + (!cast(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; + def : Pat<(nxv2f32 (AArch64dup (f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), + (!cast(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; + def : Pat<(nxv2f64 (AArch64dup (f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), + (!cast(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; } -class sve_int_perm_tbl sz8_64, string asm, ZPRRegOp zprty, +class sve_int_perm_tbl sz8_64, bits<2> opc, string asm, ZPRRegOp zprty, RegisterOperand VecList> : I<(outs zprty:$Zd), (ins VecList:$Zn, zprty:$Zm), asm, "\t$Zd, $Zn, $Zm", @@ -714,16 +1215,18 @@ class sve_int_perm_tbl sz8_64, string asm, ZPRRegOp zprty, let Inst{23-22} = sz8_64; let Inst{21} = 0b1; let Inst{20-16} = Zm; - let Inst{15-10} = 0b001100; + let Inst{15-13} = 0b001; + let Inst{12-11} = opc; + let Inst{10} = 0b0; let Inst{9-5} = Zn; let Inst{4-0} = Zd; } -multiclass sve_int_perm_tbl { - def _B : sve_int_perm_tbl<0b00, asm, ZPR8, Z_b>; - def _H : sve_int_perm_tbl<0b01, asm, ZPR16, Z_h>; - def _S : sve_int_perm_tbl<0b10, asm, ZPR32, Z_s>; - def _D : sve_int_perm_tbl<0b11, asm, ZPR64, Z_d>; +multiclass sve_int_perm_tbl { + def _B : sve_int_perm_tbl<0b00, 0b10, asm, ZPR8, Z_b>; + def _H : sve_int_perm_tbl<0b01, 0b10, asm, ZPR16, Z_h>; + def _S : sve_int_perm_tbl<0b10, 0b10, asm, ZPR32, Z_s>; + def _D : sve_int_perm_tbl<0b11, 0b10, asm, ZPR64, Z_d>; def : InstAlias(NAME # _B) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 0>; @@ -733,6 +1236,101 @@ multiclass sve_int_perm_tbl { (!cast(NAME # _S) ZPR32:$Zd, ZPR32:$Zn, ZPR32:$Zm), 0>; def : InstAlias(NAME # _D) ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zm), 0>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; + + def : SVE_2_Op_Pat(NAME # _H)>; +} + +multiclass sve2_int_perm_tbl { + def _B : sve_int_perm_tbl<0b00, 0b01, asm, ZPR8, ZZ_b>; + def _H : sve_int_perm_tbl<0b01, 0b01, asm, ZPR16, ZZ_h>; + def _S : sve_int_perm_tbl<0b10, 0b01, asm, ZPR32, ZZ_s>; + def _D : sve_int_perm_tbl<0b11, 0b01, asm, ZPR64, ZZ_d>; + + def : Pat<(nxv16i8 (op nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)), + (nxv16i8 (!cast(NAME # _B) (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0, + nxv16i8:$Op2, zsub1), + nxv16i8:$Op3))>; + + def : Pat<(nxv8i16 (op nxv8i16:$Op1, nxv8i16:$Op2, nxv8i16:$Op3)), + (nxv8i16 (!cast(NAME # _H) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0, + nxv8i16:$Op2, zsub1), + nxv8i16:$Op3))>; + + def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv4i32:$Op2, nxv4i32:$Op3)), + (nxv4i32 (!cast(NAME # _S) (REG_SEQUENCE ZPR2, nxv4i32:$Op1, zsub0, + nxv4i32:$Op2, zsub1), + nxv4i32:$Op3))>; + + def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv2i64:$Op2, nxv2i64:$Op3)), + (nxv2i64 (!cast(NAME # _D) (REG_SEQUENCE ZPR2, nxv2i64:$Op1, zsub0, + nxv2i64:$Op2, zsub1), + nxv2i64:$Op3))>; + + def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8i16:$Op3)), + (nxv8f16 (!cast(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0, + nxv8f16:$Op2, zsub1), + nxv8i16:$Op3))>; + + def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4i32:$Op3)), + (nxv4f32 (!cast(NAME # _S) (REG_SEQUENCE ZPR2, nxv4f32:$Op1, zsub0, + nxv4f32:$Op2, zsub1), + nxv4i32:$Op3))>; + + def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2i64:$Op3)), + (nxv2f64 (!cast(NAME # _D) (REG_SEQUENCE ZPR2, nxv2f64:$Op1, zsub0, + nxv2f64:$Op2, zsub1), + nxv2i64:$Op3))>; + + def : Pat<(nxv8bf16 (op nxv8bf16:$Op1, nxv8bf16:$Op2, nxv8i16:$Op3)), + (nxv8bf16 (!cast(NAME # _H) (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, + nxv8bf16:$Op2, zsub1), + nxv8i16:$Op3))>; +} + +class sve2_int_perm_tbx sz8_64, string asm, ZPRRegOp zprty> +: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, zprty:$Zm), + asm, "\t$Zd, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<5> Zm; + bits<5> Zn; + let Inst{31-24} = 0b00000101; + let Inst{23-22} = sz8_64; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b001011; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; +} + +multiclass sve2_int_perm_tbx { + def _B : sve2_int_perm_tbx<0b00, asm, ZPR8>; + def _H : sve2_int_perm_tbx<0b01, asm, ZPR16>; + def _S : sve2_int_perm_tbx<0b10, asm, ZPR32>; + def _D : sve2_int_perm_tbx<0b11, asm, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; + + def : SVE_3_Op_Pat(NAME # _H)>; } class sve_int_perm_reverse_z sz8_64, string asm, ZPRRegOp zprty> @@ -749,11 +1347,27 @@ class sve_int_perm_reverse_z sz8_64, string asm, ZPRRegOp zprty> let Inst{4-0} = Zd; } -multiclass sve_int_perm_reverse_z { +multiclass sve_int_perm_reverse_z { def _B : sve_int_perm_reverse_z<0b00, asm, ZPR8>; def _H : sve_int_perm_reverse_z<0b01, asm, ZPR16>; def _S : sve_int_perm_reverse_z<0b10, asm, ZPR32>; def _D : sve_int_perm_reverse_z<0b11, asm, ZPR64>; + + def : SVE_1_Op_Pat(NAME # _B)>; + def : SVE_1_Op_Pat(NAME # _H)>; + def : SVE_1_Op_Pat(NAME # _S)>; + def : SVE_1_Op_Pat(NAME # _D)>; + + def : SVE_1_Op_Pat(NAME # _D)>; + def : SVE_1_Op_Pat(NAME # _S)>; + def : SVE_1_Op_Pat(NAME # _H)>; + def : SVE_1_Op_Pat(NAME # _D)>; + def : SVE_1_Op_Pat(NAME # _S)>; + def : SVE_1_Op_Pat(NAME # _D)>; + + def : SVE_1_Op_Pat(NAME # _D)>; + def : SVE_1_Op_Pat(NAME # _S)>; + def : SVE_1_Op_Pat(NAME # _H)>; } class sve_int_perm_reverse_p sz8_64, string asm, PPRRegOp pprty> @@ -771,11 +1385,16 @@ class sve_int_perm_reverse_p sz8_64, string asm, PPRRegOp pprty> let Inst{3-0} = Pd; } -multiclass sve_int_perm_reverse_p { +multiclass sve_int_perm_reverse_p { def _B : sve_int_perm_reverse_p<0b00, asm, PPR8>; def _H : sve_int_perm_reverse_p<0b01, asm, PPR16>; def _S : sve_int_perm_reverse_p<0b10, asm, PPR32>; def _D : sve_int_perm_reverse_p<0b11, asm, PPR64>; + + def : SVE_1_Op_Pat(NAME # _B)>; + def : SVE_1_Op_Pat(NAME # _H)>; + def : SVE_1_Op_Pat(NAME # _S)>; + def : SVE_1_Op_Pat(NAME # _D)>; } class sve_int_perm_unpk sz16_64, bits<2> opc, string asm, @@ -794,10 +1413,14 @@ class sve_int_perm_unpk sz16_64, bits<2> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_int_perm_unpk opc, string asm> { +multiclass sve_int_perm_unpk opc, string asm, SDPatternOperator op> { def _H : sve_int_perm_unpk<0b01, opc, asm, ZPR16, ZPR8>; def _S : sve_int_perm_unpk<0b10, opc, asm, ZPR32, ZPR16>; def _D : sve_int_perm_unpk<0b11, opc, asm, ZPR64, ZPR32>; + + def : SVE_1_Op_Pat(NAME # _H)>; + def : SVE_1_Op_Pat(NAME # _S)>; + def : SVE_1_Op_Pat(NAME # _D)>; } class sve_int_perm_insrs sz8_64, string asm, ZPRRegOp zprty, @@ -815,20 +1438,24 @@ class sve_int_perm_insrs sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; - let ElementSize = ElementSizeNone; + let DestructiveInstType = DestructiveOther; } -multiclass sve_int_perm_insrs { +multiclass sve_int_perm_insrs { def _B : sve_int_perm_insrs<0b00, asm, ZPR8, GPR32>; def _H : sve_int_perm_insrs<0b01, asm, ZPR16, GPR32>; def _S : sve_int_perm_insrs<0b10, asm, ZPR32, GPR32>; def _D : sve_int_perm_insrs<0b11, asm, ZPR64, GPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } class sve_int_perm_insrv sz8_64, string asm, ZPRRegOp zprty, - RegisterClass srcRegType> -: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcRegType:$Vm), + FPRasZPROperand srcOpType> +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, srcOpType:$Vm), asm, "\t$Zdn, $Vm", "", []>, Sched<[]> { @@ -841,15 +1468,35 @@ class sve_int_perm_insrv sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; - let ElementSize = ElementSizeNone; + let DestructiveInstType = DestructiveOther; } -multiclass sve_int_perm_insrv { - def _B : sve_int_perm_insrv<0b00, asm, ZPR8, FPR8>; - def _H : sve_int_perm_insrv<0b01, asm, ZPR16, FPR16>; - def _S : sve_int_perm_insrv<0b10, asm, ZPR32, FPR32>; - def _D : sve_int_perm_insrv<0b11, asm, ZPR64, FPR64>; +multiclass sve_int_perm_insrv { + def _B : sve_int_perm_insrv<0b00, asm, ZPR8, FPR8asZPR>; + def _H : sve_int_perm_insrv<0b01, asm, ZPR16, FPR16asZPR>; + def _S : sve_int_perm_insrv<0b10, asm, ZPR32, FPR32asZPR>; + def _D : sve_int_perm_insrv<0b11, asm, ZPR64, FPR64asZPR>; + + def : Pat<(nxv8f16 (op nxv8f16:$Zn, f16:$Vm)), + (!cast(NAME # _H) $Zn, (INSERT_SUBREG (IMPLICIT_DEF), $Vm, hsub))>; + def : Pat<(nxv4f32 (op nxv4f32:$Zn, f32:$Vm)), + (!cast(NAME # _S) $Zn, (INSERT_SUBREG (IMPLICIT_DEF), $Vm, ssub))>; + def : Pat<(nxv2f64 (op nxv2f64:$Zn, f64:$Vm)), + (!cast(NAME # _D) $Zn, (INSERT_SUBREG (IMPLICIT_DEF), $Vm, dsub))>; + + def : Pat<(nxv8bf16 (op nxv8bf16:$Zn, bf16:$Vm)), + (!cast(NAME # _H) $Zn, (INSERT_SUBREG (IMPLICIT_DEF), $Vm, hsub))>; + + // Keep integer insertions within the vector unit. + def : Pat<(nxv16i8 (op (nxv16i8 ZPR:$Zn), (i32 (vector_extract (nxv16i8 ZPR:$Vm), 0)))), + (!cast(NAME # _B) $Zn, ZPR:$Vm)>; + def : Pat<(nxv8i16 (op (nxv8i16 ZPR:$Zn), (i32 (vector_extract (nxv8i16 ZPR:$Vm), 0)))), + (!cast(NAME # _H) $Zn, ZPR:$Vm)>; + def : Pat<(nxv4i32 (op (nxv4i32 ZPR:$Zn), (i32 (vector_extract (nxv4i32 ZPR:$Vm), 0)))), + (!cast(NAME # _S) $Zn, ZPR: $Vm)>; + def : Pat<(nxv2i64 (op (nxv2i64 ZPR:$Zn), (i64 (vector_extract (nxv2i64 ZPR:$Vm), 0)))), + (!cast(NAME # _D) $Zn, ZPR:$Vm)>; + } //===----------------------------------------------------------------------===// @@ -871,10 +1518,32 @@ class sve_int_perm_extract_i let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } +multiclass sve_int_perm_extract_i { + def NAME : sve_int_perm_extract_i; + + def : SVE_3_Op_Imm_Pat(NAME)>; +} + +class sve2_int_perm_extract_i_cons +: I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, imm0_255:$imm8), + asm, "\t$Zd, $Zn, $imm8", + "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<8> imm8; + let Inst{31-21} = 0b00000101011; + let Inst{20-16} = imm8{7-3}; + let Inst{15-13} = 0b000; + let Inst{12-10} = imm8{2-0}; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + //===----------------------------------------------------------------------===// // SVE Vector Select Group //===----------------------------------------------------------------------===// @@ -898,12 +1567,26 @@ class sve_int_sel_vvv sz8_64, string asm, ZPRRegOp zprty> let Inst{4-0} = Zd; } -multiclass sve_int_sel_vvv { +multiclass sve_int_sel_vvv { def _B : sve_int_sel_vvv<0b00, asm, ZPR8>; def _H : sve_int_sel_vvv<0b01, asm, ZPR16>; def _S : sve_int_sel_vvv<0b10, asm, ZPR32>; def _D : sve_int_sel_vvv<0b11, asm, ZPR64>; + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; + def : SVE_3_Op_Pat(NAME # _D)>; + def : SVE_3_Op_Pat(NAME # _D)>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : InstAlias<"mov $Zd, $Pg/m, $Zn", (!cast(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, ZPR8:$Zn, ZPR8:$Zd), 1>; def : InstAlias<"mov $Zd, $Pg/m, $Zn", @@ -945,8 +1628,41 @@ class sve_int_pred_log opc, string asm> !strconcat(asm, "\t$Pd, $Pg/z, $Pn, $Pm")); let Defs = !if(!eq (opc{2}, 1), [NZCV], []); + } +multiclass sve_int_pred_log opc, string asm, SDPatternOperator op, + SDPatternOperator op_nopred = null_frag> { + def NAME : sve_int_pred_log; + + def : SVE_3_Op_Pat(NAME)>; + def : SVE_3_Op_Pat(NAME)>; + def : SVE_3_Op_Pat(NAME)>; + def : SVE_3_Op_Pat(NAME)>; + def : SVE_2_Op_AllActive_Pat(NAME), PTRUE_B>; + def : SVE_2_Op_AllActive_Pat(NAME), PTRUE_H>; + def : SVE_2_Op_AllActive_Pat(NAME), PTRUE_S>; + def : SVE_2_Op_AllActive_Pat(NAME), PTRUE_D>; +} + +// An instance of sve_int_pred_log_and but uses op_nopred's first operand as the +// general predicate. +multiclass sve_int_pred_log_v2 opc, string asm, SDPatternOperator op, + SDPatternOperator op_nopred> : + sve_int_pred_log { + def : Pat<(nxv16i1 (op_nopred nxv16i1:$Op1, nxv16i1:$Op2)), + (!cast(NAME) $Op1, $Op1, $Op2)>; + def : Pat<(nxv8i1 (op_nopred nxv8i1:$Op1, nxv8i1:$Op2)), + (!cast(NAME) $Op1, $Op1, $Op2)>; + def : Pat<(nxv4i1 (op_nopred nxv4i1:$Op1, nxv4i1:$Op2)), + (!cast(NAME) $Op1, $Op1, $Op2)>; + def : Pat<(nxv2i1 (op_nopred nxv2i1:$Op1, nxv2i1:$Op2)), + (!cast(NAME) $Op1, $Op1, $Op2)>; +} //===----------------------------------------------------------------------===// // SVE Logical Mask Immediate Group @@ -966,13 +1682,18 @@ class sve_int_log_imm opc, string asm> let Constraints = "$Zdn = $_Zdn"; let DecoderMethod = "DecodeSVELogicalImmInstruction"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve_int_log_imm opc, string asm, string alias> { +multiclass sve_int_log_imm opc, string asm, string alias, SDPatternOperator op> { def NAME : sve_int_log_imm; + def : SVE_1_Op_Imm_Log_Pat(NAME)>; + def : SVE_1_Op_Imm_Log_Pat(NAME)>; + def : SVE_1_Op_Imm_Log_Pat(NAME)>; + def : SVE_1_Op_Imm_Log_Pat(NAME)>; + def : InstAlias(NAME) ZPR8:$Zdn, sve_logical_imm8:$imm), 4>; def : InstAlias opc, string asm, string alias> { (!cast(NAME) ZPR64:$Zdn, logical_imm64_not:$imm), 0>; } +multiclass sve_int_log_imm_bic { + def : SVE_1_Op_Imm_Log_Pat("AND_ZI")>; + def : SVE_1_Op_Imm_Log_Pat("AND_ZI")>; + def : SVE_1_Op_Imm_Log_Pat("AND_ZI")>; + def : SVE_1_Op_Imm_Log_Pat("AND_ZI")>; +} + class sve_int_dup_mask_imm : I<(outs ZPR64:$Zd), (ins logical_imm64:$imms), asm, "\t$Zd, $imms", @@ -1022,6 +1750,9 @@ multiclass sve_int_dup_mask_imm { (!cast(NAME) ZPR32:$Zd, sve_preferred_logical_imm32:$imm), 6>; def : InstAlias<"mov $Zd, $imm", (!cast(NAME) ZPR64:$Zd, sve_preferred_logical_imm64:$imm), 5>; + + def : Pat<(nxv2i64 (AArch64dup (i64 logical_imm64:$imm))), + (!cast(NAME) logical_imm64:$imm)>; } //===----------------------------------------------------------------------===// @@ -1046,11 +1777,16 @@ class sve_int_bin_cons_arit_0 sz8_64, bits<3> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_int_bin_cons_arit_0 opc, string asm> { +multiclass sve_int_bin_cons_arit_0 opc, string asm, SDPatternOperator op> { def _B : sve_int_bin_cons_arit_0<0b00, opc, asm, ZPR8>; def _H : sve_int_bin_cons_arit_0<0b01, opc, asm, ZPR16>; def _S : sve_int_bin_cons_arit_0<0b10, opc, asm, ZPR32>; def _D : sve_int_bin_cons_arit_0<0b11, opc, asm, ZPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -1078,14 +1814,23 @@ class sve_fp_2op_i_p_zds sz, bits<3> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_fp_2op_i_p_zds opc, string asm, Operand imm_ty> { - def _H : sve_fp_2op_i_p_zds<0b01, opc, asm, ZPR16, imm_ty>; - def _S : sve_fp_2op_i_p_zds<0b10, opc, asm, ZPR32, imm_ty>; - def _D : sve_fp_2op_i_p_zds<0b11, opc, asm, ZPR64, imm_ty>; +multiclass sve_fp_2op_i_p_zds opc, string asm, string Ps, Operand imm_ty, FPImmLeaf A, FPImmLeaf B, SDPatternOperator op> { + let DestructiveInstType = DestructiveBinaryImm in { + def _H : SVEPseudo2Instr, sve_fp_2op_i_p_zds<0b01, opc, asm, ZPR16, imm_ty>; + def _S : SVEPseudo2Instr, sve_fp_2op_i_p_zds<0b10, opc, asm, ZPR32, imm_ty>; + def _D : SVEPseudo2Instr, sve_fp_2op_i_p_zds<0b11, opc, asm, ZPR64, imm_ty>; + } + + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_H")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_H")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_S")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_S")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_D")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_D")>; } class sve_fp_2op_p_zds sz, bits<4> opc, string asm, @@ -1107,18 +1852,50 @@ class sve_fp_2op_p_zds sz, bits<4> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_fp_2op_p_zds opc, string asm> { +multiclass sve_fp_2op_p_zds opc, string asm, string Ps, + SDPatternOperator op, DestructiveInstTypeEnum flags, + string revname="", bit isReverseInstr=0> { + let DestructiveInstType = flags in { + def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; + } + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; +} + +multiclass sve_fp_2op_p_zds_fscale opc, string asm, + SDPatternOperator op> { def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>; def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>; def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; +} + +multiclass sve_fp_2op_p_zds_zeroing_hsd { + def _ZERO_H : PredTwoOpPseudo; + def _ZERO_S : PredTwoOpPseudo; + def _ZERO_D : PredTwoOpPseudo; + + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_D)>; } class sve_fp_ftmad sz, string asm, ZPRRegOp zprty> -: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm0_7:$imm3), +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, timm32_0_7:$imm3), asm, "\t$Zdn, $_Zdn, $Zm, $imm3", "", []>, Sched<[]> { @@ -1134,26 +1911,66 @@ class sve_fp_ftmad sz, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve_fp_ftmad { +multiclass sve_fp_ftmad { def _H : sve_fp_ftmad<0b01, asm, ZPR16>; def _S : sve_fp_ftmad<0b10, asm, ZPR32>; def _D : sve_fp_ftmad<0b11, asm, ZPR64>; + + def : Pat<(nxv8f16 (op (nxv8f16 ZPR16:$Zn), (nxv8f16 ZPR16:$Zm), (i32 timm32_0_7:$imm))), + (!cast(NAME # _H) ZPR16:$Zn, ZPR16:$Zm, timm32_0_7:$imm)>; + def : Pat<(nxv4f32 (op (nxv4f32 ZPR32:$Zn), (nxv4f32 ZPR32:$Zm), (i32 timm32_0_7:$imm))), + (!cast(NAME # _S) ZPR32:$Zn, ZPR32:$Zm, timm32_0_7:$imm)>; + def : Pat<(nxv2f64 (op (nxv2f64 ZPR64:$Zn), (nxv2f64 ZPR64:$Zm), (i32 timm32_0_7:$imm))), + (!cast(NAME # _D) ZPR64:$Zn, ZPR64:$Zm, timm32_0_7:$imm)>; } +multiclass sve_fp_2op_i_p_zds_hfd { + def _UNDEF_H : PredTwoOpImmPseudo; + def _UNDEF_S : PredTwoOpImmPseudo; + def _UNDEF_D : PredTwoOpImmPseudo; + + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_UNDEF_H")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_UNDEF_H")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_UNDEF_H")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_UNDEF_H")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_UNDEF_H")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_UNDEF_H")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_UNDEF_S")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_UNDEF_S")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_UNDEF_S")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_UNDEF_S")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_UNDEF_D")>; + def : SVE_2_Op_Fp_Imm_Pat(NAME # "_UNDEF_D")>; +} + +multiclass sve_fp_2op_i_p_zds_zeroing_hfd { + def _ZERO_H : PredTwoOpImmPseudo; + def _ZERO_S : PredTwoOpImmPseudo; + def _ZERO_D : PredTwoOpImmPseudo; + + let AddedComplexity = 2 in { + def : SVE_2_Op_Fp_Imm_Pat_Zero(NAME # "_ZERO_H")>; + def : SVE_2_Op_Fp_Imm_Pat_Zero(NAME # "_ZERO_H")>; + def : SVE_2_Op_Fp_Imm_Pat_Zero(NAME # "_ZERO_S")>; + def : SVE_2_Op_Fp_Imm_Pat_Zero(NAME # "_ZERO_S")>; + def : SVE_2_Op_Fp_Imm_Pat_Zero(NAME # "_ZERO_D")>; + def : SVE_2_Op_Fp_Imm_Pat_Zero(NAME # "_ZERO_D")>; + } +} //===----------------------------------------------------------------------===// // SVE Floating Point Arithmetic - Unpredicated Group //===----------------------------------------------------------------------===// -class sve_fp_3op_u_zd sz, bits<3> opc, string asm, - ZPRRegOp zprty> +class sve_fp_3op_u_zd sz, bits<3> opc, string asm, ZPRRegOp zprty> : I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), asm, "\t$Zd, $Zn, $Zm", - "", []>, Sched<[]> { + "", + []>, Sched<[]> { bits<5> Zd; bits<5> Zm; bits<5> Zn; @@ -1167,10 +1984,29 @@ class sve_fp_3op_u_zd sz, bits<3> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_fp_3op_u_zd opc, string asm> { +multiclass sve_fp_3op_u_zd opc, string asm, SDPatternOperator op, + SDPatternOperator predicated_op = null_frag> { def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>; def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>; def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; + + def : SVE_2_Op_Pred_All_Active(NAME # _H)>; + def : SVE_2_Op_Pred_All_Active(NAME # _S)>; + def : SVE_2_Op_Pred_All_Active(NAME # _D)>; +} + +multiclass sve_fp_3op_u_zd_ftsmul opc, string asm, SDPatternOperator op> { + def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>; + def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>; + def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -1197,14 +2033,24 @@ class sve_fp_3op_p_zds_a sz, bits<2> opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; let ElementSize = zprty.ElementSize; } -multiclass sve_fp_3op_p_zds_a opc, string asm> { - def _H : sve_fp_3op_p_zds_a<0b01, opc, asm, ZPR16>; - def _S : sve_fp_3op_p_zds_a<0b10, opc, asm, ZPR32>; - def _D : sve_fp_3op_p_zds_a<0b11, opc, asm, ZPR64>; +multiclass sve_fp_3op_p_zds_a opc, string asm, string Ps, + SDPatternOperator op, string revname, + bit isReverseInstr=0> { + let DestructiveInstType = DestructiveTernaryCommWithRev in { + def _H : sve_fp_3op_p_zds_a<0b01, opc, asm, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + def _S : sve_fp_3op_p_zds_a<0b10, opc, asm, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve_fp_3op_p_zds_a<0b11, opc, asm, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; + } + + def : SVE_4_Op_Pat(NAME # _H)>; + def : SVE_4_Op_Pat(NAME # _S)>; + def : SVE_4_Op_Pat(NAME # _D)>; } class sve_fp_3op_p_zds_b sz, bits<2> opc, string asm, @@ -1228,14 +2074,28 @@ class sve_fp_3op_p_zds_b sz, bits<2> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_fp_3op_p_zds_b opc, string asm> { - def _H : sve_fp_3op_p_zds_b<0b01, opc, asm, ZPR16>; - def _S : sve_fp_3op_p_zds_b<0b10, opc, asm, ZPR32>; - def _D : sve_fp_3op_p_zds_b<0b11, opc, asm, ZPR64>; +multiclass sve_fp_3op_p_zds_b opc, string asm, SDPatternOperator op, + string revname, bit isReverseInstr> { + def _H : sve_fp_3op_p_zds_b<0b01, opc, asm, ZPR16>, + SVEInstr2Rev; + def _S : sve_fp_3op_p_zds_b<0b10, opc, asm, ZPR32>, + SVEInstr2Rev; + def _D : sve_fp_3op_p_zds_b<0b11, opc, asm, ZPR64>, + SVEInstr2Rev; + + def : SVE_4_Op_Pat(NAME # _H)>; + def : SVE_4_Op_Pat(NAME # _S)>; + def : SVE_4_Op_Pat(NAME # _D)>; +} + +multiclass sve_fp_3op_p_zds_zx { + def _UNDEF_H : PredThreeOpPseudo; + def _UNDEF_S : PredThreeOpPseudo; + def _UNDEF_D : PredThreeOpPseudo; } //===----------------------------------------------------------------------===// @@ -1258,30 +2118,38 @@ class sve_fp_fma_by_indexed_elem sz, bit opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve_fp_fma_by_indexed_elem { - def _H : sve_fp_fma_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR3b16, VectorIndexH> { +multiclass sve_fp_fma_by_indexed_elem { + def _H : sve_fp_fma_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR3b16, VectorIndexH32b> { bits<3> Zm; bits<3> iop; let Inst{22} = iop{2}; let Inst{20-19} = iop{1-0}; let Inst{18-16} = Zm; } - def _S : sve_fp_fma_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR3b32, VectorIndexS> { + def _S : sve_fp_fma_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR3b32, VectorIndexS32b> { bits<3> Zm; bits<2> iop; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _D : sve_fp_fma_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR4b64, VectorIndexD> { + def _D : sve_fp_fma_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR4b64, VectorIndexD32b> { bits<4> Zm; bit iop; let Inst{20} = iop; let Inst{19-16} = Zm; } + + def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexH32b_timm:$idx))), + (!cast(NAME # _H) $Op1, $Op2, $Op3, VectorIndexH32b_timm:$idx)>; + def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexS32b_timm:$idx))), + (!cast(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>; + def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 VectorIndexD32b_timm:$idx))), + (!cast(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>; } @@ -1303,26 +2171,33 @@ class sve_fp_fmul_by_indexed_elem sz, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; } -multiclass sve_fp_fmul_by_indexed_elem { - def _H : sve_fp_fmul_by_indexed_elem<{0, ?}, asm, ZPR16, ZPR3b16, VectorIndexH> { +multiclass sve_fp_fmul_by_indexed_elem { + def _H : sve_fp_fmul_by_indexed_elem<{0, ?}, asm, ZPR16, ZPR3b16, VectorIndexH32b> { bits<3> Zm; bits<3> iop; let Inst{22} = iop{2}; let Inst{20-19} = iop{1-0}; let Inst{18-16} = Zm; } - def _S : sve_fp_fmul_by_indexed_elem<0b10, asm, ZPR32, ZPR3b32, VectorIndexS> { + def _S : sve_fp_fmul_by_indexed_elem<0b10, asm, ZPR32, ZPR3b32, VectorIndexS32b> { bits<3> Zm; bits<2> iop; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _D : sve_fp_fmul_by_indexed_elem<0b11, asm, ZPR64, ZPR4b64, VectorIndexD> { + def _D : sve_fp_fmul_by_indexed_elem<0b11, asm, ZPR64, ZPR4b64, VectorIndexD32b> { bits<4> Zm; bit iop; let Inst{20} = iop; let Inst{19-16} = Zm; } + + def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, (i32 VectorIndexH32b_timm:$idx))), + (!cast(NAME # _H) $Op1, $Op2, VectorIndexH32b_timm:$idx)>; + def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, (i32 VectorIndexS32b_timm:$idx))), + (!cast(NAME # _S) $Op1, $Op2, VectorIndexS32b_timm:$idx)>; + def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, (i32 VectorIndexD32b_timm:$idx))), + (!cast(NAME # _D) $Op1, $Op2, VectorIndexD32b_timm:$idx)>; } //===----------------------------------------------------------------------===// @@ -1350,14 +2225,21 @@ class sve_fp_fcmla sz, string asm, ZPRRegOp zprty> let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_fp_fcmla { +multiclass sve_fp_fcmla { def _H : sve_fp_fcmla<0b01, asm, ZPR16>; def _S : sve_fp_fcmla<0b10, asm, ZPR32>; def _D : sve_fp_fcmla<0b11, asm, ZPR64>; + + def : Pat<(nxv8f16 (op nxv8i1:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, nxv8f16:$Op4, (i32 complexrotateop:$imm))), + (!cast(NAME # _H) $Op1, $Op2, $Op3, $Op4, complexrotateop:$imm)>; + def : Pat<(nxv4f32 (op nxv4i1:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, nxv4f32:$Op4, (i32 complexrotateop:$imm))), + (!cast(NAME # _S) $Op1, $Op2, $Op3, $Op4, complexrotateop:$imm)>; + def : Pat<(nxv2f64 (op nxv2i1:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, nxv2f64:$Op4, (i32 complexrotateop:$imm))), + (!cast(NAME # _D) $Op1, $Op2, $Op3, $Op4, complexrotateop:$imm)>; } //===----------------------------------------------------------------------===// @@ -1383,23 +2265,28 @@ class sve_fp_fcmla_by_indexed_elem sz, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve_fp_fcmla_by_indexed_elem { - def _H : sve_fp_fcmla_by_indexed_elem<0b10, asm, ZPR16, ZPR3b16, VectorIndexS> { +multiclass sve_fp_fcmla_by_indexed_elem { + def _H : sve_fp_fcmla_by_indexed_elem<0b10, asm, ZPR16, ZPR3b16, VectorIndexS32b> { bits<3> Zm; bits<2> iop; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _S : sve_fp_fcmla_by_indexed_elem<0b11, asm, ZPR32, ZPR4b32, VectorIndexD> { + def _S : sve_fp_fcmla_by_indexed_elem<0b11, asm, ZPR32, ZPR4b32, VectorIndexD32b> { bits<4> Zm; bits<1> iop; let Inst{20} = iop; let Inst{19-16} = Zm; } + + def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast(NAME # _H) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>; + def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast(NAME # _S) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>; } //===----------------------------------------------------------------------===// @@ -1426,14 +2313,173 @@ class sve_fp_fcadd sz, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_fp_fcadd { +multiclass sve_fp_fcadd { def _H : sve_fp_fcadd<0b01, asm, ZPR16>; def _S : sve_fp_fcadd<0b10, asm, ZPR32>; def _D : sve_fp_fcadd<0b11, asm, ZPR64>; + + def : Pat<(nxv8f16 (op nxv8i1:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 complexrotateopodd:$imm))), + (!cast(NAME # _H) $Op1, $Op2, $Op3, complexrotateopodd:$imm)>; + def : Pat<(nxv4f32 (op nxv4i1:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 complexrotateopodd:$imm))), + (!cast(NAME # _S) $Op1, $Op2, $Op3, complexrotateopodd:$imm)>; + def : Pat<(nxv2f64 (op nxv2i1:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 complexrotateopodd:$imm))), + (!cast(NAME # _D) $Op1, $Op2, $Op3, complexrotateopodd:$imm)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Floating Point Convert Group +//===----------------------------------------------------------------------===// + +class sve2_fp_convert_precision opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, PPR3bAny:$Pg, zprty2:$Zn), + asm, "\t$Zd, $Pg/m, $Zn", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<3> Pg; + let Inst{31-24} = 0b01100100; + let Inst{23-22} = opc{3-2}; + let Inst{21-18} = 0b0010; + let Inst{17-16} = opc{1-0}; + let Inst{15-13} = 0b101; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; +} + +multiclass sve2_fp_convert_down_narrow { + def _StoH : sve2_fp_convert_precision<0b1000, asm, ZPR16, ZPR32>; + def _DtoS : sve2_fp_convert_precision<0b1110, asm, ZPR32, ZPR64>; + + def : SVE_3_Op_Pat(op # _f16f32), nxv8f16, nxv4i1, nxv4f32, !cast(NAME # _StoH)>; + def : SVE_3_Op_Pat(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast(NAME # _DtoS)>; +} + +multiclass sve2_fp_convert_up_long { + def _HtoS : sve2_fp_convert_precision<0b1001, asm, ZPR32, ZPR16>; + def _StoD : sve2_fp_convert_precision<0b1111, asm, ZPR64, ZPR32>; + + def : SVE_3_Op_Pat(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast(NAME # _HtoS)>; + def : SVE_3_Op_Pat(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast(NAME # _StoD)>; +} + +multiclass sve2_fp_convert_down_odd_rounding_top { + def _DtoS : sve2_fp_convert_precision<0b0010, asm, ZPR32, ZPR64>; + + def : SVE_3_Op_Pat(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast(NAME # _DtoS)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Floating Point Pairwise Group +//===----------------------------------------------------------------------===// + +class sve2_fp_pairwise_pred sz, bits<3> opc, string asm, + ZPRRegOp zprty> +: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm), + asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", + "", + []>, Sched<[]> { + bits<3> Pg; + bits<5> Zm; + bits<5> Zdn; + let Inst{31-24} = 0b01100100; + let Inst{23-22} = sz; + let Inst{21-19} = 0b010; + let Inst{18-16} = opc; + let Inst{15-13} = 0b100; + let Inst{12-10} = Pg; + let Inst{9-5} = Zm; + let Inst{4-0} = Zdn; + + let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = DestructiveOther; + let ElementSize = zprty.ElementSize; +} + +multiclass sve2_fp_pairwise_pred opc, string asm, + SDPatternOperator op> { + def _H : sve2_fp_pairwise_pred<0b01, opc, asm, ZPR16>; + def _S : sve2_fp_pairwise_pred<0b10, opc, asm, ZPR32>; + def _D : sve2_fp_pairwise_pred<0b11, opc, asm, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Floating Point Widening Multiply-Add - Indexed Group +//===----------------------------------------------------------------------===// + +class sve2_fp_mla_long_by_indexed_elem opc, string asm> +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, + VectorIndexH32b:$iop), + asm, "\t$Zda, $Zn, $Zm$iop", + "", + []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<3> Zm; + bits<3> iop; + let Inst{31-21} = 0b01100100101; + let Inst{20-19} = iop{2-1}; + let Inst{18-16} = Zm; + let Inst{15-14} = 0b01; + let Inst{13} = opc{1}; + let Inst{12} = 0b0; + let Inst{11} = iop{0}; + let Inst{10} = opc{0}; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_fp_mla_long_by_indexed_elem opc, string asm, + SDPatternOperator op> { + def NAME : sve2_fp_mla_long_by_indexed_elem; + def : SVE_4_Op_Imm_Pat(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Floating Point Widening Multiply-Add Group +//===----------------------------------------------------------------------===// + +class sve2_fp_mla_long opc, string asm> +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm), + asm, "\t$Zda, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-21} = 0b01100100101; + let Inst{20-16} = Zm; + let Inst{15-14} = 0b10; + let Inst{13} = opc{1}; + let Inst{12-11} = 0b00; + let Inst{10} = opc{0}; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_fp_mla_long opc, string asm, SDPatternOperator op> { + def NAME : sve2_fp_mla_long; + def : SVE_3_Op_Pat(NAME)>; } //===----------------------------------------------------------------------===// @@ -1496,11 +2542,26 @@ class sve_int_perm_bin_perm_zz opc, bits<2> sz8_64, string asm, let Inst{4-0} = Zd; } -multiclass sve_int_perm_bin_perm_zz opc, string asm> { +multiclass sve_int_perm_bin_perm_zz opc, string asm, + SDPatternOperator op> { def _B : sve_int_perm_bin_perm_zz; def _H : sve_int_perm_bin_perm_zz; def _S : sve_int_perm_bin_perm_zz; def _D : sve_int_perm_bin_perm_zz; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; + def : SVE_2_Op_Pat(NAME # _D)>; + def : SVE_2_Op_Pat(NAME # _D)>; + + def : SVE_2_Op_Pat(NAME # _H)>; } //===----------------------------------------------------------------------===// @@ -1508,7 +2569,7 @@ multiclass sve_int_perm_bin_perm_zz opc, string asm> { //===----------------------------------------------------------------------===// class sve_fp_2op_p_zd opc, string asm, RegisterOperand i_zprtype, - RegisterOperand o_zprtype, ElementSizeEnum size> + RegisterOperand o_zprtype, ElementSizeEnum Sz> : I<(outs o_zprtype:$Zd), (ins i_zprtype:$_Zd, PPR3bAny:$Pg, i_zprtype:$Zn), asm, "\t$Zd, $Pg/m, $Zn", "", @@ -1526,16 +2587,103 @@ class sve_fp_2op_p_zd opc, string asm, RegisterOperand i_zprtype, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; - let ElementSize = size; + let DestructiveInstType = DestructiveUnaryPassthru; + let ElementSize = Sz; } -multiclass sve_fp_2op_p_zd_HSD opc, string asm> { - def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16, ElementSizeH>; - def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>; - def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>; +multiclass sve_fp_2op_p_zd opc, string asm, + RegisterOperand i_zprtype, + RegisterOperand o_zprtype, + SDPatternOperator int_op, + SDPatternOperator ir_op, ValueType vt1, + ValueType vt2, ValueType vt3, ElementSizeEnum Sz> { + def NAME : sve_fp_2op_p_zd, + SVEPseudo2Instr; + // convert vt1 to a packed type for the intrinsic patterns + defvar packedvt1 = !cond(!eq(!cast(vt1), "nxv2f16"): nxv8f16, + !eq(!cast(vt1), "nxv4f16"): nxv8f16, + !eq(!cast(vt1), "nxv2f32"): nxv4f32, + 1 : vt1); + + // convert vt3 to a packed type for the intrinsic patterns + defvar packedvt3 = !cond(!eq(!cast(vt3), "nxv2f16"): nxv8f16, + !eq(!cast(vt3), "nxv4f16"): nxv8f16, + !eq(!cast(vt3), "nxv2f32"): nxv4f32, + 1 : vt3); + + def : SVE_3_Op_Pat(NAME)>; + def : SVE_1_Op_Passthru_Pat(NAME)>; + + def _UNDEF : PredOneOpPassthruPseudo(i_zprtype)>; + + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF)>; } - + +multiclass sve_fp_2op_p_zdr opc, string asm, + RegisterOperand i_zprtype, + RegisterOperand o_zprtype, + SDPatternOperator int_op, + SDPatternOperator ir_op, ValueType vt1, + ValueType vt2, ValueType vt3, ElementSizeEnum Sz> { + def NAME : sve_fp_2op_p_zd, + SVEPseudo2Instr; + + // convert vt1 to a packed type for the intrinsic patterns + defvar packedvt1 = !cond(!eq(!cast(vt1), "nxv2f16"): nxv8f16, + !eq(!cast(vt1), "nxv4f16"): nxv8f16, + !eq(!cast(vt1), "nxv2f32"): nxv4f32, + 1 : vt1); + + def : SVE_3_Op_Pat(NAME)>; + def : SVE_1_Op_Passthru_Round_Pat(NAME)>; + + def _UNDEF : PredOneOpPassthruPseudo(i_zprtype)>; + + defm : SVE_1_Op_PassthruUndef_Round_Pat(NAME # _UNDEF)>; +} + +multiclass sve_fp_2op_p_zd_HSD opc, string asm, SDPatternOperator op> { + def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16, ElementSizeH>, + SVEPseudo2Instr; + def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>, + SVEPseudo2Instr; + def _D : sve_fp_2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64, ElementSizeD>, + SVEPseudo2Instr; + + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat(NAME # _D)>; + + def _UNDEF_H : PredOneOpPassthruPseudo; + def _UNDEF_S : PredOneOpPassthruPseudo; + def _UNDEF_D : PredOneOpPassthruPseudo; + + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_H)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_H)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_H)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_S)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_S)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_D)>; +} + +multiclass sve2_fp_flogb { + def _H : sve_fp_2op_p_zd<0b0011010, asm, ZPR16, ZPR16, ElementSizeH>; + def _S : sve_fp_2op_p_zd<0b0011100, asm, ZPR32, ZPR32, ElementSizeS>; + def _D : sve_fp_2op_p_zd<0b0011110, asm, ZPR64, ZPR64, ElementSizeD>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; +} + +multiclass sve2_fp_convert_down_odd_rounding { + def _DtoS : sve_fp_2op_p_zd<0b0001010, asm, ZPR64, ZPR32, ElementSizeD>; + def : SVE_3_Op_Pat(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast(NAME # _DtoS)>; +} + //===----------------------------------------------------------------------===// // SVE Floating Point Unary Operations - Unpredicated Group //===----------------------------------------------------------------------===// @@ -1557,10 +2705,14 @@ class sve_fp_2op_u_zd sz, bits<3> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_fp_2op_u_zd opc, string asm> { +multiclass sve_fp_2op_u_zd opc, string asm, SDPatternOperator op> { def _H : sve_fp_2op_u_zd<0b01, opc, asm, ZPR16>; def _S : sve_fp_2op_u_zd<0b10, opc, asm, ZPR32>; def _D : sve_fp_2op_u_zd<0b11, opc, asm, ZPR64>; + + def : SVE_1_Op_Pat(NAME # _H)>; + def : SVE_1_Op_Pat(NAME # _S)>; + def : SVE_1_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -1585,42 +2737,97 @@ class sve_int_bin_pred_arit_log sz8_64, bits<2> fmt, bits<3> opc, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_bin_pred_log opc, string asm> { +multiclass sve_int_bin_pred_log opc, string asm, SDPatternOperator op> { def _B : sve_int_bin_pred_arit_log<0b00, 0b11, opc, asm, ZPR8>; def _H : sve_int_bin_pred_arit_log<0b01, 0b11, opc, asm, ZPR16>; def _S : sve_int_bin_pred_arit_log<0b10, 0b11, opc, asm, ZPR32>; def _D : sve_int_bin_pred_arit_log<0b11, 0b11, opc, asm, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } -multiclass sve_int_bin_pred_arit_0 opc, string asm> { - def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>; - def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>; - def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>; - def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>; +multiclass sve_int_bin_pred_arit_0 opc, string asm, string Ps, + SDPatternOperator op, + DestructiveInstTypeEnum flags, + string revname="", bit isReverseInstr=0> { + let DestructiveInstType = flags in { + def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>, + SVEPseudo2Instr, SVEInstr2Rev; + def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; + } + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } -multiclass sve_int_bin_pred_arit_1 opc, string asm> { - def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>; - def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>; - def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>; - def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>; +multiclass sve_int_bin_pred_arit_1 opc, string asm, string Ps, + SDPatternOperator op, + DestructiveInstTypeEnum flags> { + let DestructiveInstType = flags in { + def _B : sve_int_bin_pred_arit_log<0b00, 0b01, opc, asm, ZPR8>, + SVEPseudo2Instr; + def _H : sve_int_bin_pred_arit_log<0b01, 0b01, opc, asm, ZPR16>, + SVEPseudo2Instr; + def _S : sve_int_bin_pred_arit_log<0b10, 0b01, opc, asm, ZPR32>, + SVEPseudo2Instr; + def _D : sve_int_bin_pred_arit_log<0b11, 0b01, opc, asm, ZPR64>, + SVEPseudo2Instr; + } + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } -multiclass sve_int_bin_pred_arit_2 opc, string asm> { - def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>; - def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>; - def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>; - def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>; +multiclass sve_int_bin_pred_arit_2 opc, string asm, string Ps, + SDPatternOperator op, + DestructiveInstTypeEnum flags> { + let DestructiveInstType = flags in { + def _B : sve_int_bin_pred_arit_log<0b00, 0b10, opc, asm, ZPR8>, + SVEPseudo2Instr; + def _H : sve_int_bin_pred_arit_log<0b01, 0b10, opc, asm, ZPR16>, + SVEPseudo2Instr; + def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>, + SVEPseudo2Instr; + def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>, + SVEPseudo2Instr; + } + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } // Special case for divides which are not defined for 8b/16b elements. -multiclass sve_int_bin_pred_arit_2_div opc, string asm> { - def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>; - def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>; +multiclass sve_int_bin_pred_arit_2_div opc, string asm, string Ps, + SDPatternOperator op, + DestructiveInstTypeEnum flags, + string revname="", bit isReverseInstr=0> { + let DestructiveInstType = flags in { + def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; + } + + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -1648,15 +2855,20 @@ class sve_int_mladdsub_vvv_pred sz8_64, bits<1> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_mladdsub_vvv_pred opc, string asm> { +multiclass sve_int_mladdsub_vvv_pred opc, string asm, SDPatternOperator op> { def _B : sve_int_mladdsub_vvv_pred<0b00, opc, asm, ZPR8>; def _H : sve_int_mladdsub_vvv_pred<0b01, opc, asm, ZPR16>; def _S : sve_int_mladdsub_vvv_pred<0b10, opc, asm, ZPR32>; def _D : sve_int_mladdsub_vvv_pred<0b11, opc, asm, ZPR64>; + + def : SVE_4_Op_Pat(NAME # _B)>; + def : SVE_4_Op_Pat(NAME # _H)>; + def : SVE_4_Op_Pat(NAME # _S)>; + def : SVE_4_Op_Pat(NAME # _D)>; } class sve_int_mlas_vvv_pred sz8_64, bits<1> opc, string asm, @@ -1680,15 +2892,154 @@ class sve_int_mlas_vvv_pred sz8_64, bits<1> opc, string asm, let Inst{4-0} = Zda; let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_mlas_vvv_pred opc, string asm> { +multiclass sve_int_mlas_vvv_pred opc, string asm, SDPatternOperator op, + SDPatternOperator outerop, SDPatternOperator mulop> { def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>; def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>; def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>; def _D : sve_int_mlas_vvv_pred<0b11, opc, asm, ZPR64>; + + def : SVE_4_Op_Pat(NAME # _B)>; + def : SVE_4_Op_Pat(NAME # _H)>; + def : SVE_4_Op_Pat(NAME # _S)>; + def : SVE_4_Op_Pat(NAME # _D)>; + + def : Pat<(outerop nxv16i8:$Op1, (mulop nxv16i1:$pred, nxv16i8:$Op2, nxv16i8:$Op3)), + (!cast(NAME # _B) $pred, $Op1, $Op2, $Op3)>; + def : Pat<(outerop nxv8i16:$Op1, (mulop nxv8i1:$pred, nxv8i16:$Op2, nxv8i16:$Op3)), + (!cast(NAME # _H) $pred, $Op1, $Op2, $Op3)>; + def : Pat<(outerop nxv4i32:$Op1, (mulop nxv4i1:$pred, nxv4i32:$Op2, nxv4i32:$Op3)), + (!cast(NAME # _S) $pred, $Op1, $Op2, $Op3)>; + def : Pat<(outerop nxv2i64:$Op1, (mulop nxv2i1:$pred, nxv2i64:$Op2, nxv2i64:$Op3)), + (!cast(NAME # _D) $pred, $Op1, $Op2, $Op3)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Integer Multiply-Add - Unpredicated Group +//===----------------------------------------------------------------------===// + +class sve2_int_mla sz, bits<5> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm), + asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15} = 0b0; + let Inst{14-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_int_mla { + def _B : sve2_int_mla<0b00, { 0b1110, S }, asm, ZPR8, ZPR8>; + def _H : sve2_int_mla<0b01, { 0b1110, S }, asm, ZPR16, ZPR16>; + def _S : sve2_int_mla<0b10, { 0b1110, S }, asm, ZPR32, ZPR32>; + def _D : sve2_int_mla<0b11, { 0b1110, S }, asm, ZPR64, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; +} + +multiclass sve2_int_mla_long opc, string asm, SDPatternOperator op> { + def _H : sve2_int_mla<0b01, opc, asm, ZPR16, ZPR8>; + def _S : sve2_int_mla<0b10, opc, asm, ZPR32, ZPR16>; + def _D : sve2_int_mla<0b11, opc, asm, ZPR64, ZPR32>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Integer Multiply-Add - Indexed Group +//===----------------------------------------------------------------------===// + +class sve2_int_mla_by_indexed_elem sz, bits<6> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2, + ZPRRegOp zprty3, Operand itype> +: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop), + asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21} = 0b1; + let Inst{15-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_int_mla_by_indexed_elem opc, bit S, string asm, + SDPatternOperator op> { + def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH32b> { + bits<3> Zm; + bits<3> iop; + let Inst{22} = iop{2}; + let Inst{20-19} = iop{1-0}; + let Inst{18-16} = Zm; + } + def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS32b> { + bits<3> Zm; + bits<2> iop; + let Inst{20-19} = iop; + let Inst{18-16} = Zm; + } + def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD32b> { + bits<4> Zm; + bit iop; + let Inst{20} = iop; + let Inst{19-16} = Zm; + } + + def : SVE_4_Op_Imm_Pat(NAME # _H)>; + def : SVE_4_Op_Imm_Pat(NAME # _S)>; + def : SVE_4_Op_Imm_Pat(NAME # _D)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Integer Multiply-Add Long - Indexed Group +//===----------------------------------------------------------------------===// + +multiclass sve2_int_mla_long_by_indexed_elem opc, string asm, + SDPatternOperator op> { + def _S : sve2_int_mla_by_indexed_elem<0b10, { opc{3}, 0b0, opc{2-1}, ?, opc{0} }, + asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> { + bits<3> Zm; + bits<3> iop; + let Inst{20-19} = iop{2-1}; + let Inst{18-16} = Zm; + let Inst{11} = iop{0}; + } + def _D : sve2_int_mla_by_indexed_elem<0b11, { opc{3}, 0b0, opc{2-1}, ?, opc{0} }, + asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> { + bits<4> Zm; + bits<2> iop; + let Inst{20} = iop{1}; + let Inst{19-16} = Zm; + let Inst{11} = iop{0}; + } + + def : SVE_4_Op_Imm_Pat(NAME # _S)>; + def : SVE_4_Op_Imm_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -1712,13 +3063,15 @@ class sve_intx_dot { +multiclass sve_intx_dot { def _S : sve_intx_dot<0b0, opc, asm, ZPR32, ZPR8>; def _D : sve_intx_dot<0b1, opc, asm, ZPR64, ZPR16>; + + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -1742,23 +3095,986 @@ class sve_intx_dot_by_indexed_elem { - def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> { +multiclass sve_intx_dot_by_indexed_elem { + def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> { bits<2> iop; bits<3> Zm; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> { + def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> { bits<1> iop; bits<4> Zm; let Inst{20} = iop; let Inst{19-16} = Zm; } + + def : SVE_4_Op_Imm_Pat(NAME # _S)>; + def : SVE_4_Op_Imm_Pat(NAME # _D)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Complex Integer Dot Product Group +//===----------------------------------------------------------------------===// + +class sve2_complex_int_arith sz, bits<4> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm, + complexrotateop:$rot), + asm, "\t$Zda, $Zn, $Zm, $rot", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + bits<2> rot; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15-12} = opc; + let Inst{11-10} = rot; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_cintx_dot { + def _S : sve2_complex_int_arith<0b10, 0b0001, asm, ZPR32, ZPR8>; + def _D : sve2_complex_int_arith<0b11, 0b0001, asm, ZPR64, ZPR16>; + + def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv16i8 ZPR8:$Op2), (nxv16i8 ZPR8:$Op3), + (i32 complexrotateop:$imm))), + (!cast(NAME # "_S") ZPR32:$Op1, ZPR8:$Op2, ZPR8:$Op3, complexrotateop:$imm)>; + def : Pat<(nxv2i64 (op (nxv2i64 ZPR64:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3), + (i32 complexrotateop:$imm))), + (!cast(NAME # "_D") ZPR64:$Op1, ZPR16:$Op2, ZPR16:$Op3, complexrotateop:$imm)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Complex Multiply-Add Group +//===----------------------------------------------------------------------===// + +multiclass sve2_int_cmla { + def _B : sve2_complex_int_arith<0b00, { 0b001, opc }, asm, ZPR8, ZPR8>; + def _H : sve2_complex_int_arith<0b01, { 0b001, opc }, asm, ZPR16, ZPR16>; + def _S : sve2_complex_int_arith<0b10, { 0b001, opc }, asm, ZPR32, ZPR32>; + def _D : sve2_complex_int_arith<0b11, { 0b001, opc }, asm, ZPR64, ZPR64>; + + def : SVE_4_Op_Imm_Pat(NAME # _B)>; + def : SVE_4_Op_Imm_Pat(NAME # _H)>; + def : SVE_4_Op_Imm_Pat(NAME # _S)>; + def : SVE_4_Op_Imm_Pat(NAME # _D)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Complex Integer Dot Product - Indexed Group +//===----------------------------------------------------------------------===// + +class sve2_complex_int_arith_indexed sz, bits<4> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2, + ZPRRegOp zprty3, Operand itype> +: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop, + complexrotateop:$rot), + asm, "\t$Zda, $Zn, $Zm$iop, $rot", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<2> rot; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21} = 0b1; + let Inst{15-12} = opc; + let Inst{11-10} = rot; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_cintx_dot_by_indexed_elem { + def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> { + bits<2> iop; + bits<3> Zm; + let Inst{20-19} = iop; + let Inst{18-16} = Zm; + } + def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> { + bit iop; + bits<4> Zm; + let Inst{20} = iop; + let Inst{19-16} = Zm; + } + + def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv16i8 ZPR8:$Op2), (nxv16i8 ZPR8:$Op3), + (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast(NAME # "_S") ZPR32:$Op1, ZPR8:$Op2, ZPR8:$Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>; + def : Pat<(nxv2i64 (op (nxv2i64 ZPR64:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3), + (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast(NAME # "_D") ZPR64:$Op1, ZPR16:$Op2, ZPR16:$Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Complex Multiply-Add - Indexed Group +//===----------------------------------------------------------------------===// + +multiclass sve2_cmla_by_indexed_elem { + def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS32b> { + bits<2> iop; + bits<3> Zm; + let Inst{20-19} = iop; + let Inst{18-16} = Zm; + } + def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD32b> { + bit iop; + bits<4> Zm; + let Inst{20} = iop; + let Inst{19-16} = Zm; + } + + def : Pat<(nxv8i16 (op (nxv8i16 ZPR16:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3), + (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast(NAME # "_H") ZPR16:$Op1, ZPR16:$Op2, ZPR16:$Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>; + + def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv4i32 ZPR32:$Op2), (nxv4i32 ZPR32:$Op3), + (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))), + (!cast(NAME # "_S") ZPR32:$Op1, ZPR32:$Op2, ZPR32:$Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Integer Multiply - Unpredicated Group +//===----------------------------------------------------------------------===// + +class sve2_int_mul sz, bits<3> opc, string asm, ZPRRegOp zprty> +: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), + asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zm; + bits<5> Zn; + let Inst{31-24} = 0b00000100; + let Inst{23-22} = sz; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b011; + let Inst{12-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_int_mul opc, string asm, SDPatternOperator op, + SDPatternOperator op_pred = null_frag> { + def _B : sve2_int_mul<0b00, opc, asm, ZPR8>; + def _H : sve2_int_mul<0b01, opc, asm, ZPR16>; + def _S : sve2_int_mul<0b10, opc, asm, ZPR32>; + def _D : sve2_int_mul<0b11, opc, asm, ZPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; + + def : SVE_2_Op_Pred_All_Active(NAME # _B)>; + def : SVE_2_Op_Pred_All_Active(NAME # _H)>; + def : SVE_2_Op_Pred_All_Active(NAME # _S)>; + def : SVE_2_Op_Pred_All_Active(NAME # _D)>; +} + +multiclass sve2_int_mul_single opc, string asm, SDPatternOperator op> { + def _B : sve2_int_mul<0b00, opc, asm, ZPR8>; + + def : SVE_2_Op_Pat(NAME # _B)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Integer Multiply - Indexed Group +//===----------------------------------------------------------------------===// + +class sve2_int_mul_by_indexed_elem sz, bits<4> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2, + ZPRRegOp zprty3, Operand itype> +: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty3:$Zm, itype:$iop), + asm, "\t$Zd, $Zn, $Zm$iop", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21} = 0b1; + let Inst{15-14} = 0b11; + let Inst{13-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_int_mul_by_indexed_elem opc, string asm, + SDPatternOperator op> { + def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH32b> { + bits<3> Zm; + bits<3> iop; + let Inst{22} = iop{2}; + let Inst{20-19} = iop{1-0}; + let Inst{18-16} = Zm; + } + def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS32b> { + bits<3> Zm; + bits<2> iop; + let Inst{20-19} = iop; + let Inst{18-16} = Zm; + } + def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD32b> { + bits<4> Zm; + bit iop; + let Inst{20} = iop; + let Inst{19-16} = Zm; + } + + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; +} + +multiclass sve2_int_mul_long_by_indexed_elem opc, string asm, + SDPatternOperator op> { + def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm, + ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> { + bits<3> Zm; + bits<3> iop; + let Inst{20-19} = iop{2-1}; + let Inst{18-16} = Zm; + let Inst{11} = iop{0}; + } + def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm, + ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> { + bits<4> Zm; + bits<2> iop; + let Inst{20} = iop{1}; + let Inst{19-16} = Zm; + let Inst{11} = iop{0}; + } + + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Integer - Predicated Group +//===----------------------------------------------------------------------===// + +class sve2_int_arith_pred sz, bits<6> opc, string asm, + ZPRRegOp zprty> +: I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty:$Zm), + asm, "\t$Zdn, $Pg/m, $_Zdn, $Zm", "", []>, Sched<[]> { + bits<3> Pg; + bits<5> Zm; + bits<5> Zdn; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21-20} = 0b01; + let Inst{20-16} = opc{5-1}; + let Inst{15-14} = 0b10; + let Inst{13} = opc{0}; + let Inst{12-10} = Pg; + let Inst{9-5} = Zm; + let Inst{4-0} = Zdn; + + let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = DestructiveOther; + let ElementSize = zprty.ElementSize; +} + +multiclass sve2_int_arith_pred opc, string asm, SDPatternOperator op, + string Ps = "", + DestructiveInstTypeEnum flags=DestructiveOther, + string revname="", bit isReverseInstr=0> { + let DestructiveInstType = flags in { + def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>, + SVEPseudo2Instr, SVEInstr2Rev; + def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; + } + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; +} + +class sve2_int_sadd_long_accum_pairwise sz, bit U, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zda), (ins PPR3bAny:$Pg, zprty1:$_Zda, zprty2:$Zn), + asm, "\t$Zda, $Pg/m, $Zn", "", []>, Sched<[]> { + bits<3> Pg; + bits<5> Zn; + bits<5> Zda; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21-17} = 0b00010; + let Inst{16} = U; + let Inst{15-13} = 0b101; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = zprty1.ElementSize; +} + +multiclass sve2_int_sadd_long_accum_pairwise { + def _H : sve2_int_sadd_long_accum_pairwise<0b01, U, asm, ZPR16, ZPR8>; + def _S : sve2_int_sadd_long_accum_pairwise<0b10, U, asm, ZPR32, ZPR16>; + def _D : sve2_int_sadd_long_accum_pairwise<0b11, U, asm, ZPR64, ZPR32>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; +} + +class sve2_int_un_pred_arit sz, bit Q, bits<2> opc, + string asm, ZPRRegOp zprty> +: I<(outs zprty:$Zd), (ins zprty:$_Zd, PPR3bAny:$Pg, zprty:$Zn), + asm, "\t$Zd, $Pg/m, $Zn", + "", + []>, Sched<[]> { + bits<3> Pg; + bits<5> Zd; + bits<5> Zn; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; + let Inst{21-20} = 0b00; + let Inst{19} = Q; + let Inst{18} = 0b0; + let Inst{17-16} = opc; + let Inst{15-13} = 0b101; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = DestructiveUnaryPassthru; + let ElementSize = zprty.ElementSize; +} + +multiclass sve2_int_un_pred_arit_s opc, string asm, + SDPatternOperator op> { + def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>, + SVEPseudo2Instr; + + def : SVE_3_Op_Pat(NAME # _S)>; + + def _UNDEF_S : PredOneOpPassthruPseudo; + + defm : SVE_3_Op_Undef_Pat(NAME # _UNDEF_S)>; +} + +multiclass sve2_int_un_pred_arit opc, string asm, SDPatternOperator op> { + def _B : sve2_int_un_pred_arit<0b00, opc{2}, opc{1-0}, asm, ZPR8>, + SVEPseudo2Instr; + def _H : sve2_int_un_pred_arit<0b01, opc{2}, opc{1-0}, asm, ZPR16>, + SVEPseudo2Instr; + def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>, + SVEPseudo2Instr; + def _D : sve2_int_un_pred_arit<0b11, opc{2}, opc{1-0}, asm, ZPR64>, + SVEPseudo2Instr; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; + + def _UNDEF_B : PredOneOpPassthruPseudo; + def _UNDEF_H : PredOneOpPassthruPseudo; + def _UNDEF_S : PredOneOpPassthruPseudo; + def _UNDEF_D : PredOneOpPassthruPseudo; + + defm : SVE_3_Op_Undef_Pat(NAME # _UNDEF_B)>; + defm : SVE_3_Op_Undef_Pat(NAME # _UNDEF_H)>; + defm : SVE_3_Op_Undef_Pat(NAME # _UNDEF_S)>; + defm : SVE_3_Op_Undef_Pat(NAME # _UNDEF_D)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Widening Integer Arithmetic Group +//===----------------------------------------------------------------------===// + +class sve2_wide_int_arith sz, bits<5> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2, ZPRRegOp zprty3> +: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty3:$Zm), + asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15} = 0b0; + let Inst{14-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_wide_int_arith_long opc, string asm, + SDPatternOperator op> { + def _H : sve2_wide_int_arith<0b01, opc, asm, ZPR16, ZPR8, ZPR8>; + def _S : sve2_wide_int_arith<0b10, opc, asm, ZPR32, ZPR16, ZPR16>; + def _D : sve2_wide_int_arith<0b11, opc, asm, ZPR64, ZPR32, ZPR32>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; +} + +multiclass sve2_wide_int_arith_wide opc, string asm, + SDPatternOperator op> { + def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>; + def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>; + def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; +} + +multiclass sve2_wide_int_arith_pmul sz, bits<5> opc, string asm, + SDPatternOperator op> { + def NAME : sve2_wide_int_arith; + + // To avoid using 128 bit elements in the IR, the pattern below works with + // llvm intrinsics with the _pair suffix, to reflect that + // _Q is implemented as a pair of _D. + def : SVE_2_Op_Pat(NAME)>; +} + +multiclass sve2_pmul_long opc, string asm, SDPatternOperator op> { + def _H : sve2_wide_int_arith<0b01, {0b1101, opc}, asm, ZPR16, ZPR8, ZPR8>; + def _D : sve2_wide_int_arith<0b11, {0b1101, opc}, asm, ZPR64, ZPR32, ZPR32>; + + // To avoid using 128 bit elements in the IR, the patterns below work with + // llvm intrinsics with the _pair suffix, to reflect that + // _H is implemented as a pair of _B and _D is implemented as a pair of _S. + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _D)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Misc Group +//===----------------------------------------------------------------------===// + +class sve2_misc sz, bits<4> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm), + asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15-14} = 0b10; + let Inst{13-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_misc_bitwise opc, string asm, SDPatternOperator op> { + def _B : sve2_misc<0b00, opc, asm, ZPR8, ZPR8>; + def _H : sve2_misc<0b01, opc, asm, ZPR16, ZPR16>; + def _S : sve2_misc<0b10, opc, asm, ZPR32, ZPR32>; + def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; +} + +multiclass sve2_misc_int_addsub_long_interleaved opc, string asm, + SDPatternOperator op> { + def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>; + def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>; + def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; +} + +class sve2_bitwise_xor_interleaved sz, bits<1> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm), + asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15-11} = 0b10010; + let Inst{10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_bitwise_xor_interleaved { + def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8, ZPR8>; + def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>; + def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>; + def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; +} + +class sve2_bitwise_shift_left_long tsz8_64, bits<2> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2, + Operand immtype> +: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm), + asm, "\t$Zd, $Zn, $imm", + "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> imm; + let Inst{31-23} = 0b010001010; + let Inst{22} = tsz8_64{2}; + let Inst{21} = 0b0; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-16} = imm{2-0}; // imm3 + let Inst{15-12} = 0b1010; + let Inst{11-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_bitwise_shift_left_long opc, string asm, + SDPatternOperator op> { + def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm, + ZPR16, ZPR8, vecshiftL8>; + def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm, + ZPR32, ZPR16, vecshiftL16> { + let Inst{19} = imm{3}; + } + def _D : sve2_bitwise_shift_left_long<{1,?,?}, opc, asm, + ZPR64, ZPR32, vecshiftL32> { + let Inst{20-19} = imm{4-3}; + } + def : SVE_2_Op_Imm_Pat(NAME # _H)>; + def : SVE_2_Op_Imm_Pat(NAME # _S)>; + def : SVE_2_Op_Imm_Pat(NAME # _D)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Accumulate Group +//===----------------------------------------------------------------------===// + +class sve2_int_bin_shift_imm tsz8_64, bit opc, string asm, + ZPRRegOp zprty, Operand immtype> +: I<(outs zprty:$Zd), (ins zprty:$_Zd, zprty:$Zn, immtype:$imm), + asm, "\t$Zd, $Zn, $imm", + "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<6> imm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = tsz8_64{3-2}; + let Inst{21} = 0b0; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-16} = imm{2-0}; // imm3 + let Inst{15-11} = 0b11110; + let Inst{10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; +} + +multiclass sve2_int_bin_shift_imm_left { + def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; + def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { + let Inst{19} = imm{3}; + } + def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { + let Inst{20-19} = imm{4-3}; + } + def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { + let Inst{22} = imm{5}; + let Inst{20-19} = imm{4-3}; + } + + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; +} + +multiclass sve2_int_bin_shift_imm_right { + def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; + def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { + let Inst{19} = imm{3}; + } + def _S : sve2_int_bin_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { + let Inst{20-19} = imm{4-3}; + } + def _D : sve2_int_bin_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { + let Inst{22} = imm{5}; + let Inst{20-19} = imm{4-3}; + } + + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; +} + +class sve2_int_bin_accum_shift_imm tsz8_64, bits<2> opc, string asm, + ZPRRegOp zprty, Operand immtype> +: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, immtype:$imm), + asm, "\t$Zda, $Zn, $imm", + "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<6> imm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = tsz8_64{3-2}; + let Inst{21} = 0b0; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-16} = imm{2-0}; // imm3 + let Inst{15-12} = 0b1110; + let Inst{11-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_int_bin_accum_shift_imm_right opc, string asm, + SDPatternOperator op> { + def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; + def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { + let Inst{19} = imm{3}; + } + def _S : sve2_int_bin_accum_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { + let Inst{20-19} = imm{4-3}; + } + def _D : sve2_int_bin_accum_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { + let Inst{22} = imm{5}; + let Inst{20-19} = imm{4-3}; + } + + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; +} + +class sve2_int_cadd sz, bit opc, string asm, ZPRRegOp zprty> +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, complexrotateopodd:$rot), + asm, "\t$Zdn, $_Zdn, $Zm, $rot", "", []>, Sched<[]> { + bits<5> Zdn; + bits<5> Zm; + bit rot; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21-17} = 0b00000; + let Inst{16} = opc; + let Inst{15-11} = 0b11011; + let Inst{10} = rot; + let Inst{9-5} = Zm; + let Inst{4-0} = Zdn; + + let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_int_cadd { + def _B : sve2_int_cadd<0b00, opc, asm, ZPR8>; + def _H : sve2_int_cadd<0b01, opc, asm, ZPR16>; + def _S : sve2_int_cadd<0b10, opc, asm, ZPR32>; + def _D : sve2_int_cadd<0b11, opc, asm, ZPR64>; + + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; +} + +class sve2_int_absdiff_accum sz, bits<4> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm), + asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21} = 0b0; + let Inst{20-16} = Zm; + let Inst{15-14} = 0b11; + let Inst{13-10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_int_absdiff_accum { + def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>; + def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>; + def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>; + def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; +} + +multiclass sve2_int_absdiff_accum_long opc, string asm, + SDPatternOperator op> { + def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>; + def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>; + def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; +} + +multiclass sve2_int_addsub_long_carry opc, string asm, + SDPatternOperator op> { + def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm, + ZPR32, ZPR32>; + def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm, + ZPR64, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Narrowing Group +//===----------------------------------------------------------------------===// + +class sve2_int_bin_shift_imm_narrow_bottom tsz8_64, bits<3> opc, + string asm, ZPRRegOp zprty1, + ZPRRegOp zprty2, Operand immtype> +: I<(outs zprty1:$Zd), (ins zprty2:$Zn, immtype:$imm), + asm, "\t$Zd, $Zn, $imm", + "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> imm; + let Inst{31-23} = 0b010001010; + let Inst{22} = tsz8_64{2}; + let Inst{21} = 0b1; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-16} = imm{2-0}; // imm3 + let Inst{15-14} = 0b00; + let Inst{13-11} = opc; + let Inst{10} = 0b0; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_int_bin_shift_imm_right_narrow_bottom opc, string asm, + SDPatternOperator op> { + def _B : sve2_int_bin_shift_imm_narrow_bottom<{0,0,1}, opc, asm, ZPR8, ZPR16, + tvecshiftR8>; + def _H : sve2_int_bin_shift_imm_narrow_bottom<{0,1,?}, opc, asm, ZPR16, ZPR32, + tvecshiftR16> { + let Inst{19} = imm{3}; + } + def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64, + tvecshiftR32> { + let Inst{20-19} = imm{4-3}; + } + def : SVE_2_Op_Imm_Pat(NAME # _B)>; + def : SVE_2_Op_Imm_Pat(NAME # _H)>; + def : SVE_2_Op_Imm_Pat(NAME # _S)>; +} + +class sve2_int_bin_shift_imm_narrow_top tsz8_64, bits<3> opc, + string asm, ZPRRegOp zprty1, + ZPRRegOp zprty2, Operand immtype> +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, immtype:$imm), + asm, "\t$Zd, $Zn, $imm", + "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> imm; + let Inst{31-23} = 0b010001010; + let Inst{22} = tsz8_64{2}; + let Inst{21} = 0b1; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-16} = imm{2-0}; // imm3 + let Inst{15-14} = 0b00; + let Inst{13-11} = opc; + let Inst{10} = 0b1; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; +} + +multiclass sve2_int_bin_shift_imm_right_narrow_top opc, string asm, + SDPatternOperator op> { + def _B : sve2_int_bin_shift_imm_narrow_top<{0,0,1}, opc, asm, ZPR8, ZPR16, + tvecshiftR8>; + def _H : sve2_int_bin_shift_imm_narrow_top<{0,1,?}, opc, asm, ZPR16, ZPR32, + tvecshiftR16> { + let Inst{19} = imm{3}; + } + def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64, + tvecshiftR32> { + let Inst{20-19} = imm{4-3}; + } + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; +} + +class sve2_int_addsub_narrow_high_bottom sz, bits<2> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty2:$Zn, zprty2:$Zm), + asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b011; + let Inst{12-11} = opc; // S, R + let Inst{10} = 0b0; // Top + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_int_addsub_narrow_high_bottom opc, string asm, + SDPatternOperator op> { + def _B : sve2_int_addsub_narrow_high_bottom<0b01, opc, asm, ZPR8, ZPR16>; + def _H : sve2_int_addsub_narrow_high_bottom<0b10, opc, asm, ZPR16, ZPR32>; + def _S : sve2_int_addsub_narrow_high_bottom<0b11, opc, asm, ZPR32, ZPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; +} + +class sve2_int_addsub_narrow_high_top sz, bits<2> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn, zprty2:$Zm), + asm, "\t$Zd, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = sz; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b011; + let Inst{12-11} = opc; // S, R + let Inst{10} = 0b1; // Top + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; +} + +multiclass sve2_int_addsub_narrow_high_top opc, string asm, + SDPatternOperator op> { + def _B : sve2_int_addsub_narrow_high_top<0b01, opc, asm, ZPR8, ZPR16>; + def _H : sve2_int_addsub_narrow_high_top<0b10, opc, asm, ZPR16, ZPR32>; + def _S : sve2_int_addsub_narrow_high_top<0b11, opc, asm, ZPR32, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; +} + +class sve2_int_sat_extract_narrow_bottom tsz8_64, bits<2> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty2:$Zn), + asm, "\t$Zd, $Zn", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + let Inst{31-23} = 0b010001010; + let Inst{22} = tsz8_64{2}; + let Inst{21} = 0b1; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-13} = 0b000010; + let Inst{12-11} = opc; + let Inst{10} = 0b0; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_int_sat_extract_narrow_bottom opc, string asm, + SDPatternOperator op> { + def _B : sve2_int_sat_extract_narrow_bottom<0b001, opc, asm, ZPR8, ZPR16>; + def _H : sve2_int_sat_extract_narrow_bottom<0b010, opc, asm, ZPR16, ZPR32>; + def _S : sve2_int_sat_extract_narrow_bottom<0b100, opc, asm, ZPR32, ZPR64>; + + def : SVE_1_Op_Pat(NAME # _B)>; + def : SVE_1_Op_Pat(NAME # _H)>; + def : SVE_1_Op_Pat(NAME # _S)>; +} + +class sve2_int_sat_extract_narrow_top tsz8_64, bits<2> opc, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> +: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, zprty2:$Zn), + asm, "\t$Zd, $Zn", "", []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + let Inst{31-23} = 0b010001010; + let Inst{22} = tsz8_64{2}; + let Inst{21} = 0b1; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-13} = 0b000010; + let Inst{12-11} = opc; + let Inst{10} = 0b1; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; +} + +multiclass sve2_int_sat_extract_narrow_top opc, string asm, + SDPatternOperator op> { + def _B : sve2_int_sat_extract_narrow_top<0b001, opc, asm, ZPR8, ZPR16>; + def _H : sve2_int_sat_extract_narrow_top<0b010, opc, asm, ZPR16, ZPR32>; + def _S : sve2_int_sat_extract_narrow_top<0b100, opc, asm, ZPR32, ZPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; } //===----------------------------------------------------------------------===// @@ -1785,43 +4101,140 @@ class sve_int_un_pred_arit sz8_64, bits<4> opc, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveUnaryPassthru; let ElementSize = zprty.ElementSize; } -multiclass sve_int_un_pred_arit_0 opc, string asm> { - def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>; - def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>; - def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>; - def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>; +multiclass sve_int_un_pred_arit_0 opc, string asm, + SDPatternOperator op> { + def _B : sve_int_un_pred_arit<0b00, { opc, 0b0 }, asm, ZPR8>, + SVEPseudo2Instr; + def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>, + SVEPseudo2Instr; + def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>, + SVEPseudo2Instr; + def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>, + SVEPseudo2Instr; + + def : SVE_1_Op_Passthru_Pat(NAME # _B)>; + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat(NAME # _D)>; + + def _UNDEF_B : PredOneOpPassthruPseudo; + def _UNDEF_H : PredOneOpPassthruPseudo; + def _UNDEF_S : PredOneOpPassthruPseudo; + def _UNDEF_D : PredOneOpPassthruPseudo; + + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_B)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_H)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_S)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_D)>; } -multiclass sve_int_un_pred_arit_0_h opc, string asm> { - def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>; - def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>; - def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>; +multiclass sve_int_un_pred_arit_0_h opc, string asm, + SDPatternOperator op> { + def _H : sve_int_un_pred_arit<0b01, { opc, 0b0 }, asm, ZPR16>, + SVEPseudo2Instr; + def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>, + SVEPseudo2Instr; + def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>, + SVEPseudo2Instr; + + def : SVE_InReg_Extend(NAME # _H)>; + def : SVE_InReg_Extend(NAME # _S)>; + def : SVE_InReg_Extend(NAME # _D)>; + + def _UNDEF_H : PredOneOpPassthruPseudo; + def _UNDEF_S : PredOneOpPassthruPseudo; + def _UNDEF_D : PredOneOpPassthruPseudo; + + defm : SVE_InReg_Extend_PassthruUndef(NAME # _UNDEF_H)>; + defm : SVE_InReg_Extend_PassthruUndef(NAME # _UNDEF_S)>; + defm : SVE_InReg_Extend_PassthruUndef(NAME # _UNDEF_D)>; } -multiclass sve_int_un_pred_arit_0_w opc, string asm> { - def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>; - def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>; +multiclass sve_int_un_pred_arit_0_w opc, string asm, + SDPatternOperator op> { + def _S : sve_int_un_pred_arit<0b10, { opc, 0b0 }, asm, ZPR32>, + SVEPseudo2Instr; + def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>, + SVEPseudo2Instr; + + def : SVE_InReg_Extend(NAME # _S)>; + def : SVE_InReg_Extend(NAME # _D)>; + + def _UNDEF_S : PredOneOpPassthruPseudo; + def _UNDEF_D : PredOneOpPassthruPseudo; + + defm : SVE_InReg_Extend_PassthruUndef(NAME # _UNDEF_S)>; + defm : SVE_InReg_Extend_PassthruUndef(NAME # _UNDEF_D)>; } -multiclass sve_int_un_pred_arit_0_d opc, string asm> { - def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>; +multiclass sve_int_un_pred_arit_0_d opc, string asm, + SDPatternOperator op> { + def _D : sve_int_un_pred_arit<0b11, { opc, 0b0 }, asm, ZPR64>, + SVEPseudo2Instr; + + def : SVE_InReg_Extend(NAME # _D)>; + + def _UNDEF_D : PredOneOpPassthruPseudo; + + defm : SVE_InReg_Extend_PassthruUndef(NAME # _UNDEF_D)>; } -multiclass sve_int_un_pred_arit_1 opc, string asm> { - def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>; - def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>; - def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>; - def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>; +multiclass sve_int_un_pred_arit_1 opc, string asm, + SDPatternOperator op> { + def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>, + SVEPseudo2Instr; + def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>, + SVEPseudo2Instr; + def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>, + SVEPseudo2Instr; + def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>, + SVEPseudo2Instr; + + def : SVE_1_Op_Passthru_Pat(NAME # _B)>; + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat(NAME # _D)>; + + def _UNDEF_B : PredOneOpPassthruPseudo; + def _UNDEF_H : PredOneOpPassthruPseudo; + def _UNDEF_S : PredOneOpPassthruPseudo; + def _UNDEF_D : PredOneOpPassthruPseudo; + + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_B)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_H)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_S)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_D)>; } -multiclass sve_int_un_pred_arit_1_fp opc, string asm> { - def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>; - def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>; - def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>; +multiclass sve_int_un_pred_arit_1_fp opc, string asm, SDPatternOperator op> { + def _H : sve_int_un_pred_arit<0b01, { opc, 0b1 }, asm, ZPR16>, + SVEPseudo2Instr; + def _S : sve_int_un_pred_arit<0b10, { opc, 0b1 }, asm, ZPR32>, + SVEPseudo2Instr; + def _D : sve_int_un_pred_arit<0b11, { opc, 0b1 }, asm, ZPR64>, + SVEPseudo2Instr; + + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat(NAME # _D)>; + + def _UNDEF_H : PredOneOpPassthruPseudo; + def _UNDEF_S : PredOneOpPassthruPseudo; + def _UNDEF_D : PredOneOpPassthruPseudo; + + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_H)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_H)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_H)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_S)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_S)>; + defm : SVE_1_Op_PassthruUndef_Pat(NAME # _UNDEF_D)>; } //===----------------------------------------------------------------------===// @@ -1917,15 +4330,32 @@ class sve_int_arith_imm0 sz8_64, bits<3> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve_int_arith_imm0 opc, string asm> { - def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>; +multiclass sve_int_arith_imm0 opc, string asm, SDPatternOperator op> { + def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>; def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>; def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>; def _D : sve_int_arith_imm0<0b11, opc, asm, ZPR64, addsub_imm8_opt_lsl_i64>; + + def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _B)>; + def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _H)>; + def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _S)>; + def : SVE_1_Op_Imm_OptLsl_Pat(NAME # _D)>; +} + +multiclass sve_int_arith_imm0_subr opc, string asm, SDPatternOperator op> { + def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8, addsub_imm8_opt_lsl_i8>; + def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>; + def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>; + def _D : sve_int_arith_imm0<0b11, opc, asm, ZPR64, addsub_imm8_opt_lsl_i64>; + + def : SVE_1_Op_Imm_OptLsl_Reverse_Pat(NAME # _B)>; + def : SVE_1_Op_Imm_OptLsl_Reverse_Pat(NAME # _H)>; + def : SVE_1_Op_Imm_OptLsl_Reverse_Pat(NAME # _S)>; + def : SVE_1_Op_Imm_OptLsl_Reverse_Pat(NAME # _D)>; } class sve_int_arith_imm sz8_64, bits<6> opc, string asm, @@ -1944,22 +4374,44 @@ class sve_int_arith_imm sz8_64, bits<6> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve_int_arith_imm1 opc, string asm, Operand immtype> { - def _B : sve_int_arith_imm<0b00, { 0b1010, opc }, asm, ZPR8, immtype>; - def _H : sve_int_arith_imm<0b01, { 0b1010, opc }, asm, ZPR16, immtype>; - def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, immtype>; - def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, immtype>; +multiclass sve_int_arith_imm1 opc, string asm, SDPatternOperator op> { + def _B : sve_int_arith_imm<0b00, { 0b1010, opc }, asm, ZPR8, simm8>; + def _H : sve_int_arith_imm<0b01, { 0b1010, opc }, asm, ZPR16, simm8>; + def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>; + def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>; + + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _D)>; } -multiclass sve_int_arith_imm2 { +multiclass sve_int_arith_imm1_unsigned opc, string asm, SDPatternOperator op> { + def _B : sve_int_arith_imm<0b00, { 0b1010, opc }, asm, ZPR8, imm0_255>; + def _H : sve_int_arith_imm<0b01, { 0b1010, opc }, asm, ZPR16, imm0_255>; + def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>; + def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>; + + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _D)>; +} + +multiclass sve_int_arith_imm2 { def _B : sve_int_arith_imm<0b00, 0b110000, asm, ZPR8, simm8>; def _H : sve_int_arith_imm<0b01, 0b110000, asm, ZPR16, simm8>; def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>; def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>; + + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -1983,6 +4435,101 @@ class sve_int_bin_cons_log opc, string asm> let Inst{4-0} = Zd; } +multiclass sve_int_bin_cons_log opc, string asm, SDPatternOperator op> { + def NAME : sve_int_bin_cons_log; + + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + + def : InstAlias(NAME) ZPR8:$Zd, ZPR8:$Zn, ZPR8:$Zm), 1>; + def : InstAlias(NAME) ZPR16:$Zd, ZPR16:$Zn, ZPR16:$Zm), 1>; + def : InstAlias(NAME) ZPR32:$Zd, ZPR32:$Zn, ZPR32:$Zm), 1>; +} + +class sve2_int_bitwise_ternary_op_d opc, string asm> +: I<(outs ZPR64:$Zdn), (ins ZPR64:$_Zdn, ZPR64:$Zm, ZPR64:$Zk), + asm, "\t$Zdn, $_Zdn, $Zm, $Zk", + "", + []>, Sched<[]> { + bits<5> Zdn; + bits<5> Zk; + bits<5> Zm; + let Inst{31-24} = 0b00000100; + let Inst{23-22} = opc{2-1}; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-11} = 0b00111; + let Inst{10} = opc{0}; + let Inst{9-5} = Zk; + let Inst{4-0} = Zdn; + + let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_int_bitwise_ternary_op opc, string asm, SDPatternOperator op> { + def NAME : sve2_int_bitwise_ternary_op_d; + + def : InstAlias(NAME) ZPR8:$Zdn, ZPR8:$Zm, ZPR8:$Zk), 1>; + def : InstAlias(NAME) ZPR16:$Zdn, ZPR16:$Zm, ZPR16:$Zk), 1>; + def : InstAlias(NAME) ZPR32:$Zdn, ZPR32:$Zm, ZPR32:$Zk), 1>; + + def : SVE_3_Op_Pat(NAME)>; + def : SVE_3_Op_Pat(NAME)>; + def : SVE_3_Op_Pat(NAME)>; + def : SVE_3_Op_Pat(NAME)>; +} + +class sve2_int_rotate_right_imm tsz8_64, string asm, + ZPRRegOp zprty, Operand immtype> +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, immtype:$imm), + asm, "\t$Zdn, $_Zdn, $Zm, $imm", + "", + []>, Sched<[]> { + bits<5> Zdn; + bits<5> Zm; + bits<6> imm; + let Inst{31-24} = 0b00000100; + let Inst{23-22} = tsz8_64{3-2}; + let Inst{21} = 0b1; + let Inst{20-19} = tsz8_64{1-0}; + let Inst{18-16} = imm{2-0}; // imm3 + let Inst{15-10} = 0b001101; + let Inst{9-5} = Zm; + let Inst{4-0} = Zdn; + + let Constraints = "$Zdn = $_Zdn"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeNone; +} + +multiclass sve2_int_rotate_right_imm { + def _B : sve2_int_rotate_right_imm<{0,0,0,1}, asm, ZPR8, vecshiftR8>; + def _H : sve2_int_rotate_right_imm<{0,0,1,?}, asm, ZPR16, vecshiftR16> { + let Inst{19} = imm{3}; + } + def _S : sve2_int_rotate_right_imm<{0,1,?,?}, asm, ZPR32, vecshiftR32> { + let Inst{20-19} = imm{4-3}; + } + def _D : sve2_int_rotate_right_imm<{1,?,?,?}, asm, ZPR64, vecshiftR64> { + let Inst{22} = imm{5}; + let Inst{20-19} = imm{4-3}; + } + + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; +} //===----------------------------------------------------------------------===// // SVE Integer Wide Immediate - Predicated Group @@ -2006,7 +4553,7 @@ class sve_int_dup_fpimm_pred sz, Operand fpimmtype, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } @@ -2041,26 +4588,34 @@ class sve_int_dup_imm_pred sz8_64, bit m, string asm, let Inst{12-5} = imm{7-0}; // imm8 let Inst{4-0} = Zd; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_dup_imm_pred_merge { - let Constraints = "$Zd = $_Zd" in { - def _B : sve_int_dup_imm_pred<0b00, 1, asm, ZPR8, "/m", (ins ZPR8:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>; - def _H : sve_int_dup_imm_pred<0b01, 1, asm, ZPR16, "/m", (ins ZPR16:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>; - def _S : sve_int_dup_imm_pred<0b10, 1, asm, ZPR32, "/m", (ins ZPR32:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>; - def _D : sve_int_dup_imm_pred<0b11, 1, asm, ZPR64, "/m", (ins ZPR64:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>; - } +multiclass sve_int_dup_imm_pred_merge_inst< + bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty, + ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> { + let Constraints = "$Zd = $_Zd" in + def NAME : sve_int_dup_imm_pred; + def : InstAlias<"mov $Zd, $Pg/m, $imm", + (!cast(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>; + def : Pat<(intty + (vselect predty:$Pg, + (intty (AArch64dup (scalarty (SVE8BitLslImm.Pat i32:$imm, i32:$shift)))), + intty:$Zd)), + (!cast(NAME) zprty:$Zd, $Pg, i32:$imm, i32:$shift)>; +} - def : InstAlias<"mov $Zd, $Pg/m, $imm", - (!cast(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>; - def : InstAlias<"mov $Zd, $Pg/m, $imm", - (!cast(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>; - def : InstAlias<"mov $Zd, $Pg/m, $imm", - (!cast(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>; - def : InstAlias<"mov $Zd, $Pg/m, $imm", - (!cast(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>; +multiclass sve_int_dup_imm_pred_merge { + defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1, + i32, cpy_imm8_opt_lsl_i8>; + defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1, + i32, cpy_imm8_opt_lsl_i16>; + defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1, + i32, cpy_imm8_opt_lsl_i32>; + defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1, + i64, cpy_imm8_opt_lsl_i64>; def : InstAlias<"fmov $Zd, $Pg/m, #0.0", (!cast(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>; @@ -2070,20 +4625,35 @@ multiclass sve_int_dup_imm_pred_merge { (!cast(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>; } -multiclass sve_int_dup_imm_pred_zero { - def _B : sve_int_dup_imm_pred<0b00, 0, asm, ZPR8, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>; - def _H : sve_int_dup_imm_pred<0b01, 0, asm, ZPR16, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>; - def _S : sve_int_dup_imm_pred<0b10, 0, asm, ZPR32, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>; - def _D : sve_int_dup_imm_pred<0b11, 0, asm, ZPR64, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>; +multiclass sve_int_dup_imm_pred_zero_inst< + bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty, + ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> { + def NAME : sve_int_dup_imm_pred; + def : InstAlias<"mov $Zd, $Pg/z, $imm", + (!cast(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>; + def : Pat<(intty (zext (predty PPRAny:$Ps1))), + (!cast(NAME) PPRAny:$Ps1, 1, 0)>; + def : Pat<(intty (sext (predty PPRAny:$Ps1))), + (!cast(NAME) PPRAny:$Ps1, -1, 0)>; + def : Pat<(intty (anyext (predty PPRAny:$Ps1))), + (!cast(NAME) PPRAny:$Ps1, 1, 0)>; + def : Pat<(intty + (vselect predty:$Pg, + (intty (AArch64dup (scalarty (SVE8BitLslImm.Pat i32:$imm, i32:$shift)))), + (intty (AArch64dup (scalarty 0))))), + (!cast(NAME) $Pg, i32:$imm, i32:$shift)>; +} - def : InstAlias<"mov $Zd, $Pg/z, $imm", - (!cast(NAME # _B) ZPR8:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>; - def : InstAlias<"mov $Zd, $Pg/z, $imm", - (!cast(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>; - def : InstAlias<"mov $Zd, $Pg/z, $imm", - (!cast(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>; - def : InstAlias<"mov $Zd, $Pg/z, $imm", - (!cast(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>; +multiclass sve_int_dup_imm_pred_zero { + defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1, + i32, cpy_imm8_opt_lsl_i8>; + defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1, + i32, cpy_imm8_opt_lsl_i16>; + defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1, + i32, cpy_imm8_opt_lsl_i32>; + defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1, + i64, cpy_imm8_opt_lsl_i64>; } //===----------------------------------------------------------------------===// @@ -2113,25 +4683,60 @@ class sve_int_cmp sz8_64, bits<3> opc, string asm, let Inst{3-0} = Pd; let Defs = [NZCV]; + let ElementSize = pprty.ElementSize; + let isPTestLike = 1; } -multiclass sve_int_cmp_0 opc, string asm> { +multiclass SVE_SETCC_Pat { + def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, cc)), + (cmp $Op1, $Op2, $Op3)>; + def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, invcc)), + (cmp $Op1, $Op3, $Op2)>; + def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z (predvt (AArch64ptrue 31)), intvt:$Op2, intvt:$Op3, cc))), + (cmp $Pg, $Op2, $Op3)>; + def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z (predvt (AArch64ptrue 31)), intvt:$Op2, intvt:$Op3, invcc))), + (cmp $Pg, $Op3, $Op2)>; +} + +multiclass SVE_SETCC_Pat_With_Zero { + def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, (SVEDup0), cc)), + (cmp $Op1, $Op2)>; + def : Pat<(predvt (AArch64setcc_z predvt:$Op1, (SVEDup0), intvt:$Op2, invcc)), + (cmp $Op1, $Op2)>; +} + +multiclass sve_int_cmp_0 opc, string asm, CondCode cc, CondCode invcc> { def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR8>; def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR16>; def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR32>; def _D : sve_int_cmp<0b0, 0b11, opc, asm, PPR64, ZPR64, ZPR64>; + + defm : SVE_SETCC_Pat(NAME # _B)>; + defm : SVE_SETCC_Pat(NAME # _H)>; + defm : SVE_SETCC_Pat(NAME # _S)>; + defm : SVE_SETCC_Pat(NAME # _D)>; } -multiclass sve_int_cmp_0_wide opc, string asm> { +multiclass sve_int_cmp_0_wide opc, string asm, SDPatternOperator op> { def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR64>; def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR64>; def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; } -multiclass sve_int_cmp_1_wide opc, string asm> { +multiclass sve_int_cmp_1_wide opc, string asm, SDPatternOperator op> { def _B : sve_int_cmp<0b1, 0b00, opc, asm, PPR8, ZPR8, ZPR64>; def _H : sve_int_cmp<0b1, 0b01, opc, asm, PPR16, ZPR16, ZPR64>; def _S : sve_int_cmp<0b1, 0b10, opc, asm, PPR32, ZPR32, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; } @@ -2163,13 +4768,39 @@ class sve_int_scmp_vi sz8_64, bits<3> opc, string asm, PPRRegOp pprty, let Inst{3-0} = Pd; let Defs = [NZCV]; + let ElementSize = pprty.ElementSize; + let isPTestLike = 1; } -multiclass sve_int_scmp_vi opc, string asm> { +multiclass SVE_SETCC_Imm_Pat { + def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg), + (intvt ZPR:$Zs1), + (intvt (AArch64dup (immtype:$imm))), + cc)), + (cmp $Pg, $Zs1, immtype:$imm)>; + def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg), + (intvt (AArch64dup (immtype:$imm))), + (intvt ZPR:$Zs1), + commuted_cc)), + (cmp $Pg, $Zs1, immtype:$imm)>; +} + +multiclass sve_int_scmp_vi opc, string asm, CondCode cc, CondCode commuted_cc> { def _B : sve_int_scmp_vi<0b00, opc, asm, PPR8, ZPR8, simm5_32b>; def _H : sve_int_scmp_vi<0b01, opc, asm, PPR16, ZPR16, simm5_32b>; def _S : sve_int_scmp_vi<0b10, opc, asm, PPR32, ZPR32, simm5_32b>; def _D : sve_int_scmp_vi<0b11, opc, asm, PPR64, ZPR64, simm5_64b>; + + defm : SVE_SETCC_Imm_Pat(NAME # _B)>; + defm : SVE_SETCC_Imm_Pat(NAME # _H)>; + defm : SVE_SETCC_Imm_Pat(NAME # _S)>; + defm : SVE_SETCC_Imm_Pat(NAME # _D)>; } @@ -2198,13 +4829,25 @@ class sve_int_ucmp_vi sz8_64, bits<2> opc, string asm, PPRRegOp pprty, let Inst{3-0} = Pd; let Defs = [NZCV]; + let ElementSize = pprty.ElementSize; + let isPTestLike = 1; } -multiclass sve_int_ucmp_vi opc, string asm> { +multiclass sve_int_ucmp_vi opc, string asm, CondCode cc, + CondCode commuted_cc> { def _B : sve_int_ucmp_vi<0b00, opc, asm, PPR8, ZPR8, imm0_127>; def _H : sve_int_ucmp_vi<0b01, opc, asm, PPR16, ZPR16, imm0_127>; def _S : sve_int_ucmp_vi<0b10, opc, asm, PPR32, ZPR32, imm0_127>; - def _D : sve_int_ucmp_vi<0b11, opc, asm, PPR64, ZPR64, imm0_127>; + def _D : sve_int_ucmp_vi<0b11, opc, asm, PPR64, ZPR64, imm0_127_64b>; + + defm : SVE_SETCC_Imm_Pat(NAME # _B)>; + defm : SVE_SETCC_Imm_Pat(NAME # _H)>; + defm : SVE_SETCC_Imm_Pat(NAME # _S)>; + defm : SVE_SETCC_Imm_Pat(NAME # _D)>; } @@ -2250,30 +4893,75 @@ class sve_int_while_rr sz8_64, bits<4> opc, string asm, let Inst{3-0} = Pd; let Defs = [NZCV]; + let ElementSize = pprty.ElementSize; + let isWhile = 1; } -multiclass sve_int_while4_rr opc, string asm> { +multiclass sve_int_while4_rr opc, string asm, SDPatternOperator op> { def _B : sve_int_while_rr<0b00, { 0, opc }, asm, GPR32, PPR8>; def _H : sve_int_while_rr<0b01, { 0, opc }, asm, GPR32, PPR16>; def _S : sve_int_while_rr<0b10, { 0, opc }, asm, GPR32, PPR32>; def _D : sve_int_while_rr<0b11, { 0, opc }, asm, GPR32, PPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } -multiclass sve_int_while8_rr opc, string asm> { +multiclass sve_int_while8_rr opc, string asm, SDPatternOperator op> { def _B : sve_int_while_rr<0b00, { 1, opc }, asm, GPR64, PPR8>; def _H : sve_int_while_rr<0b01, { 1, opc }, asm, GPR64, PPR16>; def _S : sve_int_while_rr<0b10, { 1, opc }, asm, GPR64, PPR32>; def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } +class sve2_int_while_rr sz8_64, bits<1> rw, string asm, + PPRRegOp pprty> +: I<(outs pprty:$Pd), (ins GPR64:$Rn, GPR64:$Rm), + asm, "\t$Pd, $Rn, $Rm", + "", []>, Sched<[]> { + bits<4> Pd; + bits<5> Rm; + bits<5> Rn; + let Inst{31-24} = 0b00100101; + let Inst{23-22} = sz8_64; + let Inst{21} = 0b1; + let Inst{20-16} = Rm; + let Inst{15-10} = 0b001100; + let Inst{9-5} = Rn; + let Inst{4} = rw; + let Inst{3-0} = Pd; + + let Defs = [NZCV]; + let ElementSize = pprty.ElementSize; + let isWhile = 1; +} + +multiclass sve2_int_while_rr rw, string asm, string op> { + def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>; + def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>; + def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>; + def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>; + + def : SVE_2_Op_Pat(op # _b), i64, i64, !cast(NAME # _B)>; + def : SVE_2_Op_Pat(op # _h), i64, i64, !cast(NAME # _H)>; + def : SVE_2_Op_Pat(op # _s), i64, i64, !cast(NAME # _S)>; + def : SVE_2_Op_Pat(op # _d), i64, i64, !cast(NAME # _D)>; +} //===----------------------------------------------------------------------===// // SVE Floating Point Fast Reduction Group //===----------------------------------------------------------------------===// class sve_fp_fast_red sz, bits<3> opc, string asm, - ZPRRegOp zprty, RegisterClass dstRegClass> -: I<(outs dstRegClass:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn), + ZPRRegOp zprty, FPRasZPROperand dstOpType> +: I<(outs dstOpType:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn), asm, "\t$Vd, $Pg, $Zn", "", []>, Sched<[]> { @@ -2290,20 +4978,26 @@ class sve_fp_fast_red sz, bits<3> opc, string asm, let Inst{4-0} = Vd; } -multiclass sve_fp_fast_red opc, string asm> { - def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16>; - def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32>; - def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64>; -} +multiclass sve_fp_fast_red opc, string asm, SDPatternOperator op> { + def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16asZPR>; + def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32asZPR>; + def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64asZPR>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; +} //===----------------------------------------------------------------------===// // SVE Floating Point Accumulating Reduction Group //===----------------------------------------------------------------------===// class sve_fp_2op_p_vd sz, bits<3> opc, string asm, - ZPRRegOp zprty, RegisterClass dstRegClass> -: I<(outs dstRegClass:$Vdn), (ins PPR3bAny:$Pg, dstRegClass:$_Vdn, zprty:$Zm), + ZPRRegOp zprty, FPRasZPROperand dstOpType> +: I<(outs dstOpType:$Vdn), (ins PPR3bAny:$Pg, dstOpType:$_Vdn, zprty:$Zm), asm, "\t$Vdn, $Pg, $_Vdn, $Zm", "", []>, @@ -2323,10 +5017,17 @@ class sve_fp_2op_p_vd sz, bits<3> opc, string asm, let Constraints = "$Vdn = $_Vdn"; } -multiclass sve_fp_2op_p_vd opc, string asm> { - def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16>; - def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32>; - def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64>; +multiclass sve_fp_2op_p_vd opc, string asm, SDPatternOperator op> { + def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16asZPR>; + def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32asZPR>; + def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64asZPR>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -2356,12 +5057,37 @@ class sve_fp_3op_p_pd sz, bits<3> opc, string asm, PPRRegOp pprty, let Inst{3-0} = Pd; } -multiclass sve_fp_3op_p_pd opc, string asm> { +multiclass sve_fp_3op_p_pd opc, string asm, SDPatternOperator op> { def _H : sve_fp_3op_p_pd<0b01, opc, asm, PPR16, ZPR16>; def _S : sve_fp_3op_p_pd<0b10, opc, asm, PPR32, ZPR32>; def _D : sve_fp_3op_p_pd<0b11, opc, asm, PPR64, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } +multiclass sve_fp_3op_p_pd_cc opc, string asm, + CondCode cc1, CondCode cc2, + CondCode invcc1, CondCode invcc2> { + def _H : sve_fp_3op_p_pd<0b01, opc, asm, PPR16, ZPR16>; + def _S : sve_fp_3op_p_pd<0b10, opc, asm, PPR32, ZPR32>; + def _D : sve_fp_3op_p_pd<0b11, opc, asm, PPR64, ZPR64>; + + defm : SVE_SETCC_Pat(NAME # _H)>; + defm : SVE_SETCC_Pat(NAME # _H)>; + defm : SVE_SETCC_Pat(NAME # _H)>; + defm : SVE_SETCC_Pat(NAME # _S)>; + defm : SVE_SETCC_Pat(NAME # _S)>; + defm : SVE_SETCC_Pat(NAME # _D)>; + + defm : SVE_SETCC_Pat(NAME # _H)>; + defm : SVE_SETCC_Pat(NAME # _H)>; + defm : SVE_SETCC_Pat(NAME # _H)>; + defm : SVE_SETCC_Pat(NAME # _S)>; + defm : SVE_SETCC_Pat(NAME # _S)>; + defm : SVE_SETCC_Pat(NAME # _D)>; +} //===----------------------------------------------------------------------===// // SVE Floating Point Compare - with Zero Group @@ -2387,10 +5113,26 @@ class sve_fp_2op_p_pd sz, bits<3> opc, string asm, PPRRegOp pprty, let Inst{3-0} = Pd; } -multiclass sve_fp_2op_p_pd opc, string asm> { +multiclass sve_fp_2op_p_pd opc, string asm, + CondCode cc1, CondCode cc2, + CondCode invcc1, CondCode invcc2> { def _H : sve_fp_2op_p_pd<0b01, opc, asm, PPR16, ZPR16>; def _S : sve_fp_2op_p_pd<0b10, opc, asm, PPR32, ZPR32>; def _D : sve_fp_2op_p_pd<0b11, opc, asm, PPR64, ZPR64>; + + defm : SVE_SETCC_Pat_With_Zero(NAME # _H)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _H)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _H)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _S)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _S)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _D)>; + + defm : SVE_SETCC_Pat_With_Zero(NAME # _H)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _H)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _H)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _S)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _S)>; + defm : SVE_SETCC_Pat_With_Zero(NAME # _D)>; } @@ -2398,6 +5140,14 @@ multiclass sve_fp_2op_p_pd opc, string asm> { //SVE Index Generation Group //===----------------------------------------------------------------------===// +def simm5_8b_tgt : TImmLeaf= -16 && (int8_t)Imm < 16; }]>; +def simm5_16b_tgt : TImmLeaf= -16 && (int16_t)Imm < 16; }]>; +def simm5_32b_tgt : TImmLeaf= -16 && (int32_t)Imm < 16; }]>; +def simm5_64b_tgt : TImmLeaf= -16 && (int64_t)Imm < 16; }]>; +def i64imm_32bit_tgt : TImmLeaf(Imm); +}]>; + class sve_int_index_ii sz8_64, string asm, ZPRRegOp zprty, Operand imm_ty> : I<(outs zprty:$Zd), (ins imm_ty:$imm5, imm_ty:$imm5b), @@ -2416,10 +5166,29 @@ class sve_int_index_ii sz8_64, string asm, ZPRRegOp zprty, } multiclass sve_int_index_ii { - def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_32b>; - def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_32b>; + def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_8b>; + def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_16b>; def _S : sve_int_index_ii<0b10, asm, ZPR32, simm5_32b>; def _D : sve_int_index_ii<0b11, asm, ZPR64, simm5_64b>; + + def : Pat<(nxv16i8 (step_vector simm5_8b_tgt:$imm5b)), + (!cast(NAME # "_B") (i32 0), (!cast("trunc_imm") $imm5b))>; + def : Pat<(nxv8i16 (step_vector simm5_16b_tgt:$imm5b)), + (!cast(NAME # "_H") (i32 0), (!cast("trunc_imm") $imm5b))>; + def : Pat<(nxv4i32 (step_vector simm5_32b_tgt:$imm5b)), + (!cast(NAME # "_S") (i32 0), simm5_32b:$imm5b)>; + def : Pat<(nxv2i64 (step_vector simm5_64b_tgt:$imm5b)), + (!cast(NAME # "_D") (i64 0), simm5_64b:$imm5b)>; + + // add(step_vector(step), dup(X)) -> index(X, step). + def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5b)), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))), + (!cast(NAME # "_B") simm5_8b:$imm5, (!cast("trunc_imm") $imm5b))>; + def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5b)), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))), + (!cast(NAME # "_H") simm5_16b:$imm5, (!cast("trunc_imm") $imm5b))>; + def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5b)), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))), + (!cast(NAME # "_S") simm5_32b:$imm5, simm5_32b:$imm5b)>; + def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5b)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))), + (!cast(NAME # "_D") simm5_64b:$imm5, simm5_64b:$imm5b)>; } class sve_int_index_ir sz8_64, string asm, ZPRRegOp zprty, @@ -2439,11 +5208,54 @@ class sve_int_index_ir sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; } -multiclass sve_int_index_ir { - def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_32b>; - def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_32b>; +multiclass sve_int_index_ir { + def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_8b>; + def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_16b>; def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>; def _D : sve_int_index_ir<0b11, asm, ZPR64, GPR64, simm5_64b>; + + def : Pat<(nxv16i8 (step_vector i8:$imm)), + (!cast(NAME # "_B") (i32 0), (!cast("MOVi32imm") (!cast("trunc_imm") $imm)))>; + def : Pat<(nxv8i16 (step_vector i16:$imm)), + (!cast(NAME # "_H") (i32 0), (!cast("MOVi32imm") (!cast("trunc_imm") $imm)))>; + def : Pat<(nxv4i32 (step_vector i32:$imm)), + (!cast(NAME # "_S") (i32 0), (!cast("MOVi32imm") $imm))>; + def : Pat<(nxv2i64 (step_vector i64:$imm)), + (!cast(NAME # "_D") (i64 0), (!cast("MOVi64imm") $imm))>; + def : Pat<(nxv2i64 (step_vector i64imm_32bit_tgt:$imm)), + (!cast(NAME # "_D") (i64 0), (SUBREG_TO_REG (i64 0), (!cast("MOVi32imm") (!cast("trunc_imm") $imm)), sub_32))>; + + // add(step_vector(step), dup(X)) -> index(X, step). + def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))), + (!cast(NAME # "_B") simm5_8b:$imm5, (!cast("MOVi32imm") (!cast("trunc_imm") $imm)))>; + def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))), + (!cast(NAME # "_H") simm5_16b:$imm5, (!cast("MOVi32imm") (!cast("trunc_imm") $imm)))>; + def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))), + (!cast(NAME # "_S") simm5_32b:$imm5, (!cast("MOVi32imm") $imm))>; + def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))), + (!cast(NAME # "_D") simm5_64b:$imm5, (!cast("MOVi64imm") $imm))>; + def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))), + (!cast(NAME # "_D") simm5_64b:$imm5, (SUBREG_TO_REG (i64 0), (!cast("MOVi32imm") (!cast("trunc_imm") $imm)), sub_32))>; + + // mul(step_vector(1), dup(Y)) -> index(0, Y). + def : Pat<(mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), + (!cast(NAME # "_B") (i32 0), GPR32:$Rm)>; + def : Pat<(mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))), + (!cast(NAME # "_H") (i32 0), GPR32:$Rm)>; + def : Pat<(mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))), + (!cast(NAME # "_S") (i32 0), GPR32:$Rm)>; + def : Pat<(mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))), + (!cast(NAME # "_D") (i64 0), GPR64:$Rm)>; + + // add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y). + def : Pat<(add (muloneuseop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(simm5_8b:$imm5)))), + (!cast(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>; + def : Pat<(add (muloneuseop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))), (nxv8i16 (AArch64dup(simm5_16b:$imm5)))), + (!cast(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>; + def : Pat<(add (muloneuseop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))), (nxv4i32 (AArch64dup(simm5_32b:$imm5)))), + (!cast(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>; + def : Pat<(add (muloneuseop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))), (nxv2i64 (AArch64dup(simm5_64b:$imm5)))), + (!cast(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>; } class sve_int_index_ri sz8_64, string asm, ZPRRegOp zprty, @@ -2464,10 +5276,20 @@ class sve_int_index_ri sz8_64, string asm, ZPRRegOp zprty, } multiclass sve_int_index_ri { - def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_32b>; - def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_32b>; + def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_8b>; + def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_16b>; def _S : sve_int_index_ri<0b10, asm, ZPR32, GPR32, simm5_32b>; def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>; + + // add(step_vector(step), dup(X)) -> index(X, step). + def : Pat<(add (nxv16i8 (step_vector_oneuse simm5_8b_tgt:$imm5)), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), + (!cast(NAME # "_B") GPR32:$Rm, (!cast("trunc_imm") $imm5))>; + def : Pat<(add (nxv8i16 (step_vector_oneuse simm5_16b_tgt:$imm5)), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))), + (!cast(NAME # "_H") GPR32:$Rm, (!cast("trunc_imm") $imm5))>; + def : Pat<(add (nxv4i32 (step_vector_oneuse simm5_32b_tgt:$imm5)), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))), + (!cast(NAME # "_S") GPR32:$Rm, simm5_32b:$imm5)>; + def : Pat<(add (nxv2i64 (step_vector_oneuse simm5_64b_tgt:$imm5)), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))), + (!cast(NAME # "_D") GPR64:$Rm, simm5_64b:$imm5)>; } class sve_int_index_rr sz8_64, string asm, ZPRRegOp zprty, @@ -2487,19 +5309,41 @@ class sve_int_index_rr sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; } -multiclass sve_int_index_rr { +multiclass sve_int_index_rr { def _B : sve_int_index_rr<0b00, asm, ZPR8, GPR32>; def _H : sve_int_index_rr<0b01, asm, ZPR16, GPR32>; def _S : sve_int_index_rr<0b10, asm, ZPR32, GPR32>; def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>; + + // add(step_vector(step), dup(X)) -> index(X, step). + def : Pat<(add (nxv16i8 (step_vector_oneuse i8:$imm)), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))), + (!cast(NAME # "_B") GPR32:$Rn, (!cast("MOVi32imm") (!cast("trunc_imm") $imm)))>; + def : Pat<(add (nxv8i16 (step_vector_oneuse i16:$imm)), (nxv8i16 (AArch64dup(i32 GPR32:$Rn)))), + (!cast(NAME # "_H") GPR32:$Rn, (!cast("MOVi32imm") (!cast("trunc_imm") $imm)))>; + def : Pat<(add (nxv4i32 (step_vector_oneuse i32:$imm)), (nxv4i32 (AArch64dup(i32 GPR32:$Rn)))), + (!cast(NAME # "_S") GPR32:$Rn, (!cast("MOVi32imm") $imm))>; + def : Pat<(add (nxv2i64 (step_vector_oneuse i64:$imm)), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))), + (!cast(NAME # "_D") GPR64:$Rn, (!cast("MOVi64imm") $imm))>; + def : Pat<(add (nxv2i64 (step_vector_oneuse i64imm_32bit_tgt:$imm)), (nxv2i64 (AArch64dup(i64 GPR64:$Rn)))), + (!cast(NAME # "_D") GPR64:$Rn, (SUBREG_TO_REG (i64 0), (!cast("MOVi32imm") (!cast("trunc_imm") $imm)), sub_32))>; + + // add(mul(step_vector(1), dup(Y)), dup(X)) -> index(X, Y). + def : Pat<(add (mulop (nxv16i1 (AArch64ptrue 31)), (nxv16i8 (step_vector_oneuse (i8 1))), (nxv16i8 (AArch64dup(i32 GPR32:$Rm)))), (nxv16i8 (AArch64dup(i32 GPR32:$Rn)))), + (!cast(NAME # "_B") GPR32:$Rn, GPR32:$Rm)>; + def : Pat<(add (mulop (nxv8i1 (AArch64ptrue 31)), (nxv8i16 (step_vector_oneuse (i16 1))), (nxv8i16 (AArch64dup(i32 GPR32:$Rm)))),(nxv8i16 (AArch64dup(i32 GPR32:$Rn)))), + (!cast(NAME # "_H") GPR32:$Rn, GPR32:$Rm)>; + def : Pat<(add (mulop (nxv4i1 (AArch64ptrue 31)), (nxv4i32 (step_vector_oneuse (i32 1))), (nxv4i32 (AArch64dup(i32 GPR32:$Rm)))),(nxv4i32 (AArch64dup(i32 GPR32:$Rn)))), + (!cast(NAME # "_S") GPR32:$Rn, GPR32:$Rm)>; + def : Pat<(add (mulop (nxv2i1 (AArch64ptrue 31)), (nxv2i64 (step_vector_oneuse (i64 1))), (nxv2i64 (AArch64dup(i64 GPR64:$Rm)))),(nxv2i64 (AArch64dup(i64 GPR64:$Rn)))), + (!cast(NAME # "_D") GPR64:$Rn, GPR64:$Rm)>; } -// + //===----------------------------------------------------------------------===// // SVE Bitwise Shift - Predicated Group //===----------------------------------------------------------------------===// -class sve_int_bin_pred_shift_imm tsz8_64, bits<3> opc, string asm, - ZPRRegOp zprty, Operand immtype, - ElementSizeEnum size> + +class sve_int_bin_pred_shift_imm tsz8_64, bits<4> opc, string asm, + ZPRRegOp zprty, Operand immtype> : I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm), asm, "\t$Zdn, $Pg/m, $_Zdn, $imm", "", @@ -2509,8 +5353,8 @@ class sve_int_bin_pred_shift_imm tsz8_64, bits<3> opc, string asm, bits<6> imm; let Inst{31-24} = 0b00000100; let Inst{23-22} = tsz8_64{3-2}; - let Inst{21-19} = 0b000; - let Inst{18-16} = opc; + let Inst{21-20} = 0b00; + let Inst{19-16} = opc; let Inst{15-13} = 0b100; let Inst{12-10} = Pg; let Inst{9-8} = tsz8_64{1-0}; @@ -2518,44 +5362,100 @@ class sve_int_bin_pred_shift_imm tsz8_64, bits<3> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; - let ElementSize = size; + let DestructiveInstType = DestructiveBinaryImm; + let ElementSize = zprty.ElementSize; } -multiclass sve_int_bin_pred_shift_imm_left opc, string asm> { - def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8, - ElementSizeB>; - def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16, - ElementSizeH> { +multiclass sve_int_bin_pred_shift_imm_left opc, string asm, string Ps, + SDPatternOperator op = null_frag> { + def _B : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; + def _H : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { let Inst{8} = imm{3}; } - def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32, - ElementSizeS> { + def _S : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { let Inst{9-8} = imm{4-3}; } - def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64, - ElementSizeD> { + def _D : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { let Inst{22} = imm{5}; let Inst{9-8} = imm{4-3}; } + + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; } -multiclass sve_int_bin_pred_shift_imm_right opc, string asm> { - def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8, - ElementSizeB>; - def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16, - ElementSizeH> { +// As above but shift amount takes the form of a "vector immediate". +multiclass sve_int_bin_pred_shift_imm_left_dup opc, string asm, + string Ps, SDPatternOperator op> +: sve_int_bin_pred_shift_imm_left { + def : SVE_Shift_DupImm_Pred_Pat(NAME # _B)>; + def : SVE_Shift_DupImm_Pred_Pat(NAME # _H)>; + def : SVE_Shift_DupImm_Pred_Pat(NAME # _S)>; + def : SVE_Shift_DupImm_Pred_Pat(NAME # _D)>; +} + +multiclass sve_int_bin_pred_shift_imm_left_zeroing_bhsd { + def _ZERO_B : PredTwoOpImmPseudo; + def _ZERO_H : PredTwoOpImmPseudo; + def _ZERO_S : PredTwoOpImmPseudo; + def _ZERO_D : PredTwoOpImmPseudo; + + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_D)>; +} + +multiclass sve_int_bin_pred_shift_imm_right opc, string asm, string Ps, + SDPatternOperator op = null_frag> { + def _B : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; + def _H : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{8} = imm{3}; } - def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32, - ElementSizeS> { + def _S : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { let Inst{9-8} = imm{4-3}; } - def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64, - ElementSizeD> { + def _D : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { let Inst{22} = imm{5}; let Inst{9-8} = imm{4-3}; } + + def : SVE_3_Op_Imm_Pat(NAME # _B)>; + def : SVE_3_Op_Imm_Pat(NAME # _H)>; + def : SVE_3_Op_Imm_Pat(NAME # _S)>; + def : SVE_3_Op_Imm_Pat(NAME # _D)>; +} + +// As above but shift amount takes the form of a "vector immediate". +multiclass sve_int_bin_pred_shift_imm_right_dup opc, string asm, + string Ps, SDPatternOperator op> +: sve_int_bin_pred_shift_imm_right { + def : SVE_Shift_DupImm_Pred_Pat(NAME # _B)>; + def : SVE_Shift_DupImm_Pred_Pat(NAME # _H)>; + def : SVE_Shift_DupImm_Pred_Pat(NAME # _S)>; + def : SVE_Shift_DupImm_Pred_Pat(NAME # _D)>; +} + +multiclass sve_int_bin_pred_shift_imm_right_zeroing_bhsd { + def _ZERO_B : PredTwoOpImmPseudo; + def _ZERO_H : PredTwoOpImmPseudo; + def _ZERO_S : PredTwoOpImmPseudo; + def _ZERO_D : PredTwoOpImmPseudo; + + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_D)>; } class sve_int_bin_pred_shift sz8_64, bit wide, bits<3> opc, @@ -2578,21 +5478,49 @@ class sve_int_bin_pred_shift sz8_64, bit wide, bits<3> opc, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_bin_pred_shift opc, string asm> { - def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>; - def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>; - def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>; - def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>; +multiclass sve_int_bin_pred_shift opc, string asm, string Ps, + SDPatternOperator op, string revname, bit isReverseInstr = 0> { + let DestructiveInstType = DestructiveBinaryCommWithRev in { + def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>, + SVEPseudo2Instr, SVEInstr2Rev; + def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; + } + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } -multiclass sve_int_bin_pred_shift_wide opc, string asm> { +multiclass sve_int_bin_pred_zeroing_bhsd { + def _ZERO_B : PredTwoOpPseudo; + def _ZERO_H : PredTwoOpPseudo; + def _ZERO_S : PredTwoOpPseudo; + def _ZERO_D : PredTwoOpPseudo; + + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_D)>; +} + +multiclass sve_int_bin_pred_shift_wide opc, string asm, + SDPatternOperator op> { def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>; def _H : sve_int_bin_pred_shift<0b01, 0b1, opc, asm, ZPR16, ZPR64>; def _S : sve_int_bin_pred_shift<0b10, 0b1, opc, asm, ZPR32, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; } //===----------------------------------------------------------------------===// @@ -2618,17 +5546,22 @@ class sve_int_bin_cons_shift_wide sz8_64, bits<2> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_int_bin_cons_shift_wide opc, string asm> { +multiclass sve_int_bin_cons_shift_wide opc, string asm, SDPatternOperator op> { def _B : sve_int_bin_cons_shift_wide<0b00, opc, asm, ZPR8>; def _H : sve_int_bin_cons_shift_wide<0b01, opc, asm, ZPR16>; def _S : sve_int_bin_cons_shift_wide<0b10, opc, asm, ZPR32>; + + def : SVE_2_Op_Pred_All_Active(NAME # _B)>; + def : SVE_2_Op_Pred_All_Active(NAME # _H)>; + def : SVE_2_Op_Pred_All_Active(NAME # _S)>; } class sve_int_bin_cons_shift_imm tsz8_64, bits<2> opc, string asm, ZPRRegOp zprty, Operand immtype> : I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm), asm, "\t$Zd, $Zn, $imm", - "", []>, Sched<[]> { + "", + []>, Sched<[]> { bits<5> Zd; bits<5> Zn; bits<6> imm; @@ -2643,7 +5576,8 @@ class sve_int_bin_cons_shift_imm tsz8_64, bits<2> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_int_bin_cons_shift_imm_left opc, string asm> { +multiclass sve_int_bin_cons_shift_imm_left opc, string asm, + SDPatternOperator op> { def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { let Inst{19} = imm{3}; @@ -2655,9 +5589,15 @@ multiclass sve_int_bin_cons_shift_imm_left opc, string asm> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } + + def : SVE_Shift_DupImm_All_Active_Pat(NAME # _B)>; + def : SVE_Shift_DupImm_All_Active_Pat(NAME # _H)>; + def : SVE_Shift_DupImm_All_Active_Pat(NAME # _S)>; + def : SVE_Shift_DupImm_All_Active_Pat(NAME # _D)>; } -multiclass sve_int_bin_cons_shift_imm_right opc, string asm> { +multiclass sve_int_bin_cons_shift_imm_right opc, string asm, + SDPatternOperator op> { def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{19} = imm{3}; @@ -2669,7 +5609,13 @@ multiclass sve_int_bin_cons_shift_imm_right opc, string asm> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } + + def : SVE_Shift_DupImm_All_Active_Pat(NAME # _B)>; + def : SVE_Shift_DupImm_All_Active_Pat(NAME # _H)>; + def : SVE_Shift_DupImm_All_Active_Pat(NAME # _S)>; + def : SVE_Shift_DupImm_All_Active_Pat(NAME # _D)>; } + //===----------------------------------------------------------------------===// // SVE Memory - Store Group //===----------------------------------------------------------------------===// @@ -2856,6 +5802,60 @@ multiclass sve_mem_cstnt_ss msz, string asm, RegisterOperand listty, (!cast(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>; } +class sve2_mem_sstnt_vs_base opc, string asm, + RegisterOperand listty, ZPRRegOp zprty> +: I<(outs), (ins listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), + asm, "\t$Zt, $Pg, [$Zn, $Rm]", + "", + []>, Sched<[]> { + bits<3> Pg; + bits<5> Rm; + bits<5> Zn; + bits<5> Zt; + let Inst{31-25} = 0b1110010; + let Inst{24-22} = opc; + let Inst{21} = 0b0; + let Inst{20-16} = Rm; + let Inst{15-13} = 0b001; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zt; + + let mayStore = 1; +} + +multiclass sve2_mem_sstnt_vs_32_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_sstnt_vs_base; + + def : InstAlias(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>; + def : InstAlias(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>; + + def : Pat <(op (nxv4i32 ZPR32:$Zt), (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zn), (i64 GPR64:$Rm), vt), + (!cast(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm)>; +} + +multiclass sve2_mem_sstnt_vs_64_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_sstnt_vs_base; + + def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>; + def : InstAlias(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>; + + def : Pat <(op (nxv2i64 ZPR64:$Zt), (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64:$Rm), vt), + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>; +} + class sve_mem_sst_sv opc, bit xs, bit scaled, string asm, RegisterOperand VecList, RegisterOperand zprext> : I<(outs), (ins VecList:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), @@ -2880,32 +5880,84 @@ class sve_mem_sst_sv opc, bit xs, bit scaled, string asm, let mayStore = 1; } -multiclass sve_mem_sst_sv_32_scaled opc, string asm, - RegisterOperand listty, - ZPRRegOp zprty, +multiclass sve_mem_32b_sst_sv_32_scaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd > { - def _UXTW_SCALED : sve_mem_sst_sv; - def _SXTW_SCALED : sve_mem_sst_sv; + RegisterOperand uxtw_opnd, + ValueType vt > { + def _UXTW_SCALED : sve_mem_sst_sv; + def _SXTW_SCALED : sve_mem_sst_sv; def : InstAlias(NAME # _UXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; + (!cast(NAME # _UXTW_SCALED) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; def : InstAlias(NAME # _SXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + (!cast(NAME # _SXTW_SCALED) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + def : Pat<(uxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt), + (!cast(NAME # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + def : Pat<(sxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt), + (!cast(NAME # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } -multiclass sve_mem_sst_sv_32_unscaled opc, string asm, - RegisterOperand listty, - ZPRRegOp zprty, - RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { - def _UXTW : sve_mem_sst_sv; - def _SXTW : sve_mem_sst_sv; +multiclass sve_mem_64b_sst_sv_32_scaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, + RegisterOperand sxtw_opnd, + RegisterOperand uxtw_opnd, + ValueType vt > { + def _UXTW_SCALED : sve_mem_sst_sv; + def _SXTW_SCALED : sve_mem_sst_sv; def : InstAlias(NAME # _UXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; + (!cast(NAME # _UXTW_SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; def : InstAlias(NAME # _SXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + (!cast(NAME # _SXTW_SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + def : Pat<(uxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + def : Pat<(sxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; +} + +multiclass sve_mem_64b_sst_sv_32_unscaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, + RegisterOperand sxtw_opnd, + RegisterOperand uxtw_opnd, + ValueType vt> { + def _UXTW : sve_mem_sst_sv; + def _SXTW : sve_mem_sst_sv; + + def : InstAlias(NAME # _UXTW) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; + def : InstAlias(NAME # _SXTW) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + def : Pat<(uxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _UXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + def : Pat<(sxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _SXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; +} + +multiclass sve_mem_32b_sst_sv_32_unscaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, + RegisterOperand sxtw_opnd, + RegisterOperand uxtw_opnd, + ValueType vt> { + def _UXTW : sve_mem_sst_sv; + def _SXTW : sve_mem_sst_sv; + + def : InstAlias(NAME # _UXTW) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; + def : InstAlias(NAME # _SXTW) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + def : Pat<(uxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt), + (!cast(NAME # _UXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + def : Pat<(sxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt), + (!cast(NAME # _SXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } class sve_mem_sst_sv2 msz, bit scaled, string asm, @@ -2932,19 +5984,28 @@ class sve_mem_sst_sv2 msz, bit scaled, string asm, } multiclass sve_mem_sst_sv_64_scaled msz, string asm, - RegisterOperand zprext> { - def "" : sve_mem_sst_sv2; + SDPatternOperator op, + RegisterOperand zprext, + ValueType vt> { + def _SCALED_REAL : sve_mem_sst_sv2; def : InstAlias(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>; + (!cast(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>; + def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt), + (!cast(NAME # _SCALED_REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } -multiclass sve_mem_sst_sv_64_unscaled msz, string asm> { - def "" : sve_mem_sst_sv2; +multiclass sve_mem_sst_sv_64_unscaled msz, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve_mem_sst_sv2; def : InstAlias(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>; + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>; + + def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } class sve_mem_sst_vi opc, string asm, ZPRRegOp zprty, @@ -2970,16 +6031,38 @@ class sve_mem_sst_vi opc, string asm, ZPRRegOp zprty, let mayStore = 1; } -multiclass sve_mem_sst_vi_ptrs opc, string asm, RegisterOperand listty, - ZPRRegOp zprty, Operand imm_ty> { - def _IMM : sve_mem_sst_vi; +multiclass sve_mem_32b_sst_vi_ptrs opc, string asm, + Operand imm_ty, + SDPatternOperator op, + ValueType vt> { + def _IMM : sve_mem_sst_vi; def : InstAlias(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 0>; + (!cast(NAME # _IMM) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 0>; def : InstAlias(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, imm_ty:$imm5), 0>; + (!cast(NAME # _IMM) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), 0>; def : InstAlias(NAME # _IMM) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 1>; + (!cast(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>; + + def : Pat<(op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt), + (!cast(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; +} + +multiclass sve_mem_64b_sst_vi_ptrs opc, string asm, + Operand imm_ty, + SDPatternOperator op, + ValueType vt> { + def _IMM : sve_mem_sst_vi; + + def : InstAlias(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 0>; + def : InstAlias(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>; + def : InstAlias(NAME # _IMM) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; + + def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt), + (!cast(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; } class sve_mem_z_spill @@ -3041,8 +6124,7 @@ class sve_int_perm_bin_perm_pp opc, bits<2> sz8_64, string asm, PPRRegOp pprty> : I<(outs pprty:$Pd), (ins pprty:$Pn, pprty:$Pm), asm, "\t$Pd, $Pn, $Pm", - "", - []>, Sched<[]> { + "", []>, Sched<[]> { bits<4> Pd; bits<4> Pm; bits<4> Pn; @@ -3058,11 +6140,17 @@ class sve_int_perm_bin_perm_pp opc, bits<2> sz8_64, string asm, let Inst{3-0} = Pd; } -multiclass sve_int_perm_bin_perm_pp opc, string asm> { +multiclass sve_int_perm_bin_perm_pp opc, string asm, + SDPatternOperator op> { def _B : sve_int_perm_bin_perm_pp; def _H : sve_int_perm_bin_perm_pp; def _S : sve_int_perm_bin_perm_pp; def _D : sve_int_perm_bin_perm_pp; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } class sve_int_perm_punpk @@ -3080,6 +6168,14 @@ class sve_int_perm_punpk let Inst{3-0} = Pd; } +multiclass sve_int_perm_punpk { + def NAME : sve_int_perm_punpk; + + def : SVE_1_Op_Pat(NAME)>; + def : SVE_1_Op_Pat(NAME)>; + def : SVE_1_Op_Pat(NAME)>; +} + class sve_int_rdffr_pred : I<(outs PPR8:$Pd), (ins PPRAny:$Pg), asm, "\t$Pd, $Pg/z", @@ -3094,10 +6190,21 @@ class sve_int_rdffr_pred let Inst{4} = 0; let Inst{3-0} = Pd; - let Defs = !if(!eq (s, 1), [NZCV], []); + let Defs = !if(s, [NZCV], []); let Uses = [FFR]; } +multiclass sve_int_rdffr_pred { + def _REAL : sve_int_rdffr_pred; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs PPR8:$Pd), (ins PPRAny:$Pg), [(set (nxv16i1 PPR8:$Pd), (op (nxv16i1 PPRAny:$Pg)))]>, + PseudoInstExpansion<(!cast(NAME # _REAL) PPR8:$Pd, PPRAny:$Pg)>; + } +} + class sve_int_rdffr_unpred : I< (outs PPR8:$Pd), (ins), asm, "\t$Pd", @@ -3110,11 +6217,22 @@ class sve_int_rdffr_unpred : I< let Uses = [FFR]; } -class sve_int_wrffr +multiclass sve_int_rdffr_unpred { + def _REAL : sve_int_rdffr_unpred; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs PPR8:$Pd), (ins), [(set (nxv16i1 PPR8:$Pd), (op))]>, + PseudoInstExpansion<(!cast(NAME # _REAL) PPR8:$Pd)>; + } +} + +class sve_int_wrffr : I<(outs), (ins PPR8:$Pn), asm, "\t$Pn", "", - []>, Sched<[]> { + [(op (nxv16i1 PPR8:$Pn))]>, Sched<[]> { bits<4> Pn; let Inst{31-9} = 0b00100101001010001001000; let Inst{8-5} = Pn; @@ -3124,11 +6242,11 @@ class sve_int_wrffr let Defs = [FFR]; } -class sve_int_setffr +class sve_int_setffr : I<(outs), (ins), asm, "", "", - []>, Sched<[]> { + [(op)]>, Sched<[]> { let Inst{31-0} = 0b00100101001011001001000000000000; let hasSideEffects = 1; @@ -3160,11 +6278,16 @@ class sve_int_perm_clast_rz sz8_64, bit ab, string asm, let Constraints = "$Rdn = $_Rdn"; } -multiclass sve_int_perm_clast_rz { +multiclass sve_int_perm_clast_rz { def _B : sve_int_perm_clast_rz<0b00, ab, asm, ZPR8, GPR32>; def _H : sve_int_perm_clast_rz<0b01, ab, asm, ZPR16, GPR32>; def _S : sve_int_perm_clast_rz<0b10, ab, asm, ZPR32, GPR32>; def _D : sve_int_perm_clast_rz<0b11, ab, asm, ZPR64, GPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } class sve_int_perm_clast_vz sz8_64, bit ab, string asm, @@ -3188,11 +6311,17 @@ class sve_int_perm_clast_vz sz8_64, bit ab, string asm, let Constraints = "$Vdn = $_Vdn"; } -multiclass sve_int_perm_clast_vz { +multiclass sve_int_perm_clast_vz { def _B : sve_int_perm_clast_vz<0b00, ab, asm, ZPR8, FPR8>; def _H : sve_int_perm_clast_vz<0b01, ab, asm, ZPR16, FPR16>; def _S : sve_int_perm_clast_vz<0b10, ab, asm, ZPR32, FPR32>; def _D : sve_int_perm_clast_vz<0b11, ab, asm, ZPR64, FPR64>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; + + def : SVE_3_Op_Pat(NAME # _H)>; } class sve_int_perm_clast_zz sz8_64, bit ab, string asm, @@ -3214,15 +6343,26 @@ class sve_int_perm_clast_zz sz8_64, bit ab, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve_int_perm_clast_zz { +multiclass sve_int_perm_clast_zz { def _B : sve_int_perm_clast_zz<0b00, ab, asm, ZPR8>; def _H : sve_int_perm_clast_zz<0b01, ab, asm, ZPR16>; def _S : sve_int_perm_clast_zz<0b10, ab, asm, ZPR32>; def _D : sve_int_perm_clast_zz<0b11, ab, asm, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; + + def : SVE_3_Op_Pat(NAME # _H)>; } class sve_int_perm_last_r sz8_64, bit ab, string asm, @@ -3244,11 +6384,16 @@ class sve_int_perm_last_r sz8_64, bit ab, string asm, let Inst{4-0} = Rd; } -multiclass sve_int_perm_last_r { +multiclass sve_int_perm_last_r { def _B : sve_int_perm_last_r<0b00, ab, asm, ZPR8, GPR32>; def _H : sve_int_perm_last_r<0b01, ab, asm, ZPR16, GPR32>; def _S : sve_int_perm_last_r<0b10, ab, asm, ZPR32, GPR32>; def _D : sve_int_perm_last_r<0b11, ab, asm, ZPR64, GPR64>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } class sve_int_perm_last_v sz8_64, bit ab, string asm, @@ -3270,11 +6415,18 @@ class sve_int_perm_last_v sz8_64, bit ab, string asm, let Inst{4-0} = Vd; } -multiclass sve_int_perm_last_v { +multiclass sve_int_perm_last_v { def _B : sve_int_perm_last_v<0b00, ab, asm, ZPR8, FPR8>; def _H : sve_int_perm_last_v<0b01, ab, asm, ZPR16, FPR16>; def _S : sve_int_perm_last_v<0b10, ab, asm, ZPR32, FPR32>; def _D : sve_int_perm_last_v<0b11, ab, asm, ZPR64, FPR64>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; + + def : SVE_2_Op_Pat(NAME # _H)>; } class sve_int_perm_splice sz8_64, string asm, ZPRRegOp zprty> @@ -3293,15 +6445,50 @@ class sve_int_perm_splice sz8_64, string asm, ZPRRegOp zprty> let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = ElementSizeNone; } -multiclass sve_int_perm_splice { +multiclass sve_int_perm_splice { def _B : sve_int_perm_splice<0b00, asm, ZPR8>; def _H : sve_int_perm_splice<0b01, asm, ZPR16>; def _S : sve_int_perm_splice<0b10, asm, ZPR32>; def _D : sve_int_perm_splice<0b11, asm, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; + + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; + + def : SVE_3_Op_Pat(NAME # _H)>; +} + +class sve2_int_perm_splice_cons sz8_64, string asm, + ZPRRegOp zprty, RegisterOperand VecList> +: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, VecList:$Zn), + asm, "\t$Zd, $Pg, $Zn", + "", + []>, Sched<[]> { + bits<3> Pg; + bits<5> Zn; + bits<5> Zd; + let Inst{31-24} = 0b00000101; + let Inst{23-22} = sz8_64; + let Inst{21-13} = 0b101101100; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_int_perm_splice_cons { + def _B : sve2_int_perm_splice_cons<0b00, asm, ZPR8, ZZ_b>; + def _H : sve2_int_perm_splice_cons<0b01, asm, ZPR16, ZZ_h>; + def _S : sve2_int_perm_splice_cons<0b10, asm, ZPR32, ZZ_s>; + def _D : sve2_int_perm_splice_cons<0b11, asm, ZPR64, ZZ_d>; } class sve_int_perm_rev sz8_64, bits<2> opc, string asm, @@ -3323,30 +6510,44 @@ class sve_int_perm_rev sz8_64, bits<2> opc, string asm, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_perm_rev_rbit { +multiclass sve_int_perm_rev_rbit { def _B : sve_int_perm_rev<0b00, 0b11, asm, ZPR8>; def _H : sve_int_perm_rev<0b01, 0b11, asm, ZPR16>; def _S : sve_int_perm_rev<0b10, 0b11, asm, ZPR32>; def _D : sve_int_perm_rev<0b11, 0b11, asm, ZPR64>; + + def : SVE_1_Op_Passthru_Pat(NAME # _B)>; + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat(NAME # _D)>; } -multiclass sve_int_perm_rev_revb { +multiclass sve_int_perm_rev_revb { def _H : sve_int_perm_rev<0b01, 0b00, asm, ZPR16>; def _S : sve_int_perm_rev<0b10, 0b00, asm, ZPR32>; def _D : sve_int_perm_rev<0b11, 0b00, asm, ZPR64>; + + def : SVE_1_Op_Passthru_Pat(NAME # _H)>; + def : SVE_1_Op_Passthru_Pat(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat(NAME # _D)>; } -multiclass sve_int_perm_rev_revh { +multiclass sve_int_perm_rev_revh { def _S : sve_int_perm_rev<0b10, 0b01, asm, ZPR32>; def _D : sve_int_perm_rev<0b11, 0b01, asm, ZPR64>; + + def : SVE_1_Op_Passthru_Pat(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat(NAME # _D)>; } -multiclass sve_int_perm_rev_revw { +multiclass sve_int_perm_rev_revw { def _D : sve_int_perm_rev<0b11, 0b10, asm, ZPR64>; + + def : SVE_1_Op_Passthru_Pat(NAME # _D)>; } class sve_int_perm_cpy_r sz8_64, string asm, ZPRRegOp zprty, @@ -3366,11 +6567,11 @@ class sve_int_perm_cpy_r sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_perm_cpy_r { +multiclass sve_int_perm_cpy_r { def _B : sve_int_perm_cpy_r<0b00, asm, ZPR8, GPR32sp>; def _H : sve_int_perm_cpy_r<0b01, asm, ZPR16, GPR32sp>; def _S : sve_int_perm_cpy_r<0b10, asm, ZPR32, GPR32sp>; @@ -3384,6 +6585,15 @@ multiclass sve_int_perm_cpy_r { (!cast(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>; def : InstAlias<"mov $Zd, $Pg/m, $Rn", (!cast(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, GPR64sp:$Rn), 1>; + + def : Pat<(nxv16i8 (op nxv16i1:$pg, i32:$splat, nxv16i8:$passthru)), + (!cast(NAME # _B) $passthru, $pg, $splat)>; + def : Pat<(nxv8i16 (op nxv8i1:$pg, i32:$splat, nxv8i16:$passthru)), + (!cast(NAME # _H) $passthru, $pg, $splat)>; + def : Pat<(nxv4i32 (op nxv4i1:$pg, i32:$splat, nxv4i32:$passthru)), + (!cast(NAME # _S) $passthru, $pg, $splat)>; + def : Pat<(nxv2i64 (op nxv2i1:$pg, i64:$splat, nxv2i64:$passthru)), + (!cast(NAME # _D) $passthru, $pg, $splat)>; } class sve_int_perm_cpy_v sz8_64, string asm, ZPRRegOp zprty, @@ -3403,11 +6613,11 @@ class sve_int_perm_cpy_v sz8_64, string asm, ZPRRegOp zprty, let Inst{4-0} = Zd; let Constraints = "$Zd = $_Zd"; - let DestructiveInstType = Destructive; + let DestructiveInstType = DestructiveOther; let ElementSize = zprty.ElementSize; } -multiclass sve_int_perm_cpy_v { +multiclass sve_int_perm_cpy_v { def _B : sve_int_perm_cpy_v<0b00, asm, ZPR8, FPR8>; def _H : sve_int_perm_cpy_v<0b01, asm, ZPR16, FPR16>; def _S : sve_int_perm_cpy_v<0b10, asm, ZPR32, FPR32>; @@ -3421,6 +6631,18 @@ multiclass sve_int_perm_cpy_v { (!cast(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, FPR32:$Vn), 1>; def : InstAlias<"mov $Zd, $Pg/m, $Vn", (!cast(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>; + + def : Pat<(nxv8f16 (op nxv8i1:$pg, f16:$splat, nxv8f16:$passthru)), + (!cast(NAME # _H) $passthru, $pg, $splat)>; + def : Pat<(nxv2f32 (op nxv2i1:$pg, f32:$splat, nxv2f32:$passthru)), + (!cast(NAME # _S) $passthru, $pg, $splat)>; + def : Pat<(nxv4f32 (op nxv4i1:$pg, f32:$splat, nxv4f32:$passthru)), + (!cast(NAME # _S) $passthru, $pg, $splat)>; + def : Pat<(nxv2f64 (op nxv2i1:$pg, f64:$splat, nxv2f64:$passthru)), + (!cast(NAME # _D) $passthru, $pg, $splat)>; + + def : Pat<(nxv8bf16 (op nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)), + (!cast(NAME # _H) $passthru, $pg, $splat)>; } class sve_int_perm_compact @@ -3439,11 +6661,15 @@ class sve_int_perm_compact let Inst{4-0} = Zd; } -multiclass sve_int_perm_compact { +multiclass sve_int_perm_compact { def _S : sve_int_perm_compact<0b0, asm, ZPR32>; def _D : sve_int_perm_compact<0b1, asm, ZPR64>; -} + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; + def : SVE_2_Op_Pat(NAME # _D)>; +} //===----------------------------------------------------------------------===// // SVE Memory - Contiguous Load Group @@ -3469,8 +6695,8 @@ class sve_mem_cld_si_base dtype, bit nf, string asm, let Inst{4-0} = Zt; let mayLoad = 1; - let Uses = !if(!eq(nf, 1), [FFR], []); - let Defs = !if(!eq(nf, 1), [FFR], []); + let Uses = !if(nf, [FFR], []); + let Defs = !if(nf, [FFR], []); } multiclass sve_mem_cld_si_base dtype, bit nf, string asm, @@ -3483,6 +6709,13 @@ multiclass sve_mem_cld_si_base dtype, bit nf, string asm, (!cast(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>; def : InstAlias(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1, mayLoad = 1 in { + def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), []>, + PseudoInstExpansion<(!cast(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4)>; + } } multiclass sve_mem_cld_si dtype, string asm, RegisterOperand listty, @@ -3665,8 +6898,8 @@ class sve_mem_cld_ss_base dtype, bit ff, dag iops, string asm, let Inst{4-0} = Zt; let mayLoad = 1; - let Uses = !if(!eq(ff, 1), [FFR], []); - let Defs = !if(!eq(ff, 1), [FFR], []); + let Uses = !if(ff, [FFR], []); + let Defs = !if(ff, [FFR], []); } multiclass sve_mem_cld_ss dtype, string asm, RegisterOperand listty, @@ -3691,6 +6924,13 @@ multiclass sve_mem_cldff_ss dtype, string asm, RegisterOperand listty, def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), []>, + PseudoInstExpansion<(!cast(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm)>; + } } multiclass sve_mem_cldnf_si dtype, string asm, RegisterOperand listty, @@ -3783,8 +7023,11 @@ class sve_mem_32b_gld_sv opc, bit xs, bit scaled, string asm, } multiclass sve_mem_32b_gld_sv_32_scaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { + RegisterOperand uxtw_opnd, + ValueType vt> { def _UXTW_SCALED_REAL : sve_mem_32b_gld_sv; def _SXTW_SCALED_REAL : sve_mem_32b_gld_sv; @@ -3792,11 +7035,28 @@ multiclass sve_mem_32b_gld_sv_32_scaled opc, string asm, (!cast(NAME # _UXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; def : InstAlias(NAME # _SXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _UXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + + def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)), + (!cast(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)), + (!cast(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } multiclass sve_mem_32b_gld_vs_32_unscaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { + RegisterOperand uxtw_opnd, + ValueType vt> { def _UXTW_REAL : sve_mem_32b_gld_sv; def _SXTW_REAL : sve_mem_32b_gld_sv; @@ -3804,6 +7064,20 @@ multiclass sve_mem_32b_gld_vs_32_unscaled opc, string asm, (!cast(NAME # _UXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; def : InstAlias(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _UXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + + def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)), + (!cast(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)), + (!cast(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } @@ -3831,7 +7105,8 @@ class sve_mem_32b_gld_vi opc, string asm, Operand imm_ty> let Uses = !if(!eq(opc{0}, 1), [FFR], []); } -multiclass sve_mem_32b_gld_vi_32_ptrs opc, string asm, Operand imm_ty> { +multiclass sve_mem_32b_gld_vi_32_ptrs opc, string asm, Operand imm_ty, + SDPatternOperator op, ValueType vt> { def _IMM_REAL : sve_mem_32b_gld_vi; def : InstAlias opc, string asm, Operand imm_ty> { (!cast(NAME # _IMM_REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), 0>; def : InstAlias(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _IMM : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), []>, + PseudoInstExpansion<(!cast(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5)>; + } + + def : Pat<(nxv4i32 (op (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt)), + (!cast(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; } class sve_mem_prfm_si msz, string asm> @@ -3920,9 +7205,17 @@ class sve_mem_32b_prfm_sv msz, bit xs, string asm, multiclass sve_mem_32b_prfm_sv_scaled msz, string asm, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { + RegisterOperand uxtw_opnd, + SDPatternOperator op_sxtw, + SDPatternOperator op_uxtw> { def _UXTW_SCALED : sve_mem_32b_prfm_sv; def _SXTW_SCALED : sve_mem_32b_prfm_sv; + + def : Pat<(op_uxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + + def : Pat<(op_sxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; } class sve_mem_32b_prfm_vi msz, string asm, Operand imm_ty> @@ -3945,11 +7238,14 @@ class sve_mem_32b_prfm_vi msz, string asm, Operand imm_ty> let Inst{3-0} = prfop; } -multiclass sve_mem_32b_prfm_vi msz, string asm, Operand imm_ty> { +multiclass sve_mem_32b_prfm_vi msz, string asm, Operand imm_ty, SDPatternOperator op> { def NAME : sve_mem_32b_prfm_vi; def : InstAlias(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>; + + def : Pat<(op (nxv4i1 PPR_3b:$Pg), (nxv4i32 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)), + (!cast(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>; } class sve_mem_z_fill @@ -4003,6 +7299,65 @@ multiclass sve_mem_p_fill { (!cast(NAME) PPRAny:$Pt, GPR64sp:$Rn, 0), 1>; } +class sve2_mem_gldnt_vs_base opc, dag iops, string asm, + RegisterOperand VecList> +: I<(outs VecList:$Zt), iops, + asm, "\t$Zt, $Pg/z, [$Zn, $Rm]", + "", + []>, Sched<[]> { + bits<3> Pg; + bits<5> Rm; + bits<5> Zn; + bits<5> Zt; + let Inst{31} = 0b1; + let Inst{30} = opc{4}; + let Inst{29-25} = 0b00010; + let Inst{24-23} = opc{3-2}; + let Inst{22-21} = 0b00; + let Inst{20-16} = Rm; + let Inst{15} = 0b1; + let Inst{14-13} = opc{1-0}; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zt; + + let mayLoad = 1; +} + +multiclass sve2_mem_gldnt_vs_32_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_gldnt_vs_base; + + def : InstAlias(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>; + def : InstAlias(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>; + + def : Pat <(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zd), (i64 GPR64:$Rm), vt)), + (!cast(NAME # _REAL) PPR3bAny:$Pg, ZPR32:$Zd, GPR64:$Rm)>; +} + +multiclass sve2_mem_gldnt_vs_64_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_gldnt_vs_base; + + def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>; + def : InstAlias(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>; + + def : Pat <(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zd), (i64 GPR64:$Rm), vt)), + (!cast(NAME # _REAL) PPR3bAny:$Pg, ZPR64:$Zd, GPR64:$Rm)>; +} + //===----------------------------------------------------------------------===// // SVE Memory - 64-bit Gather Group //===----------------------------------------------------------------------===// @@ -4037,8 +7392,11 @@ class sve_mem_64b_gld_sv opc, bit xs, bit scaled, bit lsl, string asm, } multiclass sve_mem_64b_gld_sv_32_scaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { + RegisterOperand uxtw_opnd, + ValueType vt> { def _UXTW_SCALED_REAL : sve_mem_64b_gld_sv; def _SXTW_SCALED_REAL : sve_mem_64b_gld_sv; @@ -4046,11 +7404,28 @@ multiclass sve_mem_64b_gld_sv_32_scaled opc, string asm, (!cast(NAME # _UXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; def : InstAlias(NAME # _SXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _UXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + + def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), + (!cast(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), + (!cast(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } multiclass sve_mem_64b_gld_vs_32_unscaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { + RegisterOperand uxtw_opnd, + ValueType vt> { def _UXTW_REAL : sve_mem_64b_gld_sv; def _SXTW_REAL : sve_mem_64b_gld_sv; @@ -4058,21 +7433,57 @@ multiclass sve_mem_64b_gld_vs_32_unscaled opc, string asm, (!cast(NAME # _UXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; def : InstAlias(NAME # _SXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _UXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + + def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)), + (!cast(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)), + (!cast(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } multiclass sve_mem_64b_gld_sv2_64_scaled opc, string asm, - RegisterOperand zprext> { + SDPatternOperator op, + RegisterOperand zprext, ValueType vt> { def _SCALED_REAL : sve_mem_64b_gld_sv; def : InstAlias(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>; + } + + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), + (!cast(NAME # _SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } -multiclass sve_mem_64b_gld_vs2_64_unscaled opc, string asm> { +multiclass sve_mem_64b_gld_vs2_64_unscaled opc, string asm, + SDPatternOperator op, ValueType vt> { def _REAL : sve_mem_64b_gld_sv; def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm)>; + } + + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)), + (!cast(NAME) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } class sve_mem_64b_gld_vi opc, string asm, Operand imm_ty> @@ -4099,7 +7510,8 @@ class sve_mem_64b_gld_vi opc, string asm, Operand imm_ty> let Uses = !if(!eq(opc{0}, 1), [FFR], []); } -multiclass sve_mem_64b_gld_vi_64_ptrs opc, string asm, Operand imm_ty> { +multiclass sve_mem_64b_gld_vi_64_ptrs opc, string asm, Operand imm_ty, + SDPatternOperator op, ValueType vt> { def _IMM_REAL : sve_mem_64b_gld_vi; def : InstAlias opc, string asm, Operand imm_ty> { (!cast(NAME # _IMM_REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>; def : InstAlias(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _IMM : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), []>, + PseudoInstExpansion<(!cast(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5)>; + } + + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt)), + (!cast(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; } // bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl) @@ -4137,16 +7559,28 @@ class sve_mem_64b_prfm_sv msz, bit xs, bit lsl, string asm, multiclass sve_mem_64b_prfm_sv_ext_scaled msz, string asm, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { + RegisterOperand uxtw_opnd, + SDPatternOperator op_sxtw, + SDPatternOperator op_uxtw> { def _UXTW_SCALED : sve_mem_64b_prfm_sv; def _SXTW_SCALED : sve_mem_64b_prfm_sv; + + def : Pat<(op_uxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + + def : Pat<(op_sxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } multiclass sve_mem_64b_prfm_sv_lsl_scaled msz, string asm, - RegisterOperand zprext> { + RegisterOperand zprext, SDPatternOperator frag> { def NAME : sve_mem_64b_prfm_sv; -} + def : Pat<(frag (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 zprext:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>; + +} class sve_mem_64b_prfm_vi msz, string asm, Operand imm_ty> : I<(outs), (ins sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), @@ -4170,13 +7604,15 @@ class sve_mem_64b_prfm_vi msz, string asm, Operand imm_ty> let hasSideEffects = 1; } -multiclass sve_mem_64b_prfm_vi msz, string asm, Operand imm_ty> { +multiclass sve_mem_64b_prfm_vi msz, string asm, Operand imm_ty, SDPatternOperator op> { def NAME : sve_mem_64b_prfm_vi; def : InstAlias(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; -} + def : Pat<(op (nxv2i1 PPR_3b:$Pg), (nxv2i64 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)), + (!cast(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>; +} //===----------------------------------------------------------------------===// // SVE Compute Vector Address Group @@ -4229,7 +7665,6 @@ multiclass sve_int_bin_cons_misc_0_a_64_lsl opc, string asm> { def _3 : sve_int_bin_cons_misc_0_a; } - //===----------------------------------------------------------------------===// // SVE Integer Misc - Unpredicated Group //===----------------------------------------------------------------------===// @@ -4251,10 +7686,14 @@ class sve_int_bin_cons_misc_0_b sz, string asm, ZPRRegOp zprty> let Inst{4-0} = Zd; } -multiclass sve_int_bin_cons_misc_0_b { +multiclass sve_int_bin_cons_misc_0_b { def _H : sve_int_bin_cons_misc_0_b<0b01, asm, ZPR16>; def _S : sve_int_bin_cons_misc_0_b<0b10, asm, ZPR32>; def _D : sve_int_bin_cons_misc_0_b<0b11, asm, ZPR64>; + + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } class sve_int_bin_cons_misc_0_c opc, string asm, ZPRRegOp zprty> @@ -4274,13 +7713,23 @@ class sve_int_bin_cons_misc_0_c opc, string asm, ZPRRegOp zprty> let Inst{4-0} = Zd; } +multiclass sve_int_bin_cons_misc_0_c_fexpa { + def _H : sve_int_bin_cons_misc_0_c<0b01000000, asm, ZPR16>; + def _S : sve_int_bin_cons_misc_0_c<0b10000000, asm, ZPR32>; + def _D : sve_int_bin_cons_misc_0_c<0b11000000, asm, ZPR64>; + + def : SVE_1_Op_Pat(NAME # _H)>; + def : SVE_1_Op_Pat(NAME # _S)>; + def : SVE_1_Op_Pat(NAME # _D)>; +} + //===----------------------------------------------------------------------===// // SVE Integer Reduction Group //===----------------------------------------------------------------------===// class sve_int_reduce sz8_32, bits<2> fmt, bits<3> opc, string asm, - ZPRRegOp zprty, RegisterClass regtype> -: I<(outs regtype:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn), + ZPRRegOp zprty, FPRasZPROperand dstOpType> +: I<(outs dstOpType:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn), asm, "\t$Vd, $Pg, $Zn", "", []>, Sched<[]> { @@ -4298,31 +7747,54 @@ class sve_int_reduce sz8_32, bits<2> fmt, bits<3> opc, string asm, let Inst{4-0} = Vd; } -multiclass sve_int_reduce_0_saddv opc, string asm> { - def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>; - def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>; - def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>; +multiclass sve_int_reduce_0_saddv opc, string asm, + SDPatternOperator op> { + def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64asZPR>; + def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64asZPR>; + def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64asZPR>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; } -multiclass sve_int_reduce_0_uaddv opc, string asm> { - def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64>; - def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64>; - def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64>; - def _D : sve_int_reduce<0b11, 0b00, opc, asm, ZPR64, FPR64>; +multiclass sve_int_reduce_0_uaddv opc, string asm, + SDPatternOperator op> { + def _B : sve_int_reduce<0b00, 0b00, opc, asm, ZPR8, FPR64asZPR>; + def _H : sve_int_reduce<0b01, 0b00, opc, asm, ZPR16, FPR64asZPR>; + def _S : sve_int_reduce<0b10, 0b00, opc, asm, ZPR32, FPR64asZPR>; + def _D : sve_int_reduce<0b11, 0b00, opc, asm, ZPR64, FPR64asZPR>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } -multiclass sve_int_reduce_1 opc, string asm> { - def _B : sve_int_reduce<0b00, 0b01, opc, asm, ZPR8, FPR8>; - def _H : sve_int_reduce<0b01, 0b01, opc, asm, ZPR16, FPR16>; - def _S : sve_int_reduce<0b10, 0b01, opc, asm, ZPR32, FPR32>; - def _D : sve_int_reduce<0b11, 0b01, opc, asm, ZPR64, FPR64>; +multiclass sve_int_reduce_1 opc, string asm, + SDPatternOperator op> { + def _B : sve_int_reduce<0b00, 0b01, opc, asm, ZPR8, FPR8asZPR>; + def _H : sve_int_reduce<0b01, 0b01, opc, asm, ZPR16, FPR16asZPR>; + def _S : sve_int_reduce<0b10, 0b01, opc, asm, ZPR32, FPR32asZPR>; + def _D : sve_int_reduce<0b11, 0b01, opc, asm, ZPR64, FPR64asZPR>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } -multiclass sve_int_reduce_2 opc, string asm> { - def _B : sve_int_reduce<0b00, 0b11, opc, asm, ZPR8, FPR8>; - def _H : sve_int_reduce<0b01, 0b11, opc, asm, ZPR16, FPR16>; - def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32>; - def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64>; +multiclass sve_int_reduce_2 opc, string asm, + SDPatternOperator op> { + def _B : sve_int_reduce<0b00, 0b11, opc, asm, ZPR8, FPR8asZPR>; + def _H : sve_int_reduce<0b01, 0b11, opc, asm, ZPR16, FPR16asZPR>; + def _S : sve_int_reduce<0b10, 0b11, opc, asm, ZPR32, FPR32asZPR>; + def _D : sve_int_reduce<0b11, 0b11, opc, asm, ZPR64, FPR64asZPR>; + + def : SVE_2_Op_Pat(NAME # _B)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } class sve_int_movprfx_pred sz8_32, bits<3> opc, string asm, @@ -4398,6 +7870,12 @@ class sve_int_brkp opc, string asm> let Defs = !if(!eq (opc{1}, 1), [NZCV], []); } +multiclass sve_int_brkp opc, string asm, SDPatternOperator op> { + def NAME : sve_int_brkp; + + def : SVE_3_Op_Pat(NAME)>; +} + //===----------------------------------------------------------------------===// // SVE Partition Break Group @@ -4421,7 +7899,13 @@ class sve_int_brkn let Inst{3-0} = Pdm; let Constraints = "$Pdm = $_Pdm"; - let Defs = !if(!eq (S, 0b1), [NZCV], []); + let Defs = !if(S, [NZCV], []); +} + +multiclass sve_int_brkn opc, string asm, SDPatternOperator op> { + def NAME : sve_int_brkn; + + def : SVE_3_Op_Pat(NAME)>; } class sve_int_break opc, string asm, string suffix, dag iops> @@ -4446,11 +7930,586 @@ class sve_int_break opc, string asm, string suffix, dag iops> } -multiclass sve_int_break_m opc, string asm> { +multiclass sve_int_break_m opc, string asm, SDPatternOperator op> { def NAME : sve_int_break; + + def : SVE_3_Op_Pat(NAME)>; } -multiclass sve_int_break_z opc, string asm> { +multiclass sve_int_break_z opc, string asm, SDPatternOperator op> { def NAME : sve_int_break; + + def : SVE_2_Op_Pat(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 String Processing Group +//===----------------------------------------------------------------------===// + +class sve2_char_match +: I<(outs pprty:$Pd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm), + asm, "\t$Pd, $Pg/z, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<4> Pd; + bits<3> Pg; + bits<5> Zm; + bits<5> Zn; + let Inst{31-23} = 0b010001010; + let Inst{22} = sz; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b100; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4} = opc; + let Inst{3-0} = Pd; + + let Defs = [NZCV]; + let isPTestLike = 1; +} + +multiclass sve2_char_match { + def _B : sve2_char_match<0b0, opc, asm, PPR8, ZPR8>; + def _H : sve2_char_match<0b1, opc, asm, PPR16, ZPR16>; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Histogram Computation - Segment Group +//===----------------------------------------------------------------------===// + +class sve2_hist_gen_segment +: I<(outs ZPR8:$Zd), (ins ZPR8:$Zn, ZPR8:$Zm), + asm, "\t$Zd, $Zn, $Zm", + "", + [(set nxv16i8:$Zd, (op nxv16i8:$Zn, nxv16i8:$Zm))]>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-21} = 0b01000101001; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b101000; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +//===----------------------------------------------------------------------===// +// SVE2 Histogram Computation - Vector Group +//===----------------------------------------------------------------------===// + +class sve2_hist_gen_vector +: I<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zn, zprty:$Zm), + asm, "\t$Zd, $Pg/z, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<3> Pg; + bits<5> Zm; + let Inst{31-23} = 0b010001011; + let Inst{22} = sz; + let Inst{21} = 0b1; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b110; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_hist_gen_vector { + def _S : sve2_hist_gen_vector<0b0, asm, ZPR32>; + def _D : sve2_hist_gen_vector<0b1, asm, ZPR64>; + + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; +} + +//===----------------------------------------------------------------------===// +// SVE2 Crypto Extensions Group +//===----------------------------------------------------------------------===// + +class sve2_crypto_cons_bin_op +: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), + asm, "\t$Zd, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<5> Zn; + bits<5> Zm; + let Inst{31-21} = 0b01000101001; + let Inst{20-16} = Zm; + let Inst{15-11} = 0b11110; + let Inst{10} = opc; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve2_crypto_cons_bin_op { + def NAME : sve2_crypto_cons_bin_op; + def : SVE_2_Op_Pat(NAME)>; +} + +class sve2_crypto_des_bin_op opc, string asm, ZPRRegOp zprty> +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm), + asm, "\t$Zdn, $_Zdn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zdn; + bits<5> Zm; + let Inst{31-17} = 0b010001010010001; + let Inst{16} = opc{1}; + let Inst{15-11} = 0b11100; + let Inst{10} = opc{0}; + let Inst{9-5} = Zm; + let Inst{4-0} = Zdn; + + let Constraints = "$Zdn = $_Zdn"; +} + +multiclass sve2_crypto_des_bin_op opc, string asm, ZPRRegOp zprty, + SDPatternOperator op, ValueType vt> { + def NAME : sve2_crypto_des_bin_op; + def : SVE_2_Op_Pat(NAME)>; +} + +class sve2_crypto_unary_op +: I<(outs zprty:$Zdn), (ins zprty:$_Zdn), + asm, "\t$Zdn, $_Zdn", + "", + []>, Sched<[]> { + bits<5> Zdn; + let Inst{31-11} = 0b010001010010000011100; + let Inst{10} = opc; + let Inst{9-5} = 0b00000; + let Inst{4-0} = Zdn; + + let Constraints = "$Zdn = $_Zdn"; +} + +multiclass sve2_crypto_unary_op { + def NAME : sve2_crypto_unary_op; + def : SVE_1_Op_Pat(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE BFloat16 Group +//===----------------------------------------------------------------------===// + +class sve_bfloat_dot_base opc, string asm, string ops, dag iops> +: I<(outs ZPR32:$Zda), iops, asm, ops, "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + let Inst{31-21} = 0b01100100011; + let Inst{15-14} = opc; + let Inst{13-10} = 0b0000; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeH; +} + +class sve_bfloat_dot +: sve_bfloat_dot_base<0b10, asm, "\t$Zda, $Zn, $Zm", + (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm)> { + bits<5> Zm; + let Inst{20-16} = Zm; +} + +multiclass sve_bfloat_dot { + def NAME : sve_bfloat_dot; + def : SVE_3_Op_Pat(NAME)>; +} + +class sve_bfloat_dot_indexed +: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop", + (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS:$iop)> { + bits<2> iop; + bits<3> Zm; + let Inst{20-19} = iop; + let Inst{18-16} = Zm; +} + +multiclass sve_bfloat_dot_indexed { + def NAME : sve_bfloat_dot_indexed; + def : SVE_4_Op_Imm_Pat(NAME)>; +} + +class sve_bfloat_matmul +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm), + asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zm; + bits<5> Zda; + bits<5> Zn; + let Inst{31-21} = 0b01100100011; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b111001; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ElementSizeH; +} + +multiclass sve_bfloat_matmul { + def NAME : sve_bfloat_matmul; + def : SVE_3_Op_Pat(NAME)>; +} + +class sve_bfloat_matmul_longvecl +: sve_bfloat_matmul { + let Inst{23} = 0b1; + let Inst{14-13} = 0b00; + let Inst{10} = BT; +} + +multiclass sve_bfloat_matmul_longvecl { + def NAME : sve_bfloat_matmul_longvecl; + def : SVE_3_Op_Pat(NAME)>; +} + +class sve_bfloat_matmul_longvecl_idx +: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop", + (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexH:$iop)> { + bits<3> iop; + bits<3> Zm; + let Inst{23} = 0b1; + let Inst{20-19} = iop{2-1}; + let Inst{18-16} = Zm; + let Inst{11} = iop{0}; + let Inst{10} = BT; +} + +multiclass sve_bfloat_matmul_longvecl_idx { + def NAME : sve_bfloat_matmul_longvecl_idx; + def : SVE_4_Op_Imm_Pat(NAME)>; +} + +class sve_bfloat_convert +: I<(outs ZPR16:$Zd), (ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn), + asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> { + bits<5> Zd; + bits<3> Pg; + bits<5> Zn; + let Inst{31-25} = 0b0110010; + let Inst{24} = N; + let Inst{23-13} = 0b10001010101; + let Inst{12-10} = Pg; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; + + let Constraints = "$Zd = $_Zd"; + let DestructiveInstType = DestructiveOther; + let hasSideEffects = 1; + let ElementSize = ElementSizeS; +} + +multiclass sve_bfloat_convert { + def NAME : sve_bfloat_convert; + def : SVE_3_Op_Pat(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Integer Matrix Multiply Group +//===----------------------------------------------------------------------===// + +class sve_int_matmul uns, string asm> +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm, + "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = uns; + let Inst{21} = 0; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b100110; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ZPR32.ElementSize; +} + +multiclass sve_int_matmul uns, string asm, SDPatternOperator op> { + def NAME : sve_int_matmul; + + def : SVE_3_Op_Pat(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Integer Dot Product Mixed Sign Group +//===----------------------------------------------------------------------===// + +class sve_int_dot_mixed +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm, + "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-21} = 0b01000100100; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b011110; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ZPR32.ElementSize; +} + +multiclass sve_int_dot_mixed { + def NAME : sve_int_dot_mixed; + + def : SVE_3_Op_Pat(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Integer Dot Product Mixed Sign - Indexed Group +//===----------------------------------------------------------------------===// + +class sve_int_dot_mixed_indexed +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexS32b:$idx), + asm, "\t$Zda, $Zn, $Zm$idx", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<3> Zm; + bits<2> idx; + let Inst{31-21} = 0b01000100101; + let Inst{20-19} = idx; + let Inst{18-16} = Zm; + let Inst{15-11} = 0b00011; + let Inst{10} = U; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ZPR32.ElementSize; +} + +multiclass sve_int_dot_mixed_indexed { + def NAME : sve_int_dot_mixed_indexed; + + def : SVE_4_Op_Imm_Pat(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Floating Point Matrix Multiply Accumulate Group +//===----------------------------------------------------------------------===// + +class sve_fp_matrix_mla +: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty:$Zm), + asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-23} = 0b011001001; + let Inst{22} = sz; + let Inst{21} = 1; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b111001; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = zprty.ElementSize; +} + +multiclass sve_fp_matrix_mla { + def NAME : sve_fp_matrix_mla; + + def : SVE_3_Op_Pat(NAME)>; +} + +//===----------------------------------------------------------------------===// +// SVE Memory - Contiguous Load And Replicate 256-bit Group +//===----------------------------------------------------------------------===// + +class sve_mem_ldor_si sz, string asm, RegisterOperand VecList> +: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), + asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> { + bits<5> Zt; + bits<5> Rn; + bits<3> Pg; + bits<4> imm4; + let Inst{31-25} = 0b1010010; + let Inst{24-23} = sz; + let Inst{22-20} = 0b010; + let Inst{19-16} = imm4; + let Inst{15-13} = 0b001; + let Inst{12-10} = Pg; + let Inst{9-5} = Rn; + let Inst{4-0} = Zt; + + let mayLoad = 1; +} + +multiclass sve_mem_ldor_si sz, string asm, RegisterOperand listty, + ZPRRegOp zprty, ValueType Ty, ValueType PredTy, SDNode Ld1ro> { + def NAME : sve_mem_ldor_si; + def : InstAlias(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; + def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; + def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>; + + // Base addressing mode + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), GPR64sp:$base)), + (!cast(NAME) PPR3bAny:$Pg, GPR64sp:$base, (i64 0))>; + let AddedComplexity = 2 in { + // Reg + Imm addressing mode + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), (add GPR64:$base, (i64 simm4s32:$imm)))), + (!cast(NAME) $Pg, $base, simm4s32:$imm)>; + } +} + +class sve_mem_ldor_ss sz, string asm, RegisterOperand VecList, + RegisterOperand gprty> +: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), + asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> { + bits<5> Zt; + bits<3> Pg; + bits<5> Rn; + bits<5> Rm; + let Inst{31-25} = 0b1010010; + let Inst{24-23} = sz; + let Inst{22-21} = 0b01; + let Inst{20-16} = Rm; + let Inst{15-13} = 0; + let Inst{12-10} = Pg; + let Inst{9-5} = Rn; + let Inst{4-0} = Zt; + + let mayLoad = 1; +} + +multiclass sve_mem_ldor_ss sz, string asm, RegisterOperand listty, + ZPRRegOp zprty, RegisterOperand gprty, ValueType Ty, + ValueType PredTy, SDNode Ld1ro, ComplexPattern AddrCP> { + def NAME : sve_mem_ldor_ss; + + def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>; + + def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), (AddrCP GPR64sp:$base, gprty:$offset))), + (!cast(NAME) PPR3bAny:$gp, GPR64sp:$base, gprty:$offset)>; +} + +//===----------------------------------------------------------------------===// +// SVE Interleave 128-bit Elements Group +//===----------------------------------------------------------------------===// + +class sve_int_perm_bin_perm_128_zz opc, bit P, string asm> +: I<(outs ZPR128:$Zd), (ins ZPR128:$Zn, ZPR128:$Zm), + asm, "\t$Zd, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<5> Zm; + bits<5> Zn; + let Inst{31-21} = 0b00000101101; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b000; + let Inst{12-11} = opc; + let Inst{10} = P; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + +multiclass sve_int_perm_bin_perm_128_zz opc, bit P, string asm, SDPatternOperator op> { + def NAME : sve_int_perm_bin_perm_128_zz; + + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; +} + +/// Addressing modes +def am_sve_indexed_s4 :ComplexPattern", [], [SDNPWantRoot]>; +def am_sve_indexed_s6 :ComplexPattern", [], [SDNPWantRoot]>; + +def am_sve_regreg_lsl0 : ComplexPattern", []>; +def am_sve_regreg_lsl1 : ComplexPattern", []>; +def am_sve_regreg_lsl2 : ComplexPattern", []>; +def am_sve_regreg_lsl3 : ComplexPattern", []>; + +// Predicated pseudo floating point two operand instructions. +multiclass sve_fp_bin_pred_hfd { + def _UNDEF_H : PredTwoOpPseudo; + def _UNDEF_S : PredTwoOpPseudo; + def _UNDEF_D : PredTwoOpPseudo; + + def : SVE_3_Op_Pat(NAME # _UNDEF_H)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_H)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_H)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_S)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_S)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_D)>; +} + +// Predicated pseudo integer two operand instructions. +multiclass sve_int_bin_pred_bhsd { + def _UNDEF_B : PredTwoOpPseudo; + def _UNDEF_H : PredTwoOpPseudo; + def _UNDEF_S : PredTwoOpPseudo; + def _UNDEF_D : PredTwoOpPseudo; + + def : SVE_3_Op_Pat(NAME # _UNDEF_B)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_H)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_S)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_D)>; +} + +// As sve_int_bin_pred but when only i32 and i64 vector types are required. +multiclass sve_int_bin_pred_sd { + def _UNDEF_S : PredTwoOpPseudo; + def _UNDEF_D : PredTwoOpPseudo; + + def : SVE_3_Op_Pat(NAME # _UNDEF_S)>; + def : SVE_3_Op_Pat(NAME # _UNDEF_D)>; +} + +// Predicated pseudo integer two operand instructions. Second operand is an +// immediate specified by imm_[bhsd]. +multiclass sve_int_shift_pred_bhsd { + def _UNDEF_B : PredTwoOpImmPseudo, FalseLanesUndef>; + def _UNDEF_H : PredTwoOpImmPseudo, FalseLanesUndef>; + def _UNDEF_S : PredTwoOpImmPseudo, FalseLanesUndef>; + def _UNDEF_D : PredTwoOpImmPseudo, FalseLanesUndef>; + + def : SVE_Shift_DupImm_Pred_Pat(NAME # _UNDEF_B)>; + def : SVE_Shift_DupImm_Pred_Pat(NAME # _UNDEF_H)>; + def : SVE_Shift_DupImm_Pred_Pat(NAME # _UNDEF_S)>; + def : SVE_Shift_DupImm_Pred_Pat(NAME # _UNDEF_D)>; +} + +multiclass sve_int_bin_pred_all_active_bhsd { + def _UNDEF_B : PredTwoOpPseudo; + def _UNDEF_H : PredTwoOpPseudo; + def _UNDEF_S : PredTwoOpPseudo; + def _UNDEF_D : PredTwoOpPseudo; + + def : SVE_2_Op_Pred_All_Active_Pt(NAME # _UNDEF_B)>; + def : SVE_2_Op_Pred_All_Active_Pt(NAME # _UNDEF_H)>; + def : SVE_2_Op_Pred_All_Active_Pt(NAME # _UNDEF_S)>; + def : SVE_2_Op_Pred_All_Active_Pt(NAME # _UNDEF_D)>; }