Ryujinx/ARMeilleure/Instructions/InstEmitSimdHelper.cs

using ARMeilleure.Decoders;
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.State;
using ARMeilleure.Translation;
using System;
using System.Diagnostics;
using System.Reflection;

using static ARMeilleure.Instructions.InstEmitHelper;
using static ARMeilleure.IntermediateRepresentation.Operand.Factory;

namespace ARMeilleure.Instructions
{
    using Func1I = Func<Operand, Operand>;
    using Func2I = Func<Operand, Operand, Operand>;
    using Func3I = Func<Operand, Operand, Operand, Operand>;

    static class InstEmitSimdHelper
    {
#region "Masks"
        public static readonly long[] EvenMasks = new long[]
        {
            14L << 56 | 12L << 48 | 10L << 40 | 08L << 32 | 06L << 24 | 04L << 16 | 02L << 8 | 00L << 0, // B
            13L << 56 | 12L << 48 | 09L << 40 | 08L << 32 | 05L << 24 | 04L << 16 | 01L << 8 | 00L << 0, // H
            11L << 56 | 10L << 48 | 09L << 40 | 08L << 32 | 03L << 24 | 02L << 16 | 01L << 8 | 00L << 0  // S
        };

        public static readonly long[] OddMasks = new long[]
        {
            15L << 56 | 13L << 48 | 11L << 40 | 09L << 32 | 07L << 24 | 05L << 16 | 03L << 8 | 01L << 0, // B
            15L << 56 | 14L << 48 | 11L << 40 | 10L << 32 | 07L << 24 | 06L << 16 | 03L << 8 | 02L << 0, // H
            15L << 56 | 14L << 48 | 13L << 40 | 12L << 32 | 07L << 24 | 06L << 16 | 05L << 8 | 04L << 0  // S
        };

        public static readonly long ZeroMask = 128L << 56 | 128L << 48 | 128L << 40 | 128L << 32 | 128L << 24 | 128L << 16 | 128L << 8 | 128L << 0;
#endregion

#region "X86 SSE Intrinsics"
        public static readonly Intrinsic[] X86PaddInstruction = new Intrinsic[]
        {
            Intrinsic.X86Paddb,
            Intrinsic.X86Paddw,
            Intrinsic.X86Paddd,
            Intrinsic.X86Paddq
        };

        public static readonly Intrinsic[] X86PcmpeqInstruction = new Intrinsic[]
        {
            Intrinsic.X86Pcmpeqb,
            Intrinsic.X86Pcmpeqw,
            Intrinsic.X86Pcmpeqd,
            Intrinsic.X86Pcmpeqq
        };

        public static readonly Intrinsic[] X86PcmpgtInstruction = new Intrinsic[]
        {
            Intrinsic.X86Pcmpgtb,
            Intrinsic.X86Pcmpgtw,
            Intrinsic.X86Pcmpgtd,
            Intrinsic.X86Pcmpgtq
        };

        public static readonly Intrinsic[] X86PmaxsInstruction = new Intrinsic[]
        {
            Intrinsic.X86Pmaxsb,
            Intrinsic.X86Pmaxsw,
            Intrinsic.X86Pmaxsd
        };

        public static readonly Intrinsic[] X86PmaxuInstruction = new Intrinsic[]
        {
            Intrinsic.X86Pmaxub,
            Intrinsic.X86Pmaxuw,
            Intrinsic.X86Pmaxud
        };

        public static readonly Intrinsic[] X86PminsInstruction = new Intrinsic[]
        {
            Intrinsic.X86Pminsb,
            Intrinsic.X86Pminsw,
            Intrinsic.X86Pminsd
        };

        public static readonly Intrinsic[] X86PminuInstruction = new Intrinsic[]
        {
            Intrinsic.X86Pminub,
            Intrinsic.X86Pminuw,
            Intrinsic.X86Pminud
        };

        public static readonly Intrinsic[] X86PmovsxInstruction = new Intrinsic[]
        {
            Intrinsic.X86Pmovsxbw,
            Intrinsic.X86Pmovsxwd,
            Intrinsic.X86Pmovsxdq
        };

        public static readonly Intrinsic[] X86PmovzxInstruction = new Intrinsic[]
        {
            Intrinsic.X86Pmovzxbw,
            Intrinsic.X86Pmovzxwd,
            Intrinsic.X86Pmovzxdq
        };

        public static readonly Intrinsic[] X86PsllInstruction = new Intrinsic[]
        {
            0,
            Intrinsic.X86Psllw,
            Intrinsic.X86Pslld,
            Intrinsic.X86Psllq
        };

        public static readonly Intrinsic[] X86PsraInstruction = new Intrinsic[]
        {
            0,
            Intrinsic.X86Psraw,
            Intrinsic.X86Psrad
        };

        public static readonly Intrinsic[] X86PsrlInstruction = new Intrinsic[]
        {
            0,
            Intrinsic.X86Psrlw,
            Intrinsic.X86Psrld,
            Intrinsic.X86Psrlq
        };

        public static readonly Intrinsic[] X86PsubInstruction = new Intrinsic[]
        {
            Intrinsic.X86Psubb,
            Intrinsic.X86Psubw,
            Intrinsic.X86Psubd,
            Intrinsic.X86Psubq
        };

        public static readonly Intrinsic[] X86PunpckhInstruction = new Intrinsic[]
        {
            Intrinsic.X86Punpckhbw,
            Intrinsic.X86Punpckhwd,
            Intrinsic.X86Punpckhdq,
            Intrinsic.X86Punpckhqdq
        };

        public static readonly Intrinsic[] X86PunpcklInstruction = new Intrinsic[]
        {
            Intrinsic.X86Punpcklbw,
            Intrinsic.X86Punpcklwd,
            Intrinsic.X86Punpckldq,
            Intrinsic.X86Punpcklqdq
        };
#endregion

        public static int GetImmShl(OpCodeSimdShImm op)
        {
            return op.Imm - (8 << op.Size);
        }

        public static int GetImmShr(OpCodeSimdShImm op)
        {
            return (8 << (op.Size + 1)) - op.Imm;
        }

        public static Operand X86GetScalar(ArmEmitterContext context, float value)
        {
            return X86GetScalar(context, BitConverter.SingleToInt32Bits(value));
        }

        public static Operand X86GetScalar(ArmEmitterContext context, double value)
        {
            return X86GetScalar(context, BitConverter.DoubleToInt64Bits(value));
        }

        public static Operand X86GetScalar(ArmEmitterContext context, int value)
        {
            return context.VectorCreateScalar(Const(value));
        }

        public static Operand X86GetScalar(ArmEmitterContext context, long value)
        {
            return context.VectorCreateScalar(Const(value));
        }

        public static Operand X86GetAllElements(ArmEmitterContext context, float value)
        {
            return X86GetAllElements(context, BitConverter.SingleToInt32Bits(value));
        }

        public static Operand X86GetAllElements(ArmEmitterContext context, double value)
        {
            return X86GetAllElements(context, BitConverter.DoubleToInt64Bits(value));
        }

        public static Operand X86GetAllElements(ArmEmitterContext context, short value)
        {
            ulong value1 = (ushort)value;
            ulong value2 = value1 << 16 | value1;
            ulong value4 = value2 << 32 | value2;

            return X86GetAllElements(context, (long)value4);
        }

        public static Operand X86GetAllElements(ArmEmitterContext context, int value)
        {
            Operand vector = context.VectorCreateScalar(Const(value));

            vector = context.AddIntrinsic(Intrinsic.X86Shufps, vector, vector, Const(0));

            return vector;
        }

        public static Operand X86GetAllElements(ArmEmitterContext context, long value)
        {
            Operand vector = context.VectorCreateScalar(Const(value));

            vector = context.AddIntrinsic(Intrinsic.X86Movlhps, vector, vector);

            return vector;
        }

        public static Operand X86GetElements(ArmEmitterContext context, long e1, long e0)
        {
            return X86GetElements(context, (ulong)e1, (ulong)e0);
        }

        public static Operand X86GetElements(ArmEmitterContext context, ulong e1, ulong e0)
        {
            Operand vector0 = context.VectorCreateScalar(Const(e0));
            Operand vector1 = context.VectorCreateScalar(Const(e1));

            return context.AddIntrinsic(Intrinsic.X86Punpcklqdq, vector0, vector1);
        }

        public static int X86GetRoundControl(FPRoundingMode roundMode)
        {
            switch (roundMode)
            {
                case FPRoundingMode.ToNearest:            return 8 | 0; // even
                case FPRoundingMode.TowardsPlusInfinity:  return 8 | 2;
                case FPRoundingMode.TowardsMinusInfinity: return 8 | 1;
                case FPRoundingMode.TowardsZero:          return 8 | 3;
            }

            throw new ArgumentException($"Invalid rounding mode \"{roundMode}\".");
        }

        public static Operand EmitCountSetBits8(ArmEmitterContext context, Operand op) // "size" is 8 (SIMD&FP Inst.).
        {
            Debug.Assert(op.Type == OperandType.I32 || op.Type == OperandType.I64);

            Operand op0 = context.Subtract(op, context.BitwiseAnd(context.ShiftRightUI(op, Const(1)), Const(op.Type, 0x55L)));

            Operand c1 = Const(op.Type, 0x33L);
            Operand op1 = context.Add(context.BitwiseAnd(context.ShiftRightUI(op0, Const(2)), c1), context.BitwiseAnd(op0, c1));

            return context.BitwiseAnd(context.Add(op1, context.ShiftRightUI(op1, Const(4))), Const(op.Type, 0x0fL));
        }

        public static void EmitScalarUnaryOpF(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            Operand n = GetVec(op.Rn);

            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;

            Operand res = context.AddIntrinsic(inst, n);

            if ((op.Size & 1) != 0)
            {
                res = context.VectorZeroUpper64(res);
            }
            else
            {
                res = context.VectorZeroUpper96(res);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitScalarBinaryOpF(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand n = GetVec(op.Rn);
            Operand m = GetVec(op.Rm);

            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;

            Operand res = context.AddIntrinsic(inst, n, m);

            if ((op.Size & 1) != 0)
            {
                res = context.VectorZeroUpper64(res);
            }
            else
            {
                res = context.VectorZeroUpper96(res);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorUnaryOpF(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            Operand n = GetVec(op.Rn);

            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;

            Operand res = context.AddIntrinsic(inst, n);

            if (op.RegisterSize == RegisterSize.Simd64)
            {
                res = context.VectorZeroUpper64(res);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorBinaryOpF(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand n = GetVec(op.Rn);
            Operand m = GetVec(op.Rm);

            Intrinsic inst = (op.Size & 1) != 0 ? inst64 : inst32;

            Operand res = context.AddIntrinsic(inst, n, m);

            if (op.RegisterSize == RegisterSize.Simd64)
            {
                res = context.VectorZeroUpper64(res);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static Operand EmitUnaryMathCall(ArmEmitterContext context, string name, Operand n)
        {
            IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;

            MethodInfo info = (op.Size & 1) == 0
                ? typeof(MathF).GetMethod(name, new Type[] { typeof(float) })
                : typeof(Math). GetMethod(name, new Type[] { typeof(double) });

            return context.Call(info, n);
        }

        public static Operand EmitRoundMathCall(ArmEmitterContext context, MidpointRounding roundMode, Operand n)
        {
            IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;

            string name = nameof(Math.Round);

            MethodInfo info = (op.Size & 1) == 0
                ? typeof(MathF).GetMethod(name, new Type[] { typeof(float),  typeof(MidpointRounding) })
                : typeof(Math). GetMethod(name, new Type[] { typeof(double), typeof(MidpointRounding) });

            return context.Call(info, n, Const((int)roundMode));
        }

        public static Operand EmitGetRoundingMode(ArmEmitterContext context)
        {
            Operand rMode = context.ShiftLeft(GetFpFlag(FPState.RMode1Flag), Const(1));
                    rMode = context.BitwiseOr(rMode, GetFpFlag(FPState.RMode0Flag));

            return rMode;
        }

        public static Operand EmitRoundByRMode(ArmEmitterContext context, Operand op)
        {
            Debug.Assert(op.Type == OperandType.FP32 || op.Type == OperandType.FP64);

            Operand lbl1 = Label();
            Operand lbl2 = Label();
            Operand lbl3 = Label();
            Operand lblEnd = Label();

            Operand rN = Const((int)FPRoundingMode.ToNearest);
            Operand rP = Const((int)FPRoundingMode.TowardsPlusInfinity);
            Operand rM = Const((int)FPRoundingMode.TowardsMinusInfinity);

            Operand res = context.AllocateLocal(op.Type);

            Operand rMode = EmitGetRoundingMode(context);

            context.BranchIf(lbl1, rMode, rN, Comparison.NotEqual);
            context.Copy(res, EmitRoundMathCall(context, MidpointRounding.ToEven, op));
            context.Branch(lblEnd);

            context.MarkLabel(lbl1);
            context.BranchIf(lbl2, rMode, rP, Comparison.NotEqual);
            context.Copy(res, EmitUnaryMathCall(context, nameof(Math.Ceiling), op));
            context.Branch(lblEnd);

            context.MarkLabel(lbl2);
            context.BranchIf(lbl3, rMode, rM, Comparison.NotEqual);
            context.Copy(res, EmitUnaryMathCall(context, nameof(Math.Floor), op));
            context.Branch(lblEnd);

            context.MarkLabel(lbl3);
            context.Copy(res, EmitUnaryMathCall(context, nameof(Math.Truncate), op));
            context.Branch(lblEnd);

            context.MarkLabel(lblEnd);

            return res;
        }

        public static Operand EmitSoftFloatCall(ArmEmitterContext context, string name, params Operand[] callArgs)
        {
            IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;

            MethodInfo info = (op.Size & 1) == 0
                ? typeof(SoftFloat32).GetMethod(name)
                : typeof(SoftFloat64).GetMethod(name);

            context.StoreToContext();
            Operand res = context.Call(info, callArgs);
            context.LoadFromContext();

            return res;
        }

        public static void EmitScalarBinaryOpByElemF(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp;

            OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32;

            Operand n = context.VectorExtract(type, GetVec(op.Rn), 0);
            Operand m = context.VectorExtract(type, GetVec(op.Rm), op.Index);

            context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), emit(n, m), 0));
        }

        public static void EmitScalarTernaryOpByElemF(ArmEmitterContext context, Func3I emit)
        {
            OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp;

            OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32;

            Operand d = context.VectorExtract(type, GetVec(op.Rd), 0);
            Operand n = context.VectorExtract(type, GetVec(op.Rn), 0);
            Operand m = context.VectorExtract(type, GetVec(op.Rm), op.Index);

            context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), emit(d, n, m), 0));
        }

        public static void EmitScalarUnaryOpSx(ArmEmitterContext context, Func1I emit)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            Operand n = EmitVectorExtractSx(context, op.Rn, 0, op.Size);

            Operand d = EmitVectorInsert(context, context.VectorZero(), emit(n), 0, op.Size);

            context.Copy(GetVec(op.Rd), d);
        }

        public static void EmitScalarBinaryOpSx(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand n = EmitVectorExtractSx(context, op.Rn, 0, op.Size);
            Operand m = EmitVectorExtractSx(context, op.Rm, 0, op.Size);

            Operand d = EmitVectorInsert(context, context.VectorZero(), emit(n, m), 0, op.Size);

            context.Copy(GetVec(op.Rd), d);
        }

        public static void EmitScalarUnaryOpZx(ArmEmitterContext context, Func1I emit)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            Operand n = EmitVectorExtractZx(context, op.Rn, 0, op.Size);

            Operand d = EmitVectorInsert(context, context.VectorZero(), emit(n), 0, op.Size);

            context.Copy(GetVec(op.Rd), d);
        }

        public static void EmitScalarBinaryOpZx(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand n = EmitVectorExtractZx(context, op.Rn, 0, op.Size);
            Operand m = EmitVectorExtractZx(context, op.Rm, 0, op.Size);

            Operand d = EmitVectorInsert(context, context.VectorZero(), emit(n, m), 0, op.Size);

            context.Copy(GetVec(op.Rd), d);
        }

        public static void EmitScalarTernaryOpZx(ArmEmitterContext context, Func3I emit)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand d = EmitVectorExtractZx(context, op.Rd, 0, op.Size);
            Operand n = EmitVectorExtractZx(context, op.Rn, 0, op.Size);
            Operand m = EmitVectorExtractZx(context, op.Rm, 0, op.Size);

            d = EmitVectorInsert(context, context.VectorZero(), emit(d, n, m), 0, op.Size);

            context.Copy(GetVec(op.Rd), d);
        }

        public static void EmitScalarUnaryOpF(ArmEmitterContext context, Func1I emit)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32;

            Operand n = context.VectorExtract(type, GetVec(op.Rn), 0);

            context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), emit(n), 0));
        }

        public static void EmitScalarBinaryOpF(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32;

            Operand n = context.VectorExtract(type, GetVec(op.Rn), 0);
            Operand m = context.VectorExtract(type, GetVec(op.Rm), 0);

            context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), emit(n, m), 0));
        }

        public static void EmitScalarTernaryRaOpF(ArmEmitterContext context, Func3I emit)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32;

            Operand a = context.VectorExtract(type, GetVec(op.Ra), 0);
            Operand n = context.VectorExtract(type, GetVec(op.Rn), 0);
            Operand m = context.VectorExtract(type, GetVec(op.Rm), 0);

            context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), emit(a, n, m), 0));
        }

        public static void EmitVectorUnaryOpF(ArmEmitterContext context, Func1I emit)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            Operand res = context.VectorZero();

            int sizeF = op.Size & 1;

            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;

            int elems = op.GetBytesCount() >> sizeF + 2;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = context.VectorExtract(type, GetVec(op.Rn), index);

                res = context.VectorInsert(res, emit(ne), index);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorBinaryOpF(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand res = context.VectorZero();

            int sizeF = op.Size & 1;

            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;

            int elems = op.GetBytesCount() >> sizeF + 2;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = context.VectorExtract(type, GetVec(op.Rn), index);
                Operand me = context.VectorExtract(type, GetVec(op.Rm), index);

                res = context.VectorInsert(res, emit(ne, me), index);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorTernaryOpF(ArmEmitterContext context, Func3I emit)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand res = context.VectorZero();

            int sizeF = op.Size & 1;

            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;

            int elems = op.GetBytesCount() >> sizeF + 2;

            for (int index = 0; index < elems; index++)
            {
                Operand de = context.VectorExtract(type, GetVec(op.Rd), index);
                Operand ne = context.VectorExtract(type, GetVec(op.Rn), index);
                Operand me = context.VectorExtract(type, GetVec(op.Rm), index);

                res = context.VectorInsert(res, emit(de, ne, me), index);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorBinaryOpByElemF(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp;

            Operand res = context.VectorZero();

            int sizeF = op.Size & 1;

            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;

            int elems = op.GetBytesCount() >> sizeF + 2;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = context.VectorExtract(type, GetVec(op.Rn), index);
                Operand me = context.VectorExtract(type, GetVec(op.Rm), op.Index);

                res = context.VectorInsert(res, emit(ne, me), index);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorTernaryOpByElemF(ArmEmitterContext context, Func3I emit)
        {
            OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp;

            Operand res = context.VectorZero();

            int sizeF = op.Size & 1;

            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;

            int elems = op.GetBytesCount() >> sizeF + 2;

            for (int index = 0; index < elems; index++)
            {
                Operand de = context.VectorExtract(type, GetVec(op.Rd), index);
                Operand ne = context.VectorExtract(type, GetVec(op.Rn), index);
                Operand me = context.VectorExtract(type, GetVec(op.Rm), op.Index);

                res = context.VectorInsert(res, emit(de, ne, me), index);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorUnaryOpSx(ArmEmitterContext context, Func1I emit)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            Operand res = context.VectorZero();

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size);

                res = EmitVectorInsert(context, res, emit(ne), index, op.Size);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorBinaryOpSx(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand res = context.VectorZero();

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size);
                Operand me = EmitVectorExtractSx(context, op.Rm, index, op.Size);

                res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorTernaryOpSx(ArmEmitterContext context, Func3I emit)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand res = context.VectorZero();

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand de = EmitVectorExtractSx(context, op.Rd, index, op.Size);
                Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size);
                Operand me = EmitVectorExtractSx(context, op.Rm, index, op.Size);

                res = EmitVectorInsert(context, res, emit(de, ne, me), index, op.Size);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorUnaryOpZx(ArmEmitterContext context, Func1I emit)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            Operand res = context.VectorZero();

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);

                res = EmitVectorInsert(context, res, emit(ne), index, op.Size);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorBinaryOpZx(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand res = context.VectorZero();

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
                Operand me = EmitVectorExtractZx(context, op.Rm, index, op.Size);

                res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorTernaryOpZx(ArmEmitterContext context, Func3I emit)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand res = context.VectorZero();

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size);
                Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
                Operand me = EmitVectorExtractZx(context, op.Rm, index, op.Size);

                res = EmitVectorInsert(context, res, emit(de, ne, me), index, op.Size);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorBinaryOpByElemSx(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimdRegElem op = (OpCodeSimdRegElem)context.CurrOp;

            Operand res = context.VectorZero();

            Operand me = EmitVectorExtractSx(context, op.Rm, op.Index, op.Size);

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size);

                res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorBinaryOpByElemZx(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimdRegElem op = (OpCodeSimdRegElem)context.CurrOp;

            Operand res = context.VectorZero();

            Operand me = EmitVectorExtractZx(context, op.Rm, op.Index, op.Size);

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);

                res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorTernaryOpByElemZx(ArmEmitterContext context, Func3I emit)
        {
            OpCodeSimdRegElem op = (OpCodeSimdRegElem)context.CurrOp;

            Operand res = context.VectorZero();

            Operand me = EmitVectorExtractZx(context, op.Rm, op.Index, op.Size);

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size);
                Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);

                res = EmitVectorInsert(context, res, emit(de, ne, me), index, op.Size);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorImmUnaryOp(ArmEmitterContext context, Func1I emit)
        {
            OpCodeSimdImm op = (OpCodeSimdImm)context.CurrOp;

            Operand imm = Const(op.Immediate);

            Operand res = context.VectorZero();

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                res = EmitVectorInsert(context, res, emit(imm), index, op.Size);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorImmBinaryOp(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimdImm op = (OpCodeSimdImm)context.CurrOp;

            Operand imm = Const(op.Immediate);

            Operand res = context.VectorZero();

            int elems = op.GetBytesCount() >> op.Size;

            for (int index = 0; index < elems; index++)
            {
                Operand de = EmitVectorExtractZx(context, op.Rd, index, op.Size);

                res = EmitVectorInsert(context, res, emit(de, imm), index, op.Size);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorWidenRmBinaryOpSx(ArmEmitterContext context, Func2I emit)
        {
            EmitVectorWidenRmBinaryOp(context, emit, signed: true);
        }

        public static void EmitVectorWidenRmBinaryOpZx(ArmEmitterContext context, Func2I emit)
        {
            EmitVectorWidenRmBinaryOp(context, emit, signed: false);
        }

        private static void EmitVectorWidenRmBinaryOp(ArmEmitterContext context, Func2I emit, bool signed)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand res = context.VectorZero();

            int elems = 8 >> op.Size;

            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtract(context, op.Rn,        index, op.Size + 1, signed);
                Operand me = EmitVectorExtract(context, op.Rm, part + index, op.Size,     signed);

                res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size + 1);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorWidenRnRmBinaryOpSx(ArmEmitterContext context, Func2I emit)
        {
            EmitVectorWidenRnRmBinaryOp(context, emit, signed: true);
        }

        public static void EmitVectorWidenRnRmBinaryOpZx(ArmEmitterContext context, Func2I emit)
        {
            EmitVectorWidenRnRmBinaryOp(context, emit, signed: false);
        }

        private static void EmitVectorWidenRnRmBinaryOp(ArmEmitterContext context, Func2I emit, bool signed)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand res = context.VectorZero();

            int elems = 8 >> op.Size;

            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtract(context, op.Rn, part + index, op.Size, signed);
                Operand me = EmitVectorExtract(context, op.Rm, part + index, op.Size, signed);

                res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size + 1);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorWidenRnRmTernaryOpSx(ArmEmitterContext context, Func3I emit)
        {
            EmitVectorWidenRnRmTernaryOp(context, emit, signed: true);
        }

        public static void EmitVectorWidenRnRmTernaryOpZx(ArmEmitterContext context, Func3I emit)
        {
            EmitVectorWidenRnRmTernaryOp(context, emit, signed: false);
        }

        private static void EmitVectorWidenRnRmTernaryOp(ArmEmitterContext context, Func3I emit, bool signed)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand res = context.VectorZero();

            int elems = 8 >> op.Size;

            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;

            for (int index = 0; index < elems; index++)
            {
                Operand de = EmitVectorExtract(context, op.Rd,        index, op.Size + 1, signed);
                Operand ne = EmitVectorExtract(context, op.Rn, part + index, op.Size,     signed);
                Operand me = EmitVectorExtract(context, op.Rm, part + index, op.Size,     signed);

                res = EmitVectorInsert(context, res, emit(de, ne, me), index, op.Size + 1);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorWidenBinaryOpByElemSx(ArmEmitterContext context, Func2I emit)
        {
            EmitVectorWidenBinaryOpByElem(context, emit, signed: true);
        }

        public static void EmitVectorWidenBinaryOpByElemZx(ArmEmitterContext context, Func2I emit)
        {
            EmitVectorWidenBinaryOpByElem(context, emit, signed: false);
        }

        private static void EmitVectorWidenBinaryOpByElem(ArmEmitterContext context, Func2I emit, bool signed)
        {
            OpCodeSimdRegElem op = (OpCodeSimdRegElem)context.CurrOp;

            Operand res = context.VectorZero();

            Operand me = EmitVectorExtract(context, op.Rm, op.Index, op.Size, signed);

            int elems = 8 >> op.Size;

            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtract(context, op.Rn, part + index, op.Size, signed);

                res = EmitVectorInsert(context, res, emit(ne, me), index, op.Size + 1);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorWidenTernaryOpByElemSx(ArmEmitterContext context, Func3I emit)
        {
            EmitVectorWidenTernaryOpByElem(context, emit, signed: true);
        }

        public static void EmitVectorWidenTernaryOpByElemZx(ArmEmitterContext context, Func3I emit)
        {
            EmitVectorWidenTernaryOpByElem(context, emit, signed: false);
        }

        private static void EmitVectorWidenTernaryOpByElem(ArmEmitterContext context, Func3I emit, bool signed)
        {
            OpCodeSimdRegElem op = (OpCodeSimdRegElem)context.CurrOp;

            Operand res = context.VectorZero();

            Operand me = EmitVectorExtract(context, op.Rm, op.Index, op.Size, signed);

            int elems = 8 >> op.Size;

            int part = op.RegisterSize == RegisterSize.Simd128 ? elems : 0;

            for (int index = 0; index < elems; index++)
            {
                Operand de = EmitVectorExtract(context, op.Rd,        index, op.Size + 1, signed);
                Operand ne = EmitVectorExtract(context, op.Rn, part + index, op.Size,     signed);

                res = EmitVectorInsert(context, res, emit(de, ne, me), index, op.Size + 1);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitVectorPairwiseOpSx(ArmEmitterContext context, Func2I emit)
        {
            EmitVectorPairwiseOp(context, emit, signed: true);
        }

        public static void EmitVectorPairwiseOpZx(ArmEmitterContext context, Func2I emit)
        {
            EmitVectorPairwiseOp(context, emit, signed: false);
        }

        private static void EmitVectorPairwiseOp(ArmEmitterContext context, Func2I emit, bool signed)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand res = context.VectorZero();

            int pairs = op.GetPairsCount() >> op.Size;

            for (int index = 0; index < pairs; index++)
            {
                int pairIndex = index << 1;

                Operand n0 = EmitVectorExtract(context, op.Rn, pairIndex,     op.Size, signed);
                Operand n1 = EmitVectorExtract(context, op.Rn, pairIndex + 1, op.Size, signed);

                Operand m0 = EmitVectorExtract(context, op.Rm, pairIndex,     op.Size, signed);
                Operand m1 = EmitVectorExtract(context, op.Rm, pairIndex + 1, op.Size, signed);

                res = EmitVectorInsert(context, res, emit(n0, n1),         index, op.Size);
                res = EmitVectorInsert(context, res, emit(m0, m1), pairs + index, op.Size);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitSsse3VectorPairwiseOp(ArmEmitterContext context, Intrinsic[] inst)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand n = GetVec(op.Rn);
            Operand m = GetVec(op.Rm);

            if (op.RegisterSize == RegisterSize.Simd64)
            {
                Operand zeroEvenMask = X86GetElements(context, ZeroMask, EvenMasks[op.Size]);
                Operand zeroOddMask  = X86GetElements(context, ZeroMask, OddMasks [op.Size]);

                Operand mN = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m); // m:n

                Operand left  = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroEvenMask); // 0:even from m:n
                Operand right = context.AddIntrinsic(Intrinsic.X86Pshufb, mN, zeroOddMask);  // 0:odd  from m:n

                context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst[op.Size], left, right));
            }
            else if (op.Size < 3)
            {
                Operand oddEvenMask = X86GetElements(context, OddMasks[op.Size], EvenMasks[op.Size]);

                Operand oddEvenN = context.AddIntrinsic(Intrinsic.X86Pshufb, n, oddEvenMask); // odd:even from n
                Operand oddEvenM = context.AddIntrinsic(Intrinsic.X86Pshufb, m, oddEvenMask); // odd:even from m

                Operand left  = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, oddEvenN, oddEvenM);
                Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, oddEvenN, oddEvenM);

                context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst[op.Size], left, right));
            }
            else
            {
                Operand left  = context.AddIntrinsic(Intrinsic.X86Punpcklqdq, n, m);
                Operand right = context.AddIntrinsic(Intrinsic.X86Punpckhqdq, n, m);

                context.Copy(GetVec(op.Rd), context.AddIntrinsic(inst[3], left, right));
            }
        }

        public static void EmitVectorAcrossVectorOpSx(ArmEmitterContext context, Func2I emit)
        {
            EmitVectorAcrossVectorOp(context, emit, signed: true, isLong: false);
        }

        public static void EmitVectorAcrossVectorOpZx(ArmEmitterContext context, Func2I emit)
        {
            EmitVectorAcrossVectorOp(context, emit, signed: false, isLong: false);
        }

        public static void EmitVectorLongAcrossVectorOpSx(ArmEmitterContext context, Func2I emit)
        {
            EmitVectorAcrossVectorOp(context, emit, signed: true, isLong: true);
        }

        public static void EmitVectorLongAcrossVectorOpZx(ArmEmitterContext context, Func2I emit)
        {
            EmitVectorAcrossVectorOp(context, emit, signed: false, isLong: true);
        }

        private static void EmitVectorAcrossVectorOp(
            ArmEmitterContext context,
            Func2I emit,
            bool signed,
            bool isLong)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            int elems = op.GetBytesCount() >> op.Size;

            Operand res = EmitVectorExtract(context, op.Rn, 0, op.Size, signed);

            for (int index = 1; index < elems; index++)
            {
                Operand n = EmitVectorExtract(context, op.Rn, index, op.Size, signed);

                res = emit(res, n);
            }

            int size = isLong ? op.Size + 1 : op.Size;

            Operand d = EmitVectorInsert(context, context.VectorZero(), res, 0, size);

            context.Copy(GetVec(op.Rd), d);
        }

        public static void EmitVectorAcrossVectorOpF(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            Debug.Assert((op.Size & 1) == 0 && op.RegisterSize == RegisterSize.Simd128);

            Operand res = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0);

            for (int index = 1; index < 4; index++)
            {
                Operand n = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), index);

                res = emit(res, n);
            }

            Operand d = context.VectorInsert(context.VectorZero(), res, 0);

            context.Copy(GetVec(op.Rd), d);
        }

        public static void EmitSse2VectorAcrossVectorOpF(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            Debug.Assert((op.Size & 1) == 0 && op.RegisterSize == RegisterSize.Simd128);

            const int sm0 = 0 << 6 | 0 << 4 | 0 << 2 | 0 << 0;
            const int sm1 = 1 << 6 | 1 << 4 | 1 << 2 | 1 << 0;
            const int sm2 = 2 << 6 | 2 << 4 | 2 << 2 | 2 << 0;
            const int sm3 = 3 << 6 | 3 << 4 | 3 << 2 | 3 << 0;

            Operand nCopy = context.Copy(GetVec(op.Rn));

            Operand part0 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm0));
            Operand part1 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm1));
            Operand part2 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm2));
            Operand part3 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, nCopy, Const(sm3));

            Operand res = emit(emit(part0, part1), emit(part2, part3));

            context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
        }

        public static void EmitScalarPairwiseOpF(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32;

            Operand ne0 = context.VectorExtract(type, GetVec(op.Rn), 0);
            Operand ne1 = context.VectorExtract(type, GetVec(op.Rn), 1);

            Operand res = context.VectorInsert(context.VectorZero(), emit(ne0, ne1), 0);

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitSse2ScalarPairwiseOpF(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            Operand n = GetVec(op.Rn);

            Operand op0, op1;

            if ((op.Size & 1) == 0)
            {
                const int sm0 = 2 << 6 | 2 << 4 | 2 << 2 | 0 << 0;
                const int sm1 = 2 << 6 | 2 << 4 | 2 << 2 | 1 << 0;

                Operand zeroN = context.VectorZeroUpper64(n);

                op0 = context.AddIntrinsic(Intrinsic.X86Pshufd, zeroN, Const(sm0));
                op1 = context.AddIntrinsic(Intrinsic.X86Pshufd, zeroN, Const(sm1));
            }
            else /* if ((op.Size & 1) == 1) */
            {
                Operand zero = context.VectorZero();

                op0 = context.AddIntrinsic(Intrinsic.X86Movlhps, n, zero);
                op1 = context.AddIntrinsic(Intrinsic.X86Movhlps, zero, n);
            }

            context.Copy(GetVec(op.Rd), emit(op0, op1));
        }

        public static void EmitVectorPairwiseOpF(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand res = context.VectorZero();

            int sizeF = op.Size & 1;

            OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;

            int pairs = op.GetPairsCount() >> sizeF + 2;

            for (int index = 0; index < pairs; index++)
            {
                int pairIndex = index << 1;

                Operand n0 = context.VectorExtract(type, GetVec(op.Rn), pairIndex);
                Operand n1 = context.VectorExtract(type, GetVec(op.Rn), pairIndex + 1);

                Operand m0 = context.VectorExtract(type, GetVec(op.Rm), pairIndex);
                Operand m1 = context.VectorExtract(type, GetVec(op.Rm), pairIndex + 1);

                res = context.VectorInsert(res, emit(n0, n1),         index);
                res = context.VectorInsert(res, emit(m0, m1), pairs + index);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitSse2VectorPairwiseOpF(ArmEmitterContext context, Func2I emit)
        {
            OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

            Operand nCopy = context.Copy(GetVec(op.Rn));
            Operand mCopy = context.Copy(GetVec(op.Rm));

            int sizeF = op.Size & 1;

            if (sizeF == 0)
            {
                if (op.RegisterSize == RegisterSize.Simd64)
                {
                    Operand unpck = context.AddIntrinsic(Intrinsic.X86Unpcklps, nCopy, mCopy);

                    Operand zero = context.VectorZero();

                    Operand part0 = context.AddIntrinsic(Intrinsic.X86Movlhps, unpck, zero);
                    Operand part1 = context.AddIntrinsic(Intrinsic.X86Movhlps, zero, unpck);

                    context.Copy(GetVec(op.Rd), emit(part0, part1));
                }
                else /* if (op.RegisterSize == RegisterSize.Simd128) */
                {
                    const int sm0 = 2 << 6 | 0 << 4 | 2 << 2 | 0 << 0;
                    const int sm1 = 3 << 6 | 1 << 4 | 3 << 2 | 1 << 0;

                    Operand part0 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, mCopy, Const(sm0));
                    Operand part1 = context.AddIntrinsic(Intrinsic.X86Shufps, nCopy, mCopy, Const(sm1));

                    context.Copy(GetVec(op.Rd), emit(part0, part1));
                }
            }
            else /* if (sizeF == 1) */
            {
                Operand part0 = context.AddIntrinsic(Intrinsic.X86Unpcklpd, nCopy, mCopy);
                Operand part1 = context.AddIntrinsic(Intrinsic.X86Unpckhpd, nCopy, mCopy);

                context.Copy(GetVec(op.Rd), emit(part0, part1));
            }
        }

        [Flags]
        public enum Mxcsr
        {
            Ftz = 1 << 15, // Flush To Zero.
            Um  = 1 << 11, // Underflow Mask.
            Dm  = 1 << 8,  // Denormal Mask.
            Daz = 1 << 6   // Denormals Are Zero.
        }

        public static void EmitSseOrAvxEnterFtzAndDazModesOpF(ArmEmitterContext context, out Operand isTrue)
        {
            isTrue = GetFpFlag(FPState.FzFlag);

            Operand lblTrue = Label();
            context.BranchIfFalse(lblTrue, isTrue);

            context.AddIntrinsicNoRet(Intrinsic.X86Mxcsrmb, Const((int)(Mxcsr.Ftz | Mxcsr.Um | Mxcsr.Dm | Mxcsr.Daz)));

            context.MarkLabel(lblTrue);
        }

        public static void EmitSseOrAvxExitFtzAndDazModesOpF(ArmEmitterContext context, Operand isTrue = default)
        {
            isTrue = isTrue == default ? GetFpFlag(FPState.FzFlag) : isTrue;

            Operand lblTrue = Label();
            context.BranchIfFalse(lblTrue, isTrue);

            context.AddIntrinsicNoRet(Intrinsic.X86Mxcsrub, Const((int)(Mxcsr.Ftz | Mxcsr.Daz)));

            context.MarkLabel(lblTrue);
        }

        public enum CmpCondition
        {
            // Legacy Sse.
            Equal              = 0, // Ordered, non-signaling.
            LessThan           = 1, // Ordered, signaling.
            LessThanOrEqual    = 2, // Ordered, signaling.
            UnorderedQ         = 3, // Non-signaling.
            NotLessThan        = 5, // Unordered, signaling.
            NotLessThanOrEqual = 6, // Unordered, signaling.
            OrderedQ           = 7, // Non-signaling.

            // Vex.
            GreaterThanOrEqual = 13, // Ordered, signaling.
            GreaterThan        = 14, // Ordered, signaling.
            OrderedS           = 23  // Signaling.
        }

        [Flags]
        public enum SaturatingFlags
        {
            None = 0,

            ByElem = 1 << 0,
            Scalar = 1 << 1,
            Signed = 1 << 2,

            Add = 1 << 3,
            Sub = 1 << 4,

            Accumulate = 1 << 5
        }

        public static void EmitScalarSaturatingUnaryOpSx(ArmEmitterContext context, Func1I emit)
        {
            EmitSaturatingUnaryOpSx(context, emit, SaturatingFlags.Scalar | SaturatingFlags.Signed);
        }

        public static void EmitVectorSaturatingUnaryOpSx(ArmEmitterContext context, Func1I emit)
        {
            EmitSaturatingUnaryOpSx(context, emit, SaturatingFlags.Signed);
        }

        public static void EmitSaturatingUnaryOpSx(ArmEmitterContext context, Func1I emit, SaturatingFlags flags)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            Operand res = context.VectorZero();

            bool scalar = (flags & SaturatingFlags.Scalar) != 0;

            int elems = !scalar ? op.GetBytesCount() >> op.Size : 1;

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtractSx(context, op.Rn, index, op.Size);
                Operand de;

                if (op.Size <= 2)
                {
                    de = EmitSignedSrcSatQ(context, emit(ne), op.Size, signedDst: true);
                }
                else /* if (op.Size == 3) */
                {
                    de = EmitUnarySignedSatQAbsOrNeg(context, emit(ne));
                }

                res = EmitVectorInsert(context, res, de, index, op.Size);
            }

            context.Copy(GetVec(op.Rd), res);
        }

        public static void EmitScalarSaturatingBinaryOpSx(ArmEmitterContext context, Func2I emit = null, SaturatingFlags flags = SaturatingFlags.None)
        {
            EmitSaturatingBinaryOp(context, emit, SaturatingFlags.Scalar | SaturatingFlags.Signed | flags);
        }

        public static void EmitScalarSaturatingBinaryOpZx(ArmEmitterContext context, SaturatingFlags flags)
        {
            EmitSaturatingBinaryOp(context, null, SaturatingFlags.Scalar | flags);
        }

        public static void EmitVectorSaturatingBinaryOpSx(ArmEmitterContext context, Func2I emit = null, SaturatingFlags flags = SaturatingFlags.None)
        {
            EmitSaturatingBinaryOp(context, emit, SaturatingFlags.Signed | flags);
        }

        public static void EmitVectorSaturatingBinaryOpZx(ArmEmitterContext context, SaturatingFlags flags)
        {
            EmitSaturatingBinaryOp(context, null, flags);
        }

        public static void EmitVectorSaturatingBinaryOpByElemSx(ArmEmitterContext context, Func2I emit)
        {
            EmitSaturatingBinaryOp(context, emit, SaturatingFlags.ByElem | SaturatingFlags.Signed);
        }

        public static void EmitSaturatingBinaryOp(ArmEmitterContext context, Func2I emit, SaturatingFlags flags)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            Operand res = context.VectorZero();

            bool byElem = (flags & SaturatingFlags.ByElem) != 0;
            bool scalar = (flags & SaturatingFlags.Scalar) != 0;
            bool signed = (flags & SaturatingFlags.Signed) != 0;

            bool add = (flags & SaturatingFlags.Add) != 0;
            bool sub = (flags & SaturatingFlags.Sub) != 0;

            bool accumulate = (flags & SaturatingFlags.Accumulate) != 0;

            int elems = !scalar ? op.GetBytesCount() >> op.Size : 1;

            if (add || sub)
            {
                for (int index = 0; index < elems; index++)
                {
                    Operand de;
                    Operand ne = EmitVectorExtract(context, op.Rn, index, op.Size, signed);
                    Operand me = EmitVectorExtract(context, ((OpCodeSimdReg)op).Rm, index, op.Size, signed);

                    if (op.Size <= 2)
                    {
                        Operand temp = add ? context.Add(ne, me) : context.Subtract(ne, me);

                        de = EmitSignedSrcSatQ(context, temp, op.Size, signedDst: signed);
                    }
                    else /* if (op.Size == 3) */
                    {
                        if (add)
                        {
                            de = signed ? EmitBinarySignedSatQAdd(context, ne, me) : EmitBinaryUnsignedSatQAdd(context, ne, me);
                        }
                        else /* if (sub) */
                        {
                            de = signed ? EmitBinarySignedSatQSub(context, ne, me) : EmitBinaryUnsignedSatQSub(context, ne, me);
                        }
                    }

                    res = EmitVectorInsert(context, res, de, index, op.Size);
                }
            }
            else if (accumulate)
            {
                for (int index = 0; index < elems; index++)
                {
                    Operand de;
                    Operand ne = EmitVectorExtract(context, op.Rn, index, op.Size, !signed);
                    Operand me = EmitVectorExtract(context, op.Rd, index, op.Size,  signed);

                    if (op.Size <= 2)
                    {
                        Operand temp = context.Add(ne, me);

                        de = EmitSignedSrcSatQ(context, temp, op.Size, signedDst: signed);
                    }
                    else /* if (op.Size == 3) */
                    {
                        de = signed ? EmitBinarySignedSatQAcc(context, ne, me) : EmitBinaryUnsignedSatQAcc(context, ne, me);
                    }

                    res = EmitVectorInsert(context, res, de, index, op.Size);
                }
            }
            else
            {
                Operand me = default;

                if (byElem)
                {
                    OpCodeSimdRegElem opRegElem = (OpCodeSimdRegElem)op;

                    me = EmitVectorExtract(context, opRegElem.Rm, opRegElem.Index, op.Size, signed);
                }

                for (int index = 0; index < elems; index++)
                {
                    Operand ne = EmitVectorExtract(context, op.Rn, index, op.Size, signed);

                    if (!byElem)
                    {
                        me = EmitVectorExtract(context, ((OpCodeSimdReg)op).Rm, index, op.Size, signed);
                    }

                    Operand de = EmitSignedSrcSatQ(context, emit(ne, me), op.Size, signedDst: signed);

                    res = EmitVectorInsert(context, res, de, index, op.Size);
                }
            }

            context.Copy(GetVec(op.Rd), res);
        }

        [Flags]
        public enum SaturatingNarrowFlags
        {
            Scalar    = 1 << 0,
            SignedSrc = 1 << 1,
            SignedDst = 1 << 2,

            ScalarSxSx = Scalar | SignedSrc | SignedDst,
            ScalarSxZx = Scalar | SignedSrc,
            ScalarZxZx = Scalar,

            VectorSxSx = SignedSrc | SignedDst,
            VectorSxZx = SignedSrc,
            VectorZxZx = 0
        }

        public static void EmitSaturatingNarrowOp(ArmEmitterContext context, SaturatingNarrowFlags flags)
        {
            OpCodeSimd op = (OpCodeSimd)context.CurrOp;

            bool scalar    = (flags & SaturatingNarrowFlags.Scalar)    != 0;
            bool signedSrc = (flags & SaturatingNarrowFlags.SignedSrc) != 0;
            bool signedDst = (flags & SaturatingNarrowFlags.SignedDst) != 0;

            int elems = !scalar ? 8 >> op.Size : 1;

            int part = !scalar && (op.RegisterSize == RegisterSize.Simd128) ? elems : 0;

            Operand d = GetVec(op.Rd);

            Operand res = part == 0 ? context.VectorZero() : context.Copy(d);

            for (int index = 0; index < elems; index++)
            {
                Operand ne = EmitVectorExtract(context, op.Rn, index, op.Size + 1, signedSrc);

                Operand temp = signedSrc
                    ? EmitSignedSrcSatQ(context, ne, op.Size, signedDst)
                    : EmitUnsignedSrcSatQ(context, ne, op.Size, signedDst);

                res = EmitVectorInsert(context, res, temp, part + index, op.Size);
            }

            context.Copy(d, res);
        }

        // long SignedSignSatQ(long op, int size);
        public static Operand EmitSignedSignSatQ(ArmEmitterContext context, Operand op, int size)
        {
            int eSize = 8 << size;

            Debug.Assert(op.Type == OperandType.I64);
            Debug.Assert(eSize == 8 || eSize == 16 || eSize == 32 || eSize == 64);

            Operand lbl1 = Label();
            Operand lblEnd = Label();

            Operand zeroL = Const(0L);
            Operand maxT = Const((1L << (eSize - 1)) - 1L);
            Operand minT = Const(-(1L << (eSize - 1)));

            Operand res = context.Copy(context.AllocateLocal(OperandType.I64), zeroL);

            context.BranchIf(lbl1, op, zeroL, Comparison.LessOrEqual);
            context.Copy(res, maxT);
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lbl1);
            context.BranchIf(lblEnd, op, zeroL, Comparison.GreaterOrEqual);
            context.Copy(res, minT);
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lblEnd);

            return res;
        }

        // private static ulong UnsignedSignSatQ(ulong op, int size);
        public static Operand EmitUnsignedSignSatQ(ArmEmitterContext context, Operand op, int size)
        {
            int eSize = 8 << size;

            Debug.Assert(op.Type == OperandType.I64);
            Debug.Assert(eSize == 8 || eSize == 16 || eSize == 32 || eSize == 64);

            Operand lblEnd = Label();

            Operand zeroUL = Const(0UL);
            Operand maxT = Const(ulong.MaxValue >> (64 - eSize));

            Operand res = context.Copy(context.AllocateLocal(OperandType.I64), zeroUL);

            context.BranchIf(lblEnd, op, zeroUL, Comparison.LessOrEqualUI);
            context.Copy(res, maxT);
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lblEnd);

            return res;
        }

        // TSrc (16bit, 32bit, 64bit; signed) > TDst (8bit, 16bit, 32bit; signed, unsigned).
        // long SignedSrcSignedDstSatQ(long op, int size); ulong SignedSrcUnsignedDstSatQ(long op, int size);
        public static Operand EmitSignedSrcSatQ(ArmEmitterContext context, Operand op, int sizeDst, bool signedDst)
        {
            int eSizeDst = 8 << sizeDst;

            Debug.Assert(op.Type == OperandType.I64);
            Debug.Assert(eSizeDst == 8 || eSizeDst == 16 || eSizeDst == 32);

            Operand lbl1 = Label();
            Operand lblEnd = Label();

            Operand maxT = signedDst ? Const((1L << (eSizeDst - 1)) - 1L) : Const((1UL << eSizeDst) - 1UL);
            Operand minT = signedDst ? Const(-(1L << (eSizeDst - 1))) : Const(0UL);

            Operand res = context.Copy(context.AllocateLocal(OperandType.I64), op);

            context.BranchIf(lbl1, op, maxT, Comparison.LessOrEqual);
            context.Copy(res, maxT);
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lbl1);
            context.BranchIf(lblEnd, op, minT, Comparison.GreaterOrEqual);
            context.Copy(res, minT);
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lblEnd);

            return res;
        }

        // TSrc (16bit, 32bit, 64bit; unsigned) > TDst (8bit, 16bit, 32bit; signed, unsigned).
        // long UnsignedSrcSignedDstSatQ(ulong op, int size); ulong UnsignedSrcUnsignedDstSatQ(ulong op, int size);
        public static Operand EmitUnsignedSrcSatQ(ArmEmitterContext context, Operand op, int sizeDst, bool signedDst)
        {
            int eSizeDst = 8 << sizeDst;

            Debug.Assert(op.Type == OperandType.I64);
            Debug.Assert(eSizeDst == 8 || eSizeDst == 16 || eSizeDst == 32);

            Operand lblEnd = Label();

            Operand maxT = signedDst ? Const((1L << (eSizeDst - 1)) - 1L) : Const((1UL << eSizeDst) - 1UL);

            Operand res = context.Copy(context.AllocateLocal(OperandType.I64), op);

            context.BranchIf(lblEnd, op, maxT, Comparison.LessOrEqualUI);
            context.Copy(res, maxT);
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lblEnd);

            return res;
        }

        // long UnarySignedSatQAbsOrNeg(long op);
        private static Operand EmitUnarySignedSatQAbsOrNeg(ArmEmitterContext context, Operand op)
        {
            Debug.Assert(op.Type == OperandType.I64);

            Operand lblEnd = Label();

            Operand minL = Const(long.MinValue);
            Operand maxL = Const(long.MaxValue);

            Operand res = context.Copy(context.AllocateLocal(OperandType.I64), op);

            context.BranchIf(lblEnd, op, minL, Comparison.NotEqual);
            context.Copy(res, maxL);
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lblEnd);

            return res;
        }

        // long BinarySignedSatQAdd(long op1, long op2);
        public static Operand EmitBinarySignedSatQAdd(ArmEmitterContext context, Operand op1, Operand op2)
        {
            Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64);

            Operand lblEnd = Label();

            Operand minL = Const(long.MinValue);
            Operand maxL = Const(long.MaxValue);
            Operand zeroL = Const(0L);

            Operand add = context.Add(op1, op2);
            Operand res = context.Copy(context.AllocateLocal(OperandType.I64), add);

            Operand left = context.BitwiseNot(context.BitwiseExclusiveOr(op1, op2));
            Operand right = context.BitwiseExclusiveOr(op1, add);
            context.BranchIf(lblEnd, context.BitwiseAnd(left, right), zeroL, Comparison.GreaterOrEqual);

            Operand isPositive = context.ICompareGreaterOrEqual(op1, zeroL);
            context.Copy(res, context.ConditionalSelect(isPositive, maxL, minL));
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lblEnd);

            return res;
        }

        // ulong BinaryUnsignedSatQAdd(ulong op1, ulong op2);
        public static Operand EmitBinaryUnsignedSatQAdd(ArmEmitterContext context, Operand op1, Operand op2)
        {
            Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64);

            Operand lblEnd = Label();

            Operand maxUL = Const(ulong.MaxValue);

            Operand add = context.Add(op1, op2);
            Operand res = context.Copy(context.AllocateLocal(OperandType.I64), add);

            context.BranchIf(lblEnd, add, op1, Comparison.GreaterOrEqualUI);
            context.Copy(res, maxUL);
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lblEnd);

            return res;
        }

        // long BinarySignedSatQSub(long op1, long op2);
        public static Operand EmitBinarySignedSatQSub(ArmEmitterContext context, Operand op1, Operand op2)
        {
            Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64);

            Operand lblEnd = Label();

            Operand minL = Const(long.MinValue);
            Operand maxL = Const(long.MaxValue);
            Operand zeroL = Const(0L);

            Operand sub = context.Subtract(op1, op2);
            Operand res = context.Copy(context.AllocateLocal(OperandType.I64), sub);

            Operand left = context.BitwiseExclusiveOr(op1, op2);
            Operand right = context.BitwiseExclusiveOr(op1, sub);
            context.BranchIf(lblEnd, context.BitwiseAnd(left, right), zeroL, Comparison.GreaterOrEqual);

            Operand isPositive = context.ICompareGreaterOrEqual(op1, zeroL);
            context.Copy(res, context.ConditionalSelect(isPositive, maxL, minL));
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lblEnd);

            return res;
        }

        // ulong BinaryUnsignedSatQSub(ulong op1, ulong op2);
        public static Operand EmitBinaryUnsignedSatQSub(ArmEmitterContext context, Operand op1, Operand op2)
        {
            Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64);

            Operand lblEnd = Label();

            Operand zeroL = Const(0L);

            Operand sub = context.Subtract(op1, op2);
            Operand res = context.Copy(context.AllocateLocal(OperandType.I64), sub);

            context.BranchIf(lblEnd, op1, op2, Comparison.GreaterOrEqualUI);
            context.Copy(res, zeroL);
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lblEnd);

            return res;
        }

        // long BinarySignedSatQAcc(ulong op1, long op2);
        private static Operand EmitBinarySignedSatQAcc(ArmEmitterContext context, Operand op1, Operand op2)
        {
            Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64);

            Operand lbl1 = Label();
            Operand lbl2 = Label();
            Operand lblEnd = Label();

            Operand maxL = Const(long.MaxValue);
            Operand zeroL = Const(0L);

            Operand add = context.Add(op1, op2);
            Operand res = context.Copy(context.AllocateLocal(OperandType.I64), add);

            context.BranchIf(lbl1, op1, maxL, Comparison.GreaterUI);
            Operand notOp2AndRes = context.BitwiseAnd(context.BitwiseNot(op2), add);
            context.BranchIf(lblEnd, notOp2AndRes, zeroL, Comparison.GreaterOrEqual);
            context.Copy(res, maxL);
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lbl1);
            context.BranchIf(lbl2, op2, zeroL, Comparison.Less);
            context.Copy(res, maxL);
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lbl2);
            context.BranchIf(lblEnd, add, maxL, Comparison.LessOrEqualUI);
            context.Copy(res, maxL);
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lblEnd);

            return res;
        }

        // ulong BinaryUnsignedSatQAcc(long op1, ulong op2);
        private static Operand EmitBinaryUnsignedSatQAcc(ArmEmitterContext context, Operand op1, Operand op2)
        {
            Debug.Assert(op1.Type == OperandType.I64 && op2.Type == OperandType.I64);

            Operand lbl1 = Label();
            Operand lblEnd = Label();

            Operand maxUL = Const(ulong.MaxValue);
            Operand maxL = Const(long.MaxValue);
            Operand zeroL = Const(0L);

            Operand add = context.Add(op1, op2);
            Operand res = context.Copy(context.AllocateLocal(OperandType.I64), add);

            context.BranchIf(lbl1, op1, zeroL, Comparison.Less);
            context.BranchIf(lblEnd, add, op1, Comparison.GreaterOrEqualUI);
            context.Copy(res, maxUL);
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lbl1);
            context.BranchIf(lblEnd, op2, maxL, Comparison.GreaterUI);
            context.BranchIf(lblEnd, add, zeroL, Comparison.GreaterOrEqual);
            context.Copy(res, zeroL);
            SetFpFlag(context, FPState.QcFlag, Const(1));
            context.Branch(lblEnd);

            context.MarkLabel(lblEnd);

            return res;
        }

        public static Operand EmitFloatAbs(ArmEmitterContext context, Operand value, bool single, bool vector)
        {
            Operand mask;
            if (single)
            {
                mask = vector ? X86GetAllElements(context, -0f) : X86GetScalar(context, -0f);
            }
            else
            {
                mask = vector ? X86GetAllElements(context, -0d) : X86GetScalar(context, -0d);
            }

            return context.AddIntrinsic(single ? Intrinsic.X86Andnps : Intrinsic.X86Andnpd, mask, value);
        }

        public static Operand EmitVectorExtractSx(ArmEmitterContext context, int reg, int index, int size)
        {
            return EmitVectorExtract(context, reg, index, size, true);
        }

        public static Operand EmitVectorExtractZx(ArmEmitterContext context, int reg, int index, int size)
        {
            return EmitVectorExtract(context, reg, index, size, false);
        }

        public static Operand EmitVectorExtract(ArmEmitterContext context, int reg, int index, int size, bool signed)
        {
            ThrowIfInvalid(index, size);

            Operand res = default;

            switch (size)
            {
                case 0:
                    res = context.VectorExtract8(GetVec(reg), index);
                    break;

                case 1:
                    res = context.VectorExtract16(GetVec(reg), index);
                    break;

                case 2:
                    res = context.VectorExtract(OperandType.I32, GetVec(reg), index);
                    break;

                case 3:
                    res = context.VectorExtract(OperandType.I64, GetVec(reg), index);
                    break;
            }

            if (signed)
            {
                switch (size)
                {
                    case 0: res = context.SignExtend8 (OperandType.I64, res); break;
                    case 1: res = context.SignExtend16(OperandType.I64, res); break;
                    case 2: res = context.SignExtend32(OperandType.I64, res); break;
                }
            }
            else
            {
                switch (size)
                {
                    case 0: res = context.ZeroExtend8 (OperandType.I64, res); break;
                    case 1: res = context.ZeroExtend16(OperandType.I64, res); break;
                    case 2: res = context.ZeroExtend32(OperandType.I64, res); break;
                }
            }

            return res;
        }

        public static Operand EmitVectorInsert(ArmEmitterContext context, Operand vector, Operand value, int index, int size)
        {
            ThrowIfInvalid(index, size);

            if (size < 3 && value.Type == OperandType.I64)
            {
                value = context.ConvertI64ToI32(value);
            }

            switch (size)
            {
                case 0: vector = context.VectorInsert8 (vector, value, index); break;
                case 1: vector = context.VectorInsert16(vector, value, index); break;
                case 2: vector = context.VectorInsert  (vector, value, index); break;
                case 3: vector = context.VectorInsert  (vector, value, index); break;
            }

            return vector;
        }

        public static void ThrowIfInvalid(int index, int size)
        {
            if ((uint)size > 3u)
            {
                throw new ArgumentOutOfRangeException(nameof(size));
            }

            if ((uint)index >= 16u >> size)
            {
                throw new ArgumentOutOfRangeException(nameof(index));
            }
        }
    }
}