mirror of
https://github.com/Ryujinx/Ryujinx.git
synced 2025-01-18 18:51:28 +00:00
CPU (A64): Add FP16/FP32 fast paths (F16C Intrinsics) for Fcvt_S, Fcvtl_V & Fcvtn_V Instructions. Now HardwareCapabilities uses CpuId. (#1650)
* net5.0
* CPU (A64): Add FP16/FP32 fast paths (F16C Intrinsics) for Fcvt_S, Fcvtl_V & Fcvtn_V Instructions. Switch to .NET 5.0.
Nits.
Tests performed successfully in both debug and release mode (for all instructions involved).
* Address comment.
* Update appveyor.yml
* Revert "Update appveyor.yml"
This reverts commit 27cdd59e8b
.
* Remove Assembler CpuId.
* Update appveyor.yml
* Address comment.
This commit is contained in:
parent
eafee34fee
commit
0679084f11
9 changed files with 136 additions and 62 deletions
|
@ -104,7 +104,6 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Add(X86Instruction.Cmpxchg8, new InstructionInfo(0x00000fb0, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Reg8Src));
|
||||
Add(X86Instruction.Comisd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Comiss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex));
|
||||
Add(X86Instruction.Cpuid, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fa2, InstructionFlags.RegOnly));
|
||||
Add(X86Instruction.Crc32, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f1, InstructionFlags.PrefixF2));
|
||||
Add(X86Instruction.Crc32_16, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f1, InstructionFlags.PrefixF2 | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Crc32_8, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f0, InstructionFlags.PrefixF2 | InstructionFlags.Reg8Src));
|
||||
|
@ -270,6 +269,8 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Add(X86Instruction.Unpcklps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f14, InstructionFlags.Vex));
|
||||
Add(X86Instruction.Vblendvpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4b, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Vblendvps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Vcvtph2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Vcvtps2ph, new InstructionInfo(0x000f3a1d, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Vpblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Xor, new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp, 0x00000033, InstructionFlags.None));
|
||||
Add(X86Instruction.Xorpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
|
@ -386,11 +387,6 @@ namespace ARMeilleure.CodeGen.X86
|
|||
WriteInstruction(src1, null, src2, X86Instruction.Comiss);
|
||||
}
|
||||
|
||||
public void Cpuid()
|
||||
{
|
||||
WriteInstruction(null, null, OperandType.None, X86Instruction.Cpuid);
|
||||
}
|
||||
|
||||
public void Cvtsd2ss(Operand dest, Operand src1, Operand src2)
|
||||
{
|
||||
WriteInstruction(dest, src1, src2, X86Instruction.Cvtsd2ss);
|
||||
|
|
|
@ -1,20 +1,60 @@
|
|||
using System;
|
||||
using System.Runtime.Intrinsics.X86;
|
||||
|
||||
namespace ARMeilleure.CodeGen.X86
|
||||
{
|
||||
static class HardwareCapabilities
|
||||
{
|
||||
public static bool SupportsSse => Sse.IsSupported;
|
||||
public static bool SupportsSse2 => Sse2.IsSupported;
|
||||
public static bool SupportsSse3 => Sse3.IsSupported;
|
||||
public static bool SupportsSsse3 => Ssse3.IsSupported;
|
||||
public static bool SupportsSse41 => Sse41.IsSupported;
|
||||
public static bool SupportsSse42 => Sse42.IsSupported;
|
||||
public static bool SupportsPclmulqdq => Pclmulqdq.IsSupported;
|
||||
public static bool SupportsFma => Fma.IsSupported;
|
||||
public static bool SupportsPopcnt => Popcnt.IsSupported;
|
||||
public static bool SupportsAesni => Aes.IsSupported;
|
||||
public static bool SupportsAvx => Avx.IsSupported;
|
||||
static HardwareCapabilities()
|
||||
{
|
||||
if (!X86Base.IsSupported)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
(_, _, int ecx, int edx) = X86Base.CpuId(0x00000001, 0x00000000);
|
||||
|
||||
FeatureInfoEdx = (FeatureFlagsEdx)edx;
|
||||
FeatureInfoEcx = (FeatureFlagsEcx)ecx;
|
||||
}
|
||||
|
||||
[Flags]
|
||||
public enum FeatureFlagsEdx
|
||||
{
|
||||
Sse = 1 << 25,
|
||||
Sse2 = 1 << 26
|
||||
}
|
||||
|
||||
[Flags]
|
||||
public enum FeatureFlagsEcx
|
||||
{
|
||||
Sse3 = 1 << 0,
|
||||
Pclmulqdq = 1 << 1,
|
||||
Ssse3 = 1 << 9,
|
||||
Fma = 1 << 12,
|
||||
Sse41 = 1 << 19,
|
||||
Sse42 = 1 << 20,
|
||||
Popcnt = 1 << 23,
|
||||
Aes = 1 << 25,
|
||||
Avx = 1 << 28,
|
||||
F16c = 1 << 29
|
||||
}
|
||||
|
||||
public static FeatureFlagsEdx FeatureInfoEdx { get; }
|
||||
public static FeatureFlagsEcx FeatureInfoEcx { get; }
|
||||
|
||||
public static bool SupportsSse => FeatureInfoEdx.HasFlag(FeatureFlagsEdx.Sse);
|
||||
public static bool SupportsSse2 => FeatureInfoEdx.HasFlag(FeatureFlagsEdx.Sse2);
|
||||
public static bool SupportsSse3 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Sse3);
|
||||
public static bool SupportsPclmulqdq => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Pclmulqdq);
|
||||
public static bool SupportsSsse3 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Ssse3);
|
||||
public static bool SupportsFma => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Fma);
|
||||
public static bool SupportsSse41 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Sse41);
|
||||
public static bool SupportsSse42 => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Sse42);
|
||||
public static bool SupportsPopcnt => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Popcnt);
|
||||
public static bool SupportsAesni => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Aes);
|
||||
public static bool SupportsAvx => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.Avx);
|
||||
public static bool SupportsF16c => FeatureInfoEcx.HasFlag(FeatureFlagsEcx.F16c);
|
||||
|
||||
public static bool ForceLegacySse { get; set; }
|
||||
|
||||
|
|
|
@ -162,6 +162,8 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Add(Intrinsic.X86Unpckhps, new IntrinsicInfo(X86Instruction.Unpckhps, IntrinsicType.Binary));
|
||||
Add(Intrinsic.X86Unpcklpd, new IntrinsicInfo(X86Instruction.Unpcklpd, IntrinsicType.Binary));
|
||||
Add(Intrinsic.X86Unpcklps, new IntrinsicInfo(X86Instruction.Unpcklps, IntrinsicType.Binary));
|
||||
Add(Intrinsic.X86Vcvtph2ps, new IntrinsicInfo(X86Instruction.Vcvtph2ps, IntrinsicType.Unary));
|
||||
Add(Intrinsic.X86Vcvtps2ph, new IntrinsicInfo(X86Instruction.Vcvtps2ph, IntrinsicType.BinaryImm));
|
||||
Add(Intrinsic.X86Xorpd, new IntrinsicInfo(X86Instruction.Xorpd, IntrinsicType.Binary));
|
||||
Add(Intrinsic.X86Xorps, new IntrinsicInfo(X86Instruction.Xorps, IntrinsicType.Binary));
|
||||
}
|
||||
|
|
|
@ -33,7 +33,6 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Cmpxchg8,
|
||||
Comisd,
|
||||
Comiss,
|
||||
Cpuid,
|
||||
Crc32,
|
||||
Crc32_16,
|
||||
Crc32_8,
|
||||
|
@ -199,6 +198,8 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Unpcklps,
|
||||
Vblendvpd,
|
||||
Vblendvps,
|
||||
Vcvtph2ps,
|
||||
Vcvtps2ph,
|
||||
Vpblendvb,
|
||||
Xor,
|
||||
Xorpd,
|
||||
|
|
|
@ -60,21 +60,48 @@ namespace ARMeilleure.Instructions
|
|||
}
|
||||
else if (op.Size == 0 && op.Opc == 3) // Single -> Half.
|
||||
{
|
||||
Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0);
|
||||
if (Optimizations.UseF16c)
|
||||
{
|
||||
Debug.Assert(!Optimizations.ForceLegacySse);
|
||||
|
||||
Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne);
|
||||
Operand n = GetVec(op.Rn);
|
||||
|
||||
res = context.ZeroExtend16(OperandType.I64, res);
|
||||
Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest)));
|
||||
res = context.AddIntrinsic(Intrinsic.X86Pslldq, res, Const(14)); // VectorZeroUpper112()
|
||||
res = context.AddIntrinsic(Intrinsic.X86Psrldq, res, Const(14));
|
||||
|
||||
context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1));
|
||||
context.Copy(GetVec(op.Rd), res);
|
||||
}
|
||||
else
|
||||
{
|
||||
Operand ne = context.VectorExtract(OperandType.FP32, GetVec(op.Rn), 0);
|
||||
|
||||
Operand res = context.Call(typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)), ne);
|
||||
|
||||
res = context.ZeroExtend16(OperandType.I64, res);
|
||||
|
||||
context.Copy(GetVec(op.Rd), EmitVectorInsert(context, context.VectorZero(), res, 0, 1));
|
||||
}
|
||||
}
|
||||
else if (op.Size == 3 && op.Opc == 0) // Half -> Single.
|
||||
{
|
||||
Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1);
|
||||
if (Optimizations.UseF16c)
|
||||
{
|
||||
Debug.Assert(!Optimizations.ForceLegacySse);
|
||||
|
||||
Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne);
|
||||
Operand res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, GetVec(op.Rn));
|
||||
res = context.VectorZeroUpper96(res);
|
||||
|
||||
context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0));
|
||||
context.Copy(GetVec(op.Rd), res);
|
||||
}
|
||||
else
|
||||
{
|
||||
Operand ne = EmitVectorExtractZx(context, op.Rn, 0, 1);
|
||||
|
||||
Operand res = context.Call(typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)), ne);
|
||||
|
||||
context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0));
|
||||
}
|
||||
}
|
||||
else if (op.Size == 1 && op.Opc == 3) // Double -> Half.
|
||||
{
|
||||
|
@ -129,18 +156,20 @@ namespace ARMeilleure.Instructions
|
|||
if (Optimizations.UseSse2 && sizeF == 1)
|
||||
{
|
||||
Operand n = GetVec(op.Rn);
|
||||
Operand res;
|
||||
|
||||
if (op.RegisterSize == RegisterSize.Simd128)
|
||||
{
|
||||
res = context.AddIntrinsic(Intrinsic.X86Movhlps, n, n);
|
||||
}
|
||||
else
|
||||
{
|
||||
res = n;
|
||||
}
|
||||
Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n;
|
||||
res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res);
|
||||
|
||||
res = context.AddIntrinsic(Intrinsic.X86Cvtps2pd, res);
|
||||
context.Copy(GetVec(op.Rd), res);
|
||||
}
|
||||
else if (Optimizations.UseF16c && sizeF == 0)
|
||||
{
|
||||
Debug.Assert(!Optimizations.ForceLegacySse);
|
||||
|
||||
Operand n = GetVec(op.Rn);
|
||||
|
||||
Operand res = op.RegisterSize == RegisterSize.Simd128 ? context.AddIntrinsic(Intrinsic.X86Movhlps, n, n) : n;
|
||||
res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, res);
|
||||
|
||||
context.Copy(GetVec(op.Rd), res);
|
||||
}
|
||||
|
@ -210,17 +239,30 @@ namespace ARMeilleure.Instructions
|
|||
{
|
||||
Operand d = GetVec(op.Rd);
|
||||
|
||||
Operand res = context.VectorZeroUpper64(d);
|
||||
Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps;
|
||||
|
||||
Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtpd2ps, GetVec(op.Rn));
|
||||
nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
|
||||
|
||||
nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
|
||||
Operand res = context.VectorZeroUpper64(d);
|
||||
res = context.AddIntrinsic(movInst, res, nInt);
|
||||
|
||||
Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128
|
||||
? Intrinsic.X86Movlhps
|
||||
: Intrinsic.X86Movhlps;
|
||||
context.Copy(d, res);
|
||||
}
|
||||
else if (Optimizations.UseF16c && sizeF == 0)
|
||||
{
|
||||
Debug.Assert(!Optimizations.ForceLegacySse);
|
||||
|
||||
res = context.AddIntrinsic(movInst, res, nInt);
|
||||
Operand d = GetVec(op.Rd);
|
||||
Operand n = GetVec(op.Rn);
|
||||
|
||||
Intrinsic movInst = op.RegisterSize == RegisterSize.Simd128 ? Intrinsic.X86Movlhps : Intrinsic.X86Movhlps;
|
||||
|
||||
Operand nInt = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, n, Const(X86GetRoundControl(FPRoundingMode.ToNearest)));
|
||||
nInt = context.AddIntrinsic(Intrinsic.X86Movlhps, nInt, nInt);
|
||||
|
||||
Operand res = context.VectorZeroUpper64(d);
|
||||
res = context.AddIntrinsic(movInst, res, nInt);
|
||||
|
||||
context.Copy(d, res);
|
||||
}
|
||||
|
|
|
@ -151,6 +151,8 @@ namespace ARMeilleure.IntermediateRepresentation
|
|||
X86Unpckhps,
|
||||
X86Unpcklpd,
|
||||
X86Unpcklps,
|
||||
X86Vcvtph2ps,
|
||||
X86Vcvtps2ph,
|
||||
X86Xorpd,
|
||||
X86Xorps
|
||||
}
|
||||
|
|
|
@ -14,6 +14,7 @@ namespace ARMeilleure
|
|||
public static bool UseSse42IfAvailable { get; set; } = true;
|
||||
public static bool UsePopCntIfAvailable { get; set; } = true;
|
||||
public static bool UseAvxIfAvailable { get; set; } = true;
|
||||
public static bool UseF16cIfAvailable { get; set; } = true;
|
||||
public static bool UseAesniIfAvailable { get; set; } = true;
|
||||
public static bool UsePclmulqdqIfAvailable { get; set; } = true;
|
||||
|
||||
|
@ -31,6 +32,7 @@ namespace ARMeilleure
|
|||
internal static bool UseSse42 => UseSse42IfAvailable && HardwareCapabilities.SupportsSse42;
|
||||
internal static bool UsePopCnt => UsePopCntIfAvailable && HardwareCapabilities.SupportsPopcnt;
|
||||
internal static bool UseAvx => UseAvxIfAvailable && HardwareCapabilities.SupportsAvx && !ForceLegacySse;
|
||||
internal static bool UseF16c => UseF16cIfAvailable && HardwareCapabilities.SupportsF16c;
|
||||
internal static bool UseAesni => UseAesniIfAvailable && HardwareCapabilities.SupportsAesni;
|
||||
internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && HardwareCapabilities.SupportsPclmulqdq;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
using ARMeilleure.CodeGen;
|
||||
using ARMeilleure.CodeGen.Unwinding;
|
||||
using ARMeilleure.CodeGen.X86;
|
||||
using ARMeilleure.Memory;
|
||||
using Ryujinx.Common.Configuration;
|
||||
using Ryujinx.Common.Logging;
|
||||
|
@ -10,7 +11,6 @@ using System.Diagnostics;
|
|||
using System.IO;
|
||||
using System.IO.Compression;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Runtime.Intrinsics.X86;
|
||||
using System.Runtime.Serialization.Formatters.Binary;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
|
@ -21,7 +21,7 @@ namespace ARMeilleure.Translation.PTC
|
|||
{
|
||||
private const string HeaderMagic = "PTChd";
|
||||
|
||||
private const int InternalVersion = 1273; //! To be incremented manually for each change to the ARMeilleure project.
|
||||
private const int InternalVersion = 1650; //! To be incremented manually for each change to the ARMeilleure project.
|
||||
|
||||
private const string ActualDir = "0";
|
||||
private const string BackupDir = "1";
|
||||
|
@ -646,21 +646,7 @@ namespace ARMeilleure.Translation.PTC
|
|||
|
||||
private static ulong GetFeatureInfo()
|
||||
{
|
||||
ulong featureInfo = 0ul;
|
||||
|
||||
featureInfo |= (Sse3.IsSupported ? 1ul : 0ul) << 0;
|
||||
featureInfo |= (Pclmulqdq.IsSupported ? 1ul : 0ul) << 1;
|
||||
featureInfo |= (Ssse3.IsSupported ? 1ul : 0ul) << 9;
|
||||
featureInfo |= (Fma.IsSupported ? 1ul : 0ul) << 12;
|
||||
featureInfo |= (Sse41.IsSupported ? 1ul : 0ul) << 19;
|
||||
featureInfo |= (Sse42.IsSupported ? 1ul : 0ul) << 20;
|
||||
featureInfo |= (Popcnt.IsSupported ? 1ul : 0ul) << 23;
|
||||
featureInfo |= (Aes.IsSupported ? 1ul : 0ul) << 25;
|
||||
featureInfo |= (Avx.IsSupported ? 1ul : 0ul) << 28;
|
||||
featureInfo |= (Sse.IsSupported ? 1ul : 0ul) << 57;
|
||||
featureInfo |= (Sse2.IsSupported ? 1ul : 0ul) << 58;
|
||||
|
||||
return featureInfo;
|
||||
return (ulong)HardwareCapabilities.FeatureInfoEdx << 32 | (uint)HardwareCapabilities.FeatureInfoEcx;
|
||||
}
|
||||
|
||||
private struct Header
|
||||
|
|
|
@ -1973,15 +1973,18 @@ namespace Ryujinx.Tests.Cpu
|
|||
CompareAgainstUnicorn();
|
||||
}
|
||||
|
||||
[Test, Pairwise] [Explicit]
|
||||
[Test, Pairwise] [Explicit] // Unicorn seems to default all rounding modes to RMode.Rn.
|
||||
public void F_Cvt_S_SH([ValueSource("_F_Cvt_S_SH_")] uint opcodes,
|
||||
[ValueSource("_1S_F_")] ulong a)
|
||||
[ValueSource("_1S_F_")] ulong a,
|
||||
[Values(RMode.Rn)] RMode rMode)
|
||||
{
|
||||
ulong z = TestContext.CurrentContext.Random.NextULong();
|
||||
V128 v0 = MakeVectorE0E1(z, z);
|
||||
V128 v1 = MakeVectorE0(a);
|
||||
|
||||
SingleOpcode(opcodes, v0: v0, v1: v1);
|
||||
int fpcr = (int)rMode << (int)Fpcr.RMode;
|
||||
|
||||
SingleOpcode(opcodes, v0: v0, v1: v1, fpcr: fpcr);
|
||||
|
||||
CompareAgainstUnicorn();
|
||||
}
|
||||
|
@ -2134,7 +2137,7 @@ namespace Ryujinx.Tests.Cpu
|
|||
CompareAgainstUnicorn(fpsrMask: Fpsr.Ioc | Fpsr.Ofc | Fpsr.Ufc | Fpsr.Ixc | Fpsr.Idc);
|
||||
}
|
||||
|
||||
[Test, Pairwise] [Explicit] // Unicorn seems to default all rounding modes to RMode.Rn.
|
||||
[Test, Pairwise] [Explicit]
|
||||
public void F_Cvtn_V_2D2S_2D4S([ValueSource("_F_Cvtn_V_2D2S_2D4S_")] uint opcodes,
|
||||
[Values(0u)] uint rd,
|
||||
[Values(1u, 0u)] uint rn,
|
||||
|
|
Loading…
Reference in a new issue