mirror of
https://github.com/Ryujinx/Ryujinx.git
synced 2025-01-07 01:46:03 +00:00
Add VCLZ.* fast path (#1917)
* Add VCLZ fast path * Add VCLZ.8B/16B SSSE3 fast path * Add VCLZ.4H/8H SSSE3 fast path * Add VCLZ.2S/4S SSE2 fast path * Improve CLZ.4H/8H fast path * Improve CLZ.2S/4S fast path * Set PPTC version
This commit is contained in:
parent
f94acdb4ef
commit
ddf1105bcb
3 changed files with 145 additions and 9 deletions
|
@ -120,24 +120,155 @@ namespace ARMeilleure.Instructions
|
||||||
{
|
{
|
||||||
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
|
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
|
||||||
|
|
||||||
Operand res = context.VectorZero();
|
|
||||||
|
|
||||||
int elems = op.GetBytesCount() >> op.Size;
|
|
||||||
|
|
||||||
int eSize = 8 << op.Size;
|
int eSize = 8 << op.Size;
|
||||||
|
|
||||||
for (int index = 0; index < elems; index++)
|
Operand res = eSize switch {
|
||||||
|
8 => Clz_V_I8 (context, GetVec(op.Rn)),
|
||||||
|
16 => Clz_V_I16(context, GetVec(op.Rn)),
|
||||||
|
32 => Clz_V_I32(context, GetVec(op.Rn)),
|
||||||
|
_ => null
|
||||||
|
};
|
||||||
|
|
||||||
|
if (res != null)
|
||||||
{
|
{
|
||||||
Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
|
if (op.RegisterSize == RegisterSize.Simd64)
|
||||||
|
{
|
||||||
|
res = context.VectorZeroUpper64(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int elems = op.GetBytesCount() >> op.Size;
|
||||||
|
|
||||||
Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)), ne, Const(eSize));
|
res = context.VectorZero();
|
||||||
|
|
||||||
res = EmitVectorInsert(context, res, de, index, op.Size);
|
for (int index = 0; index < elems; index++)
|
||||||
|
{
|
||||||
|
Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
|
||||||
|
|
||||||
|
Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)), ne, Const(eSize));
|
||||||
|
|
||||||
|
res = EmitVectorInsert(context, res, de, index, op.Size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
context.Copy(GetVec(op.Rd), res);
|
context.Copy(GetVec(op.Rd), res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static Operand Clz_V_I8(ArmEmitterContext context, Operand arg)
|
||||||
|
{
|
||||||
|
if (!Optimizations.UseSsse3)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// CLZ nibble table.
|
||||||
|
Operand clzTable = X86GetScalar(context, 0x01_01_01_01_02_02_03_04);
|
||||||
|
|
||||||
|
Operand maskLow = X86GetAllElements(context, 0x0f_0f_0f_0f);
|
||||||
|
Operand c04 = X86GetAllElements(context, 0x04_04_04_04);
|
||||||
|
|
||||||
|
// CLZ of low 4 bits of elements in arg.
|
||||||
|
Operand loClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, arg);
|
||||||
|
|
||||||
|
// Get the high 4 bits of elements in arg.
|
||||||
|
Operand hiArg = context.AddIntrinsic(Intrinsic.X86Psrlw, arg, Const(4));
|
||||||
|
hiArg = context.AddIntrinsic(Intrinsic.X86Pand, hiArg, maskLow);
|
||||||
|
|
||||||
|
// CLZ of high 4 bits of elements in arg.
|
||||||
|
Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, hiArg);
|
||||||
|
|
||||||
|
// If high 4 bits are not all zero, we discard the CLZ of the low 4 bits.
|
||||||
|
Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqb, hiClz, c04);
|
||||||
|
loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask);
|
||||||
|
|
||||||
|
return context.AddIntrinsic(Intrinsic.X86Paddb, loClz, hiClz);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Operand Clz_V_I16(ArmEmitterContext context, Operand arg)
|
||||||
|
{
|
||||||
|
if (!Optimizations.UseSsse3)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
Operand maskSwap = X86GetElements(context, 0x80_0f_80_0d_80_0b_80_09, 0x80_07_80_05_80_03_80_01);
|
||||||
|
Operand maskLow = X86GetAllElements(context, 0x00ff_00ff);
|
||||||
|
Operand c0008 = X86GetAllElements(context, 0x0008_0008);
|
||||||
|
|
||||||
|
// CLZ pair of high 8 and low 8 bits of elements in arg.
|
||||||
|
Operand hiloClz = Clz_V_I8(context, arg);
|
||||||
|
// Get CLZ of low 8 bits in each pair.
|
||||||
|
Operand loClz = context.AddIntrinsic(Intrinsic.X86Pand, hiloClz, maskLow);
|
||||||
|
// Get CLZ of high 8 bits in each pair.
|
||||||
|
Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, hiloClz, maskSwap);
|
||||||
|
|
||||||
|
// If high 8 bits are not all zero, we discard the CLZ of the low 8 bits.
|
||||||
|
Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqw, hiClz, c0008);
|
||||||
|
loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask);
|
||||||
|
|
||||||
|
return context.AddIntrinsic(Intrinsic.X86Paddw, loClz, hiClz);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Operand Clz_V_I32(ArmEmitterContext context, Operand arg)
|
||||||
|
{
|
||||||
|
// TODO: Use vplzcntd when AVX-512 is supported.
|
||||||
|
if (!Optimizations.UseSse2)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
Operand AddVectorI32(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Paddd, op0, op1);
|
||||||
|
Operand SubVectorI32(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Psubd, op0, op1);
|
||||||
|
Operand ShiftRightVectorUI32(Operand op0, int imm8) => context.AddIntrinsic(Intrinsic.X86Psrld, op0, Const(imm8));
|
||||||
|
Operand OrVector(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Por, op0, op1);
|
||||||
|
Operand AndVector(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Pand, op0, op1);
|
||||||
|
Operand NotVector(Operand op0) => context.AddIntrinsic(Intrinsic.X86Pandn, op0, context.VectorOne());
|
||||||
|
|
||||||
|
Operand c55555555 = X86GetAllElements(context, 0x55555555);
|
||||||
|
Operand c33333333 = X86GetAllElements(context, 0x33333333);
|
||||||
|
Operand c0f0f0f0f = X86GetAllElements(context, 0x0f0f0f0f);
|
||||||
|
Operand c0000003f = X86GetAllElements(context, 0x0000003f);
|
||||||
|
|
||||||
|
Operand tmp0;
|
||||||
|
Operand tmp1;
|
||||||
|
Operand res;
|
||||||
|
|
||||||
|
// Set all bits after highest set bit to 1.
|
||||||
|
res = OrVector(ShiftRightVectorUI32(arg, 1), arg);
|
||||||
|
res = OrVector(ShiftRightVectorUI32(res, 2), res);
|
||||||
|
res = OrVector(ShiftRightVectorUI32(res, 4), res);
|
||||||
|
res = OrVector(ShiftRightVectorUI32(res, 8), res);
|
||||||
|
res = OrVector(ShiftRightVectorUI32(res, 16), res);
|
||||||
|
|
||||||
|
// Make leading 0s into leading 1s.
|
||||||
|
res = NotVector(res);
|
||||||
|
|
||||||
|
// Count leading 1s, which is the population count.
|
||||||
|
tmp0 = ShiftRightVectorUI32(res, 1);
|
||||||
|
tmp0 = AndVector(tmp0, c55555555);
|
||||||
|
res = SubVectorI32(res, tmp0);
|
||||||
|
|
||||||
|
tmp0 = ShiftRightVectorUI32(res, 2);
|
||||||
|
tmp0 = AndVector(tmp0, c33333333);
|
||||||
|
tmp1 = AndVector(res, c33333333);
|
||||||
|
res = AddVectorI32(tmp0, tmp1);
|
||||||
|
|
||||||
|
tmp0 = ShiftRightVectorUI32(res, 4);
|
||||||
|
tmp0 = AddVectorI32(tmp0, res);
|
||||||
|
res = AndVector(tmp0, c0f0f0f0f);
|
||||||
|
|
||||||
|
tmp0 = ShiftRightVectorUI32(res, 8);
|
||||||
|
res = AddVectorI32(tmp0, res);
|
||||||
|
|
||||||
|
tmp0 = ShiftRightVectorUI32(res, 16);
|
||||||
|
res = AddVectorI32(tmp0, res);
|
||||||
|
|
||||||
|
res = AndVector(res, c0000003f);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
public static void Cnt_V(ArmEmitterContext context)
|
public static void Cnt_V(ArmEmitterContext context)
|
||||||
{
|
{
|
||||||
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
|
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
|
||||||
|
|
|
@ -209,6 +209,11 @@ namespace ARMeilleure.Instructions
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Operand X86GetElements(ArmEmitterContext context, long e1, long e0)
|
public static Operand X86GetElements(ArmEmitterContext context, long e1, long e0)
|
||||||
|
{
|
||||||
|
return X86GetElements(context, (ulong)e1, (ulong)e0);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Operand X86GetElements(ArmEmitterContext context, ulong e1, ulong e0)
|
||||||
{
|
{
|
||||||
Operand vector0 = context.VectorCreateScalar(Const(e0));
|
Operand vector0 = context.VectorCreateScalar(Const(e0));
|
||||||
Operand vector1 = context.VectorCreateScalar(Const(e1));
|
Operand vector1 = context.VectorCreateScalar(Const(e1));
|
||||||
|
|
|
@ -22,7 +22,7 @@ namespace ARMeilleure.Translation.PTC
|
||||||
{
|
{
|
||||||
private const string HeaderMagic = "PTChd";
|
private const string HeaderMagic = "PTChd";
|
||||||
|
|
||||||
private const int InternalVersion = 1817; //! To be incremented manually for each change to the ARMeilleure project.
|
private const int InternalVersion = 1917; //! To be incremented manually for each change to the ARMeilleure project.
|
||||||
|
|
||||||
private const string ActualDir = "0";
|
private const string ActualDir = "0";
|
||||||
private const string BackupDir = "1";
|
private const string BackupDir = "1";
|
||||||
|
|
Loading…
Reference in a new issue