1
0
Fork 0
mirror of https://github.com/Ryujinx/Ryujinx.git synced 2025-01-08 19:36:04 +00:00
Ryujinx/ARMeilleure/CodeGen/X86/CodeGenerator.cs

1727 lines
63 KiB
C#
Raw Normal View History

Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
using ARMeilleure.CodeGen.Optimizations;
using ARMeilleure.CodeGen.RegisterAllocators;
using ARMeilleure.CodeGen.Unwinding;
using ARMeilleure.Common;
using ARMeilleure.Diagnostics;
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.Translation;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
namespace ARMeilleure.CodeGen.X86
{
static class CodeGenerator
{
private const int PageSize = 0x1000;
private const int StackGuardSize = 0x2000;
private static Action<CodeGenContext, Operation>[] _instTable;
static CodeGenerator()
{
_instTable = new Action<CodeGenContext, Operation>[EnumUtils.GetCount(typeof(Instruction))];
Add(Instruction.Add, GenerateAdd);
Add(Instruction.BitwiseAnd, GenerateBitwiseAnd);
Add(Instruction.BitwiseExclusiveOr, GenerateBitwiseExclusiveOr);
Add(Instruction.BitwiseNot, GenerateBitwiseNot);
Add(Instruction.BitwiseOr, GenerateBitwiseOr);
Add(Instruction.Branch, GenerateBranch);
Add(Instruction.BranchIfFalse, GenerateBranchIfFalse);
Add(Instruction.BranchIfTrue, GenerateBranchIfTrue);
Add(Instruction.ByteSwap, GenerateByteSwap);
Add(Instruction.Call, GenerateCall);
Add(Instruction.Clobber, GenerateClobber);
Use a Jump Table for direct and indirect calls/jumps, removing transitions to managed (#975) * Implement Jump Table for Native Calls NOTE: this slows down rejit considerably! Not recommended to be used without codegen optimisation or AOT. - Does not work on Linux - A32 needs an additional commit. * A32 Support (WIP) * Actually write Direct Call pointers to the table That would help. * Direct Calls: Rather than returning to the translator, attempt to keep within the native stack frame. A return to the translator can still happen, but only by exceptionally bubbling up to it. Also: - Always translate lowCq as a function. Faster interop with the direct jumps, and this will be useful in future if we want to do speculative translation. - Tail Call Detection: after the decoding stage, detect if we do a tail call, and avoid translating into it. Detected if a jump is made to an address outwith the contiguous sequence of blocks surrounding the entry point. The goal is to reduce code touched by jit and rejit. * A32 Support * Use smaller max function size for lowCq, fix exceptional returns When a return has an unexpected value and there is no code block following this one, we now return the value rather than continuing. * CompareAndSwap (buggy) * Ensure CompareAndSwap does not get optimized away. * Use CompareAndSwap to make the dynamic table thread safe. * Tail call for linux, throw on too many arguments. * Combine CompareAndSwap 128 and 32/64. They emit different IR instructions since their PreAllocator behaviour is different, but now they just have one function on EmitterContext. * Fix issues separating from optimisations. * Use a stub to find and execute missing functions. This allows us to skip doing many runtime comparisons and branches, and reduces the amount of code we need to emit significantly. For the indirect call table, this stub also does the work of moving in the highCq address to the table when one is found. * Make Jump Tables and Jit Cache dynmically resize Reserve virtual memory, commit as needed. * Move TailCallRemover to its own class. * Multithreaded Translation (based on heuristic) A poor one, at that. Need to get core count for a better one, which means a lot of OS specific garbage. * Better priority management for background threads. * Bound core limit a bit more Past a certain point the load is not paralellizable and starts stealing from the main thread. Likely due to GC, memory, heap allocation thread contention. Reduce by one core til optimisations come to improve the situation. * Fix memory management on linux. * Temporary solution to some sync problems. This will make sure threads exit correctly, most of the time. There is a potential race where setting the sync counter to 0 does nothing (counter stays at what it was before, thread could take too long to exit), but we need to find a better way to do this anyways. Synchronization frequency has been tightened as we never enter blockwise segments of code. Essentially this means, check every x functions or loop iterations, before lowcq blocks existed and were worth just as much. Ideally it should be done in a better way, since functions can be anywhere from 1 to 5000 instructions. (maybe based on host timer, or an interrupt flag from a scheduler thread) * Address feedback minus CompareAndSwap change. * Use default ReservedRegion granularity. * Merge CompareAndSwap with its V128 variant. * We already got the source, no need to do it again. * Make sure all background translation threads exit. * Fix CompareAndSwap128 Detection criteria was a bit scuffed. * Address Comments.
2020-03-12 03:20:55 +00:00
Add(Instruction.CompareAndSwap, GenerateCompareAndSwap);
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
Add(Instruction.CompareEqual, GenerateCompareEqual);
Add(Instruction.CompareGreater, GenerateCompareGreater);
Add(Instruction.CompareGreaterOrEqual, GenerateCompareGreaterOrEqual);
Add(Instruction.CompareGreaterOrEqualUI, GenerateCompareGreaterOrEqualUI);
Add(Instruction.CompareGreaterUI, GenerateCompareGreaterUI);
Add(Instruction.CompareLess, GenerateCompareLess);
Add(Instruction.CompareLessOrEqual, GenerateCompareLessOrEqual);
Add(Instruction.CompareLessOrEqualUI, GenerateCompareLessOrEqualUI);
Add(Instruction.CompareLessUI, GenerateCompareLessUI);
Add(Instruction.CompareNotEqual, GenerateCompareNotEqual);
Add(Instruction.ConditionalSelect, GenerateConditionalSelect);
Add(Instruction.ConvertI64ToI32, GenerateConvertI64ToI32);
Add(Instruction.ConvertToFP, GenerateConvertToFP);
Add(Instruction.Copy, GenerateCopy);
Add(Instruction.CountLeadingZeros, GenerateCountLeadingZeros);
Add(Instruction.CpuId, GenerateCpuId);
Add(Instruction.Divide, GenerateDivide);
Add(Instruction.DivideUI, GenerateDivideUI);
Add(Instruction.Fill, GenerateFill);
Add(Instruction.Load, GenerateLoad);
Add(Instruction.Load16, GenerateLoad16);
Add(Instruction.Load8, GenerateLoad8);
Add(Instruction.Multiply, GenerateMultiply);
Add(Instruction.Multiply64HighSI, GenerateMultiply64HighSI);
Add(Instruction.Multiply64HighUI, GenerateMultiply64HighUI);
Add(Instruction.Negate, GenerateNegate);
Add(Instruction.Return, GenerateReturn);
Add(Instruction.RotateRight, GenerateRotateRight);
Add(Instruction.ShiftLeft, GenerateShiftLeft);
Add(Instruction.ShiftRightSI, GenerateShiftRightSI);
Add(Instruction.ShiftRightUI, GenerateShiftRightUI);
Add(Instruction.SignExtend16, GenerateSignExtend16);
Add(Instruction.SignExtend32, GenerateSignExtend32);
Add(Instruction.SignExtend8, GenerateSignExtend8);
Add(Instruction.Spill, GenerateSpill);
Add(Instruction.SpillArg, GenerateSpillArg);
Add(Instruction.StackAlloc, GenerateStackAlloc);
Add(Instruction.Store, GenerateStore);
Add(Instruction.Store16, GenerateStore16);
Add(Instruction.Store8, GenerateStore8);
Add(Instruction.Subtract, GenerateSubtract);
Use a Jump Table for direct and indirect calls/jumps, removing transitions to managed (#975) * Implement Jump Table for Native Calls NOTE: this slows down rejit considerably! Not recommended to be used without codegen optimisation or AOT. - Does not work on Linux - A32 needs an additional commit. * A32 Support (WIP) * Actually write Direct Call pointers to the table That would help. * Direct Calls: Rather than returning to the translator, attempt to keep within the native stack frame. A return to the translator can still happen, but only by exceptionally bubbling up to it. Also: - Always translate lowCq as a function. Faster interop with the direct jumps, and this will be useful in future if we want to do speculative translation. - Tail Call Detection: after the decoding stage, detect if we do a tail call, and avoid translating into it. Detected if a jump is made to an address outwith the contiguous sequence of blocks surrounding the entry point. The goal is to reduce code touched by jit and rejit. * A32 Support * Use smaller max function size for lowCq, fix exceptional returns When a return has an unexpected value and there is no code block following this one, we now return the value rather than continuing. * CompareAndSwap (buggy) * Ensure CompareAndSwap does not get optimized away. * Use CompareAndSwap to make the dynamic table thread safe. * Tail call for linux, throw on too many arguments. * Combine CompareAndSwap 128 and 32/64. They emit different IR instructions since their PreAllocator behaviour is different, but now they just have one function on EmitterContext. * Fix issues separating from optimisations. * Use a stub to find and execute missing functions. This allows us to skip doing many runtime comparisons and branches, and reduces the amount of code we need to emit significantly. For the indirect call table, this stub also does the work of moving in the highCq address to the table when one is found. * Make Jump Tables and Jit Cache dynmically resize Reserve virtual memory, commit as needed. * Move TailCallRemover to its own class. * Multithreaded Translation (based on heuristic) A poor one, at that. Need to get core count for a better one, which means a lot of OS specific garbage. * Better priority management for background threads. * Bound core limit a bit more Past a certain point the load is not paralellizable and starts stealing from the main thread. Likely due to GC, memory, heap allocation thread contention. Reduce by one core til optimisations come to improve the situation. * Fix memory management on linux. * Temporary solution to some sync problems. This will make sure threads exit correctly, most of the time. There is a potential race where setting the sync counter to 0 does nothing (counter stays at what it was before, thread could take too long to exit), but we need to find a better way to do this anyways. Synchronization frequency has been tightened as we never enter blockwise segments of code. Essentially this means, check every x functions or loop iterations, before lowcq blocks existed and were worth just as much. Ideally it should be done in a better way, since functions can be anywhere from 1 to 5000 instructions. (maybe based on host timer, or an interrupt flag from a scheduler thread) * Address feedback minus CompareAndSwap change. * Use default ReservedRegion granularity. * Merge CompareAndSwap with its V128 variant. * We already got the source, no need to do it again. * Make sure all background translation threads exit. * Fix CompareAndSwap128 Detection criteria was a bit scuffed. * Address Comments.
2020-03-12 03:20:55 +00:00
Add(Instruction.Tailcall, GenerateTailcall);
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
Add(Instruction.VectorCreateScalar, GenerateVectorCreateScalar);
Add(Instruction.VectorExtract, GenerateVectorExtract);
Add(Instruction.VectorExtract16, GenerateVectorExtract16);
Add(Instruction.VectorExtract8, GenerateVectorExtract8);
Add(Instruction.VectorInsert, GenerateVectorInsert);
Add(Instruction.VectorInsert16, GenerateVectorInsert16);
Add(Instruction.VectorInsert8, GenerateVectorInsert8);
Add(Instruction.VectorOne, GenerateVectorOne);
Add(Instruction.VectorZero, GenerateVectorZero);
Add(Instruction.VectorZeroUpper64, GenerateVectorZeroUpper64);
Add(Instruction.VectorZeroUpper96, GenerateVectorZeroUpper96);
Add(Instruction.ZeroExtend16, GenerateZeroExtend16);
Add(Instruction.ZeroExtend32, GenerateZeroExtend32);
Add(Instruction.ZeroExtend8, GenerateZeroExtend8);
}
private static void Add(Instruction inst, Action<CodeGenContext, Operation> func)
{
_instTable[(int)inst] = func;
}
public static CompiledFunction Generate(CompilerContext cctx)
{
ControlFlowGraph cfg = cctx.Cfg;
Logger.StartPass(PassName.Optimization);
if ((cctx.Options & CompilerOptions.SsaForm) != 0 &&
(cctx.Options & CompilerOptions.Optimize) != 0)
{
Optimizer.RunPass(cfg);
}
X86Optimizer.RunPass(cfg);
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
Logger.EndPass(PassName.Optimization, cfg);
Logger.StartPass(PassName.PreAllocation);
StackAllocator stackAlloc = new StackAllocator();
PreAllocator.RunPass(cctx, stackAlloc, out int maxCallArgs);
Logger.EndPass(PassName.PreAllocation, cfg);
Logger.StartPass(PassName.RegisterAllocation);
if ((cctx.Options & CompilerOptions.SsaForm) != 0)
{
Ssa.Deconstruct(cfg);
}
IRegisterAllocator regAlloc;
if ((cctx.Options & CompilerOptions.Lsra) != 0)
{
regAlloc = new LinearScanAllocator();
}
else
{
regAlloc = new HybridAllocator();
}
RegisterMasks regMasks = new RegisterMasks(
CallingConvention.GetIntAvailableRegisters(),
CallingConvention.GetVecAvailableRegisters(),
CallingConvention.GetIntCallerSavedRegisters(),
CallingConvention.GetVecCallerSavedRegisters(),
CallingConvention.GetIntCalleeSavedRegisters(),
CallingConvention.GetVecCalleeSavedRegisters());
AllocationResult allocResult = regAlloc.RunPass(cfg, stackAlloc, regMasks);
Logger.EndPass(PassName.RegisterAllocation, cfg);
Logger.StartPass(PassName.CodeGeneration);
using (MemoryStream stream = new MemoryStream())
{
CodeGenContext context = new CodeGenContext(stream, allocResult, maxCallArgs, cfg.Blocks.Count);
UnwindInfo unwindInfo = WritePrologue(context);
for (BasicBlock block = cfg.Blocks.First; block != null; block = block.ListNext)
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
{
context.EnterBlock(block);
for (Node node = block.Operations.First; node != null; node = node.ListNext)
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
{
if (node is Operation operation)
{
GenerateOperation(context, operation);
}
}
}
Logger.EndPass(PassName.CodeGeneration);
return new CompiledFunction(context.GetCode(), unwindInfo);
}
}
private static void GenerateOperation(CodeGenContext context, Operation operation)
{
if (operation.Instruction == Instruction.Extended)
{
IntrinsicOperation intrinOp = (IntrinsicOperation)operation;
IntrinsicInfo info = IntrinsicTable.GetInfo(intrinOp.Intrinsic);
switch (info.Type)
{
case IntrinsicType.Comis_:
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
switch (intrinOp.Intrinsic)
{
case Intrinsic.X86Comisdeq:
context.Assembler.Comisd(src1, src2);
context.Assembler.Setcc(dest, X86Condition.Equal);
break;
case Intrinsic.X86Comisdge:
context.Assembler.Comisd(src1, src2);
context.Assembler.Setcc(dest, X86Condition.AboveOrEqual);
break;
case Intrinsic.X86Comisdlt:
context.Assembler.Comisd(src1, src2);
context.Assembler.Setcc(dest, X86Condition.Below);
break;
case Intrinsic.X86Comisseq:
context.Assembler.Comiss(src1, src2);
context.Assembler.Setcc(dest, X86Condition.Equal);
break;
case Intrinsic.X86Comissge:
context.Assembler.Comiss(src1, src2);
context.Assembler.Setcc(dest, X86Condition.AboveOrEqual);
break;
case Intrinsic.X86Comisslt:
context.Assembler.Comiss(src1, src2);
context.Assembler.Setcc(dest, X86Condition.Below);
break;
}
context.Assembler.Movzx8(dest, dest, OperandType.I32);
break;
}
case IntrinsicType.PopCount:
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
EnsureSameType(dest, source);
Debug.Assert(dest.Type.IsInteger());
context.Assembler.Popcnt(dest, source, dest.Type);
break;
}
case IntrinsicType.Unary:
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
EnsureSameType(dest, source);
Debug.Assert(!dest.Type.IsInteger());
context.Assembler.WriteInstruction(info.Inst, dest, source);
break;
}
case IntrinsicType.UnaryToGpr:
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
Debug.Assert(dest.Type.IsInteger() && !source.Type.IsInteger());
if (intrinOp.Intrinsic == Intrinsic.X86Cvtsi2si)
{
if (dest.Type == OperandType.I32)
{
context.Assembler.Movd(dest, source); // int _mm_cvtsi128_si32(__m128i a)
}
else /* if (dest.Type == OperandType.I64) */
{
context.Assembler.Movq(dest, source); // __int64 _mm_cvtsi128_si64(__m128i a)
}
}
else
{
context.Assembler.WriteInstruction(info.Inst, dest, source, dest.Type);
}
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
break;
}
case IntrinsicType.Binary:
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
EnsureSameType(dest, src1);
if (!HardwareCapabilities.SupportsVexEncoding)
{
EnsureSameReg(dest, src1);
}
Debug.Assert(!dest.Type.IsInteger());
Debug.Assert(!src2.Type.IsInteger() || src2.Kind == OperandKind.Constant);
context.Assembler.WriteInstruction(info.Inst, dest, src1, src2);
break;
}
case IntrinsicType.BinaryGpr:
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
EnsureSameType(dest, src1);
if (!HardwareCapabilities.SupportsVexEncoding)
{
EnsureSameReg(dest, src1);
}
Debug.Assert(!dest.Type.IsInteger() && src2.Type.IsInteger());
context.Assembler.WriteInstruction(info.Inst, dest, src1, src2, src2.Type);
break;
}
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
case IntrinsicType.BinaryImm:
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
EnsureSameType(dest, src1);
if (!HardwareCapabilities.SupportsVexEncoding)
{
EnsureSameReg(dest, src1);
}
Debug.Assert(!dest.Type.IsInteger() && src2.Kind == OperandKind.Constant);
context.Assembler.WriteInstruction(info.Inst, dest, src1, src2.AsByte());
break;
}
case IntrinsicType.Ternary:
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
Operand src3 = operation.GetSource(2);
EnsureSameType(dest, src1, src2, src3);
Debug.Assert(!dest.Type.IsInteger());
if (info.Inst == X86Instruction.Blendvpd && HardwareCapabilities.SupportsVexEncoding)
{
context.Assembler.WriteInstruction(X86Instruction.Vblendvpd, dest, src1, src2, src3);
}
else if (info.Inst == X86Instruction.Blendvps && HardwareCapabilities.SupportsVexEncoding)
{
context.Assembler.WriteInstruction(X86Instruction.Vblendvps, dest, src1, src2, src3);
}
else if (info.Inst == X86Instruction.Pblendvb && HardwareCapabilities.SupportsVexEncoding)
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
{
context.Assembler.WriteInstruction(X86Instruction.Vpblendvb, dest, src1, src2, src3);
}
else
{
EnsureSameReg(dest, src1);
Debug.Assert(src3.GetRegister().Index == 0);
context.Assembler.WriteInstruction(info.Inst, dest, src1, src2);
}
break;
}
case IntrinsicType.TernaryImm:
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
Operand src3 = operation.GetSource(2);
EnsureSameType(dest, src1, src2);
if (!HardwareCapabilities.SupportsVexEncoding)
{
EnsureSameReg(dest, src1);
}
Debug.Assert(!dest.Type.IsInteger() && src3.Kind == OperandKind.Constant);
context.Assembler.WriteInstruction(info.Inst, dest, src1, src2, src3.AsByte());
break;
}
}
}
else
{
Action<CodeGenContext, Operation> func = _instTable[(int)operation.Instruction];
if (func != null)
{
func(context, operation);
}
else
{
throw new ArgumentException($"Invalid instruction \"{operation.Instruction}\".");
}
}
}
private static void GenerateAdd(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
ValidateBinOp(dest, src1, src2);
if (dest.Type.IsInteger())
{
context.Assembler.Add(dest, src2, dest.Type);
}
else if (dest.Type == OperandType.FP32)
{
context.Assembler.Addss(dest, src1, src2);
}
else /* if (dest.Type == OperandType.FP64) */
{
context.Assembler.Addsd(dest, src1, src2);
}
}
private static void GenerateBitwiseAnd(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
ValidateBinOp(dest, src1, src2);
Debug.Assert(dest.Type.IsInteger());
context.Assembler.And(dest, src2, dest.Type);
}
private static void GenerateBitwiseExclusiveOr(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
ValidateBinOp(dest, src1, src2);
if (dest.Type.IsInteger())
{
context.Assembler.Xor(dest, src2, dest.Type);
}
else
{
context.Assembler.Xorps(dest, src1, src2);
}
}
private static void GenerateBitwiseNot(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
ValidateUnOp(dest, source);
Debug.Assert(dest.Type.IsInteger());
context.Assembler.Not(dest);
}
private static void GenerateBitwiseOr(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
ValidateBinOp(dest, src1, src2);
Debug.Assert(dest.Type.IsInteger());
context.Assembler.Or(dest, src2, dest.Type);
}
private static void GenerateBranch(CodeGenContext context, Operation operation)
{
context.JumpTo(context.CurrBlock.Branch);
}
private static void GenerateBranchIfFalse(CodeGenContext context, Operation operation)
{
Operand source = operation.GetSource(0);
context.Assembler.Test(source, source, source.Type);
context.JumpTo(X86Condition.Equal, context.CurrBlock.Branch);
}
private static void GenerateBranchIfTrue(CodeGenContext context, Operation operation)
{
Operand source = operation.GetSource(0);
context.Assembler.Test(source, source, source.Type);
context.JumpTo(X86Condition.NotEqual, context.CurrBlock.Branch);
}
private static void GenerateByteSwap(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
ValidateUnOp(dest, source);
Debug.Assert(dest.Type.IsInteger());
context.Assembler.Bswap(dest);
}
private static void GenerateCall(CodeGenContext context, Operation operation)
{
context.Assembler.Call(operation.GetSource(0));
}
private static void GenerateClobber(CodeGenContext context, Operation operation)
{
// This is only used to indicate that a register is clobbered to the
// register allocator, we don't need to produce any code.
}
Use a Jump Table for direct and indirect calls/jumps, removing transitions to managed (#975) * Implement Jump Table for Native Calls NOTE: this slows down rejit considerably! Not recommended to be used without codegen optimisation or AOT. - Does not work on Linux - A32 needs an additional commit. * A32 Support (WIP) * Actually write Direct Call pointers to the table That would help. * Direct Calls: Rather than returning to the translator, attempt to keep within the native stack frame. A return to the translator can still happen, but only by exceptionally bubbling up to it. Also: - Always translate lowCq as a function. Faster interop with the direct jumps, and this will be useful in future if we want to do speculative translation. - Tail Call Detection: after the decoding stage, detect if we do a tail call, and avoid translating into it. Detected if a jump is made to an address outwith the contiguous sequence of blocks surrounding the entry point. The goal is to reduce code touched by jit and rejit. * A32 Support * Use smaller max function size for lowCq, fix exceptional returns When a return has an unexpected value and there is no code block following this one, we now return the value rather than continuing. * CompareAndSwap (buggy) * Ensure CompareAndSwap does not get optimized away. * Use CompareAndSwap to make the dynamic table thread safe. * Tail call for linux, throw on too many arguments. * Combine CompareAndSwap 128 and 32/64. They emit different IR instructions since their PreAllocator behaviour is different, but now they just have one function on EmitterContext. * Fix issues separating from optimisations. * Use a stub to find and execute missing functions. This allows us to skip doing many runtime comparisons and branches, and reduces the amount of code we need to emit significantly. For the indirect call table, this stub also does the work of moving in the highCq address to the table when one is found. * Make Jump Tables and Jit Cache dynmically resize Reserve virtual memory, commit as needed. * Move TailCallRemover to its own class. * Multithreaded Translation (based on heuristic) A poor one, at that. Need to get core count for a better one, which means a lot of OS specific garbage. * Better priority management for background threads. * Bound core limit a bit more Past a certain point the load is not paralellizable and starts stealing from the main thread. Likely due to GC, memory, heap allocation thread contention. Reduce by one core til optimisations come to improve the situation. * Fix memory management on linux. * Temporary solution to some sync problems. This will make sure threads exit correctly, most of the time. There is a potential race where setting the sync counter to 0 does nothing (counter stays at what it was before, thread could take too long to exit), but we need to find a better way to do this anyways. Synchronization frequency has been tightened as we never enter blockwise segments of code. Essentially this means, check every x functions or loop iterations, before lowcq blocks existed and were worth just as much. Ideally it should be done in a better way, since functions can be anywhere from 1 to 5000 instructions. (maybe based on host timer, or an interrupt flag from a scheduler thread) * Address feedback minus CompareAndSwap change. * Use default ReservedRegion granularity. * Merge CompareAndSwap with its V128 variant. * We already got the source, no need to do it again. * Make sure all background translation threads exit. * Fix CompareAndSwap128 Detection criteria was a bit scuffed. * Address Comments.
2020-03-12 03:20:55 +00:00
private static void GenerateCompareAndSwap(CodeGenContext context, Operation operation)
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
{
Use a Jump Table for direct and indirect calls/jumps, removing transitions to managed (#975) * Implement Jump Table for Native Calls NOTE: this slows down rejit considerably! Not recommended to be used without codegen optimisation or AOT. - Does not work on Linux - A32 needs an additional commit. * A32 Support (WIP) * Actually write Direct Call pointers to the table That would help. * Direct Calls: Rather than returning to the translator, attempt to keep within the native stack frame. A return to the translator can still happen, but only by exceptionally bubbling up to it. Also: - Always translate lowCq as a function. Faster interop with the direct jumps, and this will be useful in future if we want to do speculative translation. - Tail Call Detection: after the decoding stage, detect if we do a tail call, and avoid translating into it. Detected if a jump is made to an address outwith the contiguous sequence of blocks surrounding the entry point. The goal is to reduce code touched by jit and rejit. * A32 Support * Use smaller max function size for lowCq, fix exceptional returns When a return has an unexpected value and there is no code block following this one, we now return the value rather than continuing. * CompareAndSwap (buggy) * Ensure CompareAndSwap does not get optimized away. * Use CompareAndSwap to make the dynamic table thread safe. * Tail call for linux, throw on too many arguments. * Combine CompareAndSwap 128 and 32/64. They emit different IR instructions since their PreAllocator behaviour is different, but now they just have one function on EmitterContext. * Fix issues separating from optimisations. * Use a stub to find and execute missing functions. This allows us to skip doing many runtime comparisons and branches, and reduces the amount of code we need to emit significantly. For the indirect call table, this stub also does the work of moving in the highCq address to the table when one is found. * Make Jump Tables and Jit Cache dynmically resize Reserve virtual memory, commit as needed. * Move TailCallRemover to its own class. * Multithreaded Translation (based on heuristic) A poor one, at that. Need to get core count for a better one, which means a lot of OS specific garbage. * Better priority management for background threads. * Bound core limit a bit more Past a certain point the load is not paralellizable and starts stealing from the main thread. Likely due to GC, memory, heap allocation thread contention. Reduce by one core til optimisations come to improve the situation. * Fix memory management on linux. * Temporary solution to some sync problems. This will make sure threads exit correctly, most of the time. There is a potential race where setting the sync counter to 0 does nothing (counter stays at what it was before, thread could take too long to exit), but we need to find a better way to do this anyways. Synchronization frequency has been tightened as we never enter blockwise segments of code. Essentially this means, check every x functions or loop iterations, before lowcq blocks existed and were worth just as much. Ideally it should be done in a better way, since functions can be anywhere from 1 to 5000 instructions. (maybe based on host timer, or an interrupt flag from a scheduler thread) * Address feedback minus CompareAndSwap change. * Use default ReservedRegion granularity. * Merge CompareAndSwap with its V128 variant. * We already got the source, no need to do it again. * Make sure all background translation threads exit. * Fix CompareAndSwap128 Detection criteria was a bit scuffed. * Address Comments.
2020-03-12 03:20:55 +00:00
Operand src1 = operation.GetSource(0);
if (operation.SourcesCount == 5) // CompareAndSwap128 has 5 sources, compared to CompareAndSwap64/32's 3.
{
MemoryOperand memOp = new MemoryOperand(OperandType.I64, src1);
context.Assembler.Cmpxchg16b(memOp);
}
else
{
Operand src2 = operation.GetSource(1);
Operand src3 = operation.GetSource(2);
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
Use a Jump Table for direct and indirect calls/jumps, removing transitions to managed (#975) * Implement Jump Table for Native Calls NOTE: this slows down rejit considerably! Not recommended to be used without codegen optimisation or AOT. - Does not work on Linux - A32 needs an additional commit. * A32 Support (WIP) * Actually write Direct Call pointers to the table That would help. * Direct Calls: Rather than returning to the translator, attempt to keep within the native stack frame. A return to the translator can still happen, but only by exceptionally bubbling up to it. Also: - Always translate lowCq as a function. Faster interop with the direct jumps, and this will be useful in future if we want to do speculative translation. - Tail Call Detection: after the decoding stage, detect if we do a tail call, and avoid translating into it. Detected if a jump is made to an address outwith the contiguous sequence of blocks surrounding the entry point. The goal is to reduce code touched by jit and rejit. * A32 Support * Use smaller max function size for lowCq, fix exceptional returns When a return has an unexpected value and there is no code block following this one, we now return the value rather than continuing. * CompareAndSwap (buggy) * Ensure CompareAndSwap does not get optimized away. * Use CompareAndSwap to make the dynamic table thread safe. * Tail call for linux, throw on too many arguments. * Combine CompareAndSwap 128 and 32/64. They emit different IR instructions since their PreAllocator behaviour is different, but now they just have one function on EmitterContext. * Fix issues separating from optimisations. * Use a stub to find and execute missing functions. This allows us to skip doing many runtime comparisons and branches, and reduces the amount of code we need to emit significantly. For the indirect call table, this stub also does the work of moving in the highCq address to the table when one is found. * Make Jump Tables and Jit Cache dynmically resize Reserve virtual memory, commit as needed. * Move TailCallRemover to its own class. * Multithreaded Translation (based on heuristic) A poor one, at that. Need to get core count for a better one, which means a lot of OS specific garbage. * Better priority management for background threads. * Bound core limit a bit more Past a certain point the load is not paralellizable and starts stealing from the main thread. Likely due to GC, memory, heap allocation thread contention. Reduce by one core til optimisations come to improve the situation. * Fix memory management on linux. * Temporary solution to some sync problems. This will make sure threads exit correctly, most of the time. There is a potential race where setting the sync counter to 0 does nothing (counter stays at what it was before, thread could take too long to exit), but we need to find a better way to do this anyways. Synchronization frequency has been tightened as we never enter blockwise segments of code. Essentially this means, check every x functions or loop iterations, before lowcq blocks existed and were worth just as much. Ideally it should be done in a better way, since functions can be anywhere from 1 to 5000 instructions. (maybe based on host timer, or an interrupt flag from a scheduler thread) * Address feedback minus CompareAndSwap change. * Use default ReservedRegion granularity. * Merge CompareAndSwap with its V128 variant. * We already got the source, no need to do it again. * Make sure all background translation threads exit. * Fix CompareAndSwap128 Detection criteria was a bit scuffed. * Address Comments.
2020-03-12 03:20:55 +00:00
EnsureSameType(src2, src3);
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
Use a Jump Table for direct and indirect calls/jumps, removing transitions to managed (#975) * Implement Jump Table for Native Calls NOTE: this slows down rejit considerably! Not recommended to be used without codegen optimisation or AOT. - Does not work on Linux - A32 needs an additional commit. * A32 Support (WIP) * Actually write Direct Call pointers to the table That would help. * Direct Calls: Rather than returning to the translator, attempt to keep within the native stack frame. A return to the translator can still happen, but only by exceptionally bubbling up to it. Also: - Always translate lowCq as a function. Faster interop with the direct jumps, and this will be useful in future if we want to do speculative translation. - Tail Call Detection: after the decoding stage, detect if we do a tail call, and avoid translating into it. Detected if a jump is made to an address outwith the contiguous sequence of blocks surrounding the entry point. The goal is to reduce code touched by jit and rejit. * A32 Support * Use smaller max function size for lowCq, fix exceptional returns When a return has an unexpected value and there is no code block following this one, we now return the value rather than continuing. * CompareAndSwap (buggy) * Ensure CompareAndSwap does not get optimized away. * Use CompareAndSwap to make the dynamic table thread safe. * Tail call for linux, throw on too many arguments. * Combine CompareAndSwap 128 and 32/64. They emit different IR instructions since their PreAllocator behaviour is different, but now they just have one function on EmitterContext. * Fix issues separating from optimisations. * Use a stub to find and execute missing functions. This allows us to skip doing many runtime comparisons and branches, and reduces the amount of code we need to emit significantly. For the indirect call table, this stub also does the work of moving in the highCq address to the table when one is found. * Make Jump Tables and Jit Cache dynmically resize Reserve virtual memory, commit as needed. * Move TailCallRemover to its own class. * Multithreaded Translation (based on heuristic) A poor one, at that. Need to get core count for a better one, which means a lot of OS specific garbage. * Better priority management for background threads. * Bound core limit a bit more Past a certain point the load is not paralellizable and starts stealing from the main thread. Likely due to GC, memory, heap allocation thread contention. Reduce by one core til optimisations come to improve the situation. * Fix memory management on linux. * Temporary solution to some sync problems. This will make sure threads exit correctly, most of the time. There is a potential race where setting the sync counter to 0 does nothing (counter stays at what it was before, thread could take too long to exit), but we need to find a better way to do this anyways. Synchronization frequency has been tightened as we never enter blockwise segments of code. Essentially this means, check every x functions or loop iterations, before lowcq blocks existed and were worth just as much. Ideally it should be done in a better way, since functions can be anywhere from 1 to 5000 instructions. (maybe based on host timer, or an interrupt flag from a scheduler thread) * Address feedback minus CompareAndSwap change. * Use default ReservedRegion granularity. * Merge CompareAndSwap with its V128 variant. * We already got the source, no need to do it again. * Make sure all background translation threads exit. * Fix CompareAndSwap128 Detection criteria was a bit scuffed. * Address Comments.
2020-03-12 03:20:55 +00:00
MemoryOperand memOp = new MemoryOperand(src3.Type, src1);
context.Assembler.Cmpxchg(memOp, src3);
}
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
}
private static void GenerateCompareEqual(CodeGenContext context, Operation operation)
{
GenerateCompare(context, operation, X86Condition.Equal);
}
private static void GenerateCompareGreater(CodeGenContext context, Operation operation)
{
GenerateCompare(context, operation, X86Condition.Greater);
}
private static void GenerateCompareGreaterOrEqual(CodeGenContext context, Operation operation)
{
GenerateCompare(context, operation, X86Condition.GreaterOrEqual);
}
private static void GenerateCompareGreaterOrEqualUI(CodeGenContext context, Operation operation)
{
GenerateCompare(context, operation, X86Condition.AboveOrEqual);
}
private static void GenerateCompareGreaterUI(CodeGenContext context, Operation operation)
{
GenerateCompare(context, operation, X86Condition.Above);
}
private static void GenerateCompareLess(CodeGenContext context, Operation operation)
{
GenerateCompare(context, operation, X86Condition.Less);
}
private static void GenerateCompareLessOrEqual(CodeGenContext context, Operation operation)
{
GenerateCompare(context, operation, X86Condition.LessOrEqual);
}
private static void GenerateCompareLessOrEqualUI(CodeGenContext context, Operation operation)
{
GenerateCompare(context, operation, X86Condition.BelowOrEqual);
}
private static void GenerateCompareLessUI(CodeGenContext context, Operation operation)
{
GenerateCompare(context, operation, X86Condition.Below);
}
private static void GenerateCompareNotEqual(CodeGenContext context, Operation operation)
{
GenerateCompare(context, operation, X86Condition.NotEqual);
}
private static void GenerateCompare(CodeGenContext context, Operation operation, X86Condition condition)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
EnsureSameType(src1, src2);
Debug.Assert(dest.Type == OperandType.I32);
context.Assembler.Cmp(src1, src2, src1.Type);
context.Assembler.Setcc(dest, condition);
context.Assembler.Movzx8(dest, dest, OperandType.I32);
}
private static void GenerateConditionalSelect(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
Operand src3 = operation.GetSource(2);
EnsureSameReg (dest, src3);
EnsureSameType(dest, src2, src3);
Debug.Assert(dest.Type.IsInteger());
Debug.Assert(src1.Type == OperandType.I32);
context.Assembler.Test (src1, src1, src1.Type);
context.Assembler.Cmovcc(dest, src2, dest.Type, X86Condition.NotEqual);
}
private static void GenerateConvertI64ToI32(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
Debug.Assert(dest.Type == OperandType.I32 && source.Type == OperandType.I64);
context.Assembler.Mov(dest, source, OperandType.I32);
}
private static void GenerateConvertToFP(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
Debug.Assert(dest.Type == OperandType.FP32 || dest.Type == OperandType.FP64);
if (dest.Type == OperandType.FP32)
{
Debug.Assert(source.Type.IsInteger() || source.Type == OperandType.FP64);
if (source.Type.IsInteger())
{
context.Assembler.Xorps (dest, dest, dest);
context.Assembler.Cvtsi2ss(dest, dest, source, source.Type);
}
else /* if (source.Type == OperandType.FP64) */
{
context.Assembler.Cvtsd2ss(dest, dest, source);
GenerateZeroUpper96(context, dest, dest);
}
}
else /* if (dest.Type == OperandType.FP64) */
{
Debug.Assert(source.Type.IsInteger() || source.Type == OperandType.FP32);
if (source.Type.IsInteger())
{
context.Assembler.Xorps (dest, dest, dest);
context.Assembler.Cvtsi2sd(dest, dest, source, source.Type);
}
else /* if (source.Type == OperandType.FP32) */
{
context.Assembler.Cvtss2sd(dest, dest, source);
GenerateZeroUpper64(context, dest, dest);
}
}
}
private static void GenerateCopy(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
EnsureSameType(dest, source);
Debug.Assert(dest.Type.IsInteger() || source.Kind != OperandKind.Constant);
// Moves to the same register are useless.
if (dest.Kind == source.Kind && dest.Value == source.Value)
{
return;
}
if (dest.Kind == OperandKind.Register &&
source.Kind == OperandKind.Constant && source.Value == 0)
{
// Assemble "mov reg, 0" as "xor reg, reg" as the later is more efficient.
context.Assembler.Xor(dest, dest, OperandType.I32);
}
else if (dest.Type.IsInteger())
{
context.Assembler.Mov(dest, source, dest.Type);
}
else
{
context.Assembler.Movdqu(dest, source);
}
}
private static void GenerateCountLeadingZeros(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
EnsureSameType(dest, source);
Debug.Assert(dest.Type.IsInteger());
context.Assembler.Bsr(dest, source, dest.Type);
int operandSize = dest.Type == OperandType.I32 ? 32 : 64;
int operandMask = operandSize - 1;
// When the input operand is 0, the result is undefined, however the
// ZF flag is set. We are supposed to return the operand size on that
// case. So, add an additional jump to handle that case, by moving the
// operand size constant to the destination register.
context.JumpToNear(X86Condition.NotEqual);
context.Assembler.Mov(dest, new Operand(operandSize | operandMask), OperandType.I32);
context.JumpHere();
// BSR returns the zero based index of the last bit set on the operand,
// starting from the least significant bit. However we are supposed to
// return the number of 0 bits on the high end. So, we invert the result
// of the BSR using XOR to get the correct value.
context.Assembler.Xor(dest, new Operand(operandMask), OperandType.I32);
}
private static void GenerateCpuId(CodeGenContext context, Operation operation)
{
context.Assembler.Cpuid();
}
private static void GenerateDivide(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand dividend = operation.GetSource(0);
Operand divisor = operation.GetSource(1);
if (!dest.Type.IsInteger())
{
ValidateBinOp(dest, dividend, divisor);
}
if (dest.Type.IsInteger())
{
divisor = operation.GetSource(2);
EnsureSameType(dest, divisor);
if (divisor.Type == OperandType.I32)
{
context.Assembler.Cdq();
}
else
{
context.Assembler.Cqo();
}
context.Assembler.Idiv(divisor);
}
else if (dest.Type == OperandType.FP32)
{
context.Assembler.Divss(dest, dividend, divisor);
}
else /* if (dest.Type == OperandType.FP64) */
{
context.Assembler.Divsd(dest, dividend, divisor);
}
}
private static void GenerateDivideUI(CodeGenContext context, Operation operation)
{
Operand divisor = operation.GetSource(2);
Operand rdx = Register(X86Register.Rdx);
Debug.Assert(divisor.Type.IsInteger());
context.Assembler.Xor(rdx, rdx, OperandType.I32);
context.Assembler.Div(divisor);
}
private static void GenerateFill(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand offset = operation.GetSource(0);
Debug.Assert(offset.Kind == OperandKind.Constant);
int offs = offset.AsInt32() + context.CallArgsRegionSize;
Operand rsp = Register(X86Register.Rsp);
MemoryOperand memOp = new MemoryOperand(dest.Type, rsp, null, Multiplier.x1, offs);
GenerateLoad(context, memOp, dest);
}
private static void GenerateLoad(CodeGenContext context, Operation operation)
{
Operand value = operation.Destination;
Operand address = Memory(operation.GetSource(0), value.Type);
GenerateLoad(context, address, value);
}
private static void GenerateLoad16(CodeGenContext context, Operation operation)
{
Operand value = operation.Destination;
Operand address = Memory(operation.GetSource(0), value.Type);
Debug.Assert(value.Type.IsInteger());
context.Assembler.Movzx16(value, address, value.Type);
}
private static void GenerateLoad8(CodeGenContext context, Operation operation)
{
Operand value = operation.Destination;
Operand address = Memory(operation.GetSource(0), value.Type);
Debug.Assert(value.Type.IsInteger());
context.Assembler.Movzx8(value, address, value.Type);
}
private static void GenerateMultiply(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
if (src2.Kind != OperandKind.Constant)
{
EnsureSameReg(dest, src1);
}
EnsureSameType(dest, src1, src2);
if (dest.Type.IsInteger())
{
if (src2.Kind == OperandKind.Constant)
{
context.Assembler.Imul(dest, src1, src2, dest.Type);
}
else
{
context.Assembler.Imul(dest, src2, dest.Type);
}
}
else if (dest.Type == OperandType.FP32)
{
context.Assembler.Mulss(dest, src1, src2);
}
else /* if (dest.Type == OperandType.FP64) */
{
context.Assembler.Mulsd(dest, src1, src2);
}
}
private static void GenerateMultiply64HighSI(CodeGenContext context, Operation operation)
{
Operand source = operation.GetSource(1);
Debug.Assert(source.Type == OperandType.I64);
context.Assembler.Imul(source);
}
private static void GenerateMultiply64HighUI(CodeGenContext context, Operation operation)
{
Operand source = operation.GetSource(1);
Debug.Assert(source.Type == OperandType.I64);
context.Assembler.Mul(source);
}
private static void GenerateNegate(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
ValidateUnOp(dest, source);
Debug.Assert(dest.Type.IsInteger());
context.Assembler.Neg(dest);
}
private static void GenerateReturn(CodeGenContext context, Operation operation)
{
WriteEpilogue(context);
context.Assembler.Return();
}
private static void GenerateRotateRight(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
ValidateShift(dest, src1, src2);
context.Assembler.Ror(dest, src2, dest.Type);
}
private static void GenerateShiftLeft(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
ValidateShift(dest, src1, src2);
context.Assembler.Shl(dest, src2, dest.Type);
}
private static void GenerateShiftRightSI(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
ValidateShift(dest, src1, src2);
context.Assembler.Sar(dest, src2, dest.Type);
}
private static void GenerateShiftRightUI(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
ValidateShift(dest, src1, src2);
context.Assembler.Shr(dest, src2, dest.Type);
}
private static void GenerateSignExtend16(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
Debug.Assert(dest.Type.IsInteger() && source.Type.IsInteger());
context.Assembler.Movsx16(dest, source, dest.Type);
}
private static void GenerateSignExtend32(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
Debug.Assert(dest.Type.IsInteger() && source.Type.IsInteger());
context.Assembler.Movsx32(dest, source, dest.Type);
}
private static void GenerateSignExtend8(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
Debug.Assert(dest.Type.IsInteger() && source.Type.IsInteger());
context.Assembler.Movsx8(dest, source, dest.Type);
}
private static void GenerateSpill(CodeGenContext context, Operation operation)
{
GenerateSpill(context, operation, context.CallArgsRegionSize);
}
private static void GenerateSpillArg(CodeGenContext context, Operation operation)
{
GenerateSpill(context, operation, 0);
}
private static void GenerateSpill(CodeGenContext context, Operation operation, int baseOffset)
{
Operand offset = operation.GetSource(0);
Operand source = operation.GetSource(1);
Debug.Assert(offset.Kind == OperandKind.Constant);
int offs = offset.AsInt32() + baseOffset;
Operand rsp = Register(X86Register.Rsp);
MemoryOperand memOp = new MemoryOperand(source.Type, rsp, null, Multiplier.x1, offs);
GenerateStore(context, memOp, source);
}
private static void GenerateStackAlloc(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand offset = operation.GetSource(0);
Debug.Assert(offset.Kind == OperandKind.Constant);
int offs = offset.AsInt32() + context.CallArgsRegionSize;
Operand rsp = Register(X86Register.Rsp);
MemoryOperand memOp = new MemoryOperand(OperandType.I64, rsp, null, Multiplier.x1, offs);
context.Assembler.Lea(dest, memOp, OperandType.I64);
}
private static void GenerateStore(CodeGenContext context, Operation operation)
{
Operand value = operation.GetSource(1);
Operand address = Memory(operation.GetSource(0), value.Type);
GenerateStore(context, address, value);
}
private static void GenerateStore16(CodeGenContext context, Operation operation)
{
Operand value = operation.GetSource(1);
Operand address = Memory(operation.GetSource(0), value.Type);
Debug.Assert(value.Type.IsInteger());
context.Assembler.Mov16(address, value);
}
private static void GenerateStore8(CodeGenContext context, Operation operation)
{
Operand value = operation.GetSource(1);
Operand address = Memory(operation.GetSource(0), value.Type);
Debug.Assert(value.Type.IsInteger());
context.Assembler.Mov8(address, value);
}
private static void GenerateSubtract(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
ValidateBinOp(dest, src1, src2);
if (dest.Type.IsInteger())
{
context.Assembler.Sub(dest, src2, dest.Type);
}
else if (dest.Type == OperandType.FP32)
{
context.Assembler.Subss(dest, src1, src2);
}
else /* if (dest.Type == OperandType.FP64) */
{
context.Assembler.Subsd(dest, src1, src2);
}
}
Use a Jump Table for direct and indirect calls/jumps, removing transitions to managed (#975) * Implement Jump Table for Native Calls NOTE: this slows down rejit considerably! Not recommended to be used without codegen optimisation or AOT. - Does not work on Linux - A32 needs an additional commit. * A32 Support (WIP) * Actually write Direct Call pointers to the table That would help. * Direct Calls: Rather than returning to the translator, attempt to keep within the native stack frame. A return to the translator can still happen, but only by exceptionally bubbling up to it. Also: - Always translate lowCq as a function. Faster interop with the direct jumps, and this will be useful in future if we want to do speculative translation. - Tail Call Detection: after the decoding stage, detect if we do a tail call, and avoid translating into it. Detected if a jump is made to an address outwith the contiguous sequence of blocks surrounding the entry point. The goal is to reduce code touched by jit and rejit. * A32 Support * Use smaller max function size for lowCq, fix exceptional returns When a return has an unexpected value and there is no code block following this one, we now return the value rather than continuing. * CompareAndSwap (buggy) * Ensure CompareAndSwap does not get optimized away. * Use CompareAndSwap to make the dynamic table thread safe. * Tail call for linux, throw on too many arguments. * Combine CompareAndSwap 128 and 32/64. They emit different IR instructions since their PreAllocator behaviour is different, but now they just have one function on EmitterContext. * Fix issues separating from optimisations. * Use a stub to find and execute missing functions. This allows us to skip doing many runtime comparisons and branches, and reduces the amount of code we need to emit significantly. For the indirect call table, this stub also does the work of moving in the highCq address to the table when one is found. * Make Jump Tables and Jit Cache dynmically resize Reserve virtual memory, commit as needed. * Move TailCallRemover to its own class. * Multithreaded Translation (based on heuristic) A poor one, at that. Need to get core count for a better one, which means a lot of OS specific garbage. * Better priority management for background threads. * Bound core limit a bit more Past a certain point the load is not paralellizable and starts stealing from the main thread. Likely due to GC, memory, heap allocation thread contention. Reduce by one core til optimisations come to improve the situation. * Fix memory management on linux. * Temporary solution to some sync problems. This will make sure threads exit correctly, most of the time. There is a potential race where setting the sync counter to 0 does nothing (counter stays at what it was before, thread could take too long to exit), but we need to find a better way to do this anyways. Synchronization frequency has been tightened as we never enter blockwise segments of code. Essentially this means, check every x functions or loop iterations, before lowcq blocks existed and were worth just as much. Ideally it should be done in a better way, since functions can be anywhere from 1 to 5000 instructions. (maybe based on host timer, or an interrupt flag from a scheduler thread) * Address feedback minus CompareAndSwap change. * Use default ReservedRegion granularity. * Merge CompareAndSwap with its V128 variant. * We already got the source, no need to do it again. * Make sure all background translation threads exit. * Fix CompareAndSwap128 Detection criteria was a bit scuffed. * Address Comments.
2020-03-12 03:20:55 +00:00
private static void GenerateTailcall(CodeGenContext context, Operation operation)
{
WriteEpilogue(context);
context.Assembler.Jmp(operation.GetSource(0));
}
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
private static void GenerateVectorCreateScalar(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
Debug.Assert(!dest.Type.IsInteger() && source.Type.IsInteger());
if (source.Type == OperandType.I32)
{
context.Assembler.Movd(dest, source); // (__m128i _mm_cvtsi32_si128(int a))
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
}
else /* if (source.Type == OperandType.I64) */
{
context.Assembler.Movq(dest, source); // (__m128i _mm_cvtsi64_si128(__int64 a))
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
}
}
private static void GenerateVectorExtract(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination; //Value
Operand src1 = operation.GetSource(0); //Vector
Operand src2 = operation.GetSource(1); //Index
Debug.Assert(src1.Type == OperandType.V128);
Debug.Assert(src2.Kind == OperandKind.Constant);
byte index = src2.AsByte();
if (dest.Type == OperandType.I32)
{
Debug.Assert(index < 4);
if (HardwareCapabilities.SupportsSse41)
{
context.Assembler.Pextrd(dest, src1, index);
}
else
{
if (index != 0)
{
int mask0 = 0b11_10_01_00;
int mask1 = 0b11_10_01_00;
mask0 = BitUtils.RotateRight(mask0, index * 2, 8);
mask1 = BitUtils.RotateRight(mask1, 8 - index * 2, 8);
context.Assembler.Pshufd(src1, src1, (byte)mask0);
context.Assembler.Movd (dest, src1);
context.Assembler.Pshufd(src1, src1, (byte)mask1);
}
else
{
context.Assembler.Movd(dest, src1);
}
}
}
else if (dest.Type == OperandType.I64)
{
Debug.Assert(index < 2);
if (HardwareCapabilities.SupportsSse41)
{
context.Assembler.Pextrq(dest, src1, index);
}
else
{
if (index != 0)
{
const byte mask = 0b01_00_11_10;
context.Assembler.Pshufd(src1, src1, mask);
context.Assembler.Movq (dest, src1);
context.Assembler.Pshufd(src1, src1, mask);
}
else
{
context.Assembler.Movq(dest, src1);
}
}
}
else
{
Debug.Assert(index < (dest.Type == OperandType.FP32 ? 4 : 2));
// Floating-point types.
if ((index >= 2 && dest.Type == OperandType.FP32) ||
(index == 1 && dest.Type == OperandType.FP64))
{
context.Assembler.Movhlps(dest, dest, src1);
context.Assembler.Movq (dest, dest);
}
else
{
context.Assembler.Movq(dest, src1);
}
if (dest.Type == OperandType.FP32)
{
context.Assembler.Pshufd(dest, dest, (byte)(0xfc | (index & 1)));
}
}
}
private static void GenerateVectorExtract16(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination; //Value
Operand src1 = operation.GetSource(0); //Vector
Operand src2 = operation.GetSource(1); //Index
Debug.Assert(src1.Type == OperandType.V128);
Debug.Assert(src2.Kind == OperandKind.Constant);
byte index = src2.AsByte();
Debug.Assert(index < 8);
context.Assembler.Pextrw(dest, src1, index);
}
private static void GenerateVectorExtract8(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination; //Value
Operand src1 = operation.GetSource(0); //Vector
Operand src2 = operation.GetSource(1); //Index
Debug.Assert(src1.Type == OperandType.V128);
Debug.Assert(src2.Kind == OperandKind.Constant);
byte index = src2.AsByte();
Debug.Assert(index < 16);
if (HardwareCapabilities.SupportsSse41)
{
context.Assembler.Pextrb(dest, src1, index);
}
else
{
context.Assembler.Pextrw(dest, src1, (byte)(index >> 1));
if ((index & 1) != 0)
{
context.Assembler.Shr(dest, new Operand(8), OperandType.I32);
}
else
{
context.Assembler.Movzx8(dest, dest, OperandType.I32);
}
}
}
private static void GenerateVectorInsert(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0); //Vector
Operand src2 = operation.GetSource(1); //Value
Operand src3 = operation.GetSource(2); //Index
if (!HardwareCapabilities.SupportsVexEncoding)
{
EnsureSameReg(dest, src1);
}
Debug.Assert(src1.Type == OperandType.V128);
Debug.Assert(src3.Kind == OperandKind.Constant);
byte index = src3.AsByte();
void InsertIntSse2(int words)
{
if (dest.GetRegister() != src1.GetRegister())
{
context.Assembler.Movdqu(dest, src1);
}
for (int word = 0; word < words; word++)
{
// Insert lower 16-bits.
context.Assembler.Pinsrw(dest, dest, src2, (byte)(index * words + word));
// Move next word down.
context.Assembler.Ror(src2, new Operand(16), src2.Type);
}
}
if (src2.Type == OperandType.I32)
{
Debug.Assert(index < 4);
if (HardwareCapabilities.SupportsSse41)
{
context.Assembler.Pinsrd(dest, src1, src2, index);
}
else
{
InsertIntSse2(2);
}
}
else if (src2.Type == OperandType.I64)
{
Debug.Assert(index < 2);
if (HardwareCapabilities.SupportsSse41)
{
context.Assembler.Pinsrq(dest, src1, src2, index);
}
else
{
InsertIntSse2(4);
}
}
else if (src2.Type == OperandType.FP32)
{
Debug.Assert(index < 4);
if (index != 0)
{
if (HardwareCapabilities.SupportsSse41)
{
context.Assembler.Insertps(dest, src1, src2, (byte)(index << 4));
}
else
{
if (src1.GetRegister() == src2.GetRegister())
{
int mask = 0b11_10_01_00;
mask &= ~(0b11 << index * 2);
context.Assembler.Pshufd(dest, src1, (byte)mask);
}
else
{
int mask0 = 0b11_10_01_00;
int mask1 = 0b11_10_01_00;
mask0 = BitUtils.RotateRight(mask0, index * 2, 8);
mask1 = BitUtils.RotateRight(mask1, 8 - index * 2, 8);
context.Assembler.Pshufd(src1, src1, (byte)mask0); // Lane to be inserted in position 0.
context.Assembler.Movss (dest, src1, src2); // dest[127:0] = src1[127:32] | src2[31:0]
context.Assembler.Pshufd(dest, dest, (byte)mask1); // Inserted lane in original position.
if (dest.GetRegister() != src1.GetRegister())
{
context.Assembler.Pshufd(src1, src1, (byte)mask1); // Restore src1.
}
}
}
}
else
{
context.Assembler.Movss(dest, src1, src2);
}
}
else /* if (src2.Type == OperandType.FP64) */
{
Debug.Assert(index < 2);
if (index != 0)
{
context.Assembler.Movlhps(dest, src1, src2);
}
else
{
context.Assembler.Movsd(dest, src1, src2);
}
}
}
private static void GenerateVectorInsert16(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0); //Vector
Operand src2 = operation.GetSource(1); //Value
Operand src3 = operation.GetSource(2); //Index
if (!HardwareCapabilities.SupportsVexEncoding)
{
EnsureSameReg(dest, src1);
}
Debug.Assert(src1.Type == OperandType.V128);
Debug.Assert(src3.Kind == OperandKind.Constant);
byte index = src3.AsByte();
context.Assembler.Pinsrw(dest, src1, src2, index);
}
private static void GenerateVectorInsert8(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0); //Vector
Operand src2 = operation.GetSource(1); //Value
Operand src3 = operation.GetSource(2); //Index
// It's not possible to emulate this instruction without
// SSE 4.1 support without the use of a temporary register,
// so we instead handle that case on the pre-allocator when
// SSE 4.1 is not supported on the CPU.
Debug.Assert(HardwareCapabilities.SupportsSse41);
if (!HardwareCapabilities.SupportsVexEncoding)
{
EnsureSameReg(dest, src1);
}
Debug.Assert(src1.Type == OperandType.V128);
Debug.Assert(src3.Kind == OperandKind.Constant);
byte index = src3.AsByte();
context.Assembler.Pinsrb(dest, src1, src2, index);
}
private static void GenerateVectorOne(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Debug.Assert(!dest.Type.IsInteger());
context.Assembler.Pcmpeqw(dest, dest, dest);
}
private static void GenerateVectorZero(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Debug.Assert(!dest.Type.IsInteger());
context.Assembler.Xorps(dest, dest, dest);
}
private static void GenerateVectorZeroUpper64(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
Debug.Assert(dest.Type == OperandType.V128 && source.Type == OperandType.V128);
GenerateZeroUpper64(context, dest, source);
}
private static void GenerateVectorZeroUpper96(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
Debug.Assert(dest.Type == OperandType.V128 && source.Type == OperandType.V128);
GenerateZeroUpper96(context, dest, source);
}
private static void GenerateZeroExtend16(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
Debug.Assert(dest.Type.IsInteger() && source.Type.IsInteger());
context.Assembler.Movzx16(dest, source, OperandType.I32);
}
private static void GenerateZeroExtend32(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
Debug.Assert(dest.Type.IsInteger() && source.Type.IsInteger());
context.Assembler.Mov(dest, source, OperandType.I32);
}
private static void GenerateZeroExtend8(CodeGenContext context, Operation operation)
{
Operand dest = operation.Destination;
Operand source = operation.GetSource(0);
Debug.Assert(dest.Type.IsInteger() && source.Type.IsInteger());
context.Assembler.Movzx8(dest, source, OperandType.I32);
}
private static void GenerateLoad(CodeGenContext context, Operand address, Operand value)
{
switch (value.Type)
{
case OperandType.I32: context.Assembler.Mov (value, address, OperandType.I32); break;
case OperandType.I64: context.Assembler.Mov (value, address, OperandType.I64); break;
case OperandType.FP32: context.Assembler.Movd (value, address); break;
case OperandType.FP64: context.Assembler.Movq (value, address); break;
case OperandType.V128: context.Assembler.Movdqu(value, address); break;
default: Debug.Assert(false); break;
}
}
private static void GenerateStore(CodeGenContext context, Operand address, Operand value)
{
switch (value.Type)
{
case OperandType.I32: context.Assembler.Mov (address, value, OperandType.I32); break;
case OperandType.I64: context.Assembler.Mov (address, value, OperandType.I64); break;
case OperandType.FP32: context.Assembler.Movd (address, value); break;
case OperandType.FP64: context.Assembler.Movq (address, value); break;
case OperandType.V128: context.Assembler.Movdqu(address, value); break;
default: Debug.Assert(false); break;
}
}
private static void GenerateZeroUpper64(CodeGenContext context, Operand dest, Operand source)
{
context.Assembler.Movq(dest, source);
}
private static void GenerateZeroUpper96(CodeGenContext context, Operand dest, Operand source)
{
context.Assembler.Movq(dest, source);
context.Assembler.Pshufd(dest, dest, 0xfc);
}
private static void ValidateUnOp(Operand dest, Operand source)
{
#if DEBUG
EnsureSameReg (dest, source);
EnsureSameType(dest, source);
#endif
}
private static void ValidateBinOp(Operand dest, Operand src1, Operand src2)
{
#if DEBUG
EnsureSameReg (dest, src1);
EnsureSameType(dest, src1, src2);
#endif
}
private static void ValidateShift(Operand dest, Operand src1, Operand src2)
{
#if DEBUG
EnsureSameReg (dest, src1);
EnsureSameType(dest, src1);
Debug.Assert(dest.Type.IsInteger() && src2.Type == OperandType.I32);
#endif
}
private static void EnsureSameReg(Operand op1, Operand op2)
{
if (!op1.Type.IsInteger() && HardwareCapabilities.SupportsVexEncoding)
{
return;
}
Debug.Assert(op1.Kind == OperandKind.Register || op1.Kind == OperandKind.Memory);
Debug.Assert(op1.Kind == op2.Kind);
Debug.Assert(op1.Value == op2.Value);
}
private static void EnsureSameType(Operand op1, Operand op2)
{
Debug.Assert(op1.Type == op2.Type);
}
private static void EnsureSameType(Operand op1, Operand op2, Operand op3)
{
Debug.Assert(op1.Type == op2.Type);
Debug.Assert(op1.Type == op3.Type);
}
private static void EnsureSameType(Operand op1, Operand op2, Operand op3, Operand op4)
{
Debug.Assert(op1.Type == op2.Type);
Debug.Assert(op1.Type == op3.Type);
Debug.Assert(op1.Type == op4.Type);
}
private static UnwindInfo WritePrologue(CodeGenContext context)
{
List<UnwindPushEntry> pushEntries = new List<UnwindPushEntry>();
Operand rsp = Register(X86Register.Rsp);
int mask = CallingConvention.GetIntCalleeSavedRegisters() & context.AllocResult.IntUsedRegisters;
while (mask != 0)
{
int bit = BitUtils.LowestBitSet(mask);
context.Assembler.Push(Register((X86Register)bit));
pushEntries.Add(new UnwindPushEntry(bit, RegisterType.Integer, context.StreamOffset));
mask &= ~(1 << bit);
}
int reservedStackSize = context.CallArgsRegionSize + context.AllocResult.SpillRegionSize;
reservedStackSize += context.XmmSaveRegionSize;
if (reservedStackSize >= StackGuardSize)
{
GenerateInlineStackProbe(context, reservedStackSize);
}
if (reservedStackSize != 0)
{
context.Assembler.Sub(rsp, new Operand(reservedStackSize), OperandType.I64);
}
int offset = reservedStackSize;
mask = CallingConvention.GetVecCalleeSavedRegisters() & context.AllocResult.VecUsedRegisters;
while (mask != 0)
{
int bit = BitUtils.LowestBitSet(mask);
offset -= 16;
MemoryOperand memOp = new MemoryOperand(OperandType.V128, rsp, null, Multiplier.x1, offset);
context.Assembler.Movdqu(memOp, Xmm((X86Register)bit));
pushEntries.Add(new UnwindPushEntry(bit, RegisterType.Vector, context.StreamOffset));
mask &= ~(1 << bit);
}
return new UnwindInfo(pushEntries.ToArray(), context.StreamOffset, reservedStackSize);
}
private static void WriteEpilogue(CodeGenContext context)
{
Operand rsp = Register(X86Register.Rsp);
int reservedStackSize = context.CallArgsRegionSize + context.AllocResult.SpillRegionSize;
reservedStackSize += context.XmmSaveRegionSize;
int offset = reservedStackSize;
int mask = CallingConvention.GetVecCalleeSavedRegisters() & context.AllocResult.VecUsedRegisters;
while (mask != 0)
{
int bit = BitUtils.LowestBitSet(mask);
offset -= 16;
MemoryOperand memOp = new MemoryOperand(OperandType.V128, rsp, null, Multiplier.x1, offset);
context.Assembler.Movdqu(Xmm((X86Register)bit), memOp);
mask &= ~(1 << bit);
}
if (reservedStackSize != 0)
{
context.Assembler.Add(rsp, new Operand(reservedStackSize), OperandType.I64);
}
mask = CallingConvention.GetIntCalleeSavedRegisters() & context.AllocResult.IntUsedRegisters;
while (mask != 0)
{
int bit = BitUtils.HighestBitSet(mask);
context.Assembler.Pop(Register((X86Register)bit));
mask &= ~(1 << bit);
}
}
private static void GenerateInlineStackProbe(CodeGenContext context, int size)
{
// Windows does lazy stack allocation, and there are just 2
// guard pages on the end of the stack. So, if the allocation
// size we make is greater than this guard size, we must ensure
// that the OS will map all pages that we'll use. We do that by
// doing a dummy read on those pages, forcing a page fault and
// the OS to map them. If they are already mapped, nothing happens.
const int pageMask = PageSize - 1;
size = (size + pageMask) & ~pageMask;
Operand rsp = Register(X86Register.Rsp);
Operand temp = Register(CallingConvention.GetIntReturnRegister());
for (int offset = PageSize; offset < size; offset += PageSize)
{
Operand memOp = new MemoryOperand(OperandType.I32, rsp, null, Multiplier.x1, -offset);
Add a new JIT compiler for CPU code (#693) * Start of the ARMeilleure project * Refactoring around the old IRAdapter, now renamed to PreAllocator * Optimize the LowestBitSet method * Add CLZ support and fix CLS implementation * Add missing Equals and GetHashCode overrides on some structs, misc small tweaks * Implement the ByteSwap IR instruction, and some refactoring on the assembler * Implement the DivideUI IR instruction and fix 64-bits IDIV * Correct constant operand type on CSINC * Move division instructions implementation to InstEmitDiv * Fix destination type for the ConditionalSelect IR instruction * Implement UMULH and SMULH, with new IR instructions * Fix some issues with shift instructions * Fix constant types for BFM instructions * Fix up new tests using the new V128 struct * Update tests * Move DIV tests to a separate file * Add support for calls, and some instructions that depends on them * Start adding support for SIMD & FP types, along with some of the related ARM instructions * Fix some typos and the divide instruction with FP operands * Fix wrong method call on Clz_V * Implement ARM FP & SIMD move instructions, Saddlv_V, and misc. fixes * Implement SIMD logical instructions and more misc. fixes * Fix PSRAD x86 instruction encoding, TRN, UABD and UABDL implementations * Implement float conversion instruction, merge in LDj3SNuD fixes, and some other misc. fixes * Implement SIMD shift instruction and fix Dup_V * Add SCVTF and UCVTF (vector, fixed-point) variants to the opcode table * Fix check with tolerance on tester * Implement FP & SIMD comparison instructions, and some fixes * Update FCVT (Scalar) encoding on the table to support the Half-float variants * Support passing V128 structs, some cleanup on the register allocator, merge LDj3SNuD fixes * Use old memory access methods, made a start on SIMD memory insts support, some fixes * Fix float constant passed to functions, save and restore non-volatile XMM registers, other fixes * Fix arguments count with struct return values, other fixes * More instructions * Misc. fixes and integrate LDj3SNuD fixes * Update tests * Add a faster linear scan allocator, unwinding support on windows, and other changes * Update Ryujinx.HLE * Update Ryujinx.Graphics * Fix V128 return pointer passing, RCX is clobbered * Update Ryujinx.Tests * Update ITimeZoneService * Stop using GetFunctionPointer as that can't be called from native code, misc. fixes and tweaks * Use generic GetFunctionPointerForDelegate method and other tweaks * Some refactoring on the code generator, assert on invalid operations and use a separate enum for intrinsics * Remove some unused code on the assembler * Fix REX.W prefix regression on float conversion instructions, add some sort of profiler * Add hardware capability detection * Fix regression on Sha1h and revert Fcm** changes * Add SSE2-only paths on vector extract and insert, some refactoring on the pre-allocator * Fix silly mistake introduced on last commit on CpuId * Generate inline stack probes when the stack allocation is too large * Initial support for the System-V ABI * Support multiple destination operands * Fix SSE2 VectorInsert8 path, and other fixes * Change placement of XMM callee save and restore code to match other compilers * Rename Dest to Destination and Inst to Instruction * Fix a regression related to calls and the V128 type * Add an extra space on comments to match code style * Some refactoring * Fix vector insert FP32 SSE2 path * Port over the ARM32 instructions * Avoid memory protection races on JIT Cache * Another fix on VectorInsert FP32 (thanks to LDj3SNuD * Float operands don't need to use the same register when VEX is supported * Add a new register allocator, higher quality code for hot code (tier up), and other tweaks * Some nits, small improvements on the pre allocator * CpuThreadState is gone * Allow changing CPU emulators with a config entry * Add runtime identifiers on the ARMeilleure project * Allow switching between CPUs through a config entry (pt. 2) * Change win10-x64 to win-x64 on projects * Update the Ryujinx project to use ARMeilleure * Ensure that the selected register is valid on the hybrid allocator * Allow exiting on returns to 0 (should fix test regression) * Remove register assignments for most used variables on the hybrid allocator * Do not use fixed registers as spill temp * Add missing namespace and remove unneeded using * Address PR feedback * Fix types, etc * Enable AssumeStrictAbiCompliance by default * Ensure that Spill and Fill don't load or store any more than necessary
2019-08-08 19:56:22 +01:00
context.Assembler.Mov(temp, memOp, OperandType.I32);
}
}
private static MemoryOperand Memory(Operand operand, OperandType type)
{
if (operand.Kind == OperandKind.Memory)
{
return operand as MemoryOperand;
}
return new MemoryOperand(type, operand);
}
private static Operand Register(X86Register register, OperandType type = OperandType.I64)
{
return new Operand((int)register, RegisterType.Integer, type);
}
private static Operand Xmm(X86Register register)
{
return new Operand((int)register, RegisterType.Vector, OperandType.V128);
}
}
}