From d5610eb26cddf649e231f0096b1fdcdf6dbce82f Mon Sep 17 00:00:00 2001 From: FernandoS27 Date: Wed, 28 Dec 2016 21:28:55 +0000 Subject: [PATCH] Implement UHASX, UHSAX, SHASX and SHSAX (#75) --- src/backend_x64/emit_x64.cpp | 82 +++++++++++++++++++ src/frontend/ir/ir_emitter.cpp | 8 ++ src/frontend/ir/ir_emitter.h | 2 + src/frontend/ir/opcodes.inc | 2 + .../translate/translate_arm/parallel.cpp | 36 ++++++-- 5 files changed, 122 insertions(+), 8 deletions(-) diff --git a/src/backend_x64/emit_x64.cpp b/src/backend_x64/emit_x64.cpp index 96db3312..d26ea9fd 100644 --- a/src/backend_x64/emit_x64.cpp +++ b/src/backend_x64/emit_x64.cpp @@ -2012,6 +2012,88 @@ void EmitX64::EmitPackedHalvingSubS16(IR::Block&, IR::Inst* inst) { code->xor(minuend, carry); } +void EmitX64::EmitPackedHalvingSubAddU16(IR::Block&, IR::Inst* inst) { + IR::Value a = inst->GetArg(0); + IR::Value b = inst->GetArg(1); + + // If asx is true, the high word contains the sum and the low word the difference. + // If false, the high word contains the difference and the low word the sum. + bool asx = inst->GetArg(2).GetU1(); + + Xbyak::Reg32 reg_a_hi = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 reg_b_hi = reg_alloc.UseScratchGpr(b).cvt32(); + Xbyak::Reg32 reg_a_lo = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 reg_b_lo = reg_alloc.ScratchGpr().cvt32(); + + code->movzx(reg_a_lo, reg_a_hi.cvt16()); + code->movzx(reg_b_lo, reg_b_hi.cvt16()); + code->shr(reg_a_hi, 16); + code->shr(reg_b_hi, 16); + + if (asx) { + // Calculate diff such that reg_a_lo<31:16> contains diff<16:1>. + code->sub(reg_a_lo, reg_b_hi); + code->shl(reg_a_lo, 15); + + // Calculate sum such that reg_a_hi<15:0> contains sum<16:1>. + code->add(reg_a_hi, reg_b_lo); + code->shr(reg_a_hi, 1); + } else { + // Calculate sum such that reg_a_lo<31:16> contains sum<16:1>. + code->add(reg_a_lo, reg_b_hi); + code->shl(reg_a_lo, 15); + + // Calculate diff such that reg_a_hi<15:0> contains diff<16:1>. + code->sub(reg_a_hi, reg_b_lo); + code->shr(reg_a_hi, 1); + } + + // reg_a_lo now contains the low word and reg_a_hi now contains the high word. + // Merge them. + code->shld(reg_a_hi, reg_a_lo, 16); +} + +void EmitX64::EmitPackedHalvingSubAddS16(IR::Block&, IR::Inst* inst) { + IR::Value a = inst->GetArg(0); + IR::Value b = inst->GetArg(1); + + // If asx is true, the high word contains the sum and the low word the difference. + // If false, the high word contains the difference and the low word the sum. + bool asx = inst->GetArg(2).GetU1(); + + Xbyak::Reg32 reg_a_hi = reg_alloc.UseDefGpr(a, inst).cvt32(); + Xbyak::Reg32 reg_b_hi = reg_alloc.UseScratchGpr(b).cvt32(); + Xbyak::Reg32 reg_a_lo = reg_alloc.ScratchGpr().cvt32(); + Xbyak::Reg32 reg_b_lo = reg_alloc.ScratchGpr().cvt32(); + + code->movsx(reg_a_lo, reg_a_hi.cvt16()); + code->movsx(reg_b_lo, reg_b_hi.cvt16()); + code->sar(reg_a_hi, 16); + code->sar(reg_b_hi, 16); + + if (asx) { + // Calculate diff such that reg_a_lo<31:16> contains diff<16:1>. + code->sub(reg_a_lo, reg_b_hi); + code->shl(reg_a_lo, 15); + + // Calculate sum such that reg_a_hi<15:0> contains sum<16:1>. + code->add(reg_a_hi, reg_b_lo); + code->shr(reg_a_hi, 1); + } else { + // Calculate sum such that reg_a_lo<31:16> contains sum<16:1>. + code->add(reg_a_lo, reg_b_hi); + code->shl(reg_a_lo, 15); + + // Calculate diff such that reg_a_hi<15:0> contains diff<16:1>. + code->sub(reg_a_hi, reg_b_lo); + code->shr(reg_a_hi, 1); + } + + // reg_a_lo now contains the low word and reg_a_hi now contains the high word. + // Merge them. + code->shld(reg_a_hi, reg_a_lo, 16); +} + static void EmitPackedOperation(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) { IR::Value a = inst->GetArg(0); IR::Value b = inst->GetArg(1); diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 562c96b7..77010eeb 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -442,6 +442,14 @@ Value IREmitter::PackedHalvingSubS16(const Value& a, const Value& b) { return Inst(Opcode::PackedHalvingSubS16, {a, b}); } +Value IREmitter::PackedHalvingSubAddU16(const Value& a, const Value& b, bool asx) { + return Inst(Opcode::PackedHalvingSubAddU16, {a, b, Imm1(asx)}); +} + +Value IREmitter::PackedHalvingSubAddS16(const Value& a, const Value& b, bool asx) { + return Inst(Opcode::PackedHalvingSubAddS16, {a, b, Imm1(asx)}); +} + Value IREmitter::PackedSaturatedAddU8(const Value& a, const Value& b) { return Inst(Opcode::PackedSaturatedAddU8, {a, b}); } diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index e229b537..f1928955 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -157,6 +157,8 @@ public: Value PackedHalvingAddS16(const Value& a, const Value& b); Value PackedHalvingSubU16(const Value& a, const Value& b); Value PackedHalvingSubS16(const Value& a, const Value& b); + Value PackedHalvingSubAddU16(const Value& a, const Value& b, bool asx); + Value PackedHalvingSubAddS16(const Value& a, const Value& b, bool asx); Value PackedSaturatedAddU8(const Value& a, const Value& b); Value PackedSaturatedAddS8(const Value& a, const Value& b); Value PackedSaturatedSubU8(const Value& a, const Value& b); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index 3afa58b3..5d8e8bc7 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -99,6 +99,8 @@ OPCODE(PackedHalvingAddU16, T::U32, T::U32, T::U32 OPCODE(PackedHalvingAddS16, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingSubU16, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingSubS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubAddU16, T::U32, T::U32, T::U32, T::U1 ) +OPCODE(PackedHalvingSubAddS16, T::U32, T::U32, T::U32, T::U1 ) OPCODE(PackedSaturatedAddU8, T::U32, T::U32, T::U32 ) OPCODE(PackedSaturatedAddS8, T::U32, T::U32, T::U32 ) OPCODE(PackedSaturatedSubU8, T::U32, T::U32, T::U32 ) diff --git a/src/frontend/translate/translate_arm/parallel.cpp b/src/frontend/translate/translate_arm/parallel.cpp index 7e5b1ebc..9debd6d0 100644 --- a/src/frontend/translate/translate_arm/parallel.cpp +++ b/src/frontend/translate/translate_arm/parallel.cpp @@ -258,13 +258,23 @@ bool ArmTranslatorVisitor::arm_SHADD16(Cond cond, Reg n, Reg d, Reg m) { } bool ArmTranslatorVisitor::arm_SHASX(Cond cond, Reg n, Reg d, Reg m) { - UNUSED(cond, n, d, m); - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedHalvingSubAddS16(ir.GetRegister(n), ir.GetRegister(m), true); + ir.SetRegister(d, result); + } + return true; } bool ArmTranslatorVisitor::arm_SHSAX(Cond cond, Reg n, Reg d, Reg m) { - UNUSED(cond, n, d, m); - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedHalvingSubAddS16(ir.GetRegister(n), ir.GetRegister(m), false); + ir.SetRegister(d, result); + } + return true; } bool ArmTranslatorVisitor::arm_SHSUB8(Cond cond, Reg n, Reg d, Reg m) { @@ -308,13 +318,23 @@ bool ArmTranslatorVisitor::arm_UHADD16(Cond cond, Reg n, Reg d, Reg m) { } bool ArmTranslatorVisitor::arm_UHASX(Cond cond, Reg n, Reg d, Reg m) { - UNUSED(cond, n, d, m); - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedHalvingSubAddU16(ir.GetRegister(n), ir.GetRegister(m), true); + ir.SetRegister(d, result); + } + return true; } bool ArmTranslatorVisitor::arm_UHSAX(Cond cond, Reg n, Reg d, Reg m) { - UNUSED(cond, n, d, m); - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedHalvingSubAddU16(ir.GetRegister(n), ir.GetRegister(m), false); + ir.SetRegister(d, result); + } + return true; } bool ArmTranslatorVisitor::arm_UHSUB8(Cond cond, Reg n, Reg d, Reg m) {