diff --git a/libraries/config/arch/arm64/arch.mk b/libraries/config/arch/arm64/arch.mk index 191b2fe9f..e3162d2dd 100644 --- a/libraries/config/arch/arm64/arch.mk +++ b/libraries/config/arch/arm64/arch.mk @@ -5,7 +5,7 @@ endif include $(DEVKITPRO)/devkitA64/base_rules export ATMOSPHERE_DEFINES += -DATMOSPHERE_ARCH_ARM64 -export ATMOSPHERE_SETTINGS += -march=armv8-a -mtp=soft +export ATMOSPHERE_SETTINGS += -march=armv8-a+crc+crypto -mtp=soft export ATMOSPHERE_CFLAGS += export ATMOSPHERE_CXXFLAGS += export ATMOSPHERE_ASFLAGS += diff --git a/libraries/libvapours/include/vapours/crypto.hpp b/libraries/libvapours/include/vapours/crypto.hpp index 45e0df55d..6ef2ed14d 100644 --- a/libraries/libvapours/include/vapours/crypto.hpp +++ b/libraries/libvapours/include/vapours/crypto.hpp @@ -21,6 +21,10 @@ #include #include #include +#include +#include +#include +#include #include #include #include diff --git a/libraries/libvapours/include/vapours/crypto/crypto_aes_ctr_encryptor_decryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_aes_ctr_encryptor_decryptor.hpp new file mode 100644 index 000000000..1363cfceb --- /dev/null +++ b/libraries/libvapours/include/vapours/crypto/crypto_aes_ctr_encryptor_decryptor.hpp @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#pragma once +#include +#include +#include +#include +#include +#include + +namespace ams::crypto { + + namespace impl { + + template typename _CtrImpl, typename _AesImpl> + class AesCtrCryptor { + NON_COPYABLE(AesCtrCryptor); + NON_MOVEABLE(AesCtrCryptor); + private: + using AesImpl = _AesImpl; + using CtrImpl = _CtrImpl; + public: + static constexpr size_t KeySize = AesImpl::KeySize; + static constexpr size_t BlockSize = CtrImpl::BlockSize; + static constexpr size_t IvSize = CtrImpl::BlockSize; + private: + AesImpl aes_impl; + CtrImpl ctr_impl; + public: + AesCtrCryptor() { /* ... */ } + + void Initialize(const void *key, size_t key_size, const void *iv, size_t iv_size) { + this->Initialize(key, key_size, iv, iv_size, 0); + } + + void Initialize(const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset) { + AMS_ASSERT(key_size == KeySize); + AMS_ASSERT(iv_size == IvSize); + AMS_ASSERT(offset >= 0); + + this->aes_impl.Initialize(key, key_size); + this->ctr_impl.Initialize(std::addressof(this->aes_impl), iv, iv_size, offset); + } + + void SwitchMessage(const void *iv, size_t iv_size) { + return this->ctr_impl.SwitchMessage(iv, iv_size); + } + + size_t Update(void *dst, size_t dst_size, const void *src, size_t src_size) { + return this->ctr_impl.Update(dst, dst_size, src, src_size); + } + }; + + } + + using Aes128CtrEncryptor = impl::AesCtrCryptor; + using Aes192CtrEncryptor = impl::AesCtrCryptor; + using Aes256CtrEncryptor = impl::AesCtrCryptor; + + using Aes128CtrDecryptor = impl::AesCtrCryptor; + using Aes192CtrDecryptor = impl::AesCtrCryptor; + using Aes256CtrDecryptor = impl::AesCtrCryptor; + + size_t EncryptAes128Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size); + size_t EncryptAes192Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size); + size_t EncryptAes256Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size); + + size_t DecryptAes128Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size); + size_t DecryptAes192Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size); + size_t DecryptAes256Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size); + + size_t EncryptAes128CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size); + size_t EncryptAes192CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size); + size_t EncryptAes256CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size); + + size_t DecryptAes128CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size); + size_t DecryptAes192CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size); + size_t DecryptAes256CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size); + +} diff --git a/libraries/libvapours/include/vapours/crypto/crypto_aes_decryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_aes_decryptor.hpp new file mode 100644 index 000000000..2f1dd4a1d --- /dev/null +++ b/libraries/libvapours/include/vapours/crypto/crypto_aes_decryptor.hpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#pragma once +#include +#include +#include +#include + +namespace ams::crypto { + + template + class AesDecryptor { + NON_COPYABLE(AesDecryptor); + NON_MOVEABLE(AesDecryptor); + private: + using Impl = impl::AesImpl<_KeySize>; + public: + static constexpr size_t KeySize = Impl::KeySize; + static constexpr size_t BlockSize = Impl::BlockSize; + static constexpr size_t RoundKeySize = Impl::RoundKeySize; + private: + Impl impl; + public: + AesDecryptor() { /* ... */ } + + void Initialize(const void *key, size_t key_size) { + this->impl.Initialize(key, key_size, false); + } + + void DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const { + return this->impl.DecryptBlock(dst, dst_size, src, src_size); + } + + const u8 *GetRoundKey() const { + return this->impl.GetRoundKey(); + } + }; + + using AesDecryptor128 = AesDecryptor<16>; + using AesDecryptor192 = AesDecryptor<24>; + using AesDecryptor256 = AesDecryptor<32>; + +} diff --git a/libraries/libvapours/include/vapours/crypto/crypto_aes_encryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_aes_encryptor.hpp new file mode 100644 index 000000000..116ab2dae --- /dev/null +++ b/libraries/libvapours/include/vapours/crypto/crypto_aes_encryptor.hpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#pragma once +#include +#include +#include +#include + +namespace ams::crypto { + + template + class AesEncryptor { + NON_COPYABLE(AesEncryptor); + NON_MOVEABLE(AesEncryptor); + private: + using Impl = impl::AesImpl<_KeySize>; + public: + static constexpr size_t KeySize = Impl::KeySize; + static constexpr size_t BlockSize = Impl::BlockSize; + static constexpr size_t RoundKeySize = Impl::RoundKeySize; + private: + Impl impl; + public: + AesEncryptor() { /* ... */ } + + void Initialize(const void *key, size_t key_size) { + this->impl.Initialize(key, key_size, true); + } + + void EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const { + return this->impl.EncryptBlock(dst, dst_size, src, src_size); + } + + const u8 *GetRoundKey() const { + return this->impl.GetRoundKey(); + } + }; + + using AesEncryptor128 = AesEncryptor<16>; + using AesEncryptor192 = AesEncryptor<24>; + using AesEncryptor256 = AesEncryptor<32>; + +} diff --git a/libraries/libvapours/include/vapours/crypto/crypto_aes_xts_encryptor_decryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_aes_xts_encryptor_decryptor.hpp new file mode 100644 index 000000000..c99660945 --- /dev/null +++ b/libraries/libvapours/include/vapours/crypto/crypto_aes_xts_encryptor_decryptor.hpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#pragma once +#include +#include +#include +#include +#include +#include + +namespace ams::crypto { + + namespace impl { + + template typename _XtsImpl, typename _AesImpl1, typename _AesImpl2> + class AesXtsCryptor { + NON_COPYABLE(AesXtsCryptor); + NON_MOVEABLE(AesXtsCryptor); + private: + using AesImpl1 = _AesImpl1; + using AesImpl2 = _AesImpl2; + using XtsImpl = _XtsImpl; + public: + static constexpr size_t KeySize = AesImpl1::KeySize; + static constexpr size_t BlockSize = AesImpl1::BlockSize; + static constexpr size_t IvSize = AesImpl1::BlockSize; + + static_assert(AesImpl1::KeySize == AesImpl2::KeySize); + static_assert(AesImpl1::BlockSize == AesImpl2::BlockSize); + private: + AesImpl1 aes_impl_1; + AesImpl2 aes_impl_2; + XtsImpl xts_impl; + public: + AesXtsCryptor() { /* ... */ } + + void Initialize(const void *key1, const void *key2, size_t key_size, const void *iv, size_t iv_size) { + AMS_ASSERT(key_size == KeySize); + AMS_ASSERT(iv_size == IvSize); + + this->aes_impl_1.Initialize(key1, key_size); + this->aes_impl_2.Initialize(key2, key_size); + this->xts_impl.Initialize(std::addressof(this->aes_impl_1), std::addressof(this->aes_impl_2), iv, iv_size); + } + + size_t Update(void *dst, size_t dst_size, const void *src, size_t src_size) { + return this->xts_impl.Update(dst, dst_size, src, src_size); + } + + size_t Finalize(void *dst, size_t dst_size) { + return this->xts_impl.Finalize(dst, dst_size); + } + }; + + } + + using Aes128XtsEncryptor = impl::AesXtsCryptor; + using Aes192XtsEncryptor = impl::AesXtsCryptor; + using Aes256XtsEncryptor = impl::AesXtsCryptor; + + using Aes128XtsDecryptor = impl::AesXtsCryptor; + using Aes192XtsDecryptor = impl::AesXtsCryptor; + using Aes256XtsDecryptor = impl::AesXtsCryptor; + + inline size_t EncryptAes128Xts(void *dst, size_t dst_size, const void *key1, const void *key2, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) { + u8 *dst_u8 = static_cast(dst); + const u8 *src_u8 = static_cast(src); + + Aes128XtsEncryptor xts; + xts.Initialize(key1, key2, key_size, iv, iv_size); + + size_t processed = xts.Update(dst_u8, dst_size, src_u8, src_size); + dst_u8 += processed; + dst_size -= processed; + + processed += xts.Finalize(dst_u8, dst_size); + return processed; + } + + inline size_t EncryptAes192Xts(void *dst, size_t dst_size, const void *key1, const void *key2, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) { + u8 *dst_u8 = static_cast(dst); + const u8 *src_u8 = static_cast(src); + + Aes192XtsEncryptor xts; + xts.Initialize(key1, key2, key_size, iv, iv_size); + + size_t processed = xts.Update(dst_u8, dst_size, src_u8, src_size); + dst_u8 += processed; + dst_size -= processed; + + processed += xts.Finalize(dst_u8, dst_size); + return processed; + } + + inline size_t EncryptAes256Xts(void *dst, size_t dst_size, const void *key1, const void *key2, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) { + u8 *dst_u8 = static_cast(dst); + const u8 *src_u8 = static_cast(src); + + Aes256XtsEncryptor xts; + xts.Initialize(key1, key2, key_size, iv, iv_size); + + size_t processed = xts.Update(dst_u8, dst_size, src_u8, src_size); + dst_u8 += processed; + dst_size -= processed; + + processed += xts.Finalize(dst_u8, dst_size); + return processed; + } + + inline size_t DecryptAes128Xts(void *dst, size_t dst_size, const void *key1, const void *key2, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) { + u8 *dst_u8 = static_cast(dst); + const u8 *src_u8 = static_cast(src); + + Aes128XtsDecryptor xts; + xts.Initialize(key1, key2, key_size, iv, iv_size); + + size_t processed = xts.Update(dst_u8, dst_size, src_u8, src_size); + dst_u8 += processed; + dst_size -= processed; + + processed += xts.Finalize(dst_u8, dst_size); + return processed; + } + + inline size_t DecryptAes192Xts(void *dst, size_t dst_size, const void *key1, const void *key2, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) { + u8 *dst_u8 = static_cast(dst); + const u8 *src_u8 = static_cast(src); + + Aes192XtsDecryptor xts; + xts.Initialize(key1, key2, key_size, iv, iv_size); + + size_t processed = xts.Update(dst_u8, dst_size, src_u8, src_size); + dst_u8 += processed; + dst_size -= processed; + + processed += xts.Finalize(dst_u8, dst_size); + return processed; + } + + inline size_t DecryptAes256Xts(void *dst, size_t dst_size, const void *key1, const void *key2, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) { + u8 *dst_u8 = static_cast(dst); + const u8 *src_u8 = static_cast(src); + + Aes256XtsDecryptor xts; + xts.Initialize(key1, key2, key_size, iv, iv_size); + + size_t processed = xts.Update(dst_u8, dst_size, src_u8, src_size); + dst_u8 += processed; + dst_size -= processed; + + processed += xts.Finalize(dst_u8, dst_size); + return processed; + } + +} diff --git a/libraries/libvapours/include/vapours/crypto/crypto_ctr_decryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_ctr_decryptor.hpp new file mode 100644 index 000000000..242233186 --- /dev/null +++ b/libraries/libvapours/include/vapours/crypto/crypto_ctr_decryptor.hpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#pragma once +#include +#include +#include +#include + +namespace ams::crypto { + + /* TODO: C++20 BlockCipher concept */ + + template + class CtrDecryptor { + NON_COPYABLE(CtrDecryptor); + NON_MOVEABLE(CtrDecryptor); + private: + using Impl = impl::CtrModeImpl; + public: + static constexpr size_t KeySize = Impl::KeySize; + static constexpr size_t BlockSize = Impl::BlockSize; + static constexpr size_t IvSize = Impl::IvSize; + private: + Impl impl; + public: + CtrDecryptor() { /* ... */ } + + void Initialize(const BlockCipher *cipher, const void *iv, size_t iv_size) { + this->impl.Initialize(cipher, iv, iv_size); + } + + void Initialize(const BlockCipher *cipher, const void *iv, size_t iv_size, s64 offset) { + this->impl.Initialize(cipher, iv, iv_size, offset); + } + + void SwitchMessage(const void *iv, size_t iv_size) { + this->impl.SwitchMessage(iv, iv_size); + } + + size_t Update(void *dst, size_t dst_size, const void *src, size_t src_size) { + return this->impl.Update(dst, dst_size, src, src_size); + } + }; + +} diff --git a/libraries/libvapours/include/vapours/crypto/crypto_ctr_encryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_ctr_encryptor.hpp new file mode 100644 index 000000000..ab5661739 --- /dev/null +++ b/libraries/libvapours/include/vapours/crypto/crypto_ctr_encryptor.hpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#pragma once +#include +#include +#include +#include + +namespace ams::crypto { + + /* TODO: C++20 BlockCipher concept */ + + template + class CtrEncryptor { + NON_COPYABLE(CtrEncryptor); + NON_MOVEABLE(CtrEncryptor); + private: + using Impl = impl::CtrModeImpl; + public: + static constexpr size_t KeySize = Impl::KeySize; + static constexpr size_t BlockSize = Impl::BlockSize; + static constexpr size_t IvSize = Impl::IvSize; + private: + Impl impl; + public: + CtrEncryptor() { /* ... */ } + + void Initialize(const BlockCipher *cipher, const void *iv, size_t iv_size) { + this->impl.Initialize(cipher, iv, iv_size); + } + + void Initialize(const BlockCipher *cipher, const void *iv, size_t iv_size, s64 offset) { + this->impl.Initialize(cipher, iv, iv_size, offset); + } + + void SwitchMessage(const void *iv, size_t iv_size) { + this->impl.SwitchMessage(iv, iv_size); + } + + size_t Update(void *dst, size_t dst_size, const void *src, size_t src_size) { + return this->impl.Update(dst, dst_size, src, src_size); + } + }; + +} diff --git a/libraries/libvapours/include/vapours/crypto/crypto_xts_decryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_xts_decryptor.hpp new file mode 100644 index 000000000..f51f8fb4d --- /dev/null +++ b/libraries/libvapours/include/vapours/crypto/crypto_xts_decryptor.hpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#pragma once +#include +#include +#include +#include + +namespace ams::crypto { + + /* TODO: C++20 BlockCipher concept */ + + template + class XtsDecryptor { + NON_COPYABLE(XtsDecryptor); + NON_MOVEABLE(XtsDecryptor); + private: + using Impl = impl::XtsModeImpl; + public: + static constexpr size_t BlockSize = Impl::BlockSize; + static constexpr size_t IvSize = Impl::IvSize; + private: + Impl impl; + public: + XtsDecryptor() { /* ... */ } + + template + void Initialize(const BlockCipher *cipher1, const BlockCipher2 *cipher2, const void *iv, size_t iv_size) { + this->impl.InitializeDecryption(cipher1, cipher2, iv, iv_size); + } + + size_t Update(void *dst, size_t dst_size, const void *src, size_t src_size) { + return this->impl.template Update(dst, dst_size, src, src_size); + } + + size_t Finalize(void *dst, size_t dst_size) { + return this->impl.FinalizeDecryption(dst, dst_size); + } + }; + +} diff --git a/libraries/libvapours/include/vapours/crypto/crypto_xts_encryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_xts_encryptor.hpp new file mode 100644 index 000000000..7d889e4e3 --- /dev/null +++ b/libraries/libvapours/include/vapours/crypto/crypto_xts_encryptor.hpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#pragma once +#include +#include +#include +#include + +namespace ams::crypto { + + /* TODO: C++20 BlockCipher concept */ + + template + class XtsEncryptor { + NON_COPYABLE(XtsEncryptor); + NON_MOVEABLE(XtsEncryptor); + private: + using Impl = impl::XtsModeImpl; + public: + static constexpr size_t BlockSize = Impl::BlockSize; + static constexpr size_t IvSize = Impl::IvSize; + private: + Impl impl; + public: + XtsEncryptor() { /* ... */ } + + template + void Initialize(const BlockCipher *cipher1, const BlockCipher2 *cipher2, const void *iv, size_t iv_size) { + this->impl.InitializeEncryption(cipher1, cipher2, iv, iv_size); + } + + size_t Update(void *dst, size_t dst_size, const void *src, size_t src_size) { + return this->impl.template Update(dst, dst_size, src, src_size); + } + + size_t Finalize(void *dst, size_t dst_size) { + return this->impl.FinalizeEncryption(dst, dst_size); + } + }; + +} diff --git a/libraries/libvapours/include/vapours/crypto/impl/crypto_aes_impl.hpp b/libraries/libvapours/include/vapours/crypto/impl/crypto_aes_impl.hpp new file mode 100644 index 000000000..d99b14774 --- /dev/null +++ b/libraries/libvapours/include/vapours/crypto/impl/crypto_aes_impl.hpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#pragma once +#include +#include +#include + + +namespace ams::crypto::impl { + + template + class AesImpl { + public: + static constexpr size_t KeySize = _KeySize; + static constexpr size_t BlockSize = 16; + static constexpr s32 RoundCount = (KeySize / 4) + 6; + static constexpr size_t RoundKeySize = BlockSize * (RoundCount + 1); + private: + u32 round_keys[RoundKeySize / sizeof(u32)]; + public: + ~AesImpl(); + + void Initialize(const void *key, size_t key_size, bool is_encrypt); + void EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const; + void DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const; + const u8 *GetRoundKey() const { + return reinterpret_cast(this->round_keys); + } + }; + + /* static_assert(HashFunction); */ + +} diff --git a/libraries/libvapours/include/vapours/crypto/impl/crypto_ctr_mode_impl.hpp b/libraries/libvapours/include/vapours/crypto/impl/crypto_ctr_mode_impl.hpp new file mode 100644 index 000000000..b588bc2c3 --- /dev/null +++ b/libraries/libvapours/include/vapours/crypto/impl/crypto_ctr_mode_impl.hpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#pragma once +#include +#include +#include +#include +#include + +namespace ams::crypto::impl { + + template + class CtrModeImpl { + NON_COPYABLE(CtrModeImpl); + NON_MOVEABLE(CtrModeImpl); + public: + static constexpr size_t KeySize = BlockCipher::KeySize; + static constexpr size_t BlockSize = BlockCipher::BlockSize; + static constexpr size_t IvSize = BlockCipher::BlockSize; + private: + enum State { + State_None, + State_Initialized, + }; + private: + const BlockCipher *block_cipher; + u8 counter[IvSize]; + u8 encrypted_counter[BlockSize]; + size_t buffer_offset; + State state; + public: + CtrModeImpl() : state(State_None) { /* ... */ } + + ~CtrModeImpl() { + ClearMemory(this, sizeof(*this)); + } + + void Initialize(const BlockCipher *block_cipher, const void *iv, size_t iv_size) { + this->Initialize(block_cipher, iv, iv_size, 0); + } + + void Initialize(const BlockCipher *block_cipher, const void *iv, size_t iv_size, s64 offset) { + AMS_ASSERT(iv_size == IvSize); + AMS_ASSERT(offset >= 0); + + this->block_cipher = block_cipher; + this->state = State_Initialized; + + this->SwitchMessage(iv, iv_size); + + if (offset >= 0) { + u64 ctr_offset = offset / BlockSize; + if (ctr_offset > 0) { + this->IncrementCounter(ctr_offset); + } + + if (size_t remaining = static_cast(offset % BlockSize); remaining != 0) { + this->block_cipher->EncryptBlock(this->encrypted_counter, sizeof(this->encrypted_counter), this->counter, sizeof(this->counter)); + this->IncrementCounter(); + + this->buffer_offset = remaining; + } + } + } + + void SwitchMessage(const void *iv, size_t iv_size) { + AMS_ASSERT(this->state == State_Initialized); + AMS_ASSERT(iv_size == IvSize); + + std::memcpy(this->counter, iv, iv_size); + this->buffer_offset = 0; + } + + void IncrementCounter() { + for (s32 i = IvSize - 1; i >= 0; --i) { + if (++this->counter[i] != 0) { + break; + } + } + } + + size_t Update(void *_dst, size_t dst_size, const void *_src, size_t src_size) { + AMS_ASSERT(this->state == State_Initialized); + AMS_ASSERT(dst_size >= src_size); + + u8 *dst = static_cast(_dst); + const u8 *src = static_cast(_src); + size_t remaining = src_size; + + if (this->buffer_offset > 0) { + const size_t xor_size = std::min(BlockSize - this->buffer_offset, remaining); + + const u8 *ctr = this->encrypted_counter + this->buffer_offset; + for (size_t i = 0; i < xor_size; i++) { + dst[i] = src[i] ^ ctr[i]; + } + + src += xor_size; + dst += xor_size; + remaining -= xor_size; + this->buffer_offset += xor_size; + + if (this->buffer_offset == BlockSize) { + this->buffer_offset = 0; + } + } + + if (remaining >= BlockSize) { + const size_t num_blocks = remaining / BlockSize; + + this->ProcessBlocks(dst, src, num_blocks); + + const size_t processed_size = num_blocks * BlockSize; + dst += processed_size; + src += processed_size; + remaining -= processed_size; + } + + if (remaining > 0) { + this->ProcessBlock(dst, src, remaining); + this->buffer_offset = remaining; + } + + return src_size; + } + private: + void IncrementCounter(u64 count) { + u64 _block[IvSize / sizeof(u64)] = {}; + util::StoreBigEndian(std::addressof(_block[(IvSize / sizeof(u64)) - 1]), count); + + u16 acc; + const u8 *block = reinterpret_cast(_block); + for (s32 i = IvSize - 1; i >= 0; --i) { + acc += (this->counter[i] + block[i]); + this->counter[i] = acc & 0xFF; + acc >>= 8; + } + } + + void ProcessBlock(u8 *dst, const u8 *src, size_t src_size) { + this->block_cipher->EncryptBlock(this->encrypted_counter, BlockSize, this->counter, IvSize); + this->IncrementCounter(); + + for (size_t i = 0; i < src_size; i++) { + dst[i] = src[i] ^ this->encrypted_counter[i]; + } + } + + void ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks); + }; + + template + inline void CtrModeImpl::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) { + while (num_blocks--) { + this->ProcessBlock(dst, src, BlockSize); + dst += BlockSize; + src += BlockSize; + } + } + + template<> void CtrModeImpl::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks); + template<> void CtrModeImpl::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks); + template<> void CtrModeImpl::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks); + +} diff --git a/libraries/libvapours/include/vapours/crypto/impl/crypto_xts_mode_impl.hpp b/libraries/libvapours/include/vapours/crypto/impl/crypto_xts_mode_impl.hpp new file mode 100644 index 000000000..4129e5595 --- /dev/null +++ b/libraries/libvapours/include/vapours/crypto/impl/crypto_xts_mode_impl.hpp @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#pragma once +#include +#include +#include +#include +#include + +namespace ams::crypto::impl { + + class XtsModeImpl { + NON_COPYABLE(XtsModeImpl); + NON_MOVEABLE(XtsModeImpl); + public: + /* TODO: More generic support. */ + static constexpr size_t BlockSize = 16; + static constexpr size_t IvSize = 16; + private: + enum State { + State_None, + State_Initialized, + State_Processing, + State_Done + }; + private: + u8 buffer[BlockSize]; + u8 tweak[BlockSize]; + u8 last_block[BlockSize]; + size_t num_buffered; + const void *cipher_ctx; + void (*cipher_func)(void *dst_block, const void *src_block, const void *cipher_ctx); + State state; + public: + XtsModeImpl() : num_buffered(0), state(State_None) { /* ... */ } + + ~XtsModeImpl() { + ClearMemory(this, sizeof(*this)); + } + private: + template + static void EncryptBlockCallback(void *dst_block, const void *src_block, const void *cipher) { + return static_cast(cipher)->EncryptBlock(dst_block, BlockCipher::BlockSize, src_block, BlockCipher::BlockSize); + } + + template + static void DecryptBlockCallback(void *dst_block, const void *src_block, const void *cipher) { + return static_cast(cipher)->DecryptBlock(dst_block, BlockCipher::BlockSize, src_block, BlockCipher::BlockSize); + } + + template + void Initialize(const BlockCipher *cipher, const void *tweak, size_t tweak_size) { + AMS_ASSERT(tweak_size == IvSize); + + cipher->EncryptBlock(this->tweak, IvSize, tweak, IvSize); + + this->num_buffered = 0; + this->state = State_Initialized; + } + + void ProcessBlock(u8 *dst, const u8 *src); + public: + template + void InitializeEncryption(const BlockCipher1 *cipher1, const BlockCipher2 *cipher2, const void *tweak, size_t tweak_size) { + static_assert(BlockCipher1::BlockSize == BlockSize); + static_assert(BlockCipher2::BlockSize == BlockSize); + + this->cipher_ctx = cipher1; + this->cipher_func = EncryptBlockCallback; + + this->Initialize(cipher2, tweak, tweak_size); + } + + template + void InitializeDecryption(const BlockCipher1 *cipher1, const BlockCipher2 *cipher2, const void *tweak, size_t tweak_size) { + static_assert(BlockCipher1::BlockSize == BlockSize); + static_assert(BlockCipher2::BlockSize == BlockSize); + + this->cipher_ctx = cipher1; + this->cipher_func = DecryptBlockCallback; + + this->Initialize(cipher2, tweak, tweak_size); + } + + template + size_t Update(void *dst, size_t dst_size, const void *src, size_t src_size) { + return this->UpdateGeneric(dst, dst_size, src, src_size); + } + + template + size_t ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) { + return this->ProcessBlocksGeneric(dst, src, num_blocks); + } + + size_t GetBufferedDataSize() const { + return this->num_buffered; + } + + constexpr size_t GetBlockSize() const { + return BlockSize; + } + + size_t FinalizeEncryption(void *dst, size_t dst_size); + size_t FinalizeDecryption(void *dst, size_t dst_size); + + size_t UpdateGeneric(void *dst, size_t dst_size, const void *src, size_t src_size); + size_t ProcessBlocksGeneric(u8 *dst, const u8 *src, size_t num_blocks); + + size_t ProcessPartialData(u8 *dst, const u8 *src, size_t size); + size_t ProcessRemainingData(u8 *dst, const u8 *src, size_t size); + }; + + template<> size_t XtsModeImpl::Update(void *dst, size_t dst_size, const void *src, size_t src_size); + template<> size_t XtsModeImpl::Update(void *dst, size_t dst_size, const void *src, size_t src_size); + template<> size_t XtsModeImpl::Update(void *dst, size_t dst_size, const void *src, size_t src_size); + + template<> size_t XtsModeImpl::Update(void *dst, size_t dst_size, const void *src, size_t src_size); + template<> size_t XtsModeImpl::Update(void *dst, size_t dst_size, const void *src, size_t src_size); + template<> size_t XtsModeImpl::Update(void *dst, size_t dst_size, const void *src, size_t src_size); + +} diff --git a/libraries/libvapours/source/crypto/crypto_aes_ctr_encryptor_decryptor.cpp b/libraries/libvapours/source/crypto/crypto_aes_ctr_encryptor_decryptor.cpp new file mode 100644 index 000000000..438dd2891 --- /dev/null +++ b/libraries/libvapours/source/crypto/crypto_aes_ctr_encryptor_decryptor.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#include + +namespace ams::crypto { + + size_t EncryptAes128Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) { + Aes128CtrEncryptor aes; + aes.Initialize(key, key_size, iv, iv_size); + return aes.Update(dst, dst_size, src, src_size); + } + + size_t EncryptAes192Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) { + Aes192CtrEncryptor aes; + aes.Initialize(key, key_size, iv, iv_size); + return aes.Update(dst, dst_size, src, src_size); + } + + size_t EncryptAes256Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) { + Aes256CtrEncryptor aes; + aes.Initialize(key, key_size, iv, iv_size); + return aes.Update(dst, dst_size, src, src_size); + } + + size_t DecryptAes128Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) { + Aes128CtrDecryptor aes; + aes.Initialize(key, key_size, iv, iv_size); + return aes.Update(dst, dst_size, src, src_size); + } + + size_t DecryptAes192Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) { + Aes192CtrDecryptor aes; + aes.Initialize(key, key_size, iv, iv_size); + return aes.Update(dst, dst_size, src, src_size); + } + + size_t DecryptAes256Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) { + Aes256CtrDecryptor aes; + aes.Initialize(key, key_size, iv, iv_size); + return aes.Update(dst, dst_size, src, src_size); + } + + size_t EncryptAes128CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size) { + Aes128CtrEncryptor aes; + aes.Initialize(key, key_size, iv, iv_size, offset); + return aes.Update(dst, dst_size, src, src_size); + } + + size_t EncryptAes192CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size) { + Aes192CtrEncryptor aes; + aes.Initialize(key, key_size, iv, iv_size, offset); + return aes.Update(dst, dst_size, src, src_size); + } + + size_t EncryptAes256CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size) { + Aes256CtrEncryptor aes; + aes.Initialize(key, key_size, iv, iv_size, offset); + return aes.Update(dst, dst_size, src, src_size); + } + + size_t DecryptAes128CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size) { + Aes128CtrDecryptor aes; + aes.Initialize(key, key_size, iv, iv_size, offset); + return aes.Update(dst, dst_size, src, src_size); + } + + size_t DecryptAes192CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size) { + Aes192CtrDecryptor aes; + aes.Initialize(key, key_size, iv, iv_size, offset); + return aes.Update(dst, dst_size, src, src_size); + } + + size_t DecryptAes256CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size) { + Aes256CtrDecryptor aes; + aes.Initialize(key, key_size, iv, iv_size, offset); + return aes.Update(dst, dst_size, src, src_size); + } + +} diff --git a/libraries/libvapours/source/crypto/impl/crypto_aes_impl.arch.arm64.cpp b/libraries/libvapours/source/crypto/impl/crypto_aes_impl.arch.arm64.cpp new file mode 100644 index 000000000..8c7465bed --- /dev/null +++ b/libraries/libvapours/source/crypto/impl/crypto_aes_impl.arch.arm64.cpp @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#include + +namespace ams::crypto::impl { + +#ifdef ATMOSPHERE_IS_STRATOSPHERE + + namespace { + + constexpr bool IsSupportedKeySize(size_t size) { + return size == 16 || size == 24 || size == 32; + } + + } + + template + AesImpl::~AesImpl() { + ClearMemory(this, sizeof(*this)); + } + + template + void AesImpl::Initialize(const void *key, size_t key_size, bool is_encrypt) { + static_assert(IsSupportedKeySize(KeySize)); + AMS_ASSERT(key_size == KeySize); + + if constexpr (KeySize == 16) { + /* Aes 128. */ + static_assert(sizeof(this->round_keys) == sizeof(::Aes128Context)); + aes128ContextCreate(reinterpret_cast(this->round_keys), key, is_encrypt); + } else if constexpr (KeySize == 24) { + /* Aes 192. */ + static_assert(sizeof(this->round_keys) == sizeof(::Aes192Context)); + aes192ContextCreate(reinterpret_cast(this->round_keys), key, is_encrypt); + } else if constexpr (KeySize == 32) { + /* Aes 256. */ + static_assert(sizeof(this->round_keys) == sizeof(::Aes256Context)); + aes256ContextCreate(reinterpret_cast(this->round_keys), key, is_encrypt); + } else { + /* Invalid key size. */ + static_assert(!std::is_same, AesImpl>::value); + } + } + + template + void AesImpl::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const { + static_assert(IsSupportedKeySize(KeySize)); + AMS_ASSERT(src_size >= BlockSize); + AMS_ASSERT(dst_size >= BlockSize); + + if constexpr (KeySize == 16) { + /* Aes 128. */ + static_assert(sizeof(this->round_keys) == sizeof(::Aes128Context)); + aes128EncryptBlock(reinterpret_cast(this->round_keys), dst, src); + } else if constexpr (KeySize == 24) { + /* Aes 192. */ + static_assert(sizeof(this->round_keys) == sizeof(::Aes192Context)); + aes192EncryptBlock(reinterpret_cast(this->round_keys), dst, src); + } else if constexpr (KeySize == 32) { + /* Aes 256. */ + static_assert(sizeof(this->round_keys) == sizeof(::Aes256Context)); + aes256EncryptBlock(reinterpret_cast(this->round_keys), dst, src); + } else { + /* Invalid key size. */ + static_assert(!std::is_same, AesImpl>::value); + } + } + + template + void AesImpl::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const { + static_assert(IsSupportedKeySize(KeySize)); + AMS_ASSERT(src_size >= BlockSize); + AMS_ASSERT(dst_size >= BlockSize); + + if constexpr (KeySize == 16) { + /* Aes 128. */ + static_assert(sizeof(this->round_keys) == sizeof(::Aes128Context)); + aes128DecryptBlock(reinterpret_cast(this->round_keys), dst, src); + } else if constexpr (KeySize == 24) { + /* Aes 192. */ + static_assert(sizeof(this->round_keys) == sizeof(::Aes192Context)); + aes192DecryptBlock(reinterpret_cast(this->round_keys), dst, src); + } else if constexpr (KeySize == 32) { + /* Aes 256. */ + static_assert(sizeof(this->round_keys) == sizeof(::Aes256Context)); + aes256DecryptBlock(reinterpret_cast(this->round_keys), dst, src); + } else { + /* Invalid key size. */ + static_assert(!std::is_same, AesImpl>::value); + } + } + + + /* Explicitly instantiate the three supported key sizes. */ + template class AesImpl<16>; + template class AesImpl<24>; + template class AesImpl<32>; + +#else + + /* TODO: Non-EL0 implementation. */ + +#endif + +} \ No newline at end of file diff --git a/libraries/libvapours/source/crypto/impl/crypto_ctr_mode_impl.arch.arm64.cpp b/libraries/libvapours/source/crypto/impl/crypto_ctr_mode_impl.arch.arm64.cpp new file mode 100644 index 000000000..5d32196a6 --- /dev/null +++ b/libraries/libvapours/source/crypto/impl/crypto_ctr_mode_impl.arch.arm64.cpp @@ -0,0 +1,588 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#include +#include + +namespace ams::crypto::impl { + +#ifdef ATMOSPHERE_IS_STRATOSPHERE + + /* Variable management macros. */ + #define DECLARE_ROUND_KEY_VAR(n) \ + const uint8x16_t round_key_##n = vld1q_u8(keys + (BlockSize * n)) + + #define AES_ENC_DEC_OUTPUT_THREE_BLOCKS() \ + [tmp0]"+w"(tmp0), [tmp1]"+w"(tmp1), [tmp2]"+w"(tmp2) + + #define AES_ENC_DEC_OUTPUT_THREE_CTRS() \ + [ctr0]"+w"(ctr0), [ctr1]"+w"(ctr1), [ctr2]"+w"(ctr2) + + #define AES_ENC_DEC_OUTPUT_ONE_BLOCK() \ + [tmp0]"+w"(tmp0) + + #define AES_ENC_DEC_OUTPUT_ONE_CTR() \ + [ctr0]"+w"(ctr0) + + #define CTR_INCREMENT_OUTPUT_HIGH_LOW() \ + [high]"=&r"(high), [low]"=&r"(low) + + #define CTR_INCREMENT_OUTPUT_HIGH_LOW_TMP() \ + [high_tmp]"=&r"(high_tmp), [low_tmp]"=&r"(low_tmp) + + #define CTR_INCREMENT_OUTPUT_HL_SINGLE_TMP() \ + [hl_tmp]"=&r"(hl_tmp) + + #define AES_ENC_DEC_INPUT_ROUND_KEY(n) \ + [round_key_##n]"w"(round_key_##n) + + /* AES Encryption macros. */ + #define AES_ENC_ROUND(n, i) \ + "aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \ + "aesmc %[tmp" #i "].16b, %[tmp" #i "].16b\n" + + #define AES_ENC_SECOND_LAST_ROUND(n, i) \ + "aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + + #define AES_ENC_LAST_ROUND(n, i) \ + "eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + + namespace { + + ALWAYS_INLINE uint8x16_t IncrementCounterOptimized(const uint8x16_t ctr) { + uint8x16_t inc; + uint64_t high, low; + /* Use ASM. TODO: Better than using intrinsics? */ + __asm__ __volatile__ ( + "mov %[high], %[ctr].d[0]\n" + "mov %[low], %[ctr].d[1]\n" + "rev %[high], %[high]\n" + "rev %[low], %[low]\n" + "adds %[low], %[low], 1\n" + "cinc %[high], %[high], cs\n" + "rev %[high], %[high]\n" + "rev %[low], %[low]\n" + "mov %[inc].d[0], %[high]\n" + "mov %[inc].d[1], %[low]\n" + : [inc]"=w"(inc), + CTR_INCREMENT_OUTPUT_HIGH_LOW() + : [ctr]"w"(ctr) + : "cc" + ); + return inc; + } + + } + + template<> + void CtrModeImpl::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) { + /* Preload all round keys + iv into neon registers. */ + const u8 *keys = this->block_cipher->GetRoundKey(); + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + uint8x16_t ctr0 = vld1q_u8(this->counter); + uint64_t high, low; + + /* Process three blocks at a time, when possible. */ + if (num_blocks >= 3) { + /* Increment CTR twice. */ + uint8x16_t ctr1 = IncrementCounterOptimized(ctr0); + uint8x16_t ctr2 = IncrementCounterOptimized(ctr1); + uint64_t high_tmp, low_tmp; + + while (num_blocks >= 3) { + /* Read blocks in. Keep them in registers for XOR later. */ + const uint8x16_t block0 = vld1q_u8(src); + src += AES_BLOCK_SIZE; + const uint8x16_t block1 = vld1q_u8(src); + src += AES_BLOCK_SIZE; + const uint8x16_t block2 = vld1q_u8(src); + src += AES_BLOCK_SIZE; + + /* We'll be encrypting the three CTRs. */ + uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[ctr2].d[0]\n" + AES_ENC_ROUND(0, 1) "mov %[low], %[ctr2].d[1]\n" + AES_ENC_ROUND(0, 2) "rev %[high], %[high]\n" + AES_ENC_ROUND(1, 0) "rev %[low], %[low]\n" + AES_ENC_ROUND(1, 1) "adds %[low], %[low], 1\n" + AES_ENC_ROUND(1, 2) "cinc %[high], %[high], cs\n" + AES_ENC_ROUND(2, 0) "rev %[high_tmp], %[high]\n" + AES_ENC_ROUND(2, 1) "rev %[low_tmp], %[low]\n" + AES_ENC_ROUND(2, 2) "mov %[ctr0].d[0], %[high_tmp]\n" + AES_ENC_ROUND(3, 0) "mov %[ctr0].d[1], %[low_tmp]\n" + AES_ENC_ROUND(3, 1) "adds %[low], %[low], 1\n" + AES_ENC_ROUND(3, 2) "cinc %[high], %[high], cs\n" + AES_ENC_ROUND(4, 0) "rev %[high_tmp], %[high]\n" + AES_ENC_ROUND(4, 1) "rev %[low_tmp], %[low]\n" + AES_ENC_ROUND(4, 2) "mov %[ctr1].d[0], %[high_tmp]\n" + AES_ENC_ROUND(5, 0) "mov %[ctr1].d[1], %[low_tmp]\n" + AES_ENC_ROUND(5, 1) "adds %[low], %[low], 1\n" + AES_ENC_ROUND(5, 2) "cinc %[high], %[high], cs\n" + AES_ENC_ROUND(6, 0) "rev %[high_tmp], %[high]\n" + AES_ENC_ROUND(6, 1) "rev %[low_tmp], %[low]\n" + AES_ENC_ROUND(6, 2) "mov %[ctr2].d[0], %[high_tmp]\n" + AES_ENC_ROUND(7, 0) "mov %[ctr2].d[1], %[low_tmp]\n" + AES_ENC_ROUND(7, 1) + AES_ENC_ROUND(7, 2) + AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2) + AES_ENC_SECOND_LAST_ROUND(9, 0) AES_ENC_SECOND_LAST_ROUND(9, 1) AES_ENC_SECOND_LAST_ROUND(9, 2) + AES_ENC_LAST_ROUND(10, 0) AES_ENC_LAST_ROUND(10, 1) AES_ENC_LAST_ROUND(10, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_CTRS(), + CTR_INCREMENT_OUTPUT_HIGH_LOW(), + CTR_INCREMENT_OUTPUT_HIGH_LOW_TMP() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(block0, tmp0); + tmp1 = veorq_u8(block1, tmp1); + tmp2 = veorq_u8(block2, tmp2); + + /* Store to output. */ + vst1q_u8(dst, tmp0); + dst += AES_BLOCK_SIZE; + vst1q_u8(dst, tmp1); + dst += AES_BLOCK_SIZE; + vst1q_u8(dst, tmp2); + dst += AES_BLOCK_SIZE; + + num_blocks -= 3; + } + } + + while (num_blocks >= 1) { + /* Read block in, keep in register for XOR. */ + const uint8x16_t block0 = vld1q_u8(src); + src += AES_BLOCK_SIZE; + + /* We'll be encrypting the CTR. */ + uint8x16_t tmp0 = ctr0; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[ctr0].d[0]\n" + AES_ENC_ROUND(1, 0) "mov %[low], %[ctr0].d[1]\n" + AES_ENC_ROUND(2, 0) "rev %[high], %[high]\n" + AES_ENC_ROUND(3, 0) "rev %[low], %[low]\n" + AES_ENC_ROUND(4, 0) "adds %[low], %[low], 1\n" + AES_ENC_ROUND(5, 0) "cinc %[high], %[high], cs\n" + AES_ENC_ROUND(6, 0) "rev %[high], %[high]\n" + AES_ENC_ROUND(7, 0) "rev %[low], %[low]\n" + AES_ENC_ROUND(8, 0) "mov %[ctr0].d[0], %[high]\n" + AES_ENC_SECOND_LAST_ROUND(9, 0) "mov %[ctr0].d[1], %[low]\n" + AES_ENC_LAST_ROUND(10, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_CTR(), + CTR_INCREMENT_OUTPUT_HIGH_LOW() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(block0, tmp0); + + /* Store to output. */ + vst1q_u8(dst, tmp0); + dst += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(this->counter, ctr0); + } + + template<> + void CtrModeImpl::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) { + /* Preload all round keys + iv into neon registers. */ + const u8 *keys = this->block_cipher->GetRoundKey(); + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + uint8x16_t ctr0 = vld1q_u8(this->counter); + uint64_t high, low; + + /* Process three blocks at a time, when possible. */ + if (num_blocks >= 3) { + /* Increment CTR twice. */ + uint8x16_t ctr1 = IncrementCounterOptimized(ctr0); + uint8x16_t ctr2 = IncrementCounterOptimized(ctr1); + uint64_t high_tmp, low_tmp; + + while (num_blocks >= 3) { + /* Read blocks in. Keep them in registers for XOR later. */ + const uint8x16_t block0 = vld1q_u8(src); + src += AES_BLOCK_SIZE; + const uint8x16_t block1 = vld1q_u8(src); + src += AES_BLOCK_SIZE; + const uint8x16_t block2 = vld1q_u8(src); + src += AES_BLOCK_SIZE; + + /* We'll be encrypting the three CTRs. */ + uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[ctr2].d[0]\n" + AES_ENC_ROUND(0, 1) "mov %[low], %[ctr2].d[1]\n" + AES_ENC_ROUND(0, 2) "rev %[high], %[high]\n" + AES_ENC_ROUND(1, 0) "rev %[low], %[low]\n" + AES_ENC_ROUND(1, 1) "adds %[low], %[low], 1\n" + AES_ENC_ROUND(1, 2) "cinc %[high], %[high], cs\n" + AES_ENC_ROUND(2, 0) "rev %[high_tmp], %[high]\n" + AES_ENC_ROUND(2, 1) "rev %[low_tmp], %[low]\n" + AES_ENC_ROUND(2, 2) "mov %[ctr0].d[0], %[high_tmp]\n" + AES_ENC_ROUND(3, 0) "mov %[ctr0].d[1], %[low_tmp]\n" + AES_ENC_ROUND(3, 1) "adds %[low], %[low], 1\n" + AES_ENC_ROUND(3, 2) "cinc %[high], %[high], cs\n" + AES_ENC_ROUND(4, 0) "rev %[high_tmp], %[high]\n" + AES_ENC_ROUND(4, 1) "rev %[low_tmp], %[low]\n" + AES_ENC_ROUND(4, 2) "mov %[ctr1].d[0], %[high_tmp]\n" + AES_ENC_ROUND(5, 0) "mov %[ctr1].d[1], %[low_tmp]\n" + AES_ENC_ROUND(5, 1) "adds %[low], %[low], 1\n" + AES_ENC_ROUND(5, 2) "cinc %[high], %[high], cs\n" + AES_ENC_ROUND(6, 0) "rev %[high_tmp], %[high]\n" + AES_ENC_ROUND(6, 1) "rev %[low_tmp], %[low]\n" + AES_ENC_ROUND(6, 2) "mov %[ctr2].d[0], %[high_tmp]\n" + AES_ENC_ROUND(7, 0) "mov %[ctr2].d[1], %[low_tmp]\n" + AES_ENC_ROUND(7, 1) + AES_ENC_ROUND(7, 2) + AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2) + AES_ENC_ROUND(9, 0) AES_ENC_ROUND(9, 1) AES_ENC_ROUND(9, 2) + AES_ENC_ROUND(10, 0) AES_ENC_ROUND(10, 1) AES_ENC_ROUND(10, 2) + AES_ENC_SECOND_LAST_ROUND(11, 0) AES_ENC_SECOND_LAST_ROUND(11, 1) AES_ENC_SECOND_LAST_ROUND(11, 2) + AES_ENC_LAST_ROUND(12, 0) AES_ENC_LAST_ROUND(12, 1) AES_ENC_LAST_ROUND(12, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_CTRS(), + CTR_INCREMENT_OUTPUT_HIGH_LOW(), + CTR_INCREMENT_OUTPUT_HIGH_LOW_TMP() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(block0, tmp0); + tmp1 = veorq_u8(block1, tmp1); + tmp2 = veorq_u8(block2, tmp2); + + /* Store to output. */ + vst1q_u8(dst, tmp0); + dst += AES_BLOCK_SIZE; + vst1q_u8(dst, tmp1); + dst += AES_BLOCK_SIZE; + vst1q_u8(dst, tmp2); + dst += AES_BLOCK_SIZE; + + num_blocks -= 3; + } + } + + while (num_blocks >= 1) { + /* Read block in, keep in register for XOR. */ + const uint8x16_t block0 = vld1q_u8(src); + src += AES_BLOCK_SIZE; + + /* We'll be encrypting the CTR. */ + uint8x16_t tmp0 = ctr0; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[ctr0].d[0]\n" + AES_ENC_ROUND(1, 0) "mov %[low], %[ctr0].d[1]\n" + AES_ENC_ROUND(2, 0) "rev %[high], %[high]\n" + AES_ENC_ROUND(3, 0) "rev %[low], %[low]\n" + AES_ENC_ROUND(4, 0) "adds %[low], %[low], 1\n" + AES_ENC_ROUND(5, 0) "cinc %[high], %[high], cs\n" + AES_ENC_ROUND(6, 0) "rev %[high], %[high]\n" + AES_ENC_ROUND(7, 0) "rev %[low], %[low]\n" + AES_ENC_ROUND(8, 0) "mov %[ctr0].d[0], %[high]\n" + AES_ENC_ROUND(9, 0) "mov %[ctr0].d[1], %[low]\n" + AES_ENC_ROUND(10, 0) + AES_ENC_SECOND_LAST_ROUND(11, 0) + AES_ENC_LAST_ROUND(12, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_CTR(), + CTR_INCREMENT_OUTPUT_HIGH_LOW() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(block0, tmp0); + + /* Store to output. */ + vst1q_u8(dst, tmp0); + dst += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(this->counter, ctr0); + } + + template<> + void CtrModeImpl::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) { + /* Preload all round keys + iv into neon registers. */ + const u8 *keys = this->block_cipher->GetRoundKey(); + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + DECLARE_ROUND_KEY_VAR(13); + DECLARE_ROUND_KEY_VAR(14); + uint8x16_t ctr0 = vld1q_u8(this->counter); + uint64_t high, low; + + /* Process three blocks at a time, when possible. */ + if (num_blocks >= 3) { + /* Increment CTR twice. */ + uint8x16_t ctr1 = IncrementCounterOptimized(ctr0); + uint8x16_t ctr2 = IncrementCounterOptimized(ctr1); + uint64_t hl_tmp; + + while (num_blocks >= 3) { + /* Read blocks in. Keep them in registers for XOR later. */ + const uint8x16_t block0 = vld1q_u8(src); + src += AES_BLOCK_SIZE; + const uint8x16_t block1 = vld1q_u8(src); + src += AES_BLOCK_SIZE; + const uint8x16_t block2 = vld1q_u8(src); + src += AES_BLOCK_SIZE; + + /* We'll be encrypting the three CTRs. */ + uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + /* Note: ASM here only uses one temporary u64 instead of two, due to 30 operand limit. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[ctr2].d[0]\n" + AES_ENC_ROUND(0, 1) "mov %[low], %[ctr2].d[1]\n" + AES_ENC_ROUND(0, 2) "rev %[high], %[high]\n" + AES_ENC_ROUND(1, 0) "rev %[low], %[low]\n" + AES_ENC_ROUND(1, 1) "adds %[low], %[low], 1\n" + AES_ENC_ROUND(1, 2) "cinc %[high], %[high], cs\n" + AES_ENC_ROUND(2, 0) "rev %[hl_tmp], %[high]\n" + AES_ENC_ROUND(2, 1) "mov %[ctr0].d[0], %[hl_tmp]\n" + AES_ENC_ROUND(2, 2) "rev %[hl_tmp], %[low]\n" + AES_ENC_ROUND(3, 0) "mov %[ctr0].d[1], %[hl_tmp]\n" + AES_ENC_ROUND(3, 1) "adds %[low], %[low], 1\n" + AES_ENC_ROUND(3, 2) "cinc %[high], %[high], cs\n" + AES_ENC_ROUND(4, 0) "rev %[hl_tmp], %[high]\n" + AES_ENC_ROUND(4, 1) "mov %[ctr1].d[0], %[hl_tmp]\n" + AES_ENC_ROUND(4, 2) "rev %[hl_tmp], %[low]\n" + AES_ENC_ROUND(5, 0) "mov %[ctr1].d[1], %[hl_tmp]\n" + AES_ENC_ROUND(5, 1) "adds %[low], %[low], 1\n" + AES_ENC_ROUND(5, 2) "cinc %[high], %[high], cs\n" + AES_ENC_ROUND(6, 0) "rev %[hl_tmp], %[high]\n" + AES_ENC_ROUND(6, 1) "mov %[ctr2].d[0], %[hl_tmp]\n" + AES_ENC_ROUND(6, 2) "rev %[hl_tmp], %[low]\n" + AES_ENC_ROUND(7, 0) "mov %[ctr2].d[1], %[hl_tmp]\n" + AES_ENC_ROUND(7, 1) + AES_ENC_ROUND(7, 2) + AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2) + AES_ENC_ROUND(9, 0) AES_ENC_ROUND(9, 1) AES_ENC_ROUND(9, 2) + AES_ENC_ROUND(10, 0) AES_ENC_ROUND(10, 1) AES_ENC_ROUND(10, 2) + AES_ENC_ROUND(11, 0) AES_ENC_ROUND(11, 1) AES_ENC_ROUND(11, 2) + AES_ENC_ROUND(12, 0) AES_ENC_ROUND(12, 1) AES_ENC_ROUND(12, 2) + AES_ENC_SECOND_LAST_ROUND(13, 0) AES_ENC_SECOND_LAST_ROUND(13, 1) AES_ENC_SECOND_LAST_ROUND(13, 2) + AES_ENC_LAST_ROUND(14, 0) AES_ENC_LAST_ROUND(14, 1) AES_ENC_LAST_ROUND(14, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_CTRS(), + CTR_INCREMENT_OUTPUT_HIGH_LOW(), + CTR_INCREMENT_OUTPUT_HL_SINGLE_TMP() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12), + AES_ENC_DEC_INPUT_ROUND_KEY(13), + AES_ENC_DEC_INPUT_ROUND_KEY(14) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(block0, tmp0); + tmp1 = veorq_u8(block1, tmp1); + tmp2 = veorq_u8(block2, tmp2); + + /* Store to output. */ + vst1q_u8(dst, tmp0); + dst += AES_BLOCK_SIZE; + vst1q_u8(dst, tmp1); + dst += AES_BLOCK_SIZE; + vst1q_u8(dst, tmp2); + dst += AES_BLOCK_SIZE; + + num_blocks -= 3; + } + } + + while (num_blocks >= 1) { + /* Read block in, keep in register for XOR. */ + const uint8x16_t block0 = vld1q_u8(src); + src += AES_BLOCK_SIZE; + + /* We'll be encrypting the CTR. */ + uint8x16_t tmp0 = ctr0; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[ctr0].d[0]\n" + AES_ENC_ROUND(1, 0) "mov %[low], %[ctr0].d[1]\n" + AES_ENC_ROUND(2, 0) "rev %[high], %[high]\n" + AES_ENC_ROUND(3, 0) "rev %[low], %[low]\n" + AES_ENC_ROUND(4, 0) "adds %[low], %[low], 1\n" + AES_ENC_ROUND(5, 0) "cinc %[high], %[high], cs\n" + AES_ENC_ROUND(6, 0) "rev %[high], %[high]\n" + AES_ENC_ROUND(7, 0) "rev %[low], %[low]\n" + AES_ENC_ROUND(8, 0) "mov %[ctr0].d[0], %[high]\n" + AES_ENC_ROUND(9, 0) "mov %[ctr0].d[1], %[low]\n" + AES_ENC_ROUND(10, 0) + AES_ENC_ROUND(11, 0) + AES_ENC_ROUND(12, 0) + AES_ENC_SECOND_LAST_ROUND(13, 0) + AES_ENC_LAST_ROUND(14, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_CTR(), + CTR_INCREMENT_OUTPUT_HIGH_LOW() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12), + AES_ENC_DEC_INPUT_ROUND_KEY(13), + AES_ENC_DEC_INPUT_ROUND_KEY(14) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(block0, tmp0); + + /* Store to output. */ + vst1q_u8(dst, tmp0); + dst += AES_BLOCK_SIZE; + + num_blocks--; + } + + vst1q_u8(this->counter, ctr0); + } + +#else + + /* TODO: Non-EL0 implementation. */ + +#endif + +} \ No newline at end of file diff --git a/libraries/libvapours/source/crypto/impl/crypto_update_impl.hpp b/libraries/libvapours/source/crypto/impl/crypto_update_impl.hpp new file mode 100644 index 000000000..d74abb18b --- /dev/null +++ b/libraries/libvapours/source/crypto/impl/crypto_update_impl.hpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#pragma once +#include + +namespace ams::crypto::impl { + + template + void UpdateImpl(Self *self, const void *src, size_t src_size) { + const size_t BlockSize = self->GetBlockSize(); + + const u8 *src_u8 = static_cast(src); + size_t remaining = src_size; + + if (const size_t buffered = self->GetBufferedDataSize(); buffered > 0) { + const size_t partial = std::min(BlockSize - buffered, remaining); + + self->ProcessPartialData(src_u8, partial); + src_u8 += partial; + remaining -= partial; + } + + if (remaining >= BlockSize) { + const size_t num_blocks = remaining / BlockSize; + + self->template ProcessBlocks(src_u8, num_blocks); + + const size_t processed = num_blocks * BlockSize; + src_u8 += processed; + remaining -= processed; + } + + if (remaining > 0) { + self->ProcessRemainingData(src_u8, remaining); + } + } + + template + size_t UpdateImpl(Self *self, void *dst, size_t dst_size, const void *src, size_t src_size) { + const size_t BlockSize = self->GetBlockSize(); + + const u8 *src_u8 = static_cast(src); + u8 *dst_u8 = static_cast(dst); + size_t remaining = src_size; + size_t total_processed = 0; + + if (const size_t buffered = self->GetBufferedDataSize(); buffered > 0) { + const size_t partial = std::min(BlockSize - buffered, remaining); + + const size_t processed = self->ProcessPartialData(dst_u8, src_u8, partial); + + dst_u8 += processed; + total_processed += processed; + + src_u8 += partial; + remaining -= partial; + } + + if (remaining >= BlockSize) { + const size_t num_blocks = remaining / BlockSize; + const size_t input_size = num_blocks * BlockSize; + + const size_t processed = self->template ProcessBlocks(dst_u8, src_u8, num_blocks); + + dst_u8 += processed; + total_processed += processed; + + src_u8 += input_size; + remaining -= input_size; + } + + if (remaining > 0) { + const size_t processed = self->ProcessRemainingData(dst_u8, src_u8, remaining); + total_processed += processed; + } + + return total_processed; + } + + + +} diff --git a/libraries/libvapours/source/crypto/impl/crypto_xts_mode_impl.arch.arm64.cpp b/libraries/libvapours/source/crypto/impl/crypto_xts_mode_impl.arch.arm64.cpp new file mode 100644 index 000000000..e526f8507 --- /dev/null +++ b/libraries/libvapours/source/crypto/impl/crypto_xts_mode_impl.arch.arm64.cpp @@ -0,0 +1,1187 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#include +#include +#include "crypto_update_impl.hpp" + +namespace ams::crypto::impl { + +#ifdef ATMOSPHERE_IS_STRATOSPHERE + + /* Variable management macros. */ + #define DECLARE_ROUND_KEY_VAR(n) \ + const uint8x16_t round_key_##n = vld1q_u8(keys + (BlockSize * n)) + + #define AES_ENC_DEC_OUTPUT_THREE_BLOCKS() \ + [tmp0]"+w"(tmp0), [tmp1]"+w"(tmp1), [tmp2]"+w"(tmp2) + + #define AES_ENC_DEC_OUTPUT_THREE_TWEAKS() \ + [tweak0]"+w"(tweak0), [tweak1]"+w"(tweak1), [tweak2]"+w"(tweak2) + + #define AES_ENC_DEC_OUTPUT_ONE_BLOCK() \ + [tmp0]"+w"(tmp0) + + #define AES_ENC_DEC_OUTPUT_ONE_TWEAK() \ + [tweak0]"+w"(tweak0) + + #define XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() \ + [high]"=&r"(high), [low]"=&r"(low), [mask]"=&r"(mask) + + #define XTS_INCREMENT_INPUT_XOR() \ + [xorv]"r"(xorv) + + #define AES_ENC_DEC_INPUT_ROUND_KEY(n) \ + [round_key_##n]"w"(round_key_##n) + + /* AES Encryption macros. */ + #define AES_ENC_ROUND(n, i) \ + "aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \ + "aesmc %[tmp" #i "].16b, %[tmp" #i "].16b\n" + + #define AES_ENC_SECOND_LAST_ROUND(n, i) \ + "aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + + #define AES_ENC_LAST_ROUND(n, i) \ + "eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + + /* AES Decryption macros. */ + #define AES_DEC_ROUND(n, i) \ + "aesd %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \ + "aesimc %[tmp" #i "].16b, %[tmp" #i "].16b\n" + + #define AES_DEC_SECOND_LAST_ROUND(n, i) \ + "aesd %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + + #define AES_DEC_LAST_ROUND(n, i) \ + "eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n" + + namespace { + + /* TODO: Support non-Nintendo Endianness */ + + ALWAYS_INLINE uint8x16_t MultiplyTweak(const uint8x16_t tweak) { + /* TODO: Is the inline asm better than using intrinsics? */ + #if 1 + uint8x16_t mult; + uint64_t high, low, mask; + constexpr uint64_t xorv = 0x87ul; + /* Use ASM. TODO: Better than using intrinsics? */ + __asm__ __volatile__ ( + "mov %[high], %[tweak].d[1]\n" + "mov %[low], %[tweak].d[0]\n" + "and %[mask], %[xorv], %[high], asr 63\n" + "extr %[high], %[high], %[low], 63\n" + "eor %[low], %[mask], %[low], lsl 1\n" + "mov %[mult].d[1], %[high]\n" + "mov %[mult].d[0], %[low]\n" + : [mult]"=w"(mult), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : [tweak]"w"(tweak), + XTS_INCREMENT_INPUT_XOR() + : "cc" + ); + return mult; + #else + constexpr uint64_t XorMask = 0x87ul; + + const uint64x2_t tweak64 = vreinterpretq_u64_u8(tweak); + const uint64_t high = vgetq_lane_u64(tweak64, 1); + const uint64_t low = vgetq_lane_u64(tweak64, 0); + const uint64_t mask = static_cast(high) >> (BITSIZEOF(uint64_t) - 1); + + return vreinterpretq_u8_u64(vcombine_u64(vmov_n_u64((low << 1) ^ (mask & XorMask)), vmov_n_u64((high << 1) | (low >> (BITSIZEOF(uint64_t) - 1))))); + #endif + } + + } + + size_t XtsModeImpl::UpdateGeneric(void *dst, size_t dst_size, const void *src, size_t src_size) { + AMS_ASSERT(this->state == State_Initialized || this->state == State_Processing); + + return UpdateImpl(this, dst, dst_size, src, src_size); + } + + size_t XtsModeImpl::ProcessBlocksGeneric(u8 *dst, const u8 *src, size_t num_blocks) { + size_t processed = BlockSize * (num_blocks - 1); + + if (this->state == State_Processing) { + this->ProcessBlock(dst, this->last_block); + dst += BlockSize; + processed += BlockSize; + } + + uint8x16_t tweak = vld1q_u8(this->tweak); + + while ((--num_blocks) > 0) { + /* Xor */ + uint8x16_t block = vld1q_u8(src); + src += BlockSize; + block = veorq_u8(block, tweak); + + /* Encrypt */ + vst1q_u8(dst, block); + this->cipher_func(dst, dst, this->cipher_ctx); + block = vld1q_u8(dst); + + /* Xor */ + veorq_u8(block, tweak); + vst1q_u8(dst, block); + dst += BlockSize; + + /* Increment tweak. */ + tweak = MultiplyTweak(tweak); + } + + vst1q_u8(this->tweak, tweak); + + std::memcpy(this->last_block, src, BlockSize); + + this->state = State_Processing; + + return processed; + } + + template<> size_t XtsModeImpl::Update(void *dst, size_t dst_size, const void *src, size_t src_size) { return UpdateImpl(this, dst, dst_size, src, src_size); } + template<> size_t XtsModeImpl::Update(void *dst, size_t dst_size, const void *src, size_t src_size) { return UpdateImpl(this, dst, dst_size, src, src_size); } + template<> size_t XtsModeImpl::Update(void *dst, size_t dst_size, const void *src, size_t src_size) { return UpdateImpl(this, dst, dst_size, src, src_size); } + + template<> size_t XtsModeImpl::Update(void *dst, size_t dst_size, const void *src, size_t src_size) { return UpdateImpl(this, dst, dst_size, src, src_size); } + template<> size_t XtsModeImpl::Update(void *dst, size_t dst_size, const void *src, size_t src_size) { return UpdateImpl(this, dst, dst_size, src, src_size); } + template<> size_t XtsModeImpl::Update(void *dst, size_t dst_size, const void *src, size_t src_size) { return UpdateImpl(this, dst, dst_size, src, src_size); } + + template<> + size_t XtsModeImpl::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) { + /* Handle last buffered block. */ + size_t processed = (num_blocks - 1) * BlockSize; + + if (this->state == State_Processing) { + this->ProcessBlock(dst, this->last_block); + dst += BlockSize; + processed += BlockSize; + } + + /* Preload all round keys + iv into neon registers. */ + const u8 *keys = static_cast(this->cipher_ctx)->GetRoundKey(); + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + uint8x16_t tweak0 = vld1q_u8(this->tweak); + constexpr uint64_t xorv = 0x87ul; + uint64_t high, low, mask; + + /* Process three blocks at a time, when possible. */ + if (num_blocks > 3) { + /* Multiply tweak twice. */ + uint8x16_t tweak1 = MultiplyTweak(tweak0); + uint8x16_t tweak2 = MultiplyTweak(tweak1); + + do { + /* Save tweaks for xor usage. */ + const uint8x16_t mask0 = tweak0; + const uint8x16_t mask1 = tweak1; + const uint8x16_t mask2 = tweak2; + + /* Read blocks in, XOR with tweaks. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); src += BlockSize; + uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src)); src += BlockSize; + uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src)); src += BlockSize; + + /* Actually do encryption, use optimized asm. */ + /* Interleave GF mult calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[tweak2].d[1]\n" + AES_ENC_ROUND(0, 1) "mov %[low], %[tweak2].d[0]\n" + AES_ENC_ROUND(0, 2) "and %[mask], %[xorv], %[high], asr 63\n" + AES_ENC_ROUND(1, 0) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(1, 1) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(1, 2) "mov %[tweak0].d[1], %[high]\n" + AES_ENC_ROUND(2, 0) "mov %[tweak0].d[0], %[low]\n" + AES_ENC_ROUND(2, 1) "and %[mask], %[xorv], %[high], asr 63\n" + AES_ENC_ROUND(2, 2) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(3, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(3, 1) "mov %[tweak1].d[1], %[high]\n" + AES_ENC_ROUND(3, 2) "mov %[tweak1].d[0], %[low]\n" + AES_ENC_ROUND(4, 0) "and %[mask], %[xorv], %[high], asr 63\n" + AES_ENC_ROUND(4, 1) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(4, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(5, 0) "mov %[tweak2].d[1], %[high]\n" + AES_ENC_ROUND(5, 1) "mov %[tweak2].d[0], %[low]\n" + AES_ENC_ROUND(5, 2) + AES_ENC_ROUND(6, 0) AES_ENC_ROUND(6, 1) AES_ENC_ROUND(6, 2) + AES_ENC_ROUND(7, 0) AES_ENC_ROUND(7, 1) AES_ENC_ROUND(7, 2) + AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2) + AES_ENC_SECOND_LAST_ROUND(9, 0) AES_ENC_SECOND_LAST_ROUND(9, 1) AES_ENC_SECOND_LAST_ROUND(9, 2) + AES_ENC_LAST_ROUND(10, 0) AES_ENC_LAST_ROUND(10, 1) AES_ENC_LAST_ROUND(10, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_TWEAKS(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + tmp1 = veorq_u8(mask1, tmp1); + tmp2 = veorq_u8(mask2, tmp2); + + /* Store to output. */ + vst1q_u8(dst, tmp0); dst += BlockSize; + vst1q_u8(dst, tmp1); dst += BlockSize; + vst1q_u8(dst, tmp2); dst += BlockSize; + + num_blocks -= 3; + } while (num_blocks > 3); + } + + while ((--num_blocks) > 0) { + /* Save tweak for xor usage. */ + const uint8x16_t mask0 = tweak0; + + /* Read block in, XOR with tweak. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); + src += BlockSize; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[tweak0].d[1]\n" + AES_ENC_ROUND(1, 0) "mov %[low], %[tweak0].d[0]\n" + AES_ENC_ROUND(2, 0) "and %[mask], %[xorv], %[high], asr 63\n" + AES_ENC_ROUND(3, 0) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(4, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(5, 0) "mov %[tweak0].d[1], %[high]\n" + AES_ENC_ROUND(6, 0) "mov %[tweak0].d[0], %[low]\n" + AES_ENC_ROUND(7, 0) + AES_ENC_ROUND(8, 0) + AES_ENC_SECOND_LAST_ROUND(9, 0) + AES_ENC_LAST_ROUND(10, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_TWEAK(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + + /* Store to output. */ + vst1q_u8(dst, tmp0); + dst += BlockSize; + } + + vst1q_u8(this->tweak, tweak0); + + std::memcpy(this->last_block, src, BlockSize); + this->state = State_Processing; + + return processed; + } + + template<> + size_t XtsModeImpl::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) { + /* Handle last buffered block. */ + size_t processed = (num_blocks - 1) * BlockSize; + + if (this->state == State_Processing) { + this->ProcessBlock(dst, this->last_block); + dst += BlockSize; + processed += BlockSize; + } + + /* Preload all round keys + iv into neon registers. */ + const u8 *keys = static_cast(this->cipher_ctx)->GetRoundKey(); + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + uint8x16_t tweak0 = vld1q_u8(this->tweak); + constexpr uint64_t xorv = 0x87ul; + uint64_t high, low, mask; + + /* Process three blocks at a time, when possible. */ + if (num_blocks > 3) { + /* Multiply tweak twice. */ + uint8x16_t tweak1 = MultiplyTweak(tweak0); + uint8x16_t tweak2 = MultiplyTweak(tweak1); + + do { + /* Save tweaks for xor usage. */ + const uint8x16_t mask0 = tweak0; + const uint8x16_t mask1 = tweak1; + const uint8x16_t mask2 = tweak2; + + /* Read blocks in, XOR with tweaks. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); src += BlockSize; + uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src)); src += BlockSize; + uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src)); src += BlockSize; + + /* Actually do encryption, use optimized asm. */ + /* Interleave GF mult calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[tweak2].d[1]\n" + AES_ENC_ROUND(0, 1) "mov %[low], %[tweak2].d[0]\n" + AES_ENC_ROUND(0, 2) "and %[mask], %[xorv], %[high], asr 63\n" + AES_ENC_ROUND(1, 0) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(1, 1) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(1, 2) "mov %[tweak0].d[1], %[high]\n" + AES_ENC_ROUND(2, 0) "mov %[tweak0].d[0], %[low]\n" + AES_ENC_ROUND(2, 1) "and %[mask], %[xorv], %[high], asr 63\n" + AES_ENC_ROUND(2, 2) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(3, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(3, 1) "mov %[tweak1].d[1], %[high]\n" + AES_ENC_ROUND(3, 2) "mov %[tweak1].d[0], %[low]\n" + AES_ENC_ROUND(4, 0) "and %[mask], %[xorv], %[high], asr 63\n" + AES_ENC_ROUND(4, 1) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(4, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(5, 0) "mov %[tweak2].d[1], %[high]\n" + AES_ENC_ROUND(5, 1) "mov %[tweak2].d[0], %[low]\n" + AES_ENC_ROUND(5, 2) + AES_ENC_ROUND(6, 0) AES_ENC_ROUND(6, 1) AES_ENC_ROUND(6, 2) + AES_ENC_ROUND(7, 0) AES_ENC_ROUND(7, 1) AES_ENC_ROUND(7, 2) + AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2) + AES_ENC_ROUND(9, 0) AES_ENC_ROUND(9, 1) AES_ENC_ROUND(9, 2) + AES_ENC_ROUND(10, 0) AES_ENC_ROUND(10, 1) AES_ENC_ROUND(10, 2) + AES_ENC_SECOND_LAST_ROUND(11, 0) AES_ENC_SECOND_LAST_ROUND(11, 1) AES_ENC_SECOND_LAST_ROUND(11, 2) + AES_ENC_LAST_ROUND(12, 0) AES_ENC_LAST_ROUND(12, 1) AES_ENC_LAST_ROUND(12, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_TWEAKS(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + tmp1 = veorq_u8(mask1, tmp1); + tmp2 = veorq_u8(mask2, tmp2); + + /* Store to output. */ + vst1q_u8(dst, tmp0); dst += BlockSize; + vst1q_u8(dst, tmp1); dst += BlockSize; + vst1q_u8(dst, tmp2); dst += BlockSize; + + num_blocks -= 3; + } while (num_blocks > 3); + } + + while ((--num_blocks) > 0) { + /* Save tweak for xor usage. */ + const uint8x16_t mask0 = tweak0; + + /* Read block in, XOR with tweak. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); + src += BlockSize; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[tweak0].d[1]\n" + AES_ENC_ROUND(1, 0) "mov %[low], %[tweak0].d[0]\n" + AES_ENC_ROUND(2, 0) "and %[mask], %[xorv], %[high], asr 63\n" + AES_ENC_ROUND(3, 0) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(4, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(5, 0) "mov %[tweak0].d[1], %[high]\n" + AES_ENC_ROUND(6, 0) "mov %[tweak0].d[0], %[low]\n" + AES_ENC_ROUND(7, 0) + AES_ENC_ROUND(8, 0) + AES_ENC_ROUND(9, 0) + AES_ENC_ROUND(10, 0) + AES_ENC_SECOND_LAST_ROUND(11, 0) + AES_ENC_LAST_ROUND(12, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_TWEAK(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + + /* Store to output. */ + vst1q_u8(dst, tmp0); + dst += BlockSize; + } + + vst1q_u8(this->tweak, tweak0); + + std::memcpy(this->last_block, src, BlockSize); + this->state = State_Processing; + + return processed; + } + + template<> + size_t XtsModeImpl::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) { + /* Handle last buffered block. */ + size_t processed = (num_blocks - 1) * BlockSize; + + if (this->state == State_Processing) { + this->ProcessBlock(dst, this->last_block); + dst += BlockSize; + processed += BlockSize; + } + + /* Preload all round keys + iv into neon registers. */ + const u8 *keys = static_cast(this->cipher_ctx)->GetRoundKey(); + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + DECLARE_ROUND_KEY_VAR(13); + DECLARE_ROUND_KEY_VAR(14); + uint8x16_t tweak0 = vld1q_u8(this->tweak); + constexpr uint64_t xorv = 0x87ul; + uint64_t high, low, mask; + + /* Process three blocks at a time, when possible. */ + if (num_blocks > 3) { + /* Multiply tweak twice. */ + uint8x16_t tweak1 = MultiplyTweak(tweak0); + uint8x16_t tweak2 = MultiplyTweak(tweak1); + + do { + /* Save tweaks for xor usage. */ + const uint8x16_t mask0 = tweak0; + const uint8x16_t mask1 = tweak1; + const uint8x16_t mask2 = tweak2; + + /* Read blocks in, XOR with tweaks. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); src += BlockSize; + uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src)); src += BlockSize; + uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src)); src += BlockSize; + + /* Actually do encryption, use optimized asm. */ + /* Interleave GF mult calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[tweak2].d[1]\n" + AES_ENC_ROUND(0, 1) "mov %[low], %[tweak2].d[0]\n" + AES_ENC_ROUND(0, 2) "mov %[mask], #0x87\n" + AES_ENC_ROUND(1, 0) "and %[mask], %[mask], %[high], asr 63\n" + AES_ENC_ROUND(1, 1) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(1, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(2, 0) "mov %[tweak0].d[1], %[high]\n" + AES_ENC_ROUND(2, 1) "mov %[tweak0].d[0], %[low]\n" + AES_ENC_ROUND(2, 2) "mov %[mask], #0x87\n" + AES_ENC_ROUND(3, 0) "and %[mask], %[mask], %[high], asr 63\n" + AES_ENC_ROUND(3, 1) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(3, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(4, 0) "mov %[tweak1].d[1], %[high]\n" + AES_ENC_ROUND(4, 1) "mov %[tweak1].d[0], %[low]\n" + AES_ENC_ROUND(4, 2) "mov %[mask], #0x87\n" + AES_ENC_ROUND(5, 0) "and %[mask], %[mask], %[high], asr 63\n" + AES_ENC_ROUND(5, 1) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(5, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(6, 0) "mov %[tweak2].d[1], %[high]\n" + AES_ENC_ROUND(6, 1) "mov %[tweak2].d[0], %[low]\n" + AES_ENC_ROUND(6, 2) + AES_ENC_ROUND(7, 0) AES_ENC_ROUND(7, 1) AES_ENC_ROUND(7, 2) + AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2) + AES_ENC_ROUND(9, 0) AES_ENC_ROUND(9, 1) AES_ENC_ROUND(9, 2) + AES_ENC_ROUND(10, 0) AES_ENC_ROUND(10, 1) AES_ENC_ROUND(10, 2) + AES_ENC_ROUND(11, 0) AES_ENC_ROUND(11, 1) AES_ENC_ROUND(11, 2) + AES_ENC_ROUND(12, 0) AES_ENC_ROUND(12, 1) AES_ENC_ROUND(12, 2) + AES_ENC_SECOND_LAST_ROUND(13, 0) AES_ENC_SECOND_LAST_ROUND(13, 1) AES_ENC_SECOND_LAST_ROUND(13, 2) + AES_ENC_LAST_ROUND(14, 0) AES_ENC_LAST_ROUND(14, 1) AES_ENC_LAST_ROUND(14, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_TWEAKS(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12), + AES_ENC_DEC_INPUT_ROUND_KEY(13), + AES_ENC_DEC_INPUT_ROUND_KEY(14) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + tmp1 = veorq_u8(mask1, tmp1); + tmp2 = veorq_u8(mask2, tmp2); + + /* Store to output. */ + vst1q_u8(dst, tmp0); dst += BlockSize; + vst1q_u8(dst, tmp1); dst += BlockSize; + vst1q_u8(dst, tmp2); dst += BlockSize; + + num_blocks -= 3; + } while (num_blocks > 3); + } + + while ((--num_blocks) > 0) { + /* Save tweak for xor usage. */ + const uint8x16_t mask0 = tweak0; + + /* Read block in, XOR with tweak. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); + src += BlockSize; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_ENC_ROUND(0, 0) "mov %[high], %[tweak0].d[1]\n" + AES_ENC_ROUND(1, 0) "mov %[low], %[tweak0].d[0]\n" + AES_ENC_ROUND(2, 0) "and %[mask], %[xorv], %[high], asr 63\n" + AES_ENC_ROUND(3, 0) "extr %[high], %[high], %[low], 63\n" + AES_ENC_ROUND(4, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_ENC_ROUND(5, 0) "mov %[tweak0].d[1], %[high]\n" + AES_ENC_ROUND(6, 0) "mov %[tweak0].d[0], %[low]\n" + AES_ENC_ROUND(7, 0) + AES_ENC_ROUND(8, 0) + AES_ENC_ROUND(9, 0) + AES_ENC_ROUND(10, 0) + AES_ENC_ROUND(11, 0) + AES_ENC_ROUND(12, 0) + AES_ENC_SECOND_LAST_ROUND(13, 0) + AES_ENC_LAST_ROUND(14, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_TWEAK(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12), + AES_ENC_DEC_INPUT_ROUND_KEY(13), + AES_ENC_DEC_INPUT_ROUND_KEY(14) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + + /* Store to output. */ + vst1q_u8(dst, tmp0); + dst += BlockSize; + } + + vst1q_u8(this->tweak, tweak0); + + std::memcpy(this->last_block, src, BlockSize); + this->state = State_Processing; + + return processed; + } + + template<> + size_t XtsModeImpl::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) { + /* Handle last buffered block. */ + size_t processed = (num_blocks - 1) * BlockSize; + + if (this->state == State_Processing) { + this->ProcessBlock(dst, this->last_block); + dst += BlockSize; + processed += BlockSize; + } + + /* Preload all round keys + iv into neon registers. */ + const u8 *keys = static_cast(this->cipher_ctx)->GetRoundKey(); + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + uint8x16_t tweak0 = vld1q_u8(this->tweak); + constexpr uint64_t xorv = 0x87ul; + uint64_t high, low, mask; + + /* Process three blocks at a time, when possible. */ + if (num_blocks > 3) { + /* Multiply tweak twice. */ + uint8x16_t tweak1 = MultiplyTweak(tweak0); + uint8x16_t tweak2 = MultiplyTweak(tweak1); + + do { + /* Save tweaks for xor usage. */ + const uint8x16_t mask0 = tweak0; + const uint8x16_t mask1 = tweak1; + const uint8x16_t mask2 = tweak2; + + /* Read blocks in, XOR with tweaks. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); src += BlockSize; + uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src)); src += BlockSize; + uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src)); src += BlockSize; + + /* Actually do encryption, use optimized asm. */ + /* Interleave GF mult calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(10, 0) "mov %[high], %[tweak2].d[1]\n" + AES_DEC_ROUND(10, 1) "mov %[low], %[tweak2].d[0]\n" + AES_DEC_ROUND(10, 2) "and %[mask], %[xorv], %[high], asr 63\n" + AES_DEC_ROUND(9, 0) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(9, 1) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(9, 2) "mov %[tweak0].d[1], %[high]\n" + AES_DEC_ROUND(8, 0) "mov %[tweak0].d[0], %[low]\n" + AES_DEC_ROUND(8, 1) "and %[mask], %[xorv], %[high], asr 63\n" + AES_DEC_ROUND(8, 2) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(7, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(7, 1) "mov %[tweak1].d[1], %[high]\n" + AES_DEC_ROUND(7, 2) "mov %[tweak1].d[0], %[low]\n" + AES_DEC_ROUND(6, 0) "and %[mask], %[xorv], %[high], asr 63\n" + AES_DEC_ROUND(6, 1) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(6, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(5, 0) "mov %[tweak2].d[1], %[high]\n" + AES_DEC_ROUND(5, 1) "mov %[tweak2].d[0], %[low]\n" + AES_DEC_ROUND(5, 2) + AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2) + AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2) + AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2) + AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2) + AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_TWEAKS(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + tmp1 = veorq_u8(mask1, tmp1); + tmp2 = veorq_u8(mask2, tmp2); + + /* Store to output. */ + vst1q_u8(dst, tmp0); dst += BlockSize; + vst1q_u8(dst, tmp1); dst += BlockSize; + vst1q_u8(dst, tmp2); dst += BlockSize; + + num_blocks -= 3; + } while (num_blocks > 3); + } + + while ((--num_blocks) > 0) { + /* Save tweak for xor usage. */ + const uint8x16_t mask0 = tweak0; + + /* Read block in, XOR with tweak. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); + src += BlockSize; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(10, 0) "mov %[high], %[tweak0].d[1]\n" + AES_DEC_ROUND(9, 0) "mov %[low], %[tweak0].d[0]\n" + AES_DEC_ROUND(8, 0) "and %[mask], %[xorv], %[high], asr 63\n" + AES_DEC_ROUND(7, 0) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(6, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(5, 0) "mov %[tweak0].d[1], %[high]\n" + AES_DEC_ROUND(4, 0) "mov %[tweak0].d[0], %[low]\n" + AES_DEC_ROUND(3, 0) + AES_DEC_ROUND(2, 0) + AES_DEC_SECOND_LAST_ROUND(1, 0) + AES_DEC_LAST_ROUND(0, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_TWEAK(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + + /* Store to output. */ + vst1q_u8(dst, tmp0); + dst += BlockSize; + } + + vst1q_u8(this->tweak, tweak0); + + std::memcpy(this->last_block, src, BlockSize); + this->state = State_Processing; + + return processed; + } + + template<> + size_t XtsModeImpl::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) { + /* Handle last buffered block. */ + size_t processed = (num_blocks - 1) * BlockSize; + + if (this->state == State_Processing) { + this->ProcessBlock(dst, this->last_block); + dst += BlockSize; + processed += BlockSize; + } + + /* Preload all round keys + iv into neon registers. */ + const u8 *keys = static_cast(this->cipher_ctx)->GetRoundKey(); + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + uint8x16_t tweak0 = vld1q_u8(this->tweak); + constexpr uint64_t xorv = 0x87ul; + uint64_t high, low, mask; + + /* Process three blocks at a time, when possible. */ + if (num_blocks > 3) { + /* Multiply tweak twice. */ + uint8x16_t tweak1 = MultiplyTweak(tweak0); + uint8x16_t tweak2 = MultiplyTweak(tweak1); + + do { + /* Save tweaks for xor usage. */ + const uint8x16_t mask0 = tweak0; + const uint8x16_t mask1 = tweak1; + const uint8x16_t mask2 = tweak2; + + /* Read blocks in, XOR with tweaks. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); src += BlockSize; + uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src)); src += BlockSize; + uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src)); src += BlockSize; + + /* Actually do encryption, use optimized asm. */ + /* Interleave GF mult calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(12, 0) "mov %[high], %[tweak2].d[1]\n" + AES_DEC_ROUND(12, 1) "mov %[low], %[tweak2].d[0]\n" + AES_DEC_ROUND(12, 2) "and %[mask], %[xorv], %[high], asr 63\n" + AES_DEC_ROUND(11, 0) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(11, 1) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(11, 2) "mov %[tweak0].d[1], %[high]\n" + AES_DEC_ROUND(10, 0) "mov %[tweak0].d[0], %[low]\n" + AES_DEC_ROUND(10, 1) "and %[mask], %[xorv], %[high], asr 63\n" + AES_DEC_ROUND(10, 2) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(9, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(9, 1) "mov %[tweak1].d[1], %[high]\n" + AES_DEC_ROUND(9, 2) "mov %[tweak1].d[0], %[low]\n" + AES_DEC_ROUND(8, 0) "and %[mask], %[xorv], %[high], asr 63\n" + AES_DEC_ROUND(8, 1) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(8, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(7, 0) "mov %[tweak2].d[1], %[high]\n" + AES_DEC_ROUND(7, 1) "mov %[tweak2].d[0], %[low]\n" + AES_DEC_ROUND(7, 2) + AES_DEC_ROUND(6, 0) AES_DEC_ROUND(6, 1) AES_DEC_ROUND(6, 2) + AES_DEC_ROUND(5, 0) AES_DEC_ROUND(5, 1) AES_DEC_ROUND(5, 2) + AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2) + AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2) + AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2) + AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2) + AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_TWEAKS(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + tmp1 = veorq_u8(mask1, tmp1); + tmp2 = veorq_u8(mask2, tmp2); + + /* Store to output. */ + vst1q_u8(dst, tmp0); dst += BlockSize; + vst1q_u8(dst, tmp1); dst += BlockSize; + vst1q_u8(dst, tmp2); dst += BlockSize; + + num_blocks -= 3; + } while (num_blocks > 3); + } + + while ((--num_blocks) > 0) { + /* Save tweak for xor usage. */ + const uint8x16_t mask0 = tweak0; + + /* Read block in, XOR with tweak. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); + src += BlockSize; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(12, 0) "mov %[high], %[tweak0].d[1]\n" + AES_DEC_ROUND(11, 0) "mov %[low], %[tweak0].d[0]\n" + AES_DEC_ROUND(10, 0) "and %[mask], %[xorv], %[high], asr 63\n" + AES_DEC_ROUND(9, 0) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(8, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(7, 0) "mov %[tweak0].d[1], %[high]\n" + AES_DEC_ROUND(6, 0) "mov %[tweak0].d[0], %[low]\n" + AES_DEC_ROUND(5, 0) + AES_DEC_ROUND(4, 0) + AES_DEC_ROUND(3, 0) + AES_DEC_ROUND(2, 0) + AES_DEC_SECOND_LAST_ROUND(1, 0) + AES_DEC_LAST_ROUND(0, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_TWEAK(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + + /* Store to output. */ + vst1q_u8(dst, tmp0); + dst += BlockSize; + } + + vst1q_u8(this->tweak, tweak0); + + std::memcpy(this->last_block, src, BlockSize); + this->state = State_Processing; + + return processed; + } + + template<> + size_t XtsModeImpl::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) { + /* Handle last buffered block. */ + size_t processed = (num_blocks - 1) * BlockSize; + + if (this->state == State_Processing) { + this->ProcessBlock(dst, this->last_block); + dst += BlockSize; + processed += BlockSize; + } + + /* Preload all round keys + iv into neon registers. */ + const u8 *keys = static_cast(this->cipher_ctx)->GetRoundKey(); + DECLARE_ROUND_KEY_VAR(0); + DECLARE_ROUND_KEY_VAR(1); + DECLARE_ROUND_KEY_VAR(2); + DECLARE_ROUND_KEY_VAR(3); + DECLARE_ROUND_KEY_VAR(4); + DECLARE_ROUND_KEY_VAR(5); + DECLARE_ROUND_KEY_VAR(6); + DECLARE_ROUND_KEY_VAR(7); + DECLARE_ROUND_KEY_VAR(8); + DECLARE_ROUND_KEY_VAR(9); + DECLARE_ROUND_KEY_VAR(10); + DECLARE_ROUND_KEY_VAR(11); + DECLARE_ROUND_KEY_VAR(12); + DECLARE_ROUND_KEY_VAR(13); + DECLARE_ROUND_KEY_VAR(14); + uint8x16_t tweak0 = vld1q_u8(this->tweak); + constexpr uint64_t xorv = 0x87ul; + uint64_t high, low, mask; + + /* Process three blocks at a time, when possible. */ + if (num_blocks > 3) { + /* Multiply tweak twice. */ + uint8x16_t tweak1 = MultiplyTweak(tweak0); + uint8x16_t tweak2 = MultiplyTweak(tweak1); + + do { + /* Save tweaks for xor usage. */ + const uint8x16_t mask0 = tweak0; + const uint8x16_t mask1 = tweak1; + const uint8x16_t mask2 = tweak2; + + /* Read blocks in, XOR with tweaks. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); src += BlockSize; + uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src)); src += BlockSize; + uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src)); src += BlockSize; + + /* Actually do encryption, use optimized asm. */ + /* Interleave GF mult calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(14, 0) "mov %[high], %[tweak2].d[1]\n" + AES_DEC_ROUND(14, 1) "mov %[low], %[tweak2].d[0]\n" + AES_DEC_ROUND(14, 2) "mov %[mask], 0x87\n" + AES_DEC_ROUND(13, 0) "and %[mask], %[mask], %[high], asr 63\n" + AES_DEC_ROUND(13, 1) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(13, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(12, 0) "mov %[tweak0].d[1], %[high]\n" + AES_DEC_ROUND(12, 1) "mov %[tweak0].d[0], %[low]\n" + AES_DEC_ROUND(12, 2) "mov %[mask], 0x87\n" + AES_DEC_ROUND(11, 0) "and %[mask], %[mask], %[high], asr 63\n" + AES_DEC_ROUND(11, 1) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(11, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(10, 0) "mov %[tweak1].d[1], %[high]\n" + AES_DEC_ROUND(10, 1) "mov %[tweak1].d[0], %[low]\n" + AES_DEC_ROUND(10, 2) "mov %[mask], 0x87\n" + AES_DEC_ROUND(9, 0) "and %[mask], %[mask], %[high], asr 63\n" + AES_DEC_ROUND(9, 1) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(9, 2) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(8, 0) "mov %[tweak2].d[1], %[high]\n" + AES_DEC_ROUND(8, 1) "mov %[tweak2].d[0], %[low]\n" + AES_DEC_ROUND(8, 2) + AES_DEC_ROUND(7, 0) AES_DEC_ROUND(7, 1) AES_DEC_ROUND(7, 2) + AES_DEC_ROUND(6, 0) AES_DEC_ROUND(6, 1) AES_DEC_ROUND(6, 2) + AES_DEC_ROUND(5, 0) AES_DEC_ROUND(5, 1) AES_DEC_ROUND(5, 2) + AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2) + AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2) + AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2) + AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2) + AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2) + : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(), + AES_ENC_DEC_OUTPUT_THREE_TWEAKS(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12), + AES_ENC_DEC_INPUT_ROUND_KEY(13), + AES_ENC_DEC_INPUT_ROUND_KEY(14) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + tmp1 = veorq_u8(mask1, tmp1); + tmp2 = veorq_u8(mask2, tmp2); + + /* Store to output. */ + vst1q_u8(dst, tmp0); dst += BlockSize; + vst1q_u8(dst, tmp1); dst += BlockSize; + vst1q_u8(dst, tmp2); dst += BlockSize; + + num_blocks -= 3; + } while (num_blocks > 3); + } + + while ((--num_blocks) > 0) { + /* Save tweak for xor usage. */ + const uint8x16_t mask0 = tweak0; + + /* Read block in, XOR with tweak. */ + uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); + src += BlockSize; + + /* Actually do encryption, use optimized asm. */ + /* Interleave CTR calculations with AES ones, to mask latencies. */ + __asm__ __volatile__ ( + AES_DEC_ROUND(14, 0) "mov %[high], %[tweak0].d[1]\n" + AES_DEC_ROUND(13, 0) "mov %[low], %[tweak0].d[0]\n" + AES_DEC_ROUND(12, 0) "and %[mask], %[xorv], %[high], asr 63\n" + AES_DEC_ROUND(11, 0) "extr %[high], %[high], %[low], 63\n" + AES_DEC_ROUND(10, 0) "eor %[low], %[mask], %[low], lsl 1\n" + AES_DEC_ROUND(9, 0) "mov %[tweak0].d[1], %[high]\n" + AES_DEC_ROUND(8, 0) "mov %[tweak0].d[0], %[low]\n" + AES_DEC_ROUND(7, 0) + AES_DEC_ROUND(6, 0) + AES_DEC_ROUND(5, 0) + AES_DEC_ROUND(4, 0) + AES_DEC_ROUND(3, 0) + AES_DEC_ROUND(2, 0) + AES_DEC_SECOND_LAST_ROUND(1, 0) + AES_DEC_LAST_ROUND(0, 0) + : AES_ENC_DEC_OUTPUT_ONE_BLOCK(), + AES_ENC_DEC_OUTPUT_ONE_TWEAK(), + XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() + : XTS_INCREMENT_INPUT_XOR(), + AES_ENC_DEC_INPUT_ROUND_KEY(0), + AES_ENC_DEC_INPUT_ROUND_KEY(1), + AES_ENC_DEC_INPUT_ROUND_KEY(2), + AES_ENC_DEC_INPUT_ROUND_KEY(3), + AES_ENC_DEC_INPUT_ROUND_KEY(4), + AES_ENC_DEC_INPUT_ROUND_KEY(5), + AES_ENC_DEC_INPUT_ROUND_KEY(6), + AES_ENC_DEC_INPUT_ROUND_KEY(7), + AES_ENC_DEC_INPUT_ROUND_KEY(8), + AES_ENC_DEC_INPUT_ROUND_KEY(9), + AES_ENC_DEC_INPUT_ROUND_KEY(10), + AES_ENC_DEC_INPUT_ROUND_KEY(11), + AES_ENC_DEC_INPUT_ROUND_KEY(12), + AES_ENC_DEC_INPUT_ROUND_KEY(13), + AES_ENC_DEC_INPUT_ROUND_KEY(14) + : "cc" + ); + + /* XOR blocks. */ + tmp0 = veorq_u8(mask0, tmp0); + + /* Store to output. */ + vst1q_u8(dst, tmp0); + dst += BlockSize; + } + + vst1q_u8(this->tweak, tweak0); + + std::memcpy(this->last_block, src, BlockSize); + this->state = State_Processing; + + return processed; + } + +#else + + /* TODO: Non-EL0 implementation. */ + +#endif + +} diff --git a/libraries/libvapours/source/crypto/impl/crypto_xts_mode_impl.cpp b/libraries/libvapours/source/crypto/impl/crypto_xts_mode_impl.cpp new file mode 100644 index 000000000..86d022d71 --- /dev/null +++ b/libraries/libvapours/source/crypto/impl/crypto_xts_mode_impl.cpp @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2018-2020 Atmosphère-NX + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#include + +namespace ams::crypto::impl { + + namespace { + + /* TODO: Support non-Nintendo Endianness */ + + void MultiplyTweakGeneric(u64 *tweak) { + const u64 carry = tweak[1] & (static_cast(1) << (BITSIZEOF(u64) - 1)); + + tweak[1] = ((tweak[1] << 1) | (tweak[0] >> (BITSIZEOF(u64) - 1))); + tweak[0] = (tweak[0] << 1); + + if (carry) { + tweak[0] ^= static_cast(0x87); + } + } + + } + + void XtsModeImpl::ProcessBlock(u8 *dst, const u8 *src) { + u8 tmp[BlockSize]; + + /* Xor. */ + for (size_t i = 0; i < BlockSize; i++) { + tmp[i] = this->tweak[i] ^ src[i]; + } + + /* Crypt */ + this->cipher_func(tmp, tmp, this->cipher_ctx); + + /* Xor. */ + for (size_t i = 0; i < BlockSize; i++) { + dst[i] = this->tweak[i] ^ tmp[i]; + } + + MultiplyTweakGeneric(reinterpret_cast(this->tweak)); + } + + size_t XtsModeImpl::FinalizeEncryption(void *dst, size_t dst_size) { + AMS_ASSERT(this->state == State_Processing); + + u8 *dst_u8 = static_cast(dst); + size_t processed = 0; + + if (this->num_buffered == 0) { + this->ProcessBlock(dst_u8, this->last_block); + processed = BlockSize; + } else { + this->ProcessBlock(this->last_block, this->last_block); + + std::memcpy(this->buffer + this->num_buffered, this->last_block + this->num_buffered, BlockSize - this->num_buffered); + + this->ProcessBlock(dst_u8, this->buffer); + + std::memcpy(dst_u8 + BlockSize, this->last_block, this->num_buffered); + + processed = BlockSize + this->num_buffered; + } + + this->state = State_Done; + return processed; + } + + size_t XtsModeImpl::FinalizeDecryption(void *dst, size_t dst_size) { + AMS_ASSERT(this->state == State_Processing); + + u8 *dst_u8 = static_cast(dst); + size_t processed = 0; + + if (this->num_buffered == 0) { + this->ProcessBlock(dst_u8, this->last_block); + processed = BlockSize; + } else { + u8 tmp_tweak[BlockSize]; + std::memcpy(tmp_tweak, this->tweak, BlockSize); + MultiplyTweakGeneric(reinterpret_cast(this->tweak)); + + this->ProcessBlock(this->last_block, this->last_block); + + std::memcpy(this->buffer + this->num_buffered, this->last_block + this->num_buffered, BlockSize - this->num_buffered); + + std::memcpy(this->tweak, tmp_tweak, BlockSize); + + this->ProcessBlock(dst_u8, this->buffer); + + std::memcpy(dst_u8 + BlockSize, this->last_block, this->num_buffered); + + processed = BlockSize + this->num_buffered; + } + + this->state = State_Done; + return processed; + } + + size_t XtsModeImpl::ProcessPartialData(u8 *dst, const u8 *src, size_t size) { + size_t processed = 0; + + std::memcpy(this->buffer + this->num_buffered, src, size); + this->num_buffered += size; + + if (this->num_buffered == BlockSize) { + if (this->state == State_Processing) { + this->ProcessBlock(dst, this->last_block); + processed += BlockSize; + } + + std::memcpy(this->last_block, this->buffer, BlockSize); + this->num_buffered = 0; + + this->state = State_Processing; + } + + return processed; + } + + size_t XtsModeImpl::ProcessRemainingData(u8 *dst, const u8 *src, size_t size) { + std::memcpy(this->buffer, src, size); + this->num_buffered = size; + + return 0; + } + +}