diff --git a/libraries/config/arch/arm64/arch.mk b/libraries/config/arch/arm64/arch.mk
index 191b2fe9f..e3162d2dd 100644
--- a/libraries/config/arch/arm64/arch.mk
+++ b/libraries/config/arch/arm64/arch.mk
@@ -5,7 +5,7 @@ endif
 include $(DEVKITPRO)/devkitA64/base_rules
 
 export ATMOSPHERE_DEFINES  += -DATMOSPHERE_ARCH_ARM64
-export ATMOSPHERE_SETTINGS += -march=armv8-a -mtp=soft
+export ATMOSPHERE_SETTINGS += -march=armv8-a+crc+crypto -mtp=soft
 export ATMOSPHERE_CFLAGS   +=
 export ATMOSPHERE_CXXFLAGS +=
 export ATMOSPHERE_ASFLAGS  +=
diff --git a/libraries/libvapours/include/vapours/crypto.hpp b/libraries/libvapours/include/vapours/crypto.hpp
index 45e0df55d..6ef2ed14d 100644
--- a/libraries/libvapours/include/vapours/crypto.hpp
+++ b/libraries/libvapours/include/vapours/crypto.hpp
@@ -21,6 +21,10 @@
 #include <vapours/crypto/crypto_memory_clear.hpp>
 #include <vapours/crypto/crypto_sha1_generator.hpp>
 #include <vapours/crypto/crypto_sha256_generator.hpp>
+#include <vapours/crypto/crypto_aes_encryptor.hpp>
+#include <vapours/crypto/crypto_aes_decryptor.hpp>
+#include <vapours/crypto/crypto_aes_ctr_encryptor_decryptor.hpp>
+#include <vapours/crypto/crypto_aes_xts_encryptor_decryptor.hpp>
 #include <vapours/crypto/crypto_rsa_pss_sha256_verifier.hpp>
 #include <vapours/crypto/crypto_rsa_oaep_sha256_decoder.hpp>
 #include <vapours/crypto/crypto_rsa_oaep_sha256_decryptor.hpp>
diff --git a/libraries/libvapours/include/vapours/crypto/crypto_aes_ctr_encryptor_decryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_aes_ctr_encryptor_decryptor.hpp
new file mode 100644
index 000000000..1363cfceb
--- /dev/null
+++ b/libraries/libvapours/include/vapours/crypto/crypto_aes_ctr_encryptor_decryptor.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <vapours/common.hpp>
+#include <vapours/assert.hpp>
+#include <vapours/util.hpp>
+#include <vapours/crypto/crypto_aes_encryptor.hpp>
+#include <vapours/crypto/crypto_ctr_encryptor.hpp>
+#include <vapours/crypto/crypto_ctr_decryptor.hpp>
+
+namespace ams::crypto {
+
+    namespace impl {
+
+        template<template<typename> typename _CtrImpl, typename _AesImpl>
+        class AesCtrCryptor {
+            NON_COPYABLE(AesCtrCryptor);
+            NON_MOVEABLE(AesCtrCryptor);
+            private:
+                using AesImpl = _AesImpl;
+                using CtrImpl = _CtrImpl<AesImpl>;
+            public:
+                static constexpr size_t KeySize   = AesImpl::KeySize;
+                static constexpr size_t BlockSize = CtrImpl::BlockSize;
+                static constexpr size_t IvSize    = CtrImpl::BlockSize;
+            private:
+                AesImpl aes_impl;
+                CtrImpl ctr_impl;
+            public:
+                AesCtrCryptor() { /* ... */ }
+
+                void Initialize(const void *key, size_t key_size, const void *iv, size_t iv_size) {
+                    this->Initialize(key, key_size, iv, iv_size, 0);
+                }
+
+                void Initialize(const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset) {
+                    AMS_ASSERT(key_size == KeySize);
+                    AMS_ASSERT(iv_size  == IvSize);
+                    AMS_ASSERT(offset   >= 0);
+
+                    this->aes_impl.Initialize(key, key_size);
+                    this->ctr_impl.Initialize(std::addressof(this->aes_impl), iv, iv_size, offset);
+                }
+
+                void SwitchMessage(const void *iv, size_t iv_size) {
+                    return this->ctr_impl.SwitchMessage(iv, iv_size);
+                }
+
+                size_t Update(void *dst, size_t dst_size, const void *src, size_t src_size) {
+                    return this->ctr_impl.Update(dst, dst_size, src, src_size);
+                }
+        };
+
+    }
+
+    using Aes128CtrEncryptor = impl::AesCtrCryptor<CtrEncryptor, AesEncryptor128>;
+    using Aes192CtrEncryptor = impl::AesCtrCryptor<CtrEncryptor, AesEncryptor192>;
+    using Aes256CtrEncryptor = impl::AesCtrCryptor<CtrEncryptor, AesEncryptor256>;
+
+    using Aes128CtrDecryptor = impl::AesCtrCryptor<CtrDecryptor, AesEncryptor128>;
+    using Aes192CtrDecryptor = impl::AesCtrCryptor<CtrDecryptor, AesEncryptor192>;
+    using Aes256CtrDecryptor = impl::AesCtrCryptor<CtrDecryptor, AesEncryptor256>;
+
+    size_t EncryptAes128Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size);
+    size_t EncryptAes192Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size);
+    size_t EncryptAes256Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size);
+
+    size_t DecryptAes128Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size);
+    size_t DecryptAes192Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size);
+    size_t DecryptAes256Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size);
+
+    size_t EncryptAes128CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size);
+    size_t EncryptAes192CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size);
+    size_t EncryptAes256CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size);
+
+    size_t DecryptAes128CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size);
+    size_t DecryptAes192CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size);
+    size_t DecryptAes256CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size);
+
+}
diff --git a/libraries/libvapours/include/vapours/crypto/crypto_aes_decryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_aes_decryptor.hpp
new file mode 100644
index 000000000..2f1dd4a1d
--- /dev/null
+++ b/libraries/libvapours/include/vapours/crypto/crypto_aes_decryptor.hpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <vapours/common.hpp>
+#include <vapours/assert.hpp>
+#include <vapours/util.hpp>
+#include <vapours/crypto/impl/crypto_aes_impl.hpp>
+
+namespace ams::crypto {
+
+    template<size_t _KeySize>
+    class AesDecryptor {
+        NON_COPYABLE(AesDecryptor);
+        NON_MOVEABLE(AesDecryptor);
+        private:
+            using Impl = impl::AesImpl<_KeySize>;
+        public:
+            static constexpr size_t KeySize      = Impl::KeySize;
+            static constexpr size_t BlockSize    = Impl::BlockSize;
+            static constexpr size_t RoundKeySize = Impl::RoundKeySize;
+        private:
+            Impl impl;
+        public:
+            AesDecryptor() { /* ... */ }
+
+            void Initialize(const void *key, size_t key_size) {
+                this->impl.Initialize(key, key_size, false);
+            }
+
+            void DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
+                return this->impl.DecryptBlock(dst, dst_size, src, src_size);
+            }
+
+            const u8 *GetRoundKey() const {
+                return this->impl.GetRoundKey();
+            }
+    };
+
+    using AesDecryptor128 = AesDecryptor<16>;
+    using AesDecryptor192 = AesDecryptor<24>;
+    using AesDecryptor256 = AesDecryptor<32>;
+
+}
diff --git a/libraries/libvapours/include/vapours/crypto/crypto_aes_encryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_aes_encryptor.hpp
new file mode 100644
index 000000000..116ab2dae
--- /dev/null
+++ b/libraries/libvapours/include/vapours/crypto/crypto_aes_encryptor.hpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <vapours/common.hpp>
+#include <vapours/assert.hpp>
+#include <vapours/util.hpp>
+#include <vapours/crypto/impl/crypto_aes_impl.hpp>
+
+namespace ams::crypto {
+
+    template<size_t _KeySize>
+    class AesEncryptor {
+        NON_COPYABLE(AesEncryptor);
+        NON_MOVEABLE(AesEncryptor);
+        private:
+            using Impl = impl::AesImpl<_KeySize>;
+        public:
+            static constexpr size_t KeySize      = Impl::KeySize;
+            static constexpr size_t BlockSize    = Impl::BlockSize;
+            static constexpr size_t RoundKeySize = Impl::RoundKeySize;
+        private:
+            Impl impl;
+        public:
+            AesEncryptor() { /* ... */ }
+
+            void Initialize(const void *key, size_t key_size) {
+                this->impl.Initialize(key, key_size, true);
+            }
+
+            void EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
+                return this->impl.EncryptBlock(dst, dst_size, src, src_size);
+            }
+
+            const u8 *GetRoundKey() const {
+                return this->impl.GetRoundKey();
+            }
+    };
+
+    using AesEncryptor128 = AesEncryptor<16>;
+    using AesEncryptor192 = AesEncryptor<24>;
+    using AesEncryptor256 = AesEncryptor<32>;
+
+}
diff --git a/libraries/libvapours/include/vapours/crypto/crypto_aes_xts_encryptor_decryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_aes_xts_encryptor_decryptor.hpp
new file mode 100644
index 000000000..c99660945
--- /dev/null
+++ b/libraries/libvapours/include/vapours/crypto/crypto_aes_xts_encryptor_decryptor.hpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <vapours/common.hpp>
+#include <vapours/assert.hpp>
+#include <vapours/util.hpp>
+#include <vapours/crypto/crypto_aes_encryptor.hpp>
+#include <vapours/crypto/crypto_xts_encryptor.hpp>
+#include <vapours/crypto/crypto_xts_decryptor.hpp>
+
+namespace ams::crypto {
+
+    namespace impl {
+
+        template<template<typename> typename _XtsImpl, typename _AesImpl1, typename _AesImpl2>
+        class AesXtsCryptor {
+            NON_COPYABLE(AesXtsCryptor);
+            NON_MOVEABLE(AesXtsCryptor);
+            private:
+                using AesImpl1 = _AesImpl1;
+                using AesImpl2 = _AesImpl2;
+                using XtsImpl = _XtsImpl<AesImpl1>;
+            public:
+                static constexpr size_t KeySize   = AesImpl1::KeySize;
+                static constexpr size_t BlockSize = AesImpl1::BlockSize;
+                static constexpr size_t IvSize    = AesImpl1::BlockSize;
+
+                static_assert(AesImpl1::KeySize   == AesImpl2::KeySize);
+                static_assert(AesImpl1::BlockSize == AesImpl2::BlockSize);
+            private:
+                AesImpl1 aes_impl_1;
+                AesImpl2 aes_impl_2;
+                XtsImpl xts_impl;
+            public:
+                AesXtsCryptor() { /* ... */ }
+
+                void Initialize(const void *key1, const void *key2, size_t key_size, const void *iv, size_t iv_size) {
+                    AMS_ASSERT(key_size == KeySize);
+                    AMS_ASSERT(iv_size  == IvSize);
+
+                    this->aes_impl_1.Initialize(key1, key_size);
+                    this->aes_impl_2.Initialize(key2, key_size);
+                    this->xts_impl.Initialize(std::addressof(this->aes_impl_1), std::addressof(this->aes_impl_2), iv, iv_size);
+                }
+
+                size_t Update(void *dst, size_t dst_size, const void *src, size_t src_size) {
+                    return this->xts_impl.Update(dst, dst_size, src, src_size);
+                }
+
+                size_t Finalize(void *dst, size_t dst_size) {
+                    return this->xts_impl.Finalize(dst, dst_size);
+                }
+        };
+
+    }
+
+    using Aes128XtsEncryptor = impl::AesXtsCryptor<XtsEncryptor, AesEncryptor128, AesEncryptor128>;
+    using Aes192XtsEncryptor = impl::AesXtsCryptor<XtsEncryptor, AesEncryptor192, AesEncryptor192>;
+    using Aes256XtsEncryptor = impl::AesXtsCryptor<XtsEncryptor, AesEncryptor256, AesEncryptor256>;
+
+    using Aes128XtsDecryptor = impl::AesXtsCryptor<XtsDecryptor, AesDecryptor128, AesEncryptor128>;
+    using Aes192XtsDecryptor = impl::AesXtsCryptor<XtsDecryptor, AesDecryptor192, AesEncryptor192>;
+    using Aes256XtsDecryptor = impl::AesXtsCryptor<XtsDecryptor, AesDecryptor256, AesEncryptor256>;
+
+    inline size_t EncryptAes128Xts(void *dst, size_t dst_size, const void *key1, const void *key2, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) {
+              u8 *dst_u8 = static_cast<u8 *>(dst);
+        const u8 *src_u8 = static_cast<const u8 *>(src);
+
+        Aes128XtsEncryptor xts;
+        xts.Initialize(key1, key2, key_size, iv, iv_size);
+
+        size_t processed = xts.Update(dst_u8, dst_size, src_u8, src_size);
+        dst_u8   += processed;
+        dst_size -= processed;
+
+        processed += xts.Finalize(dst_u8, dst_size);
+        return processed;
+    }
+
+    inline size_t EncryptAes192Xts(void *dst, size_t dst_size, const void *key1, const void *key2, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) {
+              u8 *dst_u8 = static_cast<u8 *>(dst);
+        const u8 *src_u8 = static_cast<const u8 *>(src);
+
+        Aes192XtsEncryptor xts;
+        xts.Initialize(key1, key2, key_size, iv, iv_size);
+
+        size_t processed = xts.Update(dst_u8, dst_size, src_u8, src_size);
+        dst_u8   += processed;
+        dst_size -= processed;
+
+        processed += xts.Finalize(dst_u8, dst_size);
+        return processed;
+    }
+
+    inline size_t EncryptAes256Xts(void *dst, size_t dst_size, const void *key1, const void *key2, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) {
+              u8 *dst_u8 = static_cast<u8 *>(dst);
+        const u8 *src_u8 = static_cast<const u8 *>(src);
+
+        Aes256XtsEncryptor xts;
+        xts.Initialize(key1, key2, key_size, iv, iv_size);
+
+        size_t processed = xts.Update(dst_u8, dst_size, src_u8, src_size);
+        dst_u8   += processed;
+        dst_size -= processed;
+
+        processed += xts.Finalize(dst_u8, dst_size);
+        return processed;
+    }
+
+    inline size_t DecryptAes128Xts(void *dst, size_t dst_size, const void *key1, const void *key2, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) {
+              u8 *dst_u8 = static_cast<u8 *>(dst);
+        const u8 *src_u8 = static_cast<const u8 *>(src);
+
+        Aes128XtsDecryptor xts;
+        xts.Initialize(key1, key2, key_size, iv, iv_size);
+
+        size_t processed = xts.Update(dst_u8, dst_size, src_u8, src_size);
+        dst_u8   += processed;
+        dst_size -= processed;
+
+        processed += xts.Finalize(dst_u8, dst_size);
+        return processed;
+    }
+
+    inline size_t DecryptAes192Xts(void *dst, size_t dst_size, const void *key1, const void *key2, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) {
+              u8 *dst_u8 = static_cast<u8 *>(dst);
+        const u8 *src_u8 = static_cast<const u8 *>(src);
+
+        Aes192XtsDecryptor xts;
+        xts.Initialize(key1, key2, key_size, iv, iv_size);
+
+        size_t processed = xts.Update(dst_u8, dst_size, src_u8, src_size);
+        dst_u8   += processed;
+        dst_size -= processed;
+
+        processed += xts.Finalize(dst_u8, dst_size);
+        return processed;
+    }
+
+    inline size_t DecryptAes256Xts(void *dst, size_t dst_size, const void *key1, const void *key2, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) {
+              u8 *dst_u8 = static_cast<u8 *>(dst);
+        const u8 *src_u8 = static_cast<const u8 *>(src);
+
+        Aes256XtsDecryptor xts;
+        xts.Initialize(key1, key2, key_size, iv, iv_size);
+
+        size_t processed = xts.Update(dst_u8, dst_size, src_u8, src_size);
+        dst_u8   += processed;
+        dst_size -= processed;
+
+        processed += xts.Finalize(dst_u8, dst_size);
+        return processed;
+    }
+
+}
diff --git a/libraries/libvapours/include/vapours/crypto/crypto_ctr_decryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_ctr_decryptor.hpp
new file mode 100644
index 000000000..242233186
--- /dev/null
+++ b/libraries/libvapours/include/vapours/crypto/crypto_ctr_decryptor.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <vapours/common.hpp>
+#include <vapours/assert.hpp>
+#include <vapours/util.hpp>
+#include <vapours/crypto/impl/crypto_ctr_mode_impl.hpp>
+
+namespace ams::crypto {
+
+    /* TODO: C++20 BlockCipher concept */
+
+    template<typename BlockCipher>
+    class CtrDecryptor {
+        NON_COPYABLE(CtrDecryptor);
+        NON_MOVEABLE(CtrDecryptor);
+        private:
+            using Impl = impl::CtrModeImpl<BlockCipher>;
+        public:
+            static constexpr size_t KeySize   = Impl::KeySize;
+            static constexpr size_t BlockSize = Impl::BlockSize;
+            static constexpr size_t IvSize    = Impl::IvSize;
+        private:
+            Impl impl;
+        public:
+            CtrDecryptor() { /* ... */ }
+
+            void Initialize(const BlockCipher *cipher, const void *iv, size_t iv_size) {
+                this->impl.Initialize(cipher, iv, iv_size);
+            }
+
+            void Initialize(const BlockCipher *cipher, const void *iv, size_t iv_size, s64 offset) {
+                this->impl.Initialize(cipher, iv, iv_size, offset);
+            }
+
+            void SwitchMessage(const void *iv, size_t iv_size) {
+                this->impl.SwitchMessage(iv, iv_size);
+            }
+
+            size_t Update(void *dst, size_t dst_size, const void *src, size_t src_size) {
+                return this->impl.Update(dst, dst_size, src, src_size);
+            }
+    };
+
+}
diff --git a/libraries/libvapours/include/vapours/crypto/crypto_ctr_encryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_ctr_encryptor.hpp
new file mode 100644
index 000000000..ab5661739
--- /dev/null
+++ b/libraries/libvapours/include/vapours/crypto/crypto_ctr_encryptor.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <vapours/common.hpp>
+#include <vapours/assert.hpp>
+#include <vapours/util.hpp>
+#include <vapours/crypto/impl/crypto_ctr_mode_impl.hpp>
+
+namespace ams::crypto {
+
+    /* TODO: C++20 BlockCipher concept */
+
+    template<typename BlockCipher>
+    class CtrEncryptor {
+        NON_COPYABLE(CtrEncryptor);
+        NON_MOVEABLE(CtrEncryptor);
+        private:
+            using Impl = impl::CtrModeImpl<BlockCipher>;
+        public:
+            static constexpr size_t KeySize   = Impl::KeySize;
+            static constexpr size_t BlockSize = Impl::BlockSize;
+            static constexpr size_t IvSize    = Impl::IvSize;
+        private:
+            Impl impl;
+        public:
+            CtrEncryptor() { /* ... */ }
+
+            void Initialize(const BlockCipher *cipher, const void *iv, size_t iv_size) {
+                this->impl.Initialize(cipher, iv, iv_size);
+            }
+
+            void Initialize(const BlockCipher *cipher, const void *iv, size_t iv_size, s64 offset) {
+                this->impl.Initialize(cipher, iv, iv_size, offset);
+            }
+
+            void SwitchMessage(const void *iv, size_t iv_size) {
+                this->impl.SwitchMessage(iv, iv_size);
+            }
+
+            size_t Update(void *dst, size_t dst_size, const void *src, size_t src_size) {
+                return this->impl.Update(dst, dst_size, src, src_size);
+            }
+    };
+
+}
diff --git a/libraries/libvapours/include/vapours/crypto/crypto_xts_decryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_xts_decryptor.hpp
new file mode 100644
index 000000000..f51f8fb4d
--- /dev/null
+++ b/libraries/libvapours/include/vapours/crypto/crypto_xts_decryptor.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <vapours/common.hpp>
+#include <vapours/assert.hpp>
+#include <vapours/util.hpp>
+#include <vapours/crypto/impl/crypto_xts_mode_impl.hpp>
+
+namespace ams::crypto {
+
+    /* TODO: C++20 BlockCipher concept */
+
+    template<typename BlockCipher>
+    class XtsDecryptor {
+        NON_COPYABLE(XtsDecryptor);
+        NON_MOVEABLE(XtsDecryptor);
+        private:
+            using Impl = impl::XtsModeImpl;
+        public:
+            static constexpr size_t BlockSize = Impl::BlockSize;
+            static constexpr size_t IvSize    = Impl::IvSize;
+        private:
+            Impl impl;
+        public:
+            XtsDecryptor() { /* ... */ }
+
+            template<typename BlockCipher2>
+            void Initialize(const BlockCipher *cipher1, const BlockCipher2 *cipher2, const void *iv, size_t iv_size) {
+                this->impl.InitializeDecryption(cipher1, cipher2, iv, iv_size);
+            }
+
+            size_t Update(void *dst, size_t dst_size, const void *src, size_t src_size) {
+                return this->impl.template Update<BlockCipher>(dst, dst_size, src, src_size);
+            }
+
+            size_t Finalize(void *dst, size_t dst_size) {
+                return this->impl.FinalizeDecryption(dst, dst_size);
+            }
+    };
+
+}
diff --git a/libraries/libvapours/include/vapours/crypto/crypto_xts_encryptor.hpp b/libraries/libvapours/include/vapours/crypto/crypto_xts_encryptor.hpp
new file mode 100644
index 000000000..7d889e4e3
--- /dev/null
+++ b/libraries/libvapours/include/vapours/crypto/crypto_xts_encryptor.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <vapours/common.hpp>
+#include <vapours/assert.hpp>
+#include <vapours/util.hpp>
+#include <vapours/crypto/impl/crypto_xts_mode_impl.hpp>
+
+namespace ams::crypto {
+
+    /* TODO: C++20 BlockCipher concept */
+
+    template<typename BlockCipher>
+    class XtsEncryptor {
+        NON_COPYABLE(XtsEncryptor);
+        NON_MOVEABLE(XtsEncryptor);
+        private:
+            using Impl = impl::XtsModeImpl;
+        public:
+            static constexpr size_t BlockSize = Impl::BlockSize;
+            static constexpr size_t IvSize    = Impl::IvSize;
+        private:
+            Impl impl;
+        public:
+            XtsEncryptor() { /* ... */ }
+
+            template<typename BlockCipher2>
+            void Initialize(const BlockCipher *cipher1, const BlockCipher2 *cipher2, const void *iv, size_t iv_size) {
+                this->impl.InitializeEncryption(cipher1, cipher2, iv, iv_size);
+            }
+
+            size_t Update(void *dst, size_t dst_size, const void *src, size_t src_size) {
+                return this->impl.template Update<BlockCipher>(dst, dst_size, src, src_size);
+            }
+
+            size_t Finalize(void *dst, size_t dst_size) {
+                return this->impl.FinalizeEncryption(dst, dst_size);
+            }
+    };
+
+}
diff --git a/libraries/libvapours/include/vapours/crypto/impl/crypto_aes_impl.hpp b/libraries/libvapours/include/vapours/crypto/impl/crypto_aes_impl.hpp
new file mode 100644
index 000000000..d99b14774
--- /dev/null
+++ b/libraries/libvapours/include/vapours/crypto/impl/crypto_aes_impl.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <vapours/common.hpp>
+#include <vapours/assert.hpp>
+#include <vapours/util.hpp>
+
+
+namespace ams::crypto::impl {
+
+    template<size_t _KeySize>
+    class AesImpl {
+        public:
+            static constexpr size_t KeySize      = _KeySize;
+            static constexpr size_t BlockSize    = 16;
+            static constexpr s32    RoundCount   = (KeySize / 4) + 6;
+            static constexpr size_t RoundKeySize = BlockSize * (RoundCount + 1);
+        private:
+            u32 round_keys[RoundKeySize / sizeof(u32)];
+        public:
+            ~AesImpl();
+
+            void Initialize(const void *key, size_t key_size, bool is_encrypt);
+            void EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const;
+            void DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const;
+            const u8 *GetRoundKey() const {
+                return reinterpret_cast<const u8 *>(this->round_keys);
+            }
+    };
+
+    /* static_assert(HashFunction<Sha1Impl>); */
+
+}
diff --git a/libraries/libvapours/include/vapours/crypto/impl/crypto_ctr_mode_impl.hpp b/libraries/libvapours/include/vapours/crypto/impl/crypto_ctr_mode_impl.hpp
new file mode 100644
index 000000000..b588bc2c3
--- /dev/null
+++ b/libraries/libvapours/include/vapours/crypto/impl/crypto_ctr_mode_impl.hpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <vapours/common.hpp>
+#include <vapours/assert.hpp>
+#include <vapours/util.hpp>
+#include <vapours/crypto/crypto_memory_clear.hpp>
+#include <vapours/crypto/crypto_aes_encryptor.hpp>
+
+namespace ams::crypto::impl {
+
+    template<typename BlockCipher>
+    class CtrModeImpl {
+        NON_COPYABLE(CtrModeImpl);
+        NON_MOVEABLE(CtrModeImpl);
+        public:
+            static constexpr size_t KeySize   = BlockCipher::KeySize;
+            static constexpr size_t BlockSize = BlockCipher::BlockSize;
+            static constexpr size_t IvSize    = BlockCipher::BlockSize;
+        private:
+            enum State {
+                State_None,
+                State_Initialized,
+            };
+        private:
+            const BlockCipher *block_cipher;
+            u8 counter[IvSize];
+            u8 encrypted_counter[BlockSize];
+            size_t buffer_offset;
+            State state;
+        public:
+            CtrModeImpl() : state(State_None) { /* ... */ }
+
+            ~CtrModeImpl() {
+                ClearMemory(this, sizeof(*this));
+            }
+
+            void Initialize(const BlockCipher *block_cipher, const void *iv, size_t iv_size) {
+                this->Initialize(block_cipher, iv, iv_size, 0);
+            }
+
+            void Initialize(const BlockCipher *block_cipher, const void *iv, size_t iv_size, s64 offset) {
+                AMS_ASSERT(iv_size == IvSize);
+                AMS_ASSERT(offset  >= 0);
+
+                this->block_cipher = block_cipher;
+                this->state        = State_Initialized;
+
+                this->SwitchMessage(iv, iv_size);
+
+                if (offset >= 0) {
+                    u64 ctr_offset = offset / BlockSize;
+                    if (ctr_offset > 0) {
+                        this->IncrementCounter(ctr_offset);
+                    }
+
+                    if (size_t remaining = static_cast<size_t>(offset % BlockSize); remaining != 0) {
+                        this->block_cipher->EncryptBlock(this->encrypted_counter, sizeof(this->encrypted_counter), this->counter, sizeof(this->counter));
+                        this->IncrementCounter();
+
+                        this->buffer_offset = remaining;
+                    }
+                }
+            }
+
+            void SwitchMessage(const void *iv, size_t iv_size) {
+                AMS_ASSERT(this->state == State_Initialized);
+                AMS_ASSERT(iv_size     == IvSize);
+
+                std::memcpy(this->counter, iv, iv_size);
+                this->buffer_offset = 0;
+            }
+
+            void IncrementCounter() {
+                for (s32 i = IvSize - 1; i >= 0; --i) {
+                    if (++this->counter[i] != 0) {
+                        break;
+                    }
+                }
+            }
+
+            size_t Update(void *_dst, size_t dst_size, const void *_src, size_t src_size) {
+                AMS_ASSERT(this->state == State_Initialized);
+                AMS_ASSERT(dst_size >= src_size);
+
+                u8 *dst = static_cast<u8 *>(_dst);
+                const u8 *src = static_cast<const u8 *>(_src);
+                size_t remaining = src_size;
+
+                if (this->buffer_offset > 0) {
+                    const size_t xor_size = std::min(BlockSize - this->buffer_offset, remaining);
+
+                    const u8 *ctr = this->encrypted_counter + this->buffer_offset;
+                    for (size_t i = 0; i < xor_size; i++) {
+                        dst[i] = src[i] ^ ctr[i];
+                    }
+
+                    src                 += xor_size;
+                    dst                 += xor_size;
+                    remaining           -= xor_size;
+                    this->buffer_offset += xor_size;
+
+                    if (this->buffer_offset == BlockSize) {
+                        this->buffer_offset = 0;
+                    }
+                }
+
+                if (remaining >= BlockSize) {
+                    const size_t num_blocks = remaining / BlockSize;
+
+                    this->ProcessBlocks(dst, src, num_blocks);
+
+                    const size_t processed_size = num_blocks * BlockSize;
+                    dst       += processed_size;
+                    src       += processed_size;
+                    remaining -= processed_size;
+                }
+
+                if (remaining > 0) {
+                    this->ProcessBlock(dst, src, remaining);
+                    this->buffer_offset = remaining;
+                }
+
+                return src_size;
+            }
+        private:
+            void IncrementCounter(u64 count) {
+                u64 _block[IvSize / sizeof(u64)] = {};
+                util::StoreBigEndian(std::addressof(_block[(IvSize / sizeof(u64)) - 1]), count);
+
+                u16 acc;
+                const u8 *block = reinterpret_cast<const u8 *>(_block);
+                for (s32 i = IvSize - 1; i >= 0; --i) {
+                    acc += (this->counter[i] + block[i]);
+                    this->counter[i] = acc & 0xFF;
+                    acc >>= 8;
+                }
+            }
+
+            void ProcessBlock(u8 *dst, const u8 *src, size_t src_size) {
+                this->block_cipher->EncryptBlock(this->encrypted_counter, BlockSize, this->counter, IvSize);
+                this->IncrementCounter();
+
+                for (size_t i = 0; i < src_size; i++) {
+                    dst[i] = src[i] ^ this->encrypted_counter[i];
+                }
+            }
+
+            void ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks);
+    };
+
+    template<typename BlockCipher>
+    inline void CtrModeImpl<BlockCipher>::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) {
+        while (num_blocks--) {
+            this->ProcessBlock(dst, src, BlockSize);
+            dst += BlockSize;
+            src += BlockSize;
+        }
+    }
+
+    template<> void CtrModeImpl<AesEncryptor128>::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks);
+    template<> void CtrModeImpl<AesEncryptor192>::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks);
+    template<> void CtrModeImpl<AesEncryptor256>::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks);
+
+}
diff --git a/libraries/libvapours/include/vapours/crypto/impl/crypto_xts_mode_impl.hpp b/libraries/libvapours/include/vapours/crypto/impl/crypto_xts_mode_impl.hpp
new file mode 100644
index 000000000..4129e5595
--- /dev/null
+++ b/libraries/libvapours/include/vapours/crypto/impl/crypto_xts_mode_impl.hpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+#include <vapours/common.hpp>
+#include <vapours/assert.hpp>
+#include <vapours/util.hpp>
+#include <vapours/crypto/crypto_memory_clear.hpp>
+#include <vapours/crypto/crypto_aes_encryptor.hpp>
+
+namespace ams::crypto::impl {
+
+    class XtsModeImpl {
+        NON_COPYABLE(XtsModeImpl);
+        NON_MOVEABLE(XtsModeImpl);
+        public:
+            /* TODO: More generic support. */
+            static constexpr size_t BlockSize = 16;
+            static constexpr size_t IvSize    = 16;
+        private:
+            enum State {
+                State_None,
+                State_Initialized,
+                State_Processing,
+                State_Done
+            };
+        private:
+            u8 buffer[BlockSize];
+            u8 tweak[BlockSize];
+            u8 last_block[BlockSize];
+            size_t num_buffered;
+            const void *cipher_ctx;
+            void (*cipher_func)(void *dst_block, const void *src_block, const void *cipher_ctx);
+            State state;
+        public:
+            XtsModeImpl() : num_buffered(0), state(State_None) { /* ... */ }
+
+            ~XtsModeImpl() {
+                ClearMemory(this, sizeof(*this));
+            }
+        private:
+            template<typename BlockCipher>
+            static void EncryptBlockCallback(void *dst_block, const void *src_block, const void *cipher) {
+                return static_cast<const BlockCipher *>(cipher)->EncryptBlock(dst_block, BlockCipher::BlockSize, src_block, BlockCipher::BlockSize);
+            }
+
+            template<typename BlockCipher>
+            static void DecryptBlockCallback(void *dst_block, const void *src_block, const void *cipher) {
+                return static_cast<const BlockCipher *>(cipher)->DecryptBlock(dst_block, BlockCipher::BlockSize, src_block, BlockCipher::BlockSize);
+            }
+
+            template<typename BlockCipher>
+            void Initialize(const BlockCipher *cipher, const void *tweak, size_t tweak_size) {
+                AMS_ASSERT(tweak_size == IvSize);
+
+                cipher->EncryptBlock(this->tweak, IvSize, tweak, IvSize);
+
+                this->num_buffered = 0;
+                this->state = State_Initialized;
+            }
+
+            void ProcessBlock(u8 *dst, const u8 *src);
+        public:
+            template<typename BlockCipher1, typename BlockCipher2>
+            void InitializeEncryption(const BlockCipher1 *cipher1, const BlockCipher2 *cipher2, const void *tweak, size_t tweak_size) {
+                static_assert(BlockCipher1::BlockSize == BlockSize);
+                static_assert(BlockCipher2::BlockSize == BlockSize);
+
+                this->cipher_ctx  = cipher1;
+                this->cipher_func = EncryptBlockCallback<BlockCipher1>;
+
+                this->Initialize(cipher2, tweak, tweak_size);
+            }
+
+            template<typename BlockCipher1, typename BlockCipher2>
+            void InitializeDecryption(const BlockCipher1 *cipher1, const BlockCipher2 *cipher2, const void *tweak, size_t tweak_size) {
+                static_assert(BlockCipher1::BlockSize == BlockSize);
+                static_assert(BlockCipher2::BlockSize == BlockSize);
+
+                this->cipher_ctx  = cipher1;
+                this->cipher_func = DecryptBlockCallback<BlockCipher1>;
+
+                this->Initialize(cipher2, tweak, tweak_size);
+            }
+
+            template<typename BlockCipher>
+            size_t Update(void *dst, size_t dst_size, const void *src, size_t src_size) {
+                return this->UpdateGeneric(dst, dst_size, src, src_size);
+            }
+
+            template<typename BlockCipher>
+            size_t ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) {
+                return this->ProcessBlocksGeneric(dst, src, num_blocks);
+            }
+
+            size_t GetBufferedDataSize() const {
+                return this->num_buffered;
+            }
+
+            constexpr size_t GetBlockSize() const {
+                return BlockSize;
+            }
+
+            size_t FinalizeEncryption(void *dst, size_t dst_size);
+            size_t FinalizeDecryption(void *dst, size_t dst_size);
+
+            size_t UpdateGeneric(void *dst, size_t dst_size, const void *src, size_t src_size);
+            size_t ProcessBlocksGeneric(u8 *dst, const u8 *src, size_t num_blocks);
+
+            size_t ProcessPartialData(u8 *dst, const u8 *src, size_t size);
+            size_t ProcessRemainingData(u8 *dst, const u8 *src, size_t size);
+    };
+
+    template<> size_t XtsModeImpl::Update<AesEncryptor128>(void *dst, size_t dst_size, const void *src, size_t src_size);
+    template<> size_t XtsModeImpl::Update<AesEncryptor192>(void *dst, size_t dst_size, const void *src, size_t src_size);
+    template<> size_t XtsModeImpl::Update<AesEncryptor256>(void *dst, size_t dst_size, const void *src, size_t src_size);
+
+    template<> size_t XtsModeImpl::Update<AesDecryptor128>(void *dst, size_t dst_size, const void *src, size_t src_size);
+    template<> size_t XtsModeImpl::Update<AesDecryptor192>(void *dst, size_t dst_size, const void *src, size_t src_size);
+    template<> size_t XtsModeImpl::Update<AesDecryptor256>(void *dst, size_t dst_size, const void *src, size_t src_size);
+
+}
diff --git a/libraries/libvapours/source/crypto/crypto_aes_ctr_encryptor_decryptor.cpp b/libraries/libvapours/source/crypto/crypto_aes_ctr_encryptor_decryptor.cpp
new file mode 100644
index 000000000..438dd2891
--- /dev/null
+++ b/libraries/libvapours/source/crypto/crypto_aes_ctr_encryptor_decryptor.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <vapours.hpp>
+
+namespace ams::crypto {
+
+    size_t EncryptAes128Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) {
+        Aes128CtrEncryptor aes;
+        aes.Initialize(key, key_size, iv, iv_size);
+        return aes.Update(dst, dst_size, src, src_size);
+    }
+
+    size_t EncryptAes192Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) {
+        Aes192CtrEncryptor aes;
+        aes.Initialize(key, key_size, iv, iv_size);
+        return aes.Update(dst, dst_size, src, src_size);
+    }
+
+    size_t EncryptAes256Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) {
+        Aes256CtrEncryptor aes;
+        aes.Initialize(key, key_size, iv, iv_size);
+        return aes.Update(dst, dst_size, src, src_size);
+    }
+
+    size_t DecryptAes128Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) {
+        Aes128CtrDecryptor aes;
+        aes.Initialize(key, key_size, iv, iv_size);
+        return aes.Update(dst, dst_size, src, src_size);
+    }
+
+    size_t DecryptAes192Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) {
+        Aes192CtrDecryptor aes;
+        aes.Initialize(key, key_size, iv, iv_size);
+        return aes.Update(dst, dst_size, src, src_size);
+    }
+
+    size_t DecryptAes256Ctr(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *src, size_t src_size) {
+        Aes256CtrDecryptor aes;
+        aes.Initialize(key, key_size, iv, iv_size);
+        return aes.Update(dst, dst_size, src, src_size);
+    }
+
+    size_t EncryptAes128CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size) {
+        Aes128CtrEncryptor aes;
+        aes.Initialize(key, key_size, iv, iv_size, offset);
+        return aes.Update(dst, dst_size, src, src_size);
+    }
+
+    size_t EncryptAes192CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size) {
+        Aes192CtrEncryptor aes;
+        aes.Initialize(key, key_size, iv, iv_size, offset);
+        return aes.Update(dst, dst_size, src, src_size);
+    }
+
+    size_t EncryptAes256CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size) {
+        Aes256CtrEncryptor aes;
+        aes.Initialize(key, key_size, iv, iv_size, offset);
+        return aes.Update(dst, dst_size, src, src_size);
+    }
+
+    size_t DecryptAes128CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size) {
+        Aes128CtrDecryptor aes;
+        aes.Initialize(key, key_size, iv, iv_size, offset);
+        return aes.Update(dst, dst_size, src, src_size);
+    }
+
+    size_t DecryptAes192CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size) {
+        Aes192CtrDecryptor aes;
+        aes.Initialize(key, key_size, iv, iv_size, offset);
+        return aes.Update(dst, dst_size, src, src_size);
+    }
+
+    size_t DecryptAes256CtrPartial(void *dst, size_t dst_size, const void *key, size_t key_size, const void *iv, size_t iv_size, s64 offset, const void *src, size_t src_size) {
+        Aes256CtrDecryptor aes;
+        aes.Initialize(key, key_size, iv, iv_size, offset);
+        return aes.Update(dst, dst_size, src, src_size);
+    }
+
+}
diff --git a/libraries/libvapours/source/crypto/impl/crypto_aes_impl.arch.arm64.cpp b/libraries/libvapours/source/crypto/impl/crypto_aes_impl.arch.arm64.cpp
new file mode 100644
index 000000000..8c7465bed
--- /dev/null
+++ b/libraries/libvapours/source/crypto/impl/crypto_aes_impl.arch.arm64.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <vapours.hpp>
+
+namespace ams::crypto::impl {
+
+#ifdef ATMOSPHERE_IS_STRATOSPHERE
+
+    namespace {
+
+        constexpr bool IsSupportedKeySize(size_t size) {
+            return size == 16 || size == 24 || size == 32;
+        }
+
+    }
+
+    template<size_t KeySize>
+    AesImpl<KeySize>::~AesImpl() {
+        ClearMemory(this, sizeof(*this));
+    }
+
+    template<size_t KeySize>
+    void AesImpl<KeySize>::Initialize(const void *key, size_t key_size, bool is_encrypt) {
+        static_assert(IsSupportedKeySize(KeySize));
+        AMS_ASSERT(key_size == KeySize);
+
+        if constexpr (KeySize == 16) {
+            /* Aes 128. */
+            static_assert(sizeof(this->round_keys) == sizeof(::Aes128Context));
+            aes128ContextCreate(reinterpret_cast<Aes128Context *>(this->round_keys), key, is_encrypt);
+        } else if constexpr (KeySize == 24) {
+            /* Aes 192. */
+            static_assert(sizeof(this->round_keys) == sizeof(::Aes192Context));
+            aes192ContextCreate(reinterpret_cast<Aes192Context *>(this->round_keys), key, is_encrypt);
+        } else if constexpr (KeySize == 32) {
+            /* Aes 256. */
+            static_assert(sizeof(this->round_keys) == sizeof(::Aes256Context));
+            aes256ContextCreate(reinterpret_cast<Aes256Context *>(this->round_keys), key, is_encrypt);
+        } else {
+            /* Invalid key size. */
+            static_assert(!std::is_same<AesImpl<KeySize>, AesImpl<KeySize>>::value);
+        }
+    }
+
+    template<size_t KeySize>
+    void AesImpl<KeySize>::EncryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
+        static_assert(IsSupportedKeySize(KeySize));
+        AMS_ASSERT(src_size >= BlockSize);
+        AMS_ASSERT(dst_size >= BlockSize);
+
+        if constexpr (KeySize == 16) {
+            /* Aes 128. */
+            static_assert(sizeof(this->round_keys) == sizeof(::Aes128Context));
+            aes128EncryptBlock(reinterpret_cast<const Aes128Context *>(this->round_keys), dst, src);
+        } else if constexpr (KeySize == 24) {
+            /* Aes 192. */
+            static_assert(sizeof(this->round_keys) == sizeof(::Aes192Context));
+            aes192EncryptBlock(reinterpret_cast<const Aes192Context *>(this->round_keys), dst, src);
+        } else if constexpr (KeySize == 32) {
+            /* Aes 256. */
+            static_assert(sizeof(this->round_keys) == sizeof(::Aes256Context));
+            aes256EncryptBlock(reinterpret_cast<const Aes256Context *>(this->round_keys), dst, src);
+        } else {
+            /* Invalid key size. */
+            static_assert(!std::is_same<AesImpl<KeySize>, AesImpl<KeySize>>::value);
+        }
+    }
+
+    template<size_t KeySize>
+    void AesImpl<KeySize>::DecryptBlock(void *dst, size_t dst_size, const void *src, size_t src_size) const {
+        static_assert(IsSupportedKeySize(KeySize));
+        AMS_ASSERT(src_size >= BlockSize);
+        AMS_ASSERT(dst_size >= BlockSize);
+
+        if constexpr (KeySize == 16) {
+            /* Aes 128. */
+            static_assert(sizeof(this->round_keys) == sizeof(::Aes128Context));
+            aes128DecryptBlock(reinterpret_cast<const Aes128Context *>(this->round_keys), dst, src);
+        } else if constexpr (KeySize == 24) {
+            /* Aes 192. */
+            static_assert(sizeof(this->round_keys) == sizeof(::Aes192Context));
+            aes192DecryptBlock(reinterpret_cast<const Aes192Context *>(this->round_keys), dst, src);
+        } else if constexpr (KeySize == 32) {
+            /* Aes 256. */
+            static_assert(sizeof(this->round_keys) == sizeof(::Aes256Context));
+            aes256DecryptBlock(reinterpret_cast<const Aes256Context *>(this->round_keys), dst, src);
+        } else {
+            /* Invalid key size. */
+            static_assert(!std::is_same<AesImpl<KeySize>, AesImpl<KeySize>>::value);
+        }
+    }
+
+
+    /* Explicitly instantiate the three supported key sizes. */
+    template class AesImpl<16>;
+    template class AesImpl<24>;
+    template class AesImpl<32>;
+
+#else
+
+    /* TODO: Non-EL0 implementation. */
+
+#endif
+
+}
\ No newline at end of file
diff --git a/libraries/libvapours/source/crypto/impl/crypto_ctr_mode_impl.arch.arm64.cpp b/libraries/libvapours/source/crypto/impl/crypto_ctr_mode_impl.arch.arm64.cpp
new file mode 100644
index 000000000..5d32196a6
--- /dev/null
+++ b/libraries/libvapours/source/crypto/impl/crypto_ctr_mode_impl.arch.arm64.cpp
@@ -0,0 +1,588 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <vapours.hpp>
+#include <arm_neon.h>
+
+namespace ams::crypto::impl {
+
+#ifdef ATMOSPHERE_IS_STRATOSPHERE
+
+    /* Variable management macros. */
+    #define DECLARE_ROUND_KEY_VAR(n) \
+    const uint8x16_t round_key_##n = vld1q_u8(keys + (BlockSize * n))
+
+    #define AES_ENC_DEC_OUTPUT_THREE_BLOCKS() \
+    [tmp0]"+w"(tmp0), [tmp1]"+w"(tmp1), [tmp2]"+w"(tmp2)
+
+    #define AES_ENC_DEC_OUTPUT_THREE_CTRS() \
+    [ctr0]"+w"(ctr0), [ctr1]"+w"(ctr1), [ctr2]"+w"(ctr2)
+
+    #define AES_ENC_DEC_OUTPUT_ONE_BLOCK() \
+    [tmp0]"+w"(tmp0)
+
+    #define AES_ENC_DEC_OUTPUT_ONE_CTR() \
+    [ctr0]"+w"(ctr0)
+
+    #define CTR_INCREMENT_OUTPUT_HIGH_LOW() \
+    [high]"=&r"(high), [low]"=&r"(low)
+
+    #define CTR_INCREMENT_OUTPUT_HIGH_LOW_TMP() \
+    [high_tmp]"=&r"(high_tmp), [low_tmp]"=&r"(low_tmp)
+
+    #define CTR_INCREMENT_OUTPUT_HL_SINGLE_TMP() \
+    [hl_tmp]"=&r"(hl_tmp)
+
+    #define AES_ENC_DEC_INPUT_ROUND_KEY(n) \
+    [round_key_##n]"w"(round_key_##n)
+
+    /* AES Encryption macros. */
+    #define AES_ENC_ROUND(n, i) \
+    "aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \
+    "aesmc %[tmp" #i "].16b, %[tmp" #i "].16b\n"
+
+    #define AES_ENC_SECOND_LAST_ROUND(n, i) \
+    "aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
+
+    #define AES_ENC_LAST_ROUND(n, i) \
+    "eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
+
+    namespace {
+
+        ALWAYS_INLINE uint8x16_t IncrementCounterOptimized(const uint8x16_t ctr) {
+            uint8x16_t inc;
+            uint64_t high, low;
+            /* Use ASM. TODO: Better than using intrinsics? */
+            __asm__ __volatile__ (
+                "mov %[high], %[ctr].d[0]\n"
+                "mov %[low], %[ctr].d[1]\n"
+                "rev %[high], %[high]\n"
+                "rev %[low], %[low]\n"
+                "adds %[low], %[low], 1\n"
+                "cinc %[high], %[high], cs\n"
+                "rev %[high], %[high]\n"
+                "rev %[low], %[low]\n"
+                "mov %[inc].d[0], %[high]\n"
+                "mov %[inc].d[1], %[low]\n"
+                : [inc]"=w"(inc),
+                  CTR_INCREMENT_OUTPUT_HIGH_LOW()
+                : [ctr]"w"(ctr)
+                : "cc"
+            );
+            return inc;
+        }
+
+    }
+
+    template<>
+    void CtrModeImpl<AesEncryptor128>::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) {
+        /* Preload all round keys + iv into neon registers. */
+        const u8 *keys = this->block_cipher->GetRoundKey();
+        DECLARE_ROUND_KEY_VAR(0);
+        DECLARE_ROUND_KEY_VAR(1);
+        DECLARE_ROUND_KEY_VAR(2);
+        DECLARE_ROUND_KEY_VAR(3);
+        DECLARE_ROUND_KEY_VAR(4);
+        DECLARE_ROUND_KEY_VAR(5);
+        DECLARE_ROUND_KEY_VAR(6);
+        DECLARE_ROUND_KEY_VAR(7);
+        DECLARE_ROUND_KEY_VAR(8);
+        DECLARE_ROUND_KEY_VAR(9);
+        DECLARE_ROUND_KEY_VAR(10);
+        uint8x16_t ctr0 = vld1q_u8(this->counter);
+        uint64_t high, low;
+
+        /* Process three blocks at a time, when possible. */
+        if (num_blocks >= 3) {
+            /* Increment CTR twice. */
+            uint8x16_t ctr1 = IncrementCounterOptimized(ctr0);
+            uint8x16_t ctr2 = IncrementCounterOptimized(ctr1);
+            uint64_t high_tmp, low_tmp;
+
+            while (num_blocks >= 3) {
+                /* Read blocks in. Keep them in registers for XOR later. */
+                const uint8x16_t block0 = vld1q_u8(src);
+                src += AES_BLOCK_SIZE;
+                const uint8x16_t block1 = vld1q_u8(src);
+                src += AES_BLOCK_SIZE;
+                const uint8x16_t block2 = vld1q_u8(src);
+                src += AES_BLOCK_SIZE;
+
+                /* We'll be encrypting the three CTRs. */
+                uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
+
+                /* Actually do encryption, use optimized asm. */
+                /* Interleave CTR calculations with AES ones, to mask latencies. */
+                __asm__ __volatile__ (
+                    AES_ENC_ROUND(0, 0) "mov %[high], %[ctr2].d[0]\n"
+                    AES_ENC_ROUND(0, 1) "mov %[low], %[ctr2].d[1]\n"
+                    AES_ENC_ROUND(0, 2) "rev %[high], %[high]\n"
+                    AES_ENC_ROUND(1, 0) "rev %[low], %[low]\n"
+                    AES_ENC_ROUND(1, 1) "adds %[low], %[low], 1\n"
+                    AES_ENC_ROUND(1, 2) "cinc %[high], %[high], cs\n"
+                    AES_ENC_ROUND(2, 0) "rev %[high_tmp], %[high]\n"
+                    AES_ENC_ROUND(2, 1) "rev %[low_tmp], %[low]\n"
+                    AES_ENC_ROUND(2, 2) "mov %[ctr0].d[0], %[high_tmp]\n"
+                    AES_ENC_ROUND(3, 0) "mov %[ctr0].d[1], %[low_tmp]\n"
+                    AES_ENC_ROUND(3, 1) "adds %[low], %[low], 1\n"
+                    AES_ENC_ROUND(3, 2) "cinc %[high], %[high], cs\n"
+                    AES_ENC_ROUND(4, 0) "rev %[high_tmp], %[high]\n"
+                    AES_ENC_ROUND(4, 1) "rev %[low_tmp], %[low]\n"
+                    AES_ENC_ROUND(4, 2) "mov %[ctr1].d[0], %[high_tmp]\n"
+                    AES_ENC_ROUND(5, 0) "mov %[ctr1].d[1], %[low_tmp]\n"
+                    AES_ENC_ROUND(5, 1) "adds %[low], %[low], 1\n"
+                    AES_ENC_ROUND(5, 2) "cinc %[high], %[high], cs\n"
+                    AES_ENC_ROUND(6, 0) "rev %[high_tmp], %[high]\n"
+                    AES_ENC_ROUND(6, 1) "rev %[low_tmp], %[low]\n"
+                    AES_ENC_ROUND(6, 2) "mov %[ctr2].d[0], %[high_tmp]\n"
+                    AES_ENC_ROUND(7, 0) "mov %[ctr2].d[1], %[low_tmp]\n"
+                    AES_ENC_ROUND(7, 1)
+                    AES_ENC_ROUND(7, 2)
+                    AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2)
+                    AES_ENC_SECOND_LAST_ROUND(9, 0) AES_ENC_SECOND_LAST_ROUND(9, 1) AES_ENC_SECOND_LAST_ROUND(9, 2)
+                    AES_ENC_LAST_ROUND(10, 0) AES_ENC_LAST_ROUND(10, 1) AES_ENC_LAST_ROUND(10, 2)
+                    : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
+                      AES_ENC_DEC_OUTPUT_THREE_CTRS(),
+                      CTR_INCREMENT_OUTPUT_HIGH_LOW(),
+                      CTR_INCREMENT_OUTPUT_HIGH_LOW_TMP()
+                    : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(10)
+                    : "cc"
+                );
+
+                /* XOR blocks. */
+                tmp0 = veorq_u8(block0, tmp0);
+                tmp1 = veorq_u8(block1, tmp1);
+                tmp2 = veorq_u8(block2, tmp2);
+
+                /* Store to output. */
+                vst1q_u8(dst, tmp0);
+                dst += AES_BLOCK_SIZE;
+                vst1q_u8(dst, tmp1);
+                dst += AES_BLOCK_SIZE;
+                vst1q_u8(dst, tmp2);
+                dst += AES_BLOCK_SIZE;
+
+                num_blocks -= 3;
+            }
+        }
+
+        while (num_blocks >= 1) {
+            /* Read block in, keep in register for XOR. */
+            const uint8x16_t block0 = vld1q_u8(src);
+            src += AES_BLOCK_SIZE;
+
+            /* We'll be encrypting the CTR. */
+            uint8x16_t tmp0 = ctr0;
+
+            /* Actually do encryption, use optimized asm. */
+            /* Interleave CTR calculations with AES ones, to mask latencies. */
+            __asm__ __volatile__ (
+                AES_ENC_ROUND(0, 0)             "mov %[high], %[ctr0].d[0]\n"
+                AES_ENC_ROUND(1, 0)             "mov %[low], %[ctr0].d[1]\n"
+                AES_ENC_ROUND(2, 0)             "rev %[high], %[high]\n"
+                AES_ENC_ROUND(3, 0)             "rev %[low], %[low]\n"
+                AES_ENC_ROUND(4, 0)             "adds %[low], %[low], 1\n"
+                AES_ENC_ROUND(5, 0)             "cinc %[high], %[high], cs\n"
+                AES_ENC_ROUND(6, 0)             "rev %[high], %[high]\n"
+                AES_ENC_ROUND(7, 0)             "rev %[low], %[low]\n"
+                AES_ENC_ROUND(8, 0)             "mov %[ctr0].d[0], %[high]\n"
+                AES_ENC_SECOND_LAST_ROUND(9, 0) "mov %[ctr0].d[1], %[low]\n"
+                AES_ENC_LAST_ROUND(10, 0)
+                : AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
+                  AES_ENC_DEC_OUTPUT_ONE_CTR(),
+                  CTR_INCREMENT_OUTPUT_HIGH_LOW()
+                : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(10)
+                : "cc"
+            );
+
+            /* XOR blocks. */
+            tmp0 = veorq_u8(block0, tmp0);
+
+            /* Store to output. */
+            vst1q_u8(dst, tmp0);
+            dst += AES_BLOCK_SIZE;
+
+            num_blocks--;
+        }
+
+        vst1q_u8(this->counter, ctr0);
+    }
+
+    template<>
+    void CtrModeImpl<AesEncryptor192>::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) {
+        /* Preload all round keys + iv into neon registers. */
+        const u8 *keys = this->block_cipher->GetRoundKey();
+        DECLARE_ROUND_KEY_VAR(0);
+        DECLARE_ROUND_KEY_VAR(1);
+        DECLARE_ROUND_KEY_VAR(2);
+        DECLARE_ROUND_KEY_VAR(3);
+        DECLARE_ROUND_KEY_VAR(4);
+        DECLARE_ROUND_KEY_VAR(5);
+        DECLARE_ROUND_KEY_VAR(6);
+        DECLARE_ROUND_KEY_VAR(7);
+        DECLARE_ROUND_KEY_VAR(8);
+        DECLARE_ROUND_KEY_VAR(9);
+        DECLARE_ROUND_KEY_VAR(10);
+        DECLARE_ROUND_KEY_VAR(11);
+        DECLARE_ROUND_KEY_VAR(12);
+        uint8x16_t ctr0 = vld1q_u8(this->counter);
+        uint64_t high, low;
+
+        /* Process three blocks at a time, when possible. */
+        if (num_blocks >= 3) {
+            /* Increment CTR twice. */
+            uint8x16_t ctr1 = IncrementCounterOptimized(ctr0);
+            uint8x16_t ctr2 = IncrementCounterOptimized(ctr1);
+            uint64_t high_tmp, low_tmp;
+
+            while (num_blocks >= 3) {
+                /* Read blocks in. Keep them in registers for XOR later. */
+                const uint8x16_t block0 = vld1q_u8(src);
+                src += AES_BLOCK_SIZE;
+                const uint8x16_t block1 = vld1q_u8(src);
+                src += AES_BLOCK_SIZE;
+                const uint8x16_t block2 = vld1q_u8(src);
+                src += AES_BLOCK_SIZE;
+
+                /* We'll be encrypting the three CTRs. */
+                uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
+
+                /* Actually do encryption, use optimized asm. */
+                /* Interleave CTR calculations with AES ones, to mask latencies. */
+                __asm__ __volatile__ (
+                    AES_ENC_ROUND(0, 0) "mov %[high], %[ctr2].d[0]\n"
+                    AES_ENC_ROUND(0, 1) "mov %[low], %[ctr2].d[1]\n"
+                    AES_ENC_ROUND(0, 2) "rev %[high], %[high]\n"
+                    AES_ENC_ROUND(1, 0) "rev %[low], %[low]\n"
+                    AES_ENC_ROUND(1, 1) "adds %[low], %[low], 1\n"
+                    AES_ENC_ROUND(1, 2) "cinc %[high], %[high], cs\n"
+                    AES_ENC_ROUND(2, 0) "rev %[high_tmp], %[high]\n"
+                    AES_ENC_ROUND(2, 1) "rev %[low_tmp], %[low]\n"
+                    AES_ENC_ROUND(2, 2) "mov %[ctr0].d[0], %[high_tmp]\n"
+                    AES_ENC_ROUND(3, 0) "mov %[ctr0].d[1], %[low_tmp]\n"
+                    AES_ENC_ROUND(3, 1) "adds %[low], %[low], 1\n"
+                    AES_ENC_ROUND(3, 2) "cinc %[high], %[high], cs\n"
+                    AES_ENC_ROUND(4, 0) "rev %[high_tmp], %[high]\n"
+                    AES_ENC_ROUND(4, 1) "rev %[low_tmp], %[low]\n"
+                    AES_ENC_ROUND(4, 2) "mov %[ctr1].d[0], %[high_tmp]\n"
+                    AES_ENC_ROUND(5, 0) "mov %[ctr1].d[1], %[low_tmp]\n"
+                    AES_ENC_ROUND(5, 1) "adds %[low], %[low], 1\n"
+                    AES_ENC_ROUND(5, 2) "cinc %[high], %[high], cs\n"
+                    AES_ENC_ROUND(6, 0) "rev %[high_tmp], %[high]\n"
+                    AES_ENC_ROUND(6, 1) "rev %[low_tmp], %[low]\n"
+                    AES_ENC_ROUND(6, 2) "mov %[ctr2].d[0], %[high_tmp]\n"
+                    AES_ENC_ROUND(7, 0) "mov %[ctr2].d[1], %[low_tmp]\n"
+                    AES_ENC_ROUND(7, 1)
+                    AES_ENC_ROUND(7, 2)
+                    AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2)
+                    AES_ENC_ROUND(9, 0) AES_ENC_ROUND(9, 1) AES_ENC_ROUND(9, 2)
+                    AES_ENC_ROUND(10, 0) AES_ENC_ROUND(10, 1) AES_ENC_ROUND(10, 2)
+                    AES_ENC_SECOND_LAST_ROUND(11, 0) AES_ENC_SECOND_LAST_ROUND(11, 1) AES_ENC_SECOND_LAST_ROUND(11, 2)
+                    AES_ENC_LAST_ROUND(12, 0) AES_ENC_LAST_ROUND(12, 1) AES_ENC_LAST_ROUND(12, 2)
+                    : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
+                      AES_ENC_DEC_OUTPUT_THREE_CTRS(),
+                      CTR_INCREMENT_OUTPUT_HIGH_LOW(),
+                      CTR_INCREMENT_OUTPUT_HIGH_LOW_TMP()
+                    : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(10),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(11),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(12)
+                    : "cc"
+                );
+
+                /* XOR blocks. */
+                tmp0 = veorq_u8(block0, tmp0);
+                tmp1 = veorq_u8(block1, tmp1);
+                tmp2 = veorq_u8(block2, tmp2);
+
+                /* Store to output. */
+                vst1q_u8(dst, tmp0);
+                dst += AES_BLOCK_SIZE;
+                vst1q_u8(dst, tmp1);
+                dst += AES_BLOCK_SIZE;
+                vst1q_u8(dst, tmp2);
+                dst += AES_BLOCK_SIZE;
+
+                num_blocks -= 3;
+            }
+        }
+
+        while (num_blocks >= 1) {
+            /* Read block in, keep in register for XOR. */
+            const uint8x16_t block0 = vld1q_u8(src);
+            src += AES_BLOCK_SIZE;
+
+            /* We'll be encrypting the CTR. */
+            uint8x16_t tmp0 = ctr0;
+
+            /* Actually do encryption, use optimized asm. */
+            /* Interleave CTR calculations with AES ones, to mask latencies. */
+            __asm__ __volatile__ (
+                AES_ENC_ROUND(0, 0) "mov %[high], %[ctr0].d[0]\n"
+                AES_ENC_ROUND(1, 0) "mov %[low], %[ctr0].d[1]\n"
+                AES_ENC_ROUND(2, 0) "rev %[high], %[high]\n"
+                AES_ENC_ROUND(3, 0) "rev %[low], %[low]\n"
+                AES_ENC_ROUND(4, 0) "adds %[low], %[low], 1\n"
+                AES_ENC_ROUND(5, 0) "cinc %[high], %[high], cs\n"
+                AES_ENC_ROUND(6, 0) "rev %[high], %[high]\n"
+                AES_ENC_ROUND(7, 0) "rev %[low], %[low]\n"
+                AES_ENC_ROUND(8, 0) "mov %[ctr0].d[0], %[high]\n"
+                AES_ENC_ROUND(9, 0) "mov %[ctr0].d[1], %[low]\n"
+                AES_ENC_ROUND(10, 0)
+                AES_ENC_SECOND_LAST_ROUND(11, 0)
+                AES_ENC_LAST_ROUND(12, 0)
+                : AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
+                  AES_ENC_DEC_OUTPUT_ONE_CTR(),
+                  CTR_INCREMENT_OUTPUT_HIGH_LOW()
+                : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(10),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(11),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(12)
+                : "cc"
+            );
+
+            /* XOR blocks. */
+            tmp0 = veorq_u8(block0, tmp0);
+
+            /* Store to output. */
+            vst1q_u8(dst, tmp0);
+            dst += AES_BLOCK_SIZE;
+
+            num_blocks--;
+        }
+
+        vst1q_u8(this->counter, ctr0);
+    }
+
+    template<>
+    void CtrModeImpl<AesEncryptor256>::ProcessBlocks(u8 *dst, const u8 *src, size_t num_blocks) {
+        /* Preload all round keys + iv into neon registers. */
+        const u8 *keys = this->block_cipher->GetRoundKey();
+        DECLARE_ROUND_KEY_VAR(0);
+        DECLARE_ROUND_KEY_VAR(1);
+        DECLARE_ROUND_KEY_VAR(2);
+        DECLARE_ROUND_KEY_VAR(3);
+        DECLARE_ROUND_KEY_VAR(4);
+        DECLARE_ROUND_KEY_VAR(5);
+        DECLARE_ROUND_KEY_VAR(6);
+        DECLARE_ROUND_KEY_VAR(7);
+        DECLARE_ROUND_KEY_VAR(8);
+        DECLARE_ROUND_KEY_VAR(9);
+        DECLARE_ROUND_KEY_VAR(10);
+        DECLARE_ROUND_KEY_VAR(11);
+        DECLARE_ROUND_KEY_VAR(12);
+        DECLARE_ROUND_KEY_VAR(13);
+        DECLARE_ROUND_KEY_VAR(14);
+        uint8x16_t ctr0 = vld1q_u8(this->counter);
+        uint64_t high, low;
+
+        /* Process three blocks at a time, when possible. */
+        if (num_blocks >= 3) {
+            /* Increment CTR twice. */
+            uint8x16_t ctr1 = IncrementCounterOptimized(ctr0);
+            uint8x16_t ctr2 = IncrementCounterOptimized(ctr1);
+            uint64_t hl_tmp;
+
+            while (num_blocks >= 3) {
+                /* Read blocks in. Keep them in registers for XOR later. */
+                const uint8x16_t block0 = vld1q_u8(src);
+                src += AES_BLOCK_SIZE;
+                const uint8x16_t block1 = vld1q_u8(src);
+                src += AES_BLOCK_SIZE;
+                const uint8x16_t block2 = vld1q_u8(src);
+                src += AES_BLOCK_SIZE;
+
+                /* We'll be encrypting the three CTRs. */
+                uint8x16_t tmp0 = ctr0, tmp1 = ctr1, tmp2 = ctr2;
+
+                /* Actually do encryption, use optimized asm. */
+                /* Interleave CTR calculations with AES ones, to mask latencies. */
+                /* Note: ASM here only uses one temporary u64 instead of two, due to 30 operand limit. */
+                __asm__ __volatile__ (
+                    AES_ENC_ROUND(0, 0) "mov %[high], %[ctr2].d[0]\n"
+                    AES_ENC_ROUND(0, 1) "mov %[low], %[ctr2].d[1]\n"
+                    AES_ENC_ROUND(0, 2) "rev %[high], %[high]\n"
+                    AES_ENC_ROUND(1, 0) "rev %[low], %[low]\n"
+                    AES_ENC_ROUND(1, 1) "adds %[low], %[low], 1\n"
+                    AES_ENC_ROUND(1, 2) "cinc %[high], %[high], cs\n"
+                    AES_ENC_ROUND(2, 0) "rev %[hl_tmp], %[high]\n"
+                    AES_ENC_ROUND(2, 1) "mov %[ctr0].d[0], %[hl_tmp]\n"
+                    AES_ENC_ROUND(2, 2) "rev %[hl_tmp], %[low]\n"
+                    AES_ENC_ROUND(3, 0) "mov %[ctr0].d[1], %[hl_tmp]\n"
+                    AES_ENC_ROUND(3, 1) "adds %[low], %[low], 1\n"
+                    AES_ENC_ROUND(3, 2) "cinc %[high], %[high], cs\n"
+                    AES_ENC_ROUND(4, 0) "rev %[hl_tmp], %[high]\n"
+                    AES_ENC_ROUND(4, 1) "mov %[ctr1].d[0], %[hl_tmp]\n"
+                    AES_ENC_ROUND(4, 2) "rev %[hl_tmp], %[low]\n"
+                    AES_ENC_ROUND(5, 0) "mov %[ctr1].d[1], %[hl_tmp]\n"
+                    AES_ENC_ROUND(5, 1) "adds %[low], %[low], 1\n"
+                    AES_ENC_ROUND(5, 2) "cinc %[high], %[high], cs\n"
+                    AES_ENC_ROUND(6, 0) "rev %[hl_tmp], %[high]\n"
+                    AES_ENC_ROUND(6, 1) "mov %[ctr2].d[0], %[hl_tmp]\n"
+                    AES_ENC_ROUND(6, 2) "rev %[hl_tmp], %[low]\n"
+                    AES_ENC_ROUND(7, 0) "mov %[ctr2].d[1], %[hl_tmp]\n"
+                    AES_ENC_ROUND(7, 1)
+                    AES_ENC_ROUND(7, 2)
+                    AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2)
+                    AES_ENC_ROUND(9, 0) AES_ENC_ROUND(9, 1) AES_ENC_ROUND(9, 2)
+                    AES_ENC_ROUND(10, 0) AES_ENC_ROUND(10, 1) AES_ENC_ROUND(10, 2)
+                    AES_ENC_ROUND(11, 0) AES_ENC_ROUND(11, 1) AES_ENC_ROUND(11, 2)
+                    AES_ENC_ROUND(12, 0) AES_ENC_ROUND(12, 1) AES_ENC_ROUND(12, 2)
+                    AES_ENC_SECOND_LAST_ROUND(13, 0) AES_ENC_SECOND_LAST_ROUND(13, 1) AES_ENC_SECOND_LAST_ROUND(13, 2)
+                    AES_ENC_LAST_ROUND(14, 0) AES_ENC_LAST_ROUND(14, 1) AES_ENC_LAST_ROUND(14, 2)
+                    : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
+                      AES_ENC_DEC_OUTPUT_THREE_CTRS(),
+                      CTR_INCREMENT_OUTPUT_HIGH_LOW(),
+                      CTR_INCREMENT_OUTPUT_HL_SINGLE_TMP()
+                    : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(10),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(11),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(12),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(13),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(14)
+                    : "cc"
+                );
+
+                /* XOR blocks. */
+                tmp0 = veorq_u8(block0, tmp0);
+                tmp1 = veorq_u8(block1, tmp1);
+                tmp2 = veorq_u8(block2, tmp2);
+
+                /* Store to output. */
+                vst1q_u8(dst, tmp0);
+                dst += AES_BLOCK_SIZE;
+                vst1q_u8(dst, tmp1);
+                dst += AES_BLOCK_SIZE;
+                vst1q_u8(dst, tmp2);
+                dst += AES_BLOCK_SIZE;
+
+                num_blocks -= 3;
+            }
+        }
+
+        while (num_blocks >= 1) {
+            /* Read block in, keep in register for XOR. */
+            const uint8x16_t block0 = vld1q_u8(src);
+            src += AES_BLOCK_SIZE;
+
+            /* We'll be encrypting the CTR. */
+            uint8x16_t tmp0 = ctr0;
+
+            /* Actually do encryption, use optimized asm. */
+            /* Interleave CTR calculations with AES ones, to mask latencies. */
+            __asm__ __volatile__ (
+                AES_ENC_ROUND(0, 0) "mov %[high], %[ctr0].d[0]\n"
+                AES_ENC_ROUND(1, 0) "mov %[low], %[ctr0].d[1]\n"
+                AES_ENC_ROUND(2, 0) "rev %[high], %[high]\n"
+                AES_ENC_ROUND(3, 0) "rev %[low], %[low]\n"
+                AES_ENC_ROUND(4, 0) "adds %[low], %[low], 1\n"
+                AES_ENC_ROUND(5, 0) "cinc %[high], %[high], cs\n"
+                AES_ENC_ROUND(6, 0) "rev %[high], %[high]\n"
+                AES_ENC_ROUND(7, 0) "rev %[low], %[low]\n"
+                AES_ENC_ROUND(8, 0) "mov %[ctr0].d[0], %[high]\n"
+                AES_ENC_ROUND(9, 0) "mov %[ctr0].d[1], %[low]\n"
+                AES_ENC_ROUND(10, 0)
+                AES_ENC_ROUND(11, 0)
+                AES_ENC_ROUND(12, 0)
+                AES_ENC_SECOND_LAST_ROUND(13, 0)
+                AES_ENC_LAST_ROUND(14, 0)
+                : AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
+                  AES_ENC_DEC_OUTPUT_ONE_CTR(),
+                  CTR_INCREMENT_OUTPUT_HIGH_LOW()
+                : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(10),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(11),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(12),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(13),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(14)
+                : "cc"
+            );
+
+            /* XOR blocks. */
+            tmp0 = veorq_u8(block0, tmp0);
+
+            /* Store to output. */
+            vst1q_u8(dst, tmp0);
+            dst += AES_BLOCK_SIZE;
+
+            num_blocks--;
+        }
+
+        vst1q_u8(this->counter, ctr0);
+    }
+
+#else
+
+    /* TODO: Non-EL0 implementation. */
+
+#endif
+
+}
\ No newline at end of file
diff --git a/libraries/libvapours/source/crypto/impl/crypto_update_impl.hpp b/libraries/libvapours/source/crypto/impl/crypto_update_impl.hpp
new file mode 100644
index 000000000..d74abb18b
--- /dev/null
+++ b/libraries/libvapours/source/crypto/impl/crypto_update_impl.hpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+#include <vapours.hpp>
+
+namespace ams::crypto::impl {
+
+    template<typename Cipher, typename Self>
+    void UpdateImpl(Self *self, const void *src, size_t src_size) {
+        const size_t BlockSize = self->GetBlockSize();
+
+        const u8 *src_u8 = static_cast<const u8 *>(src);
+        size_t remaining = src_size;
+
+        if (const size_t buffered = self->GetBufferedDataSize(); buffered > 0) {
+            const size_t partial = std::min(BlockSize - buffered, remaining);
+
+            self->ProcessPartialData(src_u8, partial);
+            src_u8    += partial;
+            remaining -= partial;
+        }
+
+        if (remaining >= BlockSize) {
+            const size_t num_blocks = remaining / BlockSize;
+
+            self->template ProcessBlocks<Cipher>(src_u8, num_blocks);
+
+            const size_t processed = num_blocks * BlockSize;
+            src_u8    += processed;
+            remaining -= processed;
+        }
+
+        if (remaining > 0) {
+            self->ProcessRemainingData(src_u8, remaining);
+        }
+    }
+
+    template<typename Cipher, typename Self>
+    size_t UpdateImpl(Self *self, void *dst, size_t dst_size, const void *src, size_t src_size) {
+        const size_t BlockSize = self->GetBlockSize();
+
+        const u8 *src_u8 = static_cast<const u8 *>(src);
+              u8 *dst_u8 = static_cast<u8 *>(dst);
+        size_t remaining = src_size;
+        size_t total_processed = 0;
+
+        if (const size_t buffered = self->GetBufferedDataSize(); buffered > 0) {
+            const size_t partial = std::min(BlockSize - buffered, remaining);
+
+            const size_t processed = self->ProcessPartialData(dst_u8, src_u8, partial);
+
+            dst_u8          += processed;
+            total_processed += processed;
+
+            src_u8    += partial;
+            remaining -= partial;
+        }
+
+        if (remaining >= BlockSize) {
+            const size_t num_blocks = remaining / BlockSize;
+            const size_t input_size = num_blocks * BlockSize;
+
+            const size_t processed = self->template ProcessBlocks<Cipher>(dst_u8, src_u8, num_blocks);
+
+            dst_u8          += processed;
+            total_processed += processed;
+
+            src_u8    += input_size;
+            remaining -= input_size;
+        }
+
+        if (remaining > 0) {
+            const size_t processed = self->ProcessRemainingData(dst_u8, src_u8, remaining);
+            total_processed += processed;
+        }
+
+        return total_processed;
+    }
+
+
+
+}
diff --git a/libraries/libvapours/source/crypto/impl/crypto_xts_mode_impl.arch.arm64.cpp b/libraries/libvapours/source/crypto/impl/crypto_xts_mode_impl.arch.arm64.cpp
new file mode 100644
index 000000000..e526f8507
--- /dev/null
+++ b/libraries/libvapours/source/crypto/impl/crypto_xts_mode_impl.arch.arm64.cpp
@@ -0,0 +1,1187 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <vapours.hpp>
+#include <arm_neon.h>
+#include "crypto_update_impl.hpp"
+
+namespace ams::crypto::impl {
+
+#ifdef ATMOSPHERE_IS_STRATOSPHERE
+
+    /* Variable management macros. */
+    #define DECLARE_ROUND_KEY_VAR(n) \
+    const uint8x16_t round_key_##n = vld1q_u8(keys + (BlockSize * n))
+
+    #define AES_ENC_DEC_OUTPUT_THREE_BLOCKS() \
+    [tmp0]"+w"(tmp0), [tmp1]"+w"(tmp1), [tmp2]"+w"(tmp2)
+
+    #define AES_ENC_DEC_OUTPUT_THREE_TWEAKS() \
+    [tweak0]"+w"(tweak0), [tweak1]"+w"(tweak1), [tweak2]"+w"(tweak2)
+
+    #define AES_ENC_DEC_OUTPUT_ONE_BLOCK() \
+    [tmp0]"+w"(tmp0)
+
+    #define AES_ENC_DEC_OUTPUT_ONE_TWEAK() \
+    [tweak0]"+w"(tweak0)
+
+    #define XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK() \
+    [high]"=&r"(high), [low]"=&r"(low), [mask]"=&r"(mask)
+
+    #define XTS_INCREMENT_INPUT_XOR() \
+    [xorv]"r"(xorv)
+
+    #define AES_ENC_DEC_INPUT_ROUND_KEY(n) \
+    [round_key_##n]"w"(round_key_##n)
+
+    /* AES Encryption macros. */
+    #define AES_ENC_ROUND(n, i) \
+    "aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \
+    "aesmc %[tmp" #i "].16b, %[tmp" #i "].16b\n"
+
+    #define AES_ENC_SECOND_LAST_ROUND(n, i) \
+    "aese %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
+
+    #define AES_ENC_LAST_ROUND(n, i) \
+    "eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
+
+    /* AES Decryption macros. */
+    #define AES_DEC_ROUND(n, i) \
+    "aesd %[tmp" #i "].16b, %[round_key_" #n "].16b\n" \
+    "aesimc %[tmp" #i "].16b, %[tmp" #i "].16b\n"
+
+    #define AES_DEC_SECOND_LAST_ROUND(n, i) \
+    "aesd %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
+
+    #define AES_DEC_LAST_ROUND(n, i) \
+    "eor %[tmp" #i "].16b, %[tmp" #i "].16b, %[round_key_" #n "].16b\n"
+
+    namespace {
+
+        /* TODO: Support non-Nintendo Endianness */
+
+        ALWAYS_INLINE uint8x16_t MultiplyTweak(const uint8x16_t tweak) {
+            /* TODO: Is the inline asm better than using intrinsics? */
+            #if 1
+                uint8x16_t mult;
+                uint64_t high, low, mask;
+                constexpr uint64_t xorv = 0x87ul;
+                /* Use ASM. TODO: Better than using intrinsics? */
+                __asm__ __volatile__ (
+                    "mov %[high], %[tweak].d[1]\n"
+                    "mov %[low], %[tweak].d[0]\n"
+                    "and %[mask], %[xorv], %[high], asr 63\n"
+                    "extr %[high], %[high], %[low], 63\n"
+                    "eor %[low], %[mask], %[low], lsl 1\n"
+                    "mov %[mult].d[1], %[high]\n"
+                    "mov %[mult].d[0], %[low]\n"
+                    : [mult]"=w"(mult),
+                      XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK()
+                    : [tweak]"w"(tweak),
+                      XTS_INCREMENT_INPUT_XOR()
+                    : "cc"
+                );
+                return mult;
+            #else
+                constexpr uint64_t XorMask = 0x87ul;
+
+                const uint64x2_t tweak64 = vreinterpretq_u64_u8(tweak);
+                const uint64_t   high    = vgetq_lane_u64(tweak64, 1);
+                const uint64_t   low     = vgetq_lane_u64(tweak64, 0);
+                const uint64_t   mask    = static_cast<int64_t>(high) >> (BITSIZEOF(uint64_t) - 1);
+
+                return vreinterpretq_u8_u64(vcombine_u64(vmov_n_u64((low << 1) ^ (mask & XorMask)), vmov_n_u64((high << 1) | (low >> (BITSIZEOF(uint64_t) - 1)))));
+            #endif
+        }
+
+    }
+
+    size_t XtsModeImpl::UpdateGeneric(void *dst, size_t dst_size, const void *src, size_t src_size) {
+        AMS_ASSERT(this->state == State_Initialized || this->state == State_Processing);
+
+        return UpdateImpl<void>(this, dst, dst_size, src, src_size);
+    }
+
+    size_t XtsModeImpl::ProcessBlocksGeneric(u8 *dst, const u8 *src, size_t num_blocks) {
+        size_t processed = BlockSize * (num_blocks - 1);
+
+        if (this->state == State_Processing) {
+            this->ProcessBlock(dst, this->last_block);
+            dst       += BlockSize;
+            processed += BlockSize;
+        }
+
+        uint8x16_t tweak = vld1q_u8(this->tweak);
+
+        while ((--num_blocks) > 0) {
+            /* Xor */
+            uint8x16_t block = vld1q_u8(src);
+            src += BlockSize;
+            block = veorq_u8(block, tweak);
+
+            /* Encrypt */
+            vst1q_u8(dst, block);
+            this->cipher_func(dst, dst, this->cipher_ctx);
+            block = vld1q_u8(dst);
+
+            /* Xor */
+            veorq_u8(block, tweak);
+            vst1q_u8(dst, block);
+            dst += BlockSize;
+
+            /* Increment tweak. */
+            tweak = MultiplyTweak(tweak);
+        }
+
+        vst1q_u8(this->tweak, tweak);
+
+        std::memcpy(this->last_block, src, BlockSize);
+
+        this->state = State_Processing;
+
+        return processed;
+    }
+
+    template<> size_t XtsModeImpl::Update<AesEncryptor128>(void *dst, size_t dst_size, const void *src, size_t src_size) { return UpdateImpl<AesEncryptor128>(this, dst, dst_size, src, src_size); }
+    template<> size_t XtsModeImpl::Update<AesEncryptor192>(void *dst, size_t dst_size, const void *src, size_t src_size) { return UpdateImpl<AesEncryptor192>(this, dst, dst_size, src, src_size); }
+    template<> size_t XtsModeImpl::Update<AesEncryptor256>(void *dst, size_t dst_size, const void *src, size_t src_size) { return UpdateImpl<AesEncryptor256>(this, dst, dst_size, src, src_size); }
+
+    template<> size_t XtsModeImpl::Update<AesDecryptor128>(void *dst, size_t dst_size, const void *src, size_t src_size) { return UpdateImpl<AesDecryptor128>(this, dst, dst_size, src, src_size); }
+    template<> size_t XtsModeImpl::Update<AesDecryptor192>(void *dst, size_t dst_size, const void *src, size_t src_size) { return UpdateImpl<AesDecryptor192>(this, dst, dst_size, src, src_size); }
+    template<> size_t XtsModeImpl::Update<AesDecryptor256>(void *dst, size_t dst_size, const void *src, size_t src_size) { return UpdateImpl<AesDecryptor256>(this, dst, dst_size, src, src_size); }
+
+    template<>
+    size_t XtsModeImpl::ProcessBlocks<AesEncryptor128>(u8 *dst, const u8 *src, size_t num_blocks) {
+        /* Handle last buffered block. */
+        size_t processed = (num_blocks - 1) * BlockSize;
+
+        if (this->state == State_Processing) {
+            this->ProcessBlock(dst, this->last_block);
+            dst += BlockSize;
+            processed += BlockSize;
+        }
+
+        /* Preload all round keys + iv into neon registers. */
+        const u8 *keys = static_cast<const AesEncryptor128 *>(this->cipher_ctx)->GetRoundKey();
+        DECLARE_ROUND_KEY_VAR(0);
+        DECLARE_ROUND_KEY_VAR(1);
+        DECLARE_ROUND_KEY_VAR(2);
+        DECLARE_ROUND_KEY_VAR(3);
+        DECLARE_ROUND_KEY_VAR(4);
+        DECLARE_ROUND_KEY_VAR(5);
+        DECLARE_ROUND_KEY_VAR(6);
+        DECLARE_ROUND_KEY_VAR(7);
+        DECLARE_ROUND_KEY_VAR(8);
+        DECLARE_ROUND_KEY_VAR(9);
+        DECLARE_ROUND_KEY_VAR(10);
+        uint8x16_t tweak0 = vld1q_u8(this->tweak);
+        constexpr uint64_t xorv = 0x87ul;
+        uint64_t high, low, mask;
+
+        /* Process three blocks at a time, when possible. */
+        if (num_blocks > 3) {
+            /* Multiply tweak twice. */
+            uint8x16_t tweak1 = MultiplyTweak(tweak0);
+            uint8x16_t tweak2 = MultiplyTweak(tweak1);
+
+            do {
+                /* Save tweaks for xor usage. */
+                const uint8x16_t mask0 = tweak0;
+                const uint8x16_t mask1 = tweak1;
+                const uint8x16_t mask2 = tweak2;
+
+                /* Read blocks in, XOR with tweaks. */
+                uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); src += BlockSize;
+                uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src)); src += BlockSize;
+                uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src)); src += BlockSize;
+
+                /* Actually do encryption, use optimized asm. */
+                /* Interleave GF mult calculations with AES ones, to mask latencies. */
+                __asm__ __volatile__ (
+                    AES_ENC_ROUND(0, 0) "mov %[high], %[tweak2].d[1]\n"
+                    AES_ENC_ROUND(0, 1) "mov %[low], %[tweak2].d[0]\n"
+                    AES_ENC_ROUND(0, 2) "and %[mask], %[xorv], %[high], asr 63\n"
+                    AES_ENC_ROUND(1, 0) "extr %[high], %[high], %[low], 63\n"
+                    AES_ENC_ROUND(1, 1) "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_ENC_ROUND(1, 2) "mov %[tweak0].d[1], %[high]\n"
+                    AES_ENC_ROUND(2, 0) "mov %[tweak0].d[0], %[low]\n"
+                    AES_ENC_ROUND(2, 1) "and %[mask], %[xorv], %[high], asr 63\n"
+                    AES_ENC_ROUND(2, 2) "extr %[high], %[high], %[low], 63\n"
+                    AES_ENC_ROUND(3, 0) "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_ENC_ROUND(3, 1) "mov %[tweak1].d[1], %[high]\n"
+                    AES_ENC_ROUND(3, 2) "mov %[tweak1].d[0], %[low]\n"
+                    AES_ENC_ROUND(4, 0) "and %[mask], %[xorv], %[high], asr 63\n"
+                    AES_ENC_ROUND(4, 1) "extr %[high], %[high], %[low], 63\n"
+                    AES_ENC_ROUND(4, 2) "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_ENC_ROUND(5, 0) "mov %[tweak2].d[1], %[high]\n"
+                    AES_ENC_ROUND(5, 1) "mov %[tweak2].d[0], %[low]\n"
+                    AES_ENC_ROUND(5, 2)
+                    AES_ENC_ROUND(6, 0) AES_ENC_ROUND(6, 1) AES_ENC_ROUND(6, 2)
+                    AES_ENC_ROUND(7, 0) AES_ENC_ROUND(7, 1) AES_ENC_ROUND(7, 2)
+                    AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2)
+                    AES_ENC_SECOND_LAST_ROUND(9, 0) AES_ENC_SECOND_LAST_ROUND(9, 1) AES_ENC_SECOND_LAST_ROUND(9, 2)
+                    AES_ENC_LAST_ROUND(10, 0) AES_ENC_LAST_ROUND(10, 1) AES_ENC_LAST_ROUND(10, 2)
+                    : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
+                      AES_ENC_DEC_OUTPUT_THREE_TWEAKS(),
+                      XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK()
+                    : XTS_INCREMENT_INPUT_XOR(),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(10)
+                    : "cc"
+                );
+
+                /* XOR blocks. */
+                tmp0 = veorq_u8(mask0, tmp0);
+                tmp1 = veorq_u8(mask1, tmp1);
+                tmp2 = veorq_u8(mask2, tmp2);
+
+                /* Store to output. */
+                vst1q_u8(dst, tmp0); dst += BlockSize;
+                vst1q_u8(dst, tmp1); dst += BlockSize;
+                vst1q_u8(dst, tmp2); dst += BlockSize;
+
+                num_blocks -= 3;
+            } while (num_blocks > 3);
+        }
+
+        while ((--num_blocks) > 0) {
+            /* Save tweak for xor usage. */
+            const uint8x16_t mask0 = tweak0;
+
+            /* Read block in, XOR with tweak. */
+            uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src));
+            src += BlockSize;
+
+            /* Actually do encryption, use optimized asm. */
+            /* Interleave CTR calculations with AES ones, to mask latencies. */
+            __asm__ __volatile__ (
+                AES_ENC_ROUND(0, 0) "mov %[high], %[tweak0].d[1]\n"
+                AES_ENC_ROUND(1, 0) "mov %[low], %[tweak0].d[0]\n"
+                AES_ENC_ROUND(2, 0) "and %[mask], %[xorv], %[high], asr 63\n"
+                AES_ENC_ROUND(3, 0) "extr %[high], %[high], %[low], 63\n"
+                AES_ENC_ROUND(4, 0) "eor %[low], %[mask], %[low], lsl 1\n"
+                AES_ENC_ROUND(5, 0) "mov %[tweak0].d[1], %[high]\n"
+                AES_ENC_ROUND(6, 0) "mov %[tweak0].d[0], %[low]\n"
+                AES_ENC_ROUND(7, 0)
+                AES_ENC_ROUND(8, 0)
+                AES_ENC_SECOND_LAST_ROUND(9, 0)
+                AES_ENC_LAST_ROUND(10, 0)
+                : AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
+                  AES_ENC_DEC_OUTPUT_ONE_TWEAK(),
+                  XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK()
+                : XTS_INCREMENT_INPUT_XOR(),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(10)
+                : "cc"
+            );
+
+            /* XOR blocks. */
+            tmp0 = veorq_u8(mask0, tmp0);
+
+            /* Store to output. */
+            vst1q_u8(dst, tmp0);
+            dst += BlockSize;
+        }
+
+        vst1q_u8(this->tweak, tweak0);
+
+        std::memcpy(this->last_block, src, BlockSize);
+        this->state = State_Processing;
+
+        return processed;
+    }
+
+    template<>
+    size_t XtsModeImpl::ProcessBlocks<AesEncryptor192>(u8 *dst, const u8 *src, size_t num_blocks) {
+        /* Handle last buffered block. */
+        size_t processed = (num_blocks - 1) * BlockSize;
+
+        if (this->state == State_Processing) {
+            this->ProcessBlock(dst, this->last_block);
+            dst += BlockSize;
+            processed += BlockSize;
+        }
+
+        /* Preload all round keys + iv into neon registers. */
+        const u8 *keys = static_cast<const AesEncryptor192 *>(this->cipher_ctx)->GetRoundKey();
+        DECLARE_ROUND_KEY_VAR(0);
+        DECLARE_ROUND_KEY_VAR(1);
+        DECLARE_ROUND_KEY_VAR(2);
+        DECLARE_ROUND_KEY_VAR(3);
+        DECLARE_ROUND_KEY_VAR(4);
+        DECLARE_ROUND_KEY_VAR(5);
+        DECLARE_ROUND_KEY_VAR(6);
+        DECLARE_ROUND_KEY_VAR(7);
+        DECLARE_ROUND_KEY_VAR(8);
+        DECLARE_ROUND_KEY_VAR(9);
+        DECLARE_ROUND_KEY_VAR(10);
+        DECLARE_ROUND_KEY_VAR(11);
+        DECLARE_ROUND_KEY_VAR(12);
+        uint8x16_t tweak0 = vld1q_u8(this->tweak);
+        constexpr uint64_t xorv = 0x87ul;
+        uint64_t high, low, mask;
+
+        /* Process three blocks at a time, when possible. */
+        if (num_blocks > 3) {
+            /* Multiply tweak twice. */
+            uint8x16_t tweak1 = MultiplyTweak(tweak0);
+            uint8x16_t tweak2 = MultiplyTweak(tweak1);
+
+            do {
+                /* Save tweaks for xor usage. */
+                const uint8x16_t mask0 = tweak0;
+                const uint8x16_t mask1 = tweak1;
+                const uint8x16_t mask2 = tweak2;
+
+                /* Read blocks in, XOR with tweaks. */
+                uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); src += BlockSize;
+                uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src)); src += BlockSize;
+                uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src)); src += BlockSize;
+
+                /* Actually do encryption, use optimized asm. */
+                /* Interleave GF mult calculations with AES ones, to mask latencies. */
+                __asm__ __volatile__ (
+                    AES_ENC_ROUND(0, 0) "mov %[high], %[tweak2].d[1]\n"
+                    AES_ENC_ROUND(0, 1) "mov %[low], %[tweak2].d[0]\n"
+                    AES_ENC_ROUND(0, 2) "and %[mask], %[xorv], %[high], asr 63\n"
+                    AES_ENC_ROUND(1, 0) "extr %[high], %[high], %[low], 63\n"
+                    AES_ENC_ROUND(1, 1) "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_ENC_ROUND(1, 2) "mov %[tweak0].d[1], %[high]\n"
+                    AES_ENC_ROUND(2, 0) "mov %[tweak0].d[0], %[low]\n"
+                    AES_ENC_ROUND(2, 1) "and %[mask], %[xorv], %[high], asr 63\n"
+                    AES_ENC_ROUND(2, 2) "extr %[high], %[high], %[low], 63\n"
+                    AES_ENC_ROUND(3, 0) "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_ENC_ROUND(3, 1) "mov %[tweak1].d[1], %[high]\n"
+                    AES_ENC_ROUND(3, 2) "mov %[tweak1].d[0], %[low]\n"
+                    AES_ENC_ROUND(4, 0) "and %[mask], %[xorv], %[high], asr 63\n"
+                    AES_ENC_ROUND(4, 1) "extr %[high], %[high], %[low], 63\n"
+                    AES_ENC_ROUND(4, 2) "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_ENC_ROUND(5, 0) "mov %[tweak2].d[1], %[high]\n"
+                    AES_ENC_ROUND(5, 1) "mov %[tweak2].d[0], %[low]\n"
+                    AES_ENC_ROUND(5, 2)
+                    AES_ENC_ROUND(6, 0) AES_ENC_ROUND(6, 1) AES_ENC_ROUND(6, 2)
+                    AES_ENC_ROUND(7, 0) AES_ENC_ROUND(7, 1) AES_ENC_ROUND(7, 2)
+                    AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2)
+                    AES_ENC_ROUND(9, 0) AES_ENC_ROUND(9, 1) AES_ENC_ROUND(9, 2)
+                    AES_ENC_ROUND(10, 0) AES_ENC_ROUND(10, 1) AES_ENC_ROUND(10, 2)
+                    AES_ENC_SECOND_LAST_ROUND(11, 0) AES_ENC_SECOND_LAST_ROUND(11, 1) AES_ENC_SECOND_LAST_ROUND(11, 2)
+                    AES_ENC_LAST_ROUND(12, 0) AES_ENC_LAST_ROUND(12, 1) AES_ENC_LAST_ROUND(12, 2)
+                    : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
+                      AES_ENC_DEC_OUTPUT_THREE_TWEAKS(),
+                      XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK()
+                    : XTS_INCREMENT_INPUT_XOR(),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(10),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(11),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(12)
+                    : "cc"
+                );
+
+                /* XOR blocks. */
+                tmp0 = veorq_u8(mask0, tmp0);
+                tmp1 = veorq_u8(mask1, tmp1);
+                tmp2 = veorq_u8(mask2, tmp2);
+
+                /* Store to output. */
+                vst1q_u8(dst, tmp0); dst += BlockSize;
+                vst1q_u8(dst, tmp1); dst += BlockSize;
+                vst1q_u8(dst, tmp2); dst += BlockSize;
+
+                num_blocks -= 3;
+            } while (num_blocks > 3);
+        }
+
+        while ((--num_blocks) > 0) {
+            /* Save tweak for xor usage. */
+            const uint8x16_t mask0 = tweak0;
+
+            /* Read block in, XOR with tweak. */
+            uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src));
+            src += BlockSize;
+
+            /* Actually do encryption, use optimized asm. */
+            /* Interleave CTR calculations with AES ones, to mask latencies. */
+            __asm__ __volatile__ (
+                AES_ENC_ROUND(0, 0) "mov %[high], %[tweak0].d[1]\n"
+                AES_ENC_ROUND(1, 0) "mov %[low], %[tweak0].d[0]\n"
+                AES_ENC_ROUND(2, 0) "and %[mask], %[xorv], %[high], asr 63\n"
+                AES_ENC_ROUND(3, 0) "extr %[high], %[high], %[low], 63\n"
+                AES_ENC_ROUND(4, 0) "eor %[low], %[mask], %[low], lsl 1\n"
+                AES_ENC_ROUND(5, 0) "mov %[tweak0].d[1], %[high]\n"
+                AES_ENC_ROUND(6, 0) "mov %[tweak0].d[0], %[low]\n"
+                AES_ENC_ROUND(7, 0)
+                AES_ENC_ROUND(8, 0)
+                AES_ENC_ROUND(9, 0)
+                AES_ENC_ROUND(10, 0)
+                AES_ENC_SECOND_LAST_ROUND(11, 0)
+                AES_ENC_LAST_ROUND(12, 0)
+                : AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
+                  AES_ENC_DEC_OUTPUT_ONE_TWEAK(),
+                  XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK()
+                : XTS_INCREMENT_INPUT_XOR(),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(10),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(11),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(12)
+                : "cc"
+            );
+
+            /* XOR blocks. */
+            tmp0 = veorq_u8(mask0, tmp0);
+
+            /* Store to output. */
+            vst1q_u8(dst, tmp0);
+            dst += BlockSize;
+        }
+
+        vst1q_u8(this->tweak, tweak0);
+
+        std::memcpy(this->last_block, src, BlockSize);
+        this->state = State_Processing;
+
+        return processed;
+    }
+
+    template<>
+    size_t XtsModeImpl::ProcessBlocks<AesEncryptor256>(u8 *dst, const u8 *src, size_t num_blocks) {
+        /* Handle last buffered block. */
+        size_t processed = (num_blocks - 1) * BlockSize;
+
+        if (this->state == State_Processing) {
+            this->ProcessBlock(dst, this->last_block);
+            dst += BlockSize;
+            processed += BlockSize;
+        }
+
+        /* Preload all round keys + iv into neon registers. */
+        const u8 *keys = static_cast<const AesEncryptor256 *>(this->cipher_ctx)->GetRoundKey();
+        DECLARE_ROUND_KEY_VAR(0);
+        DECLARE_ROUND_KEY_VAR(1);
+        DECLARE_ROUND_KEY_VAR(2);
+        DECLARE_ROUND_KEY_VAR(3);
+        DECLARE_ROUND_KEY_VAR(4);
+        DECLARE_ROUND_KEY_VAR(5);
+        DECLARE_ROUND_KEY_VAR(6);
+        DECLARE_ROUND_KEY_VAR(7);
+        DECLARE_ROUND_KEY_VAR(8);
+        DECLARE_ROUND_KEY_VAR(9);
+        DECLARE_ROUND_KEY_VAR(10);
+        DECLARE_ROUND_KEY_VAR(11);
+        DECLARE_ROUND_KEY_VAR(12);
+        DECLARE_ROUND_KEY_VAR(13);
+        DECLARE_ROUND_KEY_VAR(14);
+        uint8x16_t tweak0 = vld1q_u8(this->tweak);
+        constexpr uint64_t xorv = 0x87ul;
+        uint64_t high, low, mask;
+
+        /* Process three blocks at a time, when possible. */
+        if (num_blocks > 3) {
+            /* Multiply tweak twice. */
+            uint8x16_t tweak1 = MultiplyTweak(tweak0);
+            uint8x16_t tweak2 = MultiplyTweak(tweak1);
+
+            do {
+                /* Save tweaks for xor usage. */
+                const uint8x16_t mask0 = tweak0;
+                const uint8x16_t mask1 = tweak1;
+                const uint8x16_t mask2 = tweak2;
+
+                /* Read blocks in, XOR with tweaks. */
+                uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); src += BlockSize;
+                uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src)); src += BlockSize;
+                uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src)); src += BlockSize;
+
+                /* Actually do encryption, use optimized asm. */
+                /* Interleave GF mult calculations with AES ones, to mask latencies. */
+                __asm__ __volatile__ (
+                    AES_ENC_ROUND(0, 0) "mov %[high], %[tweak2].d[1]\n"
+                    AES_ENC_ROUND(0, 1) "mov %[low], %[tweak2].d[0]\n"
+                    AES_ENC_ROUND(0, 2) "mov %[mask], #0x87\n"
+                    AES_ENC_ROUND(1, 0) "and %[mask], %[mask], %[high], asr 63\n"
+                    AES_ENC_ROUND(1, 1) "extr %[high], %[high], %[low], 63\n"
+                    AES_ENC_ROUND(1, 2) "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_ENC_ROUND(2, 0) "mov %[tweak0].d[1], %[high]\n"
+                    AES_ENC_ROUND(2, 1) "mov %[tweak0].d[0], %[low]\n"
+                    AES_ENC_ROUND(2, 2) "mov %[mask], #0x87\n"
+                    AES_ENC_ROUND(3, 0) "and %[mask], %[mask], %[high], asr 63\n"
+                    AES_ENC_ROUND(3, 1) "extr %[high], %[high], %[low], 63\n"
+                    AES_ENC_ROUND(3, 2) "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_ENC_ROUND(4, 0) "mov %[tweak1].d[1], %[high]\n"
+                    AES_ENC_ROUND(4, 1) "mov %[tweak1].d[0], %[low]\n"
+                    AES_ENC_ROUND(4, 2) "mov %[mask], #0x87\n"
+                    AES_ENC_ROUND(5, 0) "and %[mask], %[mask], %[high], asr 63\n"
+                    AES_ENC_ROUND(5, 1) "extr %[high], %[high], %[low], 63\n"
+                    AES_ENC_ROUND(5, 2) "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_ENC_ROUND(6, 0) "mov %[tweak2].d[1], %[high]\n"
+                    AES_ENC_ROUND(6, 1) "mov %[tweak2].d[0], %[low]\n"
+                    AES_ENC_ROUND(6, 2)
+                    AES_ENC_ROUND(7, 0) AES_ENC_ROUND(7, 1) AES_ENC_ROUND(7, 2)
+                    AES_ENC_ROUND(8, 0) AES_ENC_ROUND(8, 1) AES_ENC_ROUND(8, 2)
+                    AES_ENC_ROUND(9, 0) AES_ENC_ROUND(9, 1) AES_ENC_ROUND(9, 2)
+                    AES_ENC_ROUND(10, 0) AES_ENC_ROUND(10, 1) AES_ENC_ROUND(10, 2)
+                    AES_ENC_ROUND(11, 0) AES_ENC_ROUND(11, 1) AES_ENC_ROUND(11, 2)
+                    AES_ENC_ROUND(12, 0) AES_ENC_ROUND(12, 1) AES_ENC_ROUND(12, 2)
+                    AES_ENC_SECOND_LAST_ROUND(13, 0) AES_ENC_SECOND_LAST_ROUND(13, 1) AES_ENC_SECOND_LAST_ROUND(13, 2)
+                    AES_ENC_LAST_ROUND(14, 0) AES_ENC_LAST_ROUND(14, 1) AES_ENC_LAST_ROUND(14, 2)
+                    : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
+                      AES_ENC_DEC_OUTPUT_THREE_TWEAKS(),
+                      XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK()
+                    : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(10),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(11),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(12),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(13),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(14)
+                    : "cc"
+                );
+
+                /* XOR blocks. */
+                tmp0 = veorq_u8(mask0, tmp0);
+                tmp1 = veorq_u8(mask1, tmp1);
+                tmp2 = veorq_u8(mask2, tmp2);
+
+                /* Store to output. */
+                vst1q_u8(dst, tmp0); dst += BlockSize;
+                vst1q_u8(dst, tmp1); dst += BlockSize;
+                vst1q_u8(dst, tmp2); dst += BlockSize;
+
+                num_blocks -= 3;
+            } while (num_blocks > 3);
+        }
+
+        while ((--num_blocks) > 0) {
+            /* Save tweak for xor usage. */
+            const uint8x16_t mask0 = tweak0;
+
+            /* Read block in, XOR with tweak. */
+            uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src));
+            src += BlockSize;
+
+            /* Actually do encryption, use optimized asm. */
+            /* Interleave CTR calculations with AES ones, to mask latencies. */
+            __asm__ __volatile__ (
+                AES_ENC_ROUND(0, 0) "mov %[high], %[tweak0].d[1]\n"
+                AES_ENC_ROUND(1, 0) "mov %[low], %[tweak0].d[0]\n"
+                AES_ENC_ROUND(2, 0) "and %[mask], %[xorv], %[high], asr 63\n"
+                AES_ENC_ROUND(3, 0) "extr %[high], %[high], %[low], 63\n"
+                AES_ENC_ROUND(4, 0) "eor %[low], %[mask], %[low], lsl 1\n"
+                AES_ENC_ROUND(5, 0) "mov %[tweak0].d[1], %[high]\n"
+                AES_ENC_ROUND(6, 0) "mov %[tweak0].d[0], %[low]\n"
+                AES_ENC_ROUND(7, 0)
+                AES_ENC_ROUND(8, 0)
+                AES_ENC_ROUND(9, 0)
+                AES_ENC_ROUND(10, 0)
+                AES_ENC_ROUND(11, 0)
+                AES_ENC_ROUND(12, 0)
+                AES_ENC_SECOND_LAST_ROUND(13, 0)
+                AES_ENC_LAST_ROUND(14, 0)
+                : AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
+                  AES_ENC_DEC_OUTPUT_ONE_TWEAK(),
+                  XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK()
+                : XTS_INCREMENT_INPUT_XOR(),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(10),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(11),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(12),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(13),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(14)
+                : "cc"
+            );
+
+            /* XOR blocks. */
+            tmp0 = veorq_u8(mask0, tmp0);
+
+            /* Store to output. */
+            vst1q_u8(dst, tmp0);
+            dst += BlockSize;
+        }
+
+        vst1q_u8(this->tweak, tweak0);
+
+        std::memcpy(this->last_block, src, BlockSize);
+        this->state = State_Processing;
+
+        return processed;
+    }
+
+    template<>
+    size_t XtsModeImpl::ProcessBlocks<AesDecryptor128>(u8 *dst, const u8 *src, size_t num_blocks) {
+        /* Handle last buffered block. */
+        size_t processed = (num_blocks - 1) * BlockSize;
+
+        if (this->state == State_Processing) {
+            this->ProcessBlock(dst, this->last_block);
+            dst += BlockSize;
+            processed += BlockSize;
+        }
+
+        /* Preload all round keys + iv into neon registers. */
+        const u8 *keys = static_cast<const AesDecryptor128 *>(this->cipher_ctx)->GetRoundKey();
+        DECLARE_ROUND_KEY_VAR(0);
+        DECLARE_ROUND_KEY_VAR(1);
+        DECLARE_ROUND_KEY_VAR(2);
+        DECLARE_ROUND_KEY_VAR(3);
+        DECLARE_ROUND_KEY_VAR(4);
+        DECLARE_ROUND_KEY_VAR(5);
+        DECLARE_ROUND_KEY_VAR(6);
+        DECLARE_ROUND_KEY_VAR(7);
+        DECLARE_ROUND_KEY_VAR(8);
+        DECLARE_ROUND_KEY_VAR(9);
+        DECLARE_ROUND_KEY_VAR(10);
+        uint8x16_t tweak0 = vld1q_u8(this->tweak);
+        constexpr uint64_t xorv = 0x87ul;
+        uint64_t high, low, mask;
+
+        /* Process three blocks at a time, when possible. */
+        if (num_blocks > 3) {
+            /* Multiply tweak twice. */
+            uint8x16_t tweak1 = MultiplyTweak(tweak0);
+            uint8x16_t tweak2 = MultiplyTweak(tweak1);
+
+            do {
+                /* Save tweaks for xor usage. */
+                const uint8x16_t mask0 = tweak0;
+                const uint8x16_t mask1 = tweak1;
+                const uint8x16_t mask2 = tweak2;
+
+                /* Read blocks in, XOR with tweaks. */
+                uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); src += BlockSize;
+                uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src)); src += BlockSize;
+                uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src)); src += BlockSize;
+
+                /* Actually do encryption, use optimized asm. */
+                /* Interleave GF mult calculations with AES ones, to mask latencies. */
+                __asm__ __volatile__ (
+                    AES_DEC_ROUND(10, 0) "mov %[high], %[tweak2].d[1]\n"
+                    AES_DEC_ROUND(10, 1) "mov %[low], %[tweak2].d[0]\n"
+                    AES_DEC_ROUND(10, 2) "and %[mask], %[xorv], %[high], asr 63\n"
+                    AES_DEC_ROUND(9, 0)  "extr %[high], %[high], %[low], 63\n"
+                    AES_DEC_ROUND(9, 1)  "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_DEC_ROUND(9, 2)  "mov %[tweak0].d[1], %[high]\n"
+                    AES_DEC_ROUND(8, 0)  "mov %[tweak0].d[0], %[low]\n"
+                    AES_DEC_ROUND(8, 1)  "and %[mask], %[xorv], %[high], asr 63\n"
+                    AES_DEC_ROUND(8, 2)  "extr %[high], %[high], %[low], 63\n"
+                    AES_DEC_ROUND(7, 0)  "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_DEC_ROUND(7, 1)  "mov %[tweak1].d[1], %[high]\n"
+                    AES_DEC_ROUND(7, 2)  "mov %[tweak1].d[0], %[low]\n"
+                    AES_DEC_ROUND(6, 0)  "and %[mask], %[xorv], %[high], asr 63\n"
+                    AES_DEC_ROUND(6, 1)  "extr %[high], %[high], %[low], 63\n"
+                    AES_DEC_ROUND(6, 2)  "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_DEC_ROUND(5, 0)  "mov %[tweak2].d[1], %[high]\n"
+                    AES_DEC_ROUND(5, 1)  "mov %[tweak2].d[0], %[low]\n"
+                    AES_DEC_ROUND(5, 2)
+                    AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2)
+                    AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2)
+                    AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2)
+                    AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2)
+                    AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2)
+                    : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
+                      AES_ENC_DEC_OUTPUT_THREE_TWEAKS(),
+                      XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK()
+                    : XTS_INCREMENT_INPUT_XOR(),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(10)
+                    : "cc"
+                );
+
+                /* XOR blocks. */
+                tmp0 = veorq_u8(mask0, tmp0);
+                tmp1 = veorq_u8(mask1, tmp1);
+                tmp2 = veorq_u8(mask2, tmp2);
+
+                /* Store to output. */
+                vst1q_u8(dst, tmp0); dst += BlockSize;
+                vst1q_u8(dst, tmp1); dst += BlockSize;
+                vst1q_u8(dst, tmp2); dst += BlockSize;
+
+                num_blocks -= 3;
+            } while (num_blocks > 3);
+        }
+
+        while ((--num_blocks) > 0) {
+            /* Save tweak for xor usage. */
+            const uint8x16_t mask0 = tweak0;
+
+            /* Read block in, XOR with tweak. */
+            uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src));
+            src += BlockSize;
+
+            /* Actually do encryption, use optimized asm. */
+            /* Interleave CTR calculations with AES ones, to mask latencies. */
+            __asm__ __volatile__ (
+                AES_DEC_ROUND(10, 0) "mov %[high], %[tweak0].d[1]\n"
+                AES_DEC_ROUND(9, 0)  "mov %[low], %[tweak0].d[0]\n"
+                AES_DEC_ROUND(8, 0)  "and %[mask], %[xorv], %[high], asr 63\n"
+                AES_DEC_ROUND(7, 0)  "extr %[high], %[high], %[low], 63\n"
+                AES_DEC_ROUND(6, 0)  "eor %[low], %[mask], %[low], lsl 1\n"
+                AES_DEC_ROUND(5, 0)  "mov %[tweak0].d[1], %[high]\n"
+                AES_DEC_ROUND(4, 0)  "mov %[tweak0].d[0], %[low]\n"
+                AES_DEC_ROUND(3, 0)
+                AES_DEC_ROUND(2, 0)
+                AES_DEC_SECOND_LAST_ROUND(1, 0)
+                AES_DEC_LAST_ROUND(0, 0)
+                : AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
+                  AES_ENC_DEC_OUTPUT_ONE_TWEAK(),
+                  XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK()
+                : XTS_INCREMENT_INPUT_XOR(),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(10)
+                : "cc"
+            );
+
+            /* XOR blocks. */
+            tmp0 = veorq_u8(mask0, tmp0);
+
+            /* Store to output. */
+            vst1q_u8(dst, tmp0);
+            dst += BlockSize;
+        }
+
+        vst1q_u8(this->tweak, tweak0);
+
+        std::memcpy(this->last_block, src, BlockSize);
+        this->state = State_Processing;
+
+        return processed;
+    }
+
+    template<>
+    size_t XtsModeImpl::ProcessBlocks<AesDecryptor192>(u8 *dst, const u8 *src, size_t num_blocks) {
+        /* Handle last buffered block. */
+        size_t processed = (num_blocks - 1) * BlockSize;
+
+        if (this->state == State_Processing) {
+            this->ProcessBlock(dst, this->last_block);
+            dst += BlockSize;
+            processed += BlockSize;
+        }
+
+        /* Preload all round keys + iv into neon registers. */
+        const u8 *keys = static_cast<const AesDecryptor192 *>(this->cipher_ctx)->GetRoundKey();
+        DECLARE_ROUND_KEY_VAR(0);
+        DECLARE_ROUND_KEY_VAR(1);
+        DECLARE_ROUND_KEY_VAR(2);
+        DECLARE_ROUND_KEY_VAR(3);
+        DECLARE_ROUND_KEY_VAR(4);
+        DECLARE_ROUND_KEY_VAR(5);
+        DECLARE_ROUND_KEY_VAR(6);
+        DECLARE_ROUND_KEY_VAR(7);
+        DECLARE_ROUND_KEY_VAR(8);
+        DECLARE_ROUND_KEY_VAR(9);
+        DECLARE_ROUND_KEY_VAR(10);
+        DECLARE_ROUND_KEY_VAR(11);
+        DECLARE_ROUND_KEY_VAR(12);
+        uint8x16_t tweak0 = vld1q_u8(this->tweak);
+        constexpr uint64_t xorv = 0x87ul;
+        uint64_t high, low, mask;
+
+        /* Process three blocks at a time, when possible. */
+        if (num_blocks > 3) {
+            /* Multiply tweak twice. */
+            uint8x16_t tweak1 = MultiplyTweak(tweak0);
+            uint8x16_t tweak2 = MultiplyTweak(tweak1);
+
+            do {
+                /* Save tweaks for xor usage. */
+                const uint8x16_t mask0 = tweak0;
+                const uint8x16_t mask1 = tweak1;
+                const uint8x16_t mask2 = tweak2;
+
+                /* Read blocks in, XOR with tweaks. */
+                uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); src += BlockSize;
+                uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src)); src += BlockSize;
+                uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src)); src += BlockSize;
+
+                /* Actually do encryption, use optimized asm. */
+                /* Interleave GF mult calculations with AES ones, to mask latencies. */
+                __asm__ __volatile__ (
+                    AES_DEC_ROUND(12, 0) "mov %[high], %[tweak2].d[1]\n"
+                    AES_DEC_ROUND(12, 1) "mov %[low], %[tweak2].d[0]\n"
+                    AES_DEC_ROUND(12, 2) "and %[mask], %[xorv], %[high], asr 63\n"
+                    AES_DEC_ROUND(11, 0) "extr %[high], %[high], %[low], 63\n"
+                    AES_DEC_ROUND(11, 1) "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_DEC_ROUND(11, 2) "mov %[tweak0].d[1], %[high]\n"
+                    AES_DEC_ROUND(10, 0) "mov %[tweak0].d[0], %[low]\n"
+                    AES_DEC_ROUND(10, 1) "and %[mask], %[xorv], %[high], asr 63\n"
+                    AES_DEC_ROUND(10, 2) "extr %[high], %[high], %[low], 63\n"
+                    AES_DEC_ROUND(9, 0)  "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_DEC_ROUND(9, 1)  "mov %[tweak1].d[1], %[high]\n"
+                    AES_DEC_ROUND(9, 2)  "mov %[tweak1].d[0], %[low]\n"
+                    AES_DEC_ROUND(8, 0)  "and %[mask], %[xorv], %[high], asr 63\n"
+                    AES_DEC_ROUND(8, 1)  "extr %[high], %[high], %[low], 63\n"
+                    AES_DEC_ROUND(8, 2)  "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_DEC_ROUND(7, 0)  "mov %[tweak2].d[1], %[high]\n"
+                    AES_DEC_ROUND(7, 1)  "mov %[tweak2].d[0], %[low]\n"
+                    AES_DEC_ROUND(7, 2)
+                    AES_DEC_ROUND(6, 0) AES_DEC_ROUND(6, 1) AES_DEC_ROUND(6, 2)
+                    AES_DEC_ROUND(5, 0) AES_DEC_ROUND(5, 1) AES_DEC_ROUND(5, 2)
+                    AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2)
+                    AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2)
+                    AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2)
+                    AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2)
+                    AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2)
+                    : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
+                      AES_ENC_DEC_OUTPUT_THREE_TWEAKS(),
+                      XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK()
+                    : XTS_INCREMENT_INPUT_XOR(),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(10),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(11),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(12)
+                    : "cc"
+                );
+
+                /* XOR blocks. */
+                tmp0 = veorq_u8(mask0, tmp0);
+                tmp1 = veorq_u8(mask1, tmp1);
+                tmp2 = veorq_u8(mask2, tmp2);
+
+                /* Store to output. */
+                vst1q_u8(dst, tmp0); dst += BlockSize;
+                vst1q_u8(dst, tmp1); dst += BlockSize;
+                vst1q_u8(dst, tmp2); dst += BlockSize;
+
+                num_blocks -= 3;
+            } while (num_blocks > 3);
+        }
+
+        while ((--num_blocks) > 0) {
+            /* Save tweak for xor usage. */
+            const uint8x16_t mask0 = tweak0;
+
+            /* Read block in, XOR with tweak. */
+            uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src));
+            src += BlockSize;
+
+            /* Actually do encryption, use optimized asm. */
+            /* Interleave CTR calculations with AES ones, to mask latencies. */
+            __asm__ __volatile__ (
+                AES_DEC_ROUND(12, 0) "mov %[high], %[tweak0].d[1]\n"
+                AES_DEC_ROUND(11, 0) "mov %[low], %[tweak0].d[0]\n"
+                AES_DEC_ROUND(10, 0) "and %[mask], %[xorv], %[high], asr 63\n"
+                AES_DEC_ROUND(9, 0)  "extr %[high], %[high], %[low], 63\n"
+                AES_DEC_ROUND(8, 0)  "eor %[low], %[mask], %[low], lsl 1\n"
+                AES_DEC_ROUND(7, 0)  "mov %[tweak0].d[1], %[high]\n"
+                AES_DEC_ROUND(6, 0)  "mov %[tweak0].d[0], %[low]\n"
+                AES_DEC_ROUND(5, 0)
+                AES_DEC_ROUND(4, 0)
+                AES_DEC_ROUND(3, 0)
+                AES_DEC_ROUND(2, 0)
+                AES_DEC_SECOND_LAST_ROUND(1, 0)
+                AES_DEC_LAST_ROUND(0, 0)
+                : AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
+                  AES_ENC_DEC_OUTPUT_ONE_TWEAK(),
+                  XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK()
+                : XTS_INCREMENT_INPUT_XOR(),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(10),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(11),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(12)
+                : "cc"
+            );
+
+            /* XOR blocks. */
+            tmp0 = veorq_u8(mask0, tmp0);
+
+            /* Store to output. */
+            vst1q_u8(dst, tmp0);
+            dst += BlockSize;
+        }
+
+        vst1q_u8(this->tweak, tweak0);
+
+        std::memcpy(this->last_block, src, BlockSize);
+        this->state = State_Processing;
+
+        return processed;
+    }
+
+    template<>
+    size_t XtsModeImpl::ProcessBlocks<AesDecryptor256>(u8 *dst, const u8 *src, size_t num_blocks) {
+        /* Handle last buffered block. */
+        size_t processed = (num_blocks - 1) * BlockSize;
+
+        if (this->state == State_Processing) {
+            this->ProcessBlock(dst, this->last_block);
+            dst += BlockSize;
+            processed += BlockSize;
+        }
+
+        /* Preload all round keys + iv into neon registers. */
+        const u8 *keys = static_cast<const AesDecryptor256 *>(this->cipher_ctx)->GetRoundKey();
+        DECLARE_ROUND_KEY_VAR(0);
+        DECLARE_ROUND_KEY_VAR(1);
+        DECLARE_ROUND_KEY_VAR(2);
+        DECLARE_ROUND_KEY_VAR(3);
+        DECLARE_ROUND_KEY_VAR(4);
+        DECLARE_ROUND_KEY_VAR(5);
+        DECLARE_ROUND_KEY_VAR(6);
+        DECLARE_ROUND_KEY_VAR(7);
+        DECLARE_ROUND_KEY_VAR(8);
+        DECLARE_ROUND_KEY_VAR(9);
+        DECLARE_ROUND_KEY_VAR(10);
+        DECLARE_ROUND_KEY_VAR(11);
+        DECLARE_ROUND_KEY_VAR(12);
+        DECLARE_ROUND_KEY_VAR(13);
+        DECLARE_ROUND_KEY_VAR(14);
+        uint8x16_t tweak0 = vld1q_u8(this->tweak);
+        constexpr uint64_t xorv = 0x87ul;
+        uint64_t high, low, mask;
+
+        /* Process three blocks at a time, when possible. */
+        if (num_blocks > 3) {
+            /* Multiply tweak twice. */
+            uint8x16_t tweak1 = MultiplyTweak(tweak0);
+            uint8x16_t tweak2 = MultiplyTweak(tweak1);
+
+            do {
+                /* Save tweaks for xor usage. */
+                const uint8x16_t mask0 = tweak0;
+                const uint8x16_t mask1 = tweak1;
+                const uint8x16_t mask2 = tweak2;
+
+                /* Read blocks in, XOR with tweaks. */
+                uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src)); src += BlockSize;
+                uint8x16_t tmp1 = veorq_u8(mask1, vld1q_u8(src)); src += BlockSize;
+                uint8x16_t tmp2 = veorq_u8(mask2, vld1q_u8(src)); src += BlockSize;
+
+                /* Actually do encryption, use optimized asm. */
+                /* Interleave GF mult calculations with AES ones, to mask latencies. */
+                __asm__ __volatile__ (
+                    AES_DEC_ROUND(14, 0) "mov %[high], %[tweak2].d[1]\n"
+                    AES_DEC_ROUND(14, 1) "mov %[low], %[tweak2].d[0]\n"
+                    AES_DEC_ROUND(14, 2) "mov %[mask], 0x87\n"
+                    AES_DEC_ROUND(13, 0) "and %[mask], %[mask], %[high], asr 63\n"
+                    AES_DEC_ROUND(13, 1) "extr %[high], %[high], %[low], 63\n"
+                    AES_DEC_ROUND(13, 2) "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_DEC_ROUND(12, 0) "mov %[tweak0].d[1], %[high]\n"
+                    AES_DEC_ROUND(12, 1) "mov %[tweak0].d[0], %[low]\n"
+                    AES_DEC_ROUND(12, 2) "mov %[mask], 0x87\n"
+                    AES_DEC_ROUND(11, 0) "and %[mask], %[mask], %[high], asr 63\n"
+                    AES_DEC_ROUND(11, 1) "extr %[high], %[high], %[low], 63\n"
+                    AES_DEC_ROUND(11, 2) "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_DEC_ROUND(10, 0) "mov %[tweak1].d[1], %[high]\n"
+                    AES_DEC_ROUND(10, 1) "mov %[tweak1].d[0], %[low]\n"
+                    AES_DEC_ROUND(10, 2) "mov %[mask], 0x87\n"
+                    AES_DEC_ROUND(9, 0)  "and %[mask], %[mask], %[high], asr 63\n"
+                    AES_DEC_ROUND(9, 1)  "extr %[high], %[high], %[low], 63\n"
+                    AES_DEC_ROUND(9, 2)  "eor %[low], %[mask], %[low], lsl 1\n"
+                    AES_DEC_ROUND(8, 0)  "mov %[tweak2].d[1], %[high]\n"
+                    AES_DEC_ROUND(8, 1)  "mov %[tweak2].d[0], %[low]\n"
+                    AES_DEC_ROUND(8, 2)
+                    AES_DEC_ROUND(7, 0) AES_DEC_ROUND(7, 1) AES_DEC_ROUND(7, 2)
+                    AES_DEC_ROUND(6, 0) AES_DEC_ROUND(6, 1) AES_DEC_ROUND(6, 2)
+                    AES_DEC_ROUND(5, 0) AES_DEC_ROUND(5, 1) AES_DEC_ROUND(5, 2)
+                    AES_DEC_ROUND(4, 0) AES_DEC_ROUND(4, 1) AES_DEC_ROUND(4, 2)
+                    AES_DEC_ROUND(3, 0) AES_DEC_ROUND(3, 1) AES_DEC_ROUND(3, 2)
+                    AES_DEC_ROUND(2, 0) AES_DEC_ROUND(2, 1) AES_DEC_ROUND(2, 2)
+                    AES_DEC_SECOND_LAST_ROUND(1, 0) AES_DEC_SECOND_LAST_ROUND(1, 1) AES_DEC_SECOND_LAST_ROUND(1, 2)
+                    AES_DEC_LAST_ROUND(0, 0) AES_DEC_LAST_ROUND(0, 1) AES_DEC_LAST_ROUND(0, 2)
+                    : AES_ENC_DEC_OUTPUT_THREE_BLOCKS(),
+                      AES_ENC_DEC_OUTPUT_THREE_TWEAKS(),
+                      XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK()
+                    : AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(10),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(11),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(12),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(13),
+                      AES_ENC_DEC_INPUT_ROUND_KEY(14)
+                    : "cc"
+                );
+
+                /* XOR blocks. */
+                tmp0 = veorq_u8(mask0, tmp0);
+                tmp1 = veorq_u8(mask1, tmp1);
+                tmp2 = veorq_u8(mask2, tmp2);
+
+                /* Store to output. */
+                vst1q_u8(dst, tmp0); dst += BlockSize;
+                vst1q_u8(dst, tmp1); dst += BlockSize;
+                vst1q_u8(dst, tmp2); dst += BlockSize;
+
+                num_blocks -= 3;
+            } while (num_blocks > 3);
+        }
+
+        while ((--num_blocks) > 0) {
+            /* Save tweak for xor usage. */
+            const uint8x16_t mask0 = tweak0;
+
+            /* Read block in, XOR with tweak. */
+            uint8x16_t tmp0 = veorq_u8(mask0, vld1q_u8(src));
+            src += BlockSize;
+
+            /* Actually do encryption, use optimized asm. */
+            /* Interleave CTR calculations with AES ones, to mask latencies. */
+            __asm__ __volatile__ (
+                AES_DEC_ROUND(14, 0) "mov %[high], %[tweak0].d[1]\n"
+                AES_DEC_ROUND(13, 0) "mov %[low], %[tweak0].d[0]\n"
+                AES_DEC_ROUND(12, 0) "and %[mask], %[xorv], %[high], asr 63\n"
+                AES_DEC_ROUND(11, 0) "extr %[high], %[high], %[low], 63\n"
+                AES_DEC_ROUND(10, 0) "eor %[low], %[mask], %[low], lsl 1\n"
+                AES_DEC_ROUND(9, 0)  "mov %[tweak0].d[1], %[high]\n"
+                AES_DEC_ROUND(8, 0)  "mov %[tweak0].d[0], %[low]\n"
+                AES_DEC_ROUND(7, 0)
+                AES_DEC_ROUND(6, 0)
+                AES_DEC_ROUND(5, 0)
+                AES_DEC_ROUND(4, 0)
+                AES_DEC_ROUND(3, 0)
+                AES_DEC_ROUND(2, 0)
+                AES_DEC_SECOND_LAST_ROUND(1, 0)
+                AES_DEC_LAST_ROUND(0, 0)
+                : AES_ENC_DEC_OUTPUT_ONE_BLOCK(),
+                  AES_ENC_DEC_OUTPUT_ONE_TWEAK(),
+                  XTS_INCREMENT_OUTPUT_HIGH_LOW_MASK()
+                : XTS_INCREMENT_INPUT_XOR(),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(0),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(1),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(2),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(3),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(4),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(5),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(6),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(7),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(8),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(9),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(10),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(11),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(12),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(13),
+                  AES_ENC_DEC_INPUT_ROUND_KEY(14)
+                : "cc"
+            );
+
+            /* XOR blocks. */
+            tmp0 = veorq_u8(mask0, tmp0);
+
+            /* Store to output. */
+            vst1q_u8(dst, tmp0);
+            dst += BlockSize;
+        }
+
+        vst1q_u8(this->tweak, tweak0);
+
+        std::memcpy(this->last_block, src, BlockSize);
+        this->state = State_Processing;
+
+        return processed;
+    }
+
+#else
+
+    /* TODO: Non-EL0 implementation. */
+
+#endif
+
+}
diff --git a/libraries/libvapours/source/crypto/impl/crypto_xts_mode_impl.cpp b/libraries/libvapours/source/crypto/impl/crypto_xts_mode_impl.cpp
new file mode 100644
index 000000000..86d022d71
--- /dev/null
+++ b/libraries/libvapours/source/crypto/impl/crypto_xts_mode_impl.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018-2020 Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <vapours.hpp>
+
+namespace ams::crypto::impl {
+
+    namespace {
+
+        /* TODO: Support non-Nintendo Endianness */
+
+        void MultiplyTweakGeneric(u64 *tweak) {
+            const u64 carry = tweak[1] & (static_cast<u64>(1) << (BITSIZEOF(u64) - 1));
+
+            tweak[1] = ((tweak[1] << 1) | (tweak[0] >> (BITSIZEOF(u64) - 1)));
+            tweak[0] = (tweak[0] << 1);
+
+            if (carry) {
+                tweak[0] ^= static_cast<u64>(0x87);
+            }
+        }
+
+    }
+
+    void XtsModeImpl::ProcessBlock(u8 *dst, const u8 *src) {
+        u8 tmp[BlockSize];
+
+        /* Xor. */
+        for (size_t i = 0; i < BlockSize; i++) {
+            tmp[i] = this->tweak[i] ^ src[i];
+        }
+
+        /* Crypt */
+        this->cipher_func(tmp, tmp, this->cipher_ctx);
+
+        /* Xor. */
+        for (size_t i = 0; i < BlockSize; i++) {
+            dst[i] = this->tweak[i] ^ tmp[i];
+        }
+
+        MultiplyTweakGeneric(reinterpret_cast<u64 *>(this->tweak));
+    }
+
+    size_t XtsModeImpl::FinalizeEncryption(void *dst, size_t dst_size) {
+        AMS_ASSERT(this->state == State_Processing);
+
+        u8 *dst_u8 = static_cast<u8 *>(dst);
+        size_t processed = 0;
+
+        if (this->num_buffered == 0) {
+            this->ProcessBlock(dst_u8, this->last_block);
+            processed = BlockSize;
+        } else {
+            this->ProcessBlock(this->last_block, this->last_block);
+
+            std::memcpy(this->buffer + this->num_buffered, this->last_block + this->num_buffered, BlockSize - this->num_buffered);
+
+            this->ProcessBlock(dst_u8, this->buffer);
+
+            std::memcpy(dst_u8 + BlockSize, this->last_block, this->num_buffered);
+
+            processed = BlockSize + this->num_buffered;
+        }
+
+        this->state = State_Done;
+        return processed;
+    }
+
+    size_t XtsModeImpl::FinalizeDecryption(void *dst, size_t dst_size) {
+        AMS_ASSERT(this->state == State_Processing);
+
+        u8 *dst_u8 = static_cast<u8 *>(dst);
+        size_t processed = 0;
+
+        if (this->num_buffered == 0) {
+            this->ProcessBlock(dst_u8, this->last_block);
+            processed = BlockSize;
+        } else {
+            u8 tmp_tweak[BlockSize];
+            std::memcpy(tmp_tweak, this->tweak, BlockSize);
+            MultiplyTweakGeneric(reinterpret_cast<u64 *>(this->tweak));
+
+            this->ProcessBlock(this->last_block, this->last_block);
+
+            std::memcpy(this->buffer + this->num_buffered, this->last_block + this->num_buffered, BlockSize - this->num_buffered);
+
+            std::memcpy(this->tweak, tmp_tweak, BlockSize);
+
+            this->ProcessBlock(dst_u8, this->buffer);
+
+            std::memcpy(dst_u8 + BlockSize, this->last_block, this->num_buffered);
+
+            processed = BlockSize + this->num_buffered;
+        }
+
+        this->state = State_Done;
+        return processed;
+    }
+
+    size_t XtsModeImpl::ProcessPartialData(u8 *dst, const u8 *src, size_t size) {
+        size_t processed = 0;
+
+        std::memcpy(this->buffer + this->num_buffered, src, size);
+        this->num_buffered += size;
+
+        if (this->num_buffered == BlockSize) {
+            if (this->state == State_Processing) {
+                this->ProcessBlock(dst, this->last_block);
+                processed += BlockSize;
+            }
+
+            std::memcpy(this->last_block, this->buffer, BlockSize);
+            this->num_buffered = 0;
+
+            this->state = State_Processing;
+        }
+
+        return processed;
+    }
+
+    size_t XtsModeImpl::ProcessRemainingData(u8 *dst, const u8 *src, size_t size) {
+        std::memcpy(this->buffer, src, size);
+        this->num_buffered = size;
+
+        return 0;
+    }
+
+}