From dd4993dfda662cc6c9eebc1d4533a5d92a7cfb51 Mon Sep 17 00:00:00 2001 From: Michael Scire Date: Tue, 24 Jul 2018 01:26:37 -0700 Subject: [PATCH] Loader: Use HW-acceleration for SHA256 --- stratosphere/loader/source/ldr_nro.cpp | 5 +- stratosphere/loader/source/ldr_nso.cpp | 6 +- stratosphere/loader/source/sha256.c | 227 +++++++++------------- stratosphere/loader/source/sha256.h | 57 +++--- stratosphere/loader/source/sha256_armv8.s | 163 ++++++++++++++++ 5 files changed, 287 insertions(+), 171 deletions(-) create mode 100644 stratosphere/loader/source/sha256_armv8.s diff --git a/stratosphere/loader/source/ldr_nro.cpp b/stratosphere/loader/source/ldr_nro.cpp index 52f1bdf03..127878212 100644 --- a/stratosphere/loader/source/ldr_nro.cpp +++ b/stratosphere/loader/source/ldr_nro.cpp @@ -36,7 +36,7 @@ Result NroUtils::LoadNro(Registration::Process *target_proc, Handle process_h, u unsigned int i; Result rc; u8 nro_hash[0x20]; - SHA256_CTX sha_ctx; + struct sha256_state sha_ctx; /* Ensure there is an available NRO slot. */ if (std::all_of(target_proc->nro_infos.begin(), target_proc->nro_infos.end(), std::mem_fn(&Registration::NroInfo::in_use))) { return 0x6E09; @@ -78,7 +78,8 @@ Result NroUtils::LoadNro(Registration::Process *target_proc, Handle process_h, u sha256_init(&sha_ctx); sha256_update(&sha_ctx, (u8 *)nro, nro->nro_size); - sha256_final(&sha_ctx, nro_hash); + sha256_finalize(&sha_ctx); + sha256_finish(&sha_ctx, nro_hash); if (!Registration::IsNroHashPresent(target_proc->index, nro_hash)) { rc = 0x6C09; diff --git a/stratosphere/loader/source/ldr_nso.cpp b/stratosphere/loader/source/ldr_nso.cpp index 86b696454..b01e5d25f 100644 --- a/stratosphere/loader/source/ldr_nso.cpp +++ b/stratosphere/loader/source/ldr_nso.cpp @@ -199,6 +199,7 @@ Result NsoUtils::CalculateNsoLoadExtents(u32 addspace_type, u32 args_size, NsoLo } + Result NsoUtils::LoadNsoSegment(u64 title_id, unsigned int index, unsigned int segment, FILE *f_nso, u8 *map_base, u8 *map_end) { bool is_compressed = ((g_nso_headers[index].flags >> segment) & 1) != 0; bool check_hash = ((g_nso_headers[index].flags >> (segment + 3)) & 1) != 0; @@ -230,10 +231,11 @@ Result NsoUtils::LoadNsoSegment(u64 title_id, unsigned int index, unsigned int s if (check_hash) { u8 hash[0x20] = {0}; - SHA256_CTX sha_ctx; + struct sha256_state sha_ctx; sha256_init(&sha_ctx); sha256_update(&sha_ctx, dst_addr, out_size); - sha256_final(&sha_ctx, hash); + sha256_finalize(&sha_ctx); + sha256_finish(&sha_ctx, hash); if (std::memcmp(g_nso_headers[index].section_hashes[segment], hash, sizeof(hash))) { return 0xA09; diff --git a/stratosphere/loader/source/sha256.c b/stratosphere/loader/source/sha256.c index eb9c5c073..02d40a758 100644 --- a/stratosphere/loader/source/sha256.c +++ b/stratosphere/loader/source/sha256.c @@ -1,158 +1,113 @@ -/********************************************************************* -* Filename: sha256.c -* Author: Brad Conte (brad AT bradconte.com) -* Copyright: -* Disclaimer: This code is presented "as is" without any guarantees. -* Details: Implementation of the SHA-256 hashing algorithm. - SHA-256 is one of the three algorithms in the SHA2 - specification. The others, SHA-384 and SHA-512, are not - offered in this implementation. - Algorithm specification can be found here: - * http://csrc.nist.gov/publications/fips/fips180-2/fips180-2withchangenotice.pdf - This implementation uses little endian byte order. -*********************************************************************/ +/* Based on linux source code */ +/* + * sha256_base.h - core logic for SHA-256 implementations + * + * Copyright (C) 2015 Linaro Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ -/*************************** HEADER FILES ***************************/ -#include -#include +#ifdef __cplusplus +extern "C" { +#endif + +#include #include "sha256.h" -/****************************** MACROS ******************************/ -#define ROTLEFT(a,b) (((a) << (b)) | ((a) >> (32-(b)))) -#define ROTRIGHT(a,b) (((a) >> (b)) | ((a) << (32-(b)))) +#define unlikely(x) __builtin_expect(!!(x), 0) -#define CH(x,y,z) (((x) & (y)) ^ (~(x) & (z))) -#define MAJ(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) -#define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22)) -#define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25)) -#define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ ((x) >> 3)) -#define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ ((x) >> 10)) - -/**************************** VARIABLES *****************************/ -static const WORD k[64] = { - 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, - 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, - 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, - 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, - 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, - 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070, - 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, - 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -}; - -/*********************** FUNCTION DEFINITIONS ***********************/ -void sha256_transform(SHA256_CTX *ctx, const BYTE data[]) +void sha256_block_data_order (uint32_t *ctx, const void *in, size_t num); + +int sha256_init(struct sha256_state *sctx) { - WORD a, b, c, d, e, f, g, h, i, j, t1, t2, m[64]; + sctx->state[0] = SHA256_H0; + sctx->state[1] = SHA256_H1; + sctx->state[2] = SHA256_H2; + sctx->state[3] = SHA256_H3; + sctx->state[4] = SHA256_H4; + sctx->state[5] = SHA256_H5; + sctx->state[6] = SHA256_H6; + sctx->state[7] = SHA256_H7; + sctx->count = 0; - for (i = 0, j = 0; i < 16; ++i, j += 4) - m[i] = (data[j] << 24) | (data[j + 1] << 16) | (data[j + 2] << 8) | (data[j + 3]); - for ( ; i < 64; ++i) - m[i] = SIG1(m[i - 2]) + m[i - 7] + SIG0(m[i - 15]) + m[i - 16]; - - a = ctx->state[0]; - b = ctx->state[1]; - c = ctx->state[2]; - d = ctx->state[3]; - e = ctx->state[4]; - f = ctx->state[5]; - g = ctx->state[6]; - h = ctx->state[7]; - - for (i = 0; i < 64; ++i) { - t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i]; - t2 = EP0(a) + MAJ(a,b,c); - h = g; - g = f; - f = e; - e = d + t1; - d = c; - c = b; - b = a; - a = t1 + t2; - } - - ctx->state[0] += a; - ctx->state[1] += b; - ctx->state[2] += c; - ctx->state[3] += d; - ctx->state[4] += e; - ctx->state[5] += f; - ctx->state[6] += g; - ctx->state[7] += h; + return 0; } -void sha256_init(SHA256_CTX *ctx) +int sha256_update(struct sha256_state *sctx, + const void *data, + size_t len) { - ctx->datalen = 0; - ctx->bitlen = 0; - ctx->state[0] = 0x6a09e667; - ctx->state[1] = 0xbb67ae85; - ctx->state[2] = 0x3c6ef372; - ctx->state[3] = 0xa54ff53a; - ctx->state[4] = 0x510e527f; - ctx->state[5] = 0x9b05688c; - ctx->state[6] = 0x1f83d9ab; - ctx->state[7] = 0x5be0cd19; -} + const u8 *data8 = (const u8 *)data; + unsigned int len32 = (unsigned int)len; + unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; -void sha256_update(SHA256_CTX *ctx, const BYTE data[], size_t len) -{ - WORD i; + sctx->count += len32; - for (i = 0; i < len; ++i) { - ctx->data[ctx->datalen] = data[i]; - ctx->datalen++; - if (ctx->datalen == 64) { - sha256_transform(ctx, ctx->data); - ctx->bitlen += 512; - ctx->datalen = 0; + if (unlikely((partial + len32) >= SHA256_BLOCK_SIZE)) { + int blocks; + + if (partial) { + int p = SHA256_BLOCK_SIZE - partial; + + memcpy(sctx->buf + partial, data8, p); + data8 += p; + len32 -= p; + + sha256_block_data_order(sctx->state, sctx->buf, 1); } + + blocks = len32 / SHA256_BLOCK_SIZE; + len32 %= SHA256_BLOCK_SIZE; + + if (blocks) { + sha256_block_data_order(sctx->state, data8, blocks); + data8 += blocks * SHA256_BLOCK_SIZE; + } + partial = 0; } + if (len32) + memcpy(sctx->buf + partial, data8, len32); + + return 0; } -void sha256_final(SHA256_CTX *ctx, BYTE hash[]) +int sha256_finalize(struct sha256_state *sctx) { - WORD i; + const int bit_offset = SHA256_BLOCK_SIZE - sizeof(u64); + u64 *bits = (u64 *)(sctx->buf + bit_offset); + unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; - i = ctx->datalen; + sctx->buf[partial++] = 0x80; + if (partial > bit_offset) { + memset(sctx->buf + partial, 0x0, SHA256_BLOCK_SIZE - partial); + partial = 0; - // Pad whatever data is left in the buffer. - if (ctx->datalen < 56) { - ctx->data[i++] = 0x80; - while (i < 56) - ctx->data[i++] = 0x00; - } - else { - ctx->data[i++] = 0x80; - while (i < 64) - ctx->data[i++] = 0x00; - sha256_transform(ctx, ctx->data); - memset(ctx->data, 0, 56); + sha256_block_data_order(sctx->state, sctx->buf, 1); } - // Append to the padding the total message's length in bits and transform. - ctx->bitlen += ctx->datalen * 8; - ctx->data[63] = ctx->bitlen; - ctx->data[62] = ctx->bitlen >> 8; - ctx->data[61] = ctx->bitlen >> 16; - ctx->data[60] = ctx->bitlen >> 24; - ctx->data[59] = ctx->bitlen >> 32; - ctx->data[58] = ctx->bitlen >> 40; - ctx->data[57] = ctx->bitlen >> 48; - ctx->data[56] = ctx->bitlen >> 56; - sha256_transform(ctx, ctx->data); + memset(sctx->buf + partial, 0x0, bit_offset - partial); + *bits = __builtin_bswap64(sctx->count << 3); + sha256_block_data_order(sctx->state, sctx->buf, 1); - // Since this implementation uses little endian byte ordering and SHA uses big endian, - // reverse all the bytes when copying the final state to the output hash. - for (i = 0; i < 4; ++i) { - hash[i] = (ctx->state[0] >> (24 - i * 8)) & 0x000000ff; - hash[i + 4] = (ctx->state[1] >> (24 - i * 8)) & 0x000000ff; - hash[i + 8] = (ctx->state[2] >> (24 - i * 8)) & 0x000000ff; - hash[i + 12] = (ctx->state[3] >> (24 - i * 8)) & 0x000000ff; - hash[i + 16] = (ctx->state[4] >> (24 - i * 8)) & 0x000000ff; - hash[i + 20] = (ctx->state[5] >> (24 - i * 8)) & 0x000000ff; - hash[i + 24] = (ctx->state[6] >> (24 - i * 8)) & 0x000000ff; - hash[i + 28] = (ctx->state[7] >> (24 - i * 8)) & 0x000000ff; - } + return 0; } + +int sha256_finish(struct sha256_state *sctx, void *out) +{ + unsigned int digest_size = 32; + u32 *digest = (u32 *)out; + int i; + + // Switch: misalignment shouldn't be a problem here... + for (i = 0; digest_size > 0; i++, digest_size -= sizeof(u32)) + *digest++ = __builtin_bswap32(sctx->state[i]); + + *sctx = (struct sha256_state){}; + return 0; +} + +#ifdef __cplusplus +} +#endif diff --git a/stratosphere/loader/source/sha256.h b/stratosphere/loader/source/sha256.h index 870b1dbe0..cfb09f89c 100644 --- a/stratosphere/loader/source/sha256.h +++ b/stratosphere/loader/source/sha256.h @@ -1,41 +1,36 @@ -/********************************************************************* -* Filename: sha256.h -* Author: Brad Conte (brad AT bradconte.com) -* Copyright: -* Disclaimer: This code is presented "as is" without any guarantees. -* Details: Defines the API for the corresponding SHA1 implementation. -*********************************************************************/ +#pragma once -#if defined (__cplusplus) +/* Based on linux source code */ + +#ifdef __cplusplus extern "C" { #endif -#ifndef SHA256_H -#define SHA256_H -/*************************** HEADER FILES ***************************/ -#include +#include -/****************************** MACROS ******************************/ -#define SHA256_BLOCK_SIZE 32 // SHA256 outputs a 32 byte digest +#define SHA256_DIGEST_SIZE 32 +#define SHA256_BLOCK_SIZE 64 -/**************************** DATA TYPES ****************************/ -typedef unsigned char BYTE; // 8-bit byte -typedef unsigned int WORD; // 32-bit word, change to "long" for 16-bit machines +#define SHA256_H0 0x6a09e667UL +#define SHA256_H1 0xbb67ae85UL +#define SHA256_H2 0x3c6ef372UL +#define SHA256_H3 0xa54ff53aUL +#define SHA256_H4 0x510e527fUL +#define SHA256_H5 0x9b05688cUL +#define SHA256_H6 0x1f83d9abUL +#define SHA256_H7 0x5be0cd19UL -typedef struct { - BYTE data[64]; - WORD datalen; - unsigned long long bitlen; - WORD state[8]; -} SHA256_CTX; +struct sha256_state { + u32 state[SHA256_DIGEST_SIZE / 4]; + u64 count; + u8 buf[SHA256_BLOCK_SIZE]; +}; -/*********************** FUNCTION DECLARATIONS **********************/ -void sha256_init(SHA256_CTX *ctx); -void sha256_update(SHA256_CTX *ctx, const BYTE data[], size_t len); -void sha256_final(SHA256_CTX *ctx, BYTE hash[]); +int sha256_init(struct sha256_state *sctx); +int sha256_update(struct sha256_state *sctx, const void *data, size_t len); +int sha256_finalize(struct sha256_state *sctx); +int sha256_finish(struct sha256_state *sctx, void *out); -#endif // SHA256_H - -#if defined (__cplusplus) +#ifdef __cplusplus } -#endif \ No newline at end of file +#endif diff --git a/stratosphere/loader/source/sha256_armv8.s b/stratosphere/loader/source/sha256_armv8.s new file mode 100644 index 000000000..0420d38be --- /dev/null +++ b/stratosphere/loader/source/sha256_armv8.s @@ -0,0 +1,163 @@ +.section .text.sha256_armv8, "ax", %progbits +.align 5 +.arch armv8-a+crypto + +# SHA256 assembly implementation for ARMv8 AArch64 (based on linux source code) + +.global sha256_block_data_order +.type sha256_block_data_order,%function +sha256_block_data_order: + +.Lsha256prolog: + + stp x29, x30, [sp,#-64]! + mov x29, sp + adr x3, .LKConstant256 + str q8, [sp, #16] + ld1 {v16.4s-v19.4s}, [x3], #64 + ld1 {v0.4s}, [x0], #16 + ld1 {v20.4s-v23.4s}, [x3], #64 + add x2, x1, x2, lsl #6 + ld1 {v1.4s}, [x0] + ld1 {v24.4s-v27.4s}, [x3], #64 + sub x0, x0, #16 + str q9, [sp, #32] + str q10, [sp, #48] + ld1 {v28.4s-v31.4s}, [x3], #64 + +.Lsha256loop: + + ld1 {v5.16b-v8.16b}, [x1], #64 + mov v2.16b, v0.16b + mov v3.16b, v1.16b + + rev32 v5.16b, v5.16b + rev32 v6.16b, v6.16b + add v9.4s, v5.4s, v16.4s + rev32 v7.16b, v7.16b + add v10.4s, v6.4s, v17.4s + mov v4.16b, v2.16b + sha256h q2, q3, v9.4s + sha256h2 q3, q4, v9.4s + sha256su0 v5.4s, v6.4s + rev32 v8.16b, v8.16b + add v9.4s, v7.4s, v18.4s + mov v4.16b, v2.16b + sha256h q2, q3, v10.4s + sha256h2 q3, q4, v10.4s + sha256su0 v6.4s, v7.4s + sha256su1 v5.4s, v7.4s, v8.4s + add v10.4s, v8.4s, v19.4s + mov v4.16b, v2.16b + sha256h q2, q3, v9.4s + sha256h2 q3, q4, v9.4s + sha256su0 v7.4s, v8.4s + sha256su1 v6.4s, v8.4s, v5.4s + add v9.4s, v5.4s, v20.4s + mov v4.16b, v2.16b + sha256h q2, q3, v10.4s + sha256h2 q3, q4, v10.4s + sha256su0 v8.4s, v5.4s + sha256su1 v7.4s, v5.4s, v6.4s + add v10.4s, v6.4s, v21.4s + mov v4.16b, v2.16b + sha256h q2, q3, v9.4s + sha256h2 q3, q4, v9.4s + sha256su0 v5.4s, v6.4s + sha256su1 v8.4s, v6.4s, v7.4s + add v9.4s, v7.4s, v22.4s + mov v4.16b, v2.16b + sha256h q2, q3, v10.4s + sha256h2 q3, q4, v10.4s + sha256su0 v6.4s, v7.4s + sha256su1 v5.4s, v7.4s, v8.4s + add v10.4s, v8.4s, v23.4s + mov v4.16b, v2.16b + sha256h q2, q3, v9.4s + sha256h2 q3, q4, v9.4s + sha256su0 v7.4s, v8.4s + sha256su1 v6.4s, v8.4s, v5.4s + add v9.4s, v5.4s, v24.4s + mov v4.16b, v2.16b + sha256h q2, q3, v10.4s + sha256h2 q3, q4, v10.4s + sha256su0 v8.4s, v5.4s + sha256su1 v7.4s, v5.4s, v6.4s + add v10.4s, v6.4s, v25.4s + mov v4.16b, v2.16b + sha256h q2, q3, v9.4s + sha256h2 q3, q4, v9.4s + sha256su0 v5.4s, v6.4s + sha256su1 v8.4s, v6.4s, v7.4s + add v9.4s, v7.4s, v26.4s + mov v4.16b, v2.16b + sha256h q2, q3, v10.4s + sha256h2 q3, q4, v10.4s + sha256su0 v6.4s, v7.4s + sha256su1 v5.4s, v7.4s, v8.4s + add v10.4s, v8.4s, v27.4s + mov v4.16b, v2.16b + sha256h q2, q3, v9.4s + sha256h2 q3, q4, v9.4s + sha256su0 v7.4s, v8.4s + sha256su1 v6.4s, v8.4s, v5.4s + add v9.4s, v5.4s, v28.4s + mov v4.16b, v2.16b + sha256h q2, q3, v10.4s + sha256h2 q3, q4, v10.4s + sha256su0 v8.4s, v5.4s + sha256su1 v7.4s, v5.4s, v6.4s + add v10.4s, v6.4s, v29.4s + mov v4.16b, v2.16b + sha256h q2, q3, v9.4s + sha256h2 q3, q4, v9.4s + sha256su1 v8.4s, v6.4s, v7.4s + add v9.4s, v7.4s, v30.4s + mov v4.16b, v2.16b + sha256h q2, q3, v10.4s + sha256h2 q3, q4, v10.4s + add v10.4s, v8.4s, v31.4s + mov v4.16b, v2.16b + sha256h q2, q3, v9.4s + sha256h2 q3, q4, v9.4s + mov v4.16b, v2.16b + sha256h q2, q3, v10.4s + sha256h2 q3, q4, v10.4s + cmp x1, x2 + add v1.4s, v1.4s, v3.4s + add v0.4s, v0.4s, v2.4s + b.ne .Lsha256loop + +.Lsha256epilog: + + st1 {v0.4s,v1.4s}, [x0] + ldr q10, [sp, #48] + ldr q9, [sp, #32] + ldr q8, [sp, #16] + ldr x29, [sp], #64 + ret + +.align 5 +.LKConstant256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.size sha256_block_data_order,.-sha256_block_data_order +.align 2 + + +