From 850e7d93af208b21eb5b8367d6cb080ce27582aa Mon Sep 17 00:00:00 2001 From: Dan Ware Date: Tue, 22 Dec 2020 11:07:13 +0000 Subject: [PATCH] Optimise _gf256_mul_x_le --- source/libs/fatfs/diskio.c | 199 +++++++++++++++++++++---------------- source/sec/se.c | 18 ++-- 2 files changed, 123 insertions(+), 94 deletions(-) diff --git a/source/libs/fatfs/diskio.c b/source/libs/fatfs/diskio.c index 2cfacd7..c0112dc 100644 --- a/source/libs/fatfs/diskio.c +++ b/source/libs/fatfs/diskio.c @@ -37,19 +37,26 @@ extern sdmmc_storage_t sd_storage; extern sdmmc_storage_t storage; extern emmc_part_t *system_part; -typedef struct { - u32 sector; - u32 visit_count; - u8 align[8]; - u8 tweak[0x10]; - u8 cached_sector[0x200]; -} sector_cache_t; +#define MAX_CLUSTER_CACHE_ENTRIES 128 +#define CLUSTER_LOOKUP_EMPTY_ENTRY 0xFFFFFFFF +#define XTS_CLUSTER_SIZE 0x4000 +#define SECTORS_PER_CLUSTER 0x20 -#define MAX_SEC_CACHE_ENTRIES 256 -static sector_cache_t *sector_cache = (sector_cache_t *)(MIXD_BUF_ALIGNED + 0x100000); //NULL; -u32 secindex = 0; -bool clear_sector_cache = false; -bool lock_sector_cache = false; +typedef struct { + u32 cluster_num; // index of the cluster in the partition + u32 visit_count; // used for debugging/access analysis + u8 dirty; // has been modified without writeback flag + u8 align[7]; + u8 cluster[XTS_CLUSTER_SIZE]; // the cached cluster itself +} cluster_cache_t; + +static cluster_cache_t *cluster_cache = (cluster_cache_t *)RAM_DISK_ADDR; +u32 cluster_cache_index = 0; +u32 *cluster_lookup = (u32 *)(RAM_DISK_ADDR + MAX_CLUSTER_CACHE_ENTRIES * sizeof(cluster_cache_t)); +u8 *emmc_buffer = (u8 *)(MIXD_BUF_ALIGNED + 0x100000); + +bool clear_cluster_cache = false; +bool lock_cluster_cache = false; DSTATUS disk_status ( BYTE pdrv /* Physical drive number to identify the drive */ @@ -65,21 +72,23 @@ DSTATUS disk_initialize ( return 0; } -static inline void _gf256_mul_x_le(void *block) { - u8 *pdata = (u8 *)block; +static inline void _gf256_mul_x_le(void *block) +{ + u32 *pdata = (u32 *)block; u32 carry = 0; - for (u32 i = 0; i < 0x10; i++) { - u8 b = pdata[i]; + for (u32 i = 0; i < 4; i++) { + u32 b = pdata[i]; pdata[i] = (b << 1) | carry; - carry = b >> 7; + carry = b >> 31; } if (carry) pdata[0x0] ^= 0x87; } -static inline int _emmc_xts(u32 ks1, u32 ks2, u32 enc, u8 *tweak, bool regen_tweak, u32 tweak_exp, u64 sec, void *dst, void *src, u32 secsize) { +static inline int _emmc_xts(u32 ks1, u32 ks2, u32 enc, u8 *tweak, bool regen_tweak, u32 tweak_exp, u64 sec, void *dst, void *src, u32 secsize) +{ int res = 0; u8 *temptweak = (u8 *)malloc(0x10); u32 *pdst = (u32 *)dst; @@ -95,13 +104,19 @@ static inline int _emmc_xts(u32 ks1, u32 ks2, u32 enc, u8 *tweak, bool regen_twe goto out; } - for (u32 i = 0; i < tweak_exp * 0x20; i++) + // tweak_exp allows us to use a saved tweak to reduce _gf256_mul_x_le calls + for (u32 i = 0; i < tweak_exp * SECTORS_PER_CLUSTER; i++) _gf256_mul_x_le(tweak); memcpy(temptweak, tweak, 0x10); - //We are assuming a 0x10-aligned sector size in this implementation. - for (u32 i = 0; i < secsize / 0x10; i++) { + // The reference implementation in IEEE P1619 encrypts once per AES block + // In this environment, doing so produces a lot of overhead + // Instead, we perform one single AES-ECB operation between the sector xors + + // We are assuming a 0x10-aligned sector size in this implementation. + for (u32 i = 0; i < secsize / 0x10; i++) + { for (u32 j = 0; j < 4; j++) pdst[j] = psrc[j] ^ ptweak[j]; _gf256_mul_x_le(tweak); @@ -114,7 +129,8 @@ static inline int _emmc_xts(u32 ks1, u32 ks2, u32 enc, u8 *tweak, bool regen_twe pdst = (u32 *)dst; memcpy(tweak, temptweak, 0x10); - for (u32 i = 0; i < secsize / 0x10; i++) { + for (u32 i = 0; i < secsize / 0x10; i++) + { for (u32 j = 0; j < 4; j++) pdst[j] = pdst[j] ^ ptweak[j]; _gf256_mul_x_le(tweak); @@ -138,74 +154,87 @@ DRESULT disk_read ( switch (pdrv) { case 0: - if (((u32)buff >= DRAM_START) && !((u32)buff % 8)) - return sdmmc_storage_read(&sd_storage, sector, count, buff) ? RES_OK : RES_ERROR; - u8 *buf = (u8 *)SDMMC_UPPER_BUFFER; - if (sdmmc_storage_read(&sd_storage, sector, count, buf)) - { - memcpy(buff, buf, 512 * count); - return RES_OK; - } - return RES_ERROR; + return sdmmc_storage_read(&sd_storage, sector, count, buff) ? RES_OK : RES_ERROR; case 1:; __attribute__ ((aligned (16))) static u8 tweak[0x10]; __attribute__ ((aligned (16))) static u64 prev_cluster = -1; __attribute__ ((aligned (16))) static u32 prev_sector = 0; - bool needs_cache_sector = false; - if (secindex == 0 || clear_sector_cache) { - clear_sector_cache = false; - lock_sector_cache = false; - secindex = 0; + if (cluster_cache_index == 0 || clear_cluster_cache) + { + // memset gets optimized out... + // for (u32 i = 0; i < (system_part->lba_end - system_part->lba_start + 1) / SECTORS_PER_CLUSTER; i++) + // cluster_lookup[i] = CLUSTER_LOOKUP_EMPTY_ENTRY; + memset(cluster_lookup, -1, (system_part->lba_end - system_part->lba_start + 1) / SECTORS_PER_CLUSTER * 4); + cluster_cache_index = 0; + clear_cluster_cache = false; + lock_cluster_cache = false; } - u32 s = 0; - // only attempt to cache single-sector reads as these are most likely to be repeated (eg. rereading FAT) - if (!lock_sector_cache && count == 1) { - for ( ; s < secindex; s++) { - if (sector_cache[s].sector == sector) { - sector_cache[s].visit_count++; - memcpy(buff, sector_cache[s].cached_sector, 0x200); - memcpy(tweak, sector_cache[s].tweak, 0x10); - prev_sector = sector; - prev_cluster = sector / 0x20; - return RES_OK; - } - } - // add to cache - if (s == secindex && s < MAX_SEC_CACHE_ENTRIES) { - sector_cache[s].sector = sector; - sector_cache[s].visit_count++; - needs_cache_sector = true; - secindex++; - } - } + u32 cluster = sector / SECTORS_PER_CLUSTER; + u32 aligned_sector = cluster * SECTORS_PER_CLUSTER; + u32 sector_index_in_cluster = sector % SECTORS_PER_CLUSTER; + u32 cluster_lookup_index = cluster_lookup[cluster]; - if (nx_emmc_part_read(&storage, system_part, sector, count, buff)) { - u32 tweak_exp = 0; - bool regen_tweak = true; - if (prev_cluster != sector / 0x20) { // sector in different cluster than last read - prev_cluster = sector / 0x20; - tweak_exp = sector % 0x20; - } else if (sector > prev_sector) { // sector in same cluster and past last sector - tweak_exp = sector - prev_sector - 1; - regen_tweak = false; - } else { // sector in same cluster and before or same as last sector - tweak_exp = sector % 0x20; - } - - // fatfs will never pull more than a cluster - _emmc_xts(9, 8, 0, tweak, regen_tweak, tweak_exp, prev_cluster, buff, buff, count * 0x200); - if (needs_cache_sector) { - memcpy(sector_cache[s].cached_sector, buff, 0x200); - memcpy(sector_cache[s].tweak, tweak, 0x10); - } + if (cluster_lookup_index != CLUSTER_LOOKUP_EMPTY_ENTRY) + { + memcpy(buff, cluster_cache[cluster_lookup_index].cluster + sector_index_in_cluster * NX_EMMC_BLOCKSIZE, count * NX_EMMC_BLOCKSIZE); + cluster_cache[cluster_lookup_index].visit_count++; prev_sector = sector + count - 1; + prev_cluster = cluster; return RES_OK; } - return RES_ERROR; + + // Only cache single-sector reads as these are most likely to be repeated (eg. boot block, FAT directory tables) + if (count == 1 && + !lock_cluster_cache && + cluster_cache_index < MAX_CLUSTER_CACHE_ENTRIES && + cluster_lookup_index == CLUSTER_LOOKUP_EMPTY_ENTRY) + { + cluster_cache[cluster_cache_index].cluster_num = cluster; + cluster_cache[cluster_cache_index].visit_count = 1; + cluster_cache[cluster_cache_index].dirty = 0; + cluster_lookup[cluster] = cluster_cache_index; + + // Read and decrypt the whole cluster the sector resides in + if (!nx_emmc_part_read(&storage, system_part, aligned_sector, SECTORS_PER_CLUSTER, emmc_buffer)) + return RES_ERROR; + _emmc_xts(9, 8, 0, tweak, true, 0, cluster, emmc_buffer, emmc_buffer, XTS_CLUSTER_SIZE); + memcpy(cluster_cache[cluster_cache_index].cluster, emmc_buffer, XTS_CLUSTER_SIZE); + memcpy(buff, emmc_buffer + sector_index_in_cluster * NX_EMMC_BLOCKSIZE, NX_EMMC_BLOCKSIZE); + prev_cluster = -1; + prev_sector = 0; + cluster_cache_index++; + return RES_OK; + } + + if (!nx_emmc_part_read(&storage, system_part, sector, count, buff)) + return RES_ERROR; + u32 tweak_exp = 0; + bool regen_tweak = true; + if (prev_cluster != cluster) + { // Sector is in different cluster than last read + prev_cluster = cluster; + tweak_exp = sector_index_in_cluster; + } + else if (sector > prev_sector) + { // Sector is in same cluster and past last sector + // Calculates the new tweak using the saved one, reducing expensive _gf256_mul_x_le calls + tweak_exp = sector - prev_sector - 1; + regen_tweak = false; + } + else + { // Sector is in same cluster and before or same as last sector + tweak_exp = sector_index_in_cluster; + } + + // FatFs will never pull more than one 4K cluster, which is the same as the crypto 'sector' size + _emmc_xts(9, 8, 0, tweak, regen_tweak, tweak_exp, prev_cluster, buff, buff, count * NX_EMMC_BLOCKSIZE); + prev_sector = sector + count - 1; + return RES_OK; } + return RES_ERROR; } @@ -216,15 +245,15 @@ DRESULT disk_write ( UINT count /* Number of sectors to write */ ) { - if (pdrv == 1) - return RES_WRPRT; - - if (((u32)buff >= DRAM_START) && !((u32)buff % 8)) + switch (pdrv) + { + case 0: return sdmmc_storage_write(&sd_storage, sector, count, (void *)buff) ? RES_OK : RES_ERROR; - u8 *buf = (u8 *)SDMMC_UPPER_BUFFER; //TODO: define this somewhere. - memcpy(buf, buff, 512 * count); - if (sdmmc_storage_write(&sd_storage, sector, count, buf)) - return RES_OK; + + case 1: + return RES_WRPRT; + } + return RES_ERROR; } diff --git a/source/sec/se.c b/source/sec/se.c index bf1df30..c7730de 100644 --- a/source/sec/se.c +++ b/source/sec/se.c @@ -55,18 +55,18 @@ static void _gf256_mul_x(void *block) static void _gf256_mul_x_le(void *block) { - u8 *pdata = (u8 *)block; - u32 carry = 0; + u32 *pdata = (u32 *)block; + u32 carry = 0; - for (u32 i = 0; i < 0x10; i++) + for (u32 i = 0; i < 4; i++) { - u8 b = pdata[i]; - pdata[i] = (b << 1) | carry; - carry = b >> 7; - } + u32 b = pdata[i]; + pdata[i] = (b << 1) | carry; + carry = b >> 31; + } - if (carry) - pdata[0x0] ^= 0x87; + if (carry) + pdata[0x0] ^= 0x87; } static void _se_ll_init(se_ll_t *ll, u32 addr, u32 size)