1
0
Fork 0
mirror of https://github.com/Scandal-UK/Incognito_RCM.git synced 2024-11-26 05:42:25 +00:00

Optimise _gf256_mul_x_le

This commit is contained in:
Dan Ware 2020-12-22 11:07:13 +00:00
parent d09d402460
commit 850e7d93af
2 changed files with 123 additions and 94 deletions

View file

@ -37,19 +37,26 @@ extern sdmmc_storage_t sd_storage;
extern sdmmc_storage_t storage; extern sdmmc_storage_t storage;
extern emmc_part_t *system_part; extern emmc_part_t *system_part;
typedef struct { #define MAX_CLUSTER_CACHE_ENTRIES 128
u32 sector; #define CLUSTER_LOOKUP_EMPTY_ENTRY 0xFFFFFFFF
u32 visit_count; #define XTS_CLUSTER_SIZE 0x4000
u8 align[8]; #define SECTORS_PER_CLUSTER 0x20
u8 tweak[0x10];
u8 cached_sector[0x200];
} sector_cache_t;
#define MAX_SEC_CACHE_ENTRIES 256 typedef struct {
static sector_cache_t *sector_cache = (sector_cache_t *)(MIXD_BUF_ALIGNED + 0x100000); //NULL; u32 cluster_num; // index of the cluster in the partition
u32 secindex = 0; u32 visit_count; // used for debugging/access analysis
bool clear_sector_cache = false; u8 dirty; // has been modified without writeback flag
bool lock_sector_cache = false; u8 align[7];
u8 cluster[XTS_CLUSTER_SIZE]; // the cached cluster itself
} cluster_cache_t;
static cluster_cache_t *cluster_cache = (cluster_cache_t *)RAM_DISK_ADDR;
u32 cluster_cache_index = 0;
u32 *cluster_lookup = (u32 *)(RAM_DISK_ADDR + MAX_CLUSTER_CACHE_ENTRIES * sizeof(cluster_cache_t));
u8 *emmc_buffer = (u8 *)(MIXD_BUF_ALIGNED + 0x100000);
bool clear_cluster_cache = false;
bool lock_cluster_cache = false;
DSTATUS disk_status ( DSTATUS disk_status (
BYTE pdrv /* Physical drive number to identify the drive */ BYTE pdrv /* Physical drive number to identify the drive */
@ -65,21 +72,23 @@ DSTATUS disk_initialize (
return 0; return 0;
} }
static inline void _gf256_mul_x_le(void *block) { static inline void _gf256_mul_x_le(void *block)
u8 *pdata = (u8 *)block; {
u32 *pdata = (u32 *)block;
u32 carry = 0; u32 carry = 0;
for (u32 i = 0; i < 0x10; i++) { for (u32 i = 0; i < 4; i++) {
u8 b = pdata[i]; u32 b = pdata[i];
pdata[i] = (b << 1) | carry; pdata[i] = (b << 1) | carry;
carry = b >> 7; carry = b >> 31;
} }
if (carry) if (carry)
pdata[0x0] ^= 0x87; pdata[0x0] ^= 0x87;
} }
static inline int _emmc_xts(u32 ks1, u32 ks2, u32 enc, u8 *tweak, bool regen_tweak, u32 tweak_exp, u64 sec, void *dst, void *src, u32 secsize) { static inline int _emmc_xts(u32 ks1, u32 ks2, u32 enc, u8 *tweak, bool regen_tweak, u32 tweak_exp, u64 sec, void *dst, void *src, u32 secsize)
{
int res = 0; int res = 0;
u8 *temptweak = (u8 *)malloc(0x10); u8 *temptweak = (u8 *)malloc(0x10);
u32 *pdst = (u32 *)dst; u32 *pdst = (u32 *)dst;
@ -95,13 +104,19 @@ static inline int _emmc_xts(u32 ks1, u32 ks2, u32 enc, u8 *tweak, bool regen_twe
goto out; goto out;
} }
for (u32 i = 0; i < tweak_exp * 0x20; i++) // tweak_exp allows us to use a saved tweak to reduce _gf256_mul_x_le calls
for (u32 i = 0; i < tweak_exp * SECTORS_PER_CLUSTER; i++)
_gf256_mul_x_le(tweak); _gf256_mul_x_le(tweak);
memcpy(temptweak, tweak, 0x10); memcpy(temptweak, tweak, 0x10);
// The reference implementation in IEEE P1619 encrypts once per AES block
// In this environment, doing so produces a lot of overhead
// Instead, we perform one single AES-ECB operation between the sector xors
// We are assuming a 0x10-aligned sector size in this implementation. // We are assuming a 0x10-aligned sector size in this implementation.
for (u32 i = 0; i < secsize / 0x10; i++) { for (u32 i = 0; i < secsize / 0x10; i++)
{
for (u32 j = 0; j < 4; j++) for (u32 j = 0; j < 4; j++)
pdst[j] = psrc[j] ^ ptweak[j]; pdst[j] = psrc[j] ^ ptweak[j];
_gf256_mul_x_le(tweak); _gf256_mul_x_le(tweak);
@ -114,7 +129,8 @@ static inline int _emmc_xts(u32 ks1, u32 ks2, u32 enc, u8 *tweak, bool regen_twe
pdst = (u32 *)dst; pdst = (u32 *)dst;
memcpy(tweak, temptweak, 0x10); memcpy(tweak, temptweak, 0x10);
for (u32 i = 0; i < secsize / 0x10; i++) { for (u32 i = 0; i < secsize / 0x10; i++)
{
for (u32 j = 0; j < 4; j++) for (u32 j = 0; j < 4; j++)
pdst[j] = pdst[j] ^ ptweak[j]; pdst[j] = pdst[j] ^ ptweak[j];
_gf256_mul_x_le(tweak); _gf256_mul_x_le(tweak);
@ -138,74 +154,87 @@ DRESULT disk_read (
switch (pdrv) switch (pdrv)
{ {
case 0: case 0:
if (((u32)buff >= DRAM_START) && !((u32)buff % 8))
return sdmmc_storage_read(&sd_storage, sector, count, buff) ? RES_OK : RES_ERROR; return sdmmc_storage_read(&sd_storage, sector, count, buff) ? RES_OK : RES_ERROR;
u8 *buf = (u8 *)SDMMC_UPPER_BUFFER;
if (sdmmc_storage_read(&sd_storage, sector, count, buf))
{
memcpy(buff, buf, 512 * count);
return RES_OK;
}
return RES_ERROR;
case 1:; case 1:;
__attribute__ ((aligned (16))) static u8 tweak[0x10]; __attribute__ ((aligned (16))) static u8 tweak[0x10];
__attribute__ ((aligned (16))) static u64 prev_cluster = -1; __attribute__ ((aligned (16))) static u64 prev_cluster = -1;
__attribute__ ((aligned (16))) static u32 prev_sector = 0; __attribute__ ((aligned (16))) static u32 prev_sector = 0;
bool needs_cache_sector = false;
if (secindex == 0 || clear_sector_cache) { if (cluster_cache_index == 0 || clear_cluster_cache)
clear_sector_cache = false; {
lock_sector_cache = false; // memset gets optimized out...
secindex = 0; // for (u32 i = 0; i < (system_part->lba_end - system_part->lba_start + 1) / SECTORS_PER_CLUSTER; i++)
// cluster_lookup[i] = CLUSTER_LOOKUP_EMPTY_ENTRY;
memset(cluster_lookup, -1, (system_part->lba_end - system_part->lba_start + 1) / SECTORS_PER_CLUSTER * 4);
cluster_cache_index = 0;
clear_cluster_cache = false;
lock_cluster_cache = false;
} }
u32 s = 0; u32 cluster = sector / SECTORS_PER_CLUSTER;
// only attempt to cache single-sector reads as these are most likely to be repeated (eg. rereading FAT) u32 aligned_sector = cluster * SECTORS_PER_CLUSTER;
if (!lock_sector_cache && count == 1) { u32 sector_index_in_cluster = sector % SECTORS_PER_CLUSTER;
for ( ; s < secindex; s++) { u32 cluster_lookup_index = cluster_lookup[cluster];
if (sector_cache[s].sector == sector) {
sector_cache[s].visit_count++; if (cluster_lookup_index != CLUSTER_LOOKUP_EMPTY_ENTRY)
memcpy(buff, sector_cache[s].cached_sector, 0x200); {
memcpy(tweak, sector_cache[s].tweak, 0x10); memcpy(buff, cluster_cache[cluster_lookup_index].cluster + sector_index_in_cluster * NX_EMMC_BLOCKSIZE, count * NX_EMMC_BLOCKSIZE);
prev_sector = sector; cluster_cache[cluster_lookup_index].visit_count++;
prev_cluster = sector / 0x20; prev_sector = sector + count - 1;
prev_cluster = cluster;
return RES_OK; return RES_OK;
} }
}
// add to cache // Only cache single-sector reads as these are most likely to be repeated (eg. boot block, FAT directory tables)
if (s == secindex && s < MAX_SEC_CACHE_ENTRIES) { if (count == 1 &&
sector_cache[s].sector = sector; !lock_cluster_cache &&
sector_cache[s].visit_count++; cluster_cache_index < MAX_CLUSTER_CACHE_ENTRIES &&
needs_cache_sector = true; cluster_lookup_index == CLUSTER_LOOKUP_EMPTY_ENTRY)
secindex++; {
} cluster_cache[cluster_cache_index].cluster_num = cluster;
cluster_cache[cluster_cache_index].visit_count = 1;
cluster_cache[cluster_cache_index].dirty = 0;
cluster_lookup[cluster] = cluster_cache_index;
// Read and decrypt the whole cluster the sector resides in
if (!nx_emmc_part_read(&storage, system_part, aligned_sector, SECTORS_PER_CLUSTER, emmc_buffer))
return RES_ERROR;
_emmc_xts(9, 8, 0, tweak, true, 0, cluster, emmc_buffer, emmc_buffer, XTS_CLUSTER_SIZE);
memcpy(cluster_cache[cluster_cache_index].cluster, emmc_buffer, XTS_CLUSTER_SIZE);
memcpy(buff, emmc_buffer + sector_index_in_cluster * NX_EMMC_BLOCKSIZE, NX_EMMC_BLOCKSIZE);
prev_cluster = -1;
prev_sector = 0;
cluster_cache_index++;
return RES_OK;
} }
if (nx_emmc_part_read(&storage, system_part, sector, count, buff)) { if (!nx_emmc_part_read(&storage, system_part, sector, count, buff))
return RES_ERROR;
u32 tweak_exp = 0; u32 tweak_exp = 0;
bool regen_tweak = true; bool regen_tweak = true;
if (prev_cluster != sector / 0x20) { // sector in different cluster than last read if (prev_cluster != cluster)
prev_cluster = sector / 0x20; { // Sector is in different cluster than last read
tweak_exp = sector % 0x20; prev_cluster = cluster;
} else if (sector > prev_sector) { // sector in same cluster and past last sector tweak_exp = sector_index_in_cluster;
}
else if (sector > prev_sector)
{ // Sector is in same cluster and past last sector
// Calculates the new tweak using the saved one, reducing expensive _gf256_mul_x_le calls
tweak_exp = sector - prev_sector - 1; tweak_exp = sector - prev_sector - 1;
regen_tweak = false; regen_tweak = false;
} else { // sector in same cluster and before or same as last sector }
tweak_exp = sector % 0x20; else
{ // Sector is in same cluster and before or same as last sector
tweak_exp = sector_index_in_cluster;
} }
// fatfs will never pull more than a cluster // FatFs will never pull more than one 4K cluster, which is the same as the crypto 'sector' size
_emmc_xts(9, 8, 0, tweak, regen_tweak, tweak_exp, prev_cluster, buff, buff, count * 0x200); _emmc_xts(9, 8, 0, tweak, regen_tweak, tweak_exp, prev_cluster, buff, buff, count * NX_EMMC_BLOCKSIZE);
if (needs_cache_sector) {
memcpy(sector_cache[s].cached_sector, buff, 0x200);
memcpy(sector_cache[s].tweak, tweak, 0x10);
}
prev_sector = sector + count - 1; prev_sector = sector + count - 1;
return RES_OK; return RES_OK;
} }
return RES_ERROR;
}
return RES_ERROR; return RES_ERROR;
} }
@ -216,15 +245,15 @@ DRESULT disk_write (
UINT count /* Number of sectors to write */ UINT count /* Number of sectors to write */
) )
{ {
if (pdrv == 1) switch (pdrv)
return RES_WRPRT; {
case 0:
if (((u32)buff >= DRAM_START) && !((u32)buff % 8))
return sdmmc_storage_write(&sd_storage, sector, count, (void *)buff) ? RES_OK : RES_ERROR; return sdmmc_storage_write(&sd_storage, sector, count, (void *)buff) ? RES_OK : RES_ERROR;
u8 *buf = (u8 *)SDMMC_UPPER_BUFFER; //TODO: define this somewhere.
memcpy(buf, buff, 512 * count); case 1:
if (sdmmc_storage_write(&sd_storage, sector, count, buf)) return RES_WRPRT;
return RES_OK; }
return RES_ERROR; return RES_ERROR;
} }

View file

@ -55,14 +55,14 @@ static void _gf256_mul_x(void *block)
static void _gf256_mul_x_le(void *block) static void _gf256_mul_x_le(void *block)
{ {
u8 *pdata = (u8 *)block; u32 *pdata = (u32 *)block;
u32 carry = 0; u32 carry = 0;
for (u32 i = 0; i < 0x10; i++) for (u32 i = 0; i < 4; i++)
{ {
u8 b = pdata[i]; u32 b = pdata[i];
pdata[i] = (b << 1) | carry; pdata[i] = (b << 1) | carry;
carry = b >> 7; carry = b >> 31;
} }
if (carry) if (carry)