2
1
Fork 0
mirror of https://github.com/yuzu-emu/yuzu.git synced 2024-07-04 23:31:19 +01:00

astc_decoder: Compute offset swizzles in-shader

Alleviates the dependency on the swizzle table and a uniform which is constant for all ASTC texture sizes.
This commit is contained in:
ameerj 2021-07-31 22:24:15 -04:00
parent b2862e4772
commit 5ab8053511
4 changed files with 25 additions and 109 deletions

View file

@ -10,8 +10,7 @@
#define END_PUSH_CONSTANTS }; #define END_PUSH_CONSTANTS };
#define UNIFORM(n) #define UNIFORM(n)
#define BINDING_INPUT_BUFFER 0 #define BINDING_INPUT_BUFFER 0
#define BINDING_SWIZZLE_BUFFER 1 #define BINDING_OUTPUT_IMAGE 1
#define BINDING_OUTPUT_IMAGE 2
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
@ -19,7 +18,6 @@
#define END_PUSH_CONSTANTS #define END_PUSH_CONSTANTS
#define UNIFORM(n) layout(location = n) uniform #define UNIFORM(n) layout(location = n) uniform
#define BINDING_INPUT_BUFFER 0 #define BINDING_INPUT_BUFFER 0
#define BINDING_SWIZZLE_BUFFER 1
#define BINDING_OUTPUT_IMAGE 0 #define BINDING_OUTPUT_IMAGE 0
#endif #endif
@ -28,13 +26,11 @@ layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
BEGIN_PUSH_CONSTANTS BEGIN_PUSH_CONSTANTS
UNIFORM(1) uvec2 block_dims; UNIFORM(1) uvec2 block_dims;
UNIFORM(2) uint layer_stride;
UNIFORM(2) uint bytes_per_block_log2; UNIFORM(3) uint block_size;
UNIFORM(3) uint layer_stride; UNIFORM(4) uint x_shift;
UNIFORM(4) uint block_size; UNIFORM(5) uint block_height;
UNIFORM(5) uint x_shift; UNIFORM(6) uint block_height_mask;
UNIFORM(6) uint block_height;
UNIFORM(7) uint block_height_mask;
END_PUSH_CONSTANTS END_PUSH_CONSTANTS
struct EncodingData { struct EncodingData {
@ -53,35 +49,17 @@ struct TexelWeightParams {
bool void_extent_hdr; bool void_extent_hdr;
}; };
// Swizzle data
layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
uint swizzle_table[];
};
layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 {
uvec4 astc_data[]; uvec4 astc_data[];
}; };
layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image;
const uint GOB_SIZE_X = 64;
const uint GOB_SIZE_Y = 8;
const uint GOB_SIZE_Z = 1;
const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
const uint GOB_SIZE_X_SHIFT = 6; const uint GOB_SIZE_X_SHIFT = 6;
const uint GOB_SIZE_Y_SHIFT = 3; const uint GOB_SIZE_Y_SHIFT = 3;
const uint GOB_SIZE_Z_SHIFT = 0; const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT;
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1); const uint BYTES_PER_BLOCK_LOG2 = 4;
const int BLOCK_SIZE_IN_BYTES = 16;
const int BLOCK_INFO_ERROR = 0;
const int BLOCK_INFO_VOID_EXTENT_HDR = 1;
const int BLOCK_INFO_VOID_EXTENT_LDR = 2;
const int BLOCK_INFO_NORMAL = 3;
const int JUST_BITS = 0; const int JUST_BITS = 0;
const int QUINT = 1; const int QUINT = 1;
@ -168,8 +146,10 @@ int texel_vector_index = 0;
uint unquantized_texel_weights[2][144]; uint unquantized_texel_weights[2][144];
uint SwizzleOffset(uvec2 pos) { uint SwizzleOffset(uvec2 pos) {
pos = pos & SWIZZLE_MASK; uint x = pos.x;
return swizzle_table[pos.y * 64 + pos.x]; uint y = pos.y;
return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 +
(y % 2) * 16 + (x % 16);
} }
// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
@ -1253,7 +1233,7 @@ void DecompressBlock(ivec3 coord) {
void main() { void main() {
uvec3 pos = gl_GlobalInvocationID; uvec3 pos = gl_GlobalInvocationID;
pos.x <<= bytes_per_block_log2; pos.x <<= BYTES_PER_BLOCK_LOG2;
// Read as soon as possible due to its latency // Read as soon as possible due to its latency
const uint swizzle = SwizzleOffset(pos.xy); const uint swizzle = SwizzleOffset(pos.xy);

View file

@ -68,7 +68,6 @@ UtilShaders::~UtilShaders() = default;
void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
std::span<const VideoCommon::SwizzleParameters> swizzles) { std::span<const VideoCommon::SwizzleParameters> swizzles) {
static constexpr GLuint BINDING_INPUT_BUFFER = 0; static constexpr GLuint BINDING_INPUT_BUFFER = 0;
static constexpr GLuint BINDING_SWIZZLE_BUFFER = 1;
static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
const Extent2D tile_size{ const Extent2D tile_size{
@ -76,10 +75,9 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
.height = VideoCore::Surface::DefaultBlockHeight(image.info.format), .height = VideoCore::Surface::DefaultBlockHeight(image.info.format),
}; };
program_manager.BindComputeProgram(astc_decoder_program.handle); program_manager.BindComputeProgram(astc_decoder_program.handle);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
glUniform2ui(1, tile_size.width, tile_size.height); glUniform2ui(1, tile_size.width, tile_size.height);
// Ensure buffer data is valid before dispatching // Ensure buffer data is valid before dispatching
glFlush(); glFlush();
for (const SwizzleParameters& swizzle : swizzles) { for (const SwizzleParameters& swizzle : swizzles) {
@ -90,13 +88,13 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info);
ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0})); ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0}));
ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0})); ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0}));
ASSERT(params.bytes_per_block_log2 == 4);
glUniform1ui(2, params.bytes_per_block_log2); glUniform1ui(2, params.layer_stride);
glUniform1ui(3, params.layer_stride); glUniform1ui(3, params.block_size);
glUniform1ui(4, params.block_size); glUniform1ui(4, params.x_shift);
glUniform1ui(5, params.x_shift); glUniform1ui(5, params.block_height);
glUniform1ui(6, params.block_height); glUniform1ui(6, params.block_height_mask);
glUniform1ui(7, params.block_height_mask);
// ASTC texture data // ASTC texture data
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,

View file

@ -34,9 +34,8 @@ using Tegra::Texture::SWIZZLE_TABLE;
namespace { namespace {
constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0; constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0;
constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 1; constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 1;
constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 2; constexpr size_t ASTC_NUM_BINDINGS = 2;
constexpr size_t ASTC_NUM_BINDINGS = 3;
template <size_t size> template <size_t size>
inline constexpr VkPushConstantRange COMPUTE_PUSH_CONSTANT_RANGE{ inline constexpr VkPushConstantRange COMPUTE_PUSH_CONSTANT_RANGE{
@ -80,13 +79,6 @@ constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCR
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr, .pImmutableSamplers = nullptr,
}, },
{
.binding = ASTC_BINDING_SWIZZLE_BUFFER,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{ {
.binding = ASTC_BINDING_OUTPUT_IMAGE, .binding = ASTC_BINDING_OUTPUT_IMAGE,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
@ -98,12 +90,12 @@ constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCR
constexpr DescriptorBankInfo ASTC_BANK_INFO{ constexpr DescriptorBankInfo ASTC_BANK_INFO{
.uniform_buffers = 0, .uniform_buffers = 0,
.storage_buffers = 2, .storage_buffers = 1,
.texture_buffers = 0, .texture_buffers = 0,
.image_buffers = 0, .image_buffers = 0,
.textures = 0, .textures = 0,
.images = 1, .images = 1,
.score = 3, .score = 2,
}; };
constexpr VkDescriptorUpdateTemplateEntryKHR INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE{ constexpr VkDescriptorUpdateTemplateEntryKHR INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE{
@ -125,14 +117,6 @@ constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS>
.offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry), .offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry), .stride = sizeof(DescriptorUpdateEntry),
}, },
{
.dstBinding = ASTC_BINDING_SWIZZLE_BUFFER,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = ASTC_BINDING_SWIZZLE_BUFFER * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
},
{ {
.dstBinding = ASTC_BINDING_OUTPUT_IMAGE, .dstBinding = ASTC_BINDING_OUTPUT_IMAGE,
.dstArrayElement = 0, .dstArrayElement = 0,
@ -145,7 +129,6 @@ constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS>
struct AstcPushConstants { struct AstcPushConstants {
std::array<u32, 2> blocks_dims; std::array<u32, 2> blocks_dims;
u32 bytes_per_block_log2;
u32 layer_stride; u32 layer_stride;
u32 block_size; u32 block_size;
u32 x_shift; u32 x_shift;
@ -336,42 +319,6 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_,
ASTCDecoderPass::~ASTCDecoderPass() = default; ASTCDecoderPass::~ASTCDecoderPass() = default;
void ASTCDecoderPass::MakeDataBuffer() {
constexpr size_t TOTAL_BUFFER_SIZE = sizeof(SWIZZLE_TABLE);
data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
.size = TOTAL_BUFFER_SIZE,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
});
data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload);
const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload);
std::memcpy(staging_ref.mapped_span.data(), &SWIZZLE_TABLE, sizeof(SWIZZLE_TABLE));
scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer,
TOTAL_BUFFER_SIZE](vk::CommandBuffer cmdbuf) {
static constexpr VkMemoryBarrier write_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
};
const VkBufferCopy copy{
.srcOffset = offset,
.dstOffset = 0,
.size = TOTAL_BUFFER_SIZE,
};
cmdbuf.CopyBuffer(src, dst, copy);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
0, write_barrier);
});
}
void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
std::span<const VideoCommon::SwizzleParameters> swizzles) { std::span<const VideoCommon::SwizzleParameters> swizzles) {
using namespace VideoCommon::Accelerated; using namespace VideoCommon::Accelerated;
@ -380,9 +327,6 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
VideoCore::Surface::DefaultBlockHeight(image.info.format), VideoCore::Surface::DefaultBlockHeight(image.info.format),
}; };
scheduler.RequestOutsideRenderPassOperationContext(); scheduler.RequestOutsideRenderPassOperationContext();
if (!data_buffer) {
MakeDataBuffer();
}
const VkPipeline vk_pipeline = *pipeline; const VkPipeline vk_pipeline = *pipeline;
const VkImageAspectFlags aspect_mask = image.AspectMask(); const VkImageAspectFlags aspect_mask = image.AspectMask();
const VkImage vk_image = image.Handle(); const VkImage vk_image = image.Handle();
@ -421,7 +365,6 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
update_descriptor_queue.Acquire(); update_descriptor_queue.Acquire();
update_descriptor_queue.AddBuffer(map.buffer, input_offset, update_descriptor_queue.AddBuffer(map.buffer, input_offset,
image.guest_size_bytes - swizzle.buffer_offset); image.guest_size_bytes - swizzle.buffer_offset);
update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(SWIZZLE_TABLE));
update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level));
const void* const descriptor_data{update_descriptor_queue.UpdateData()}; const void* const descriptor_data{update_descriptor_queue.UpdateData()};
@ -429,11 +372,11 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info);
ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0})); ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0}));
ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0})); ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0}));
ASSERT(params.bytes_per_block_log2 == 4);
scheduler.Record([this, num_dispatches_x, num_dispatches_y, num_dispatches_z, block_dims, scheduler.Record([this, num_dispatches_x, num_dispatches_y, num_dispatches_z, block_dims,
params, descriptor_data](vk::CommandBuffer cmdbuf) { params, descriptor_data](vk::CommandBuffer cmdbuf) {
const AstcPushConstants uniforms{ const AstcPushConstants uniforms{
.blocks_dims = block_dims, .blocks_dims = block_dims,
.bytes_per_block_log2 = params.bytes_per_block_log2,
.layer_stride = params.layer_stride, .layer_stride = params.layer_stride,
.block_size = params.block_size, .block_size = params.block_size,
.x_shift = params.x_shift, .x_shift = params.x_shift,

View file

@ -96,15 +96,10 @@ public:
std::span<const VideoCommon::SwizzleParameters> swizzles); std::span<const VideoCommon::SwizzleParameters> swizzles);
private: private:
void MakeDataBuffer();
VKScheduler& scheduler; VKScheduler& scheduler;
StagingBufferPool& staging_buffer_pool; StagingBufferPool& staging_buffer_pool;
VKUpdateDescriptorQueue& update_descriptor_queue; VKUpdateDescriptorQueue& update_descriptor_queue;
MemoryAllocator& memory_allocator; MemoryAllocator& memory_allocator;
vk::Buffer data_buffer;
MemoryCommit data_buffer_commit;
}; };
} // namespace Vulkan } // namespace Vulkan