From fe931ac9761a813c8e7d195cf99bf68ff324839c Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Tue, 28 Apr 2020 02:14:11 -0300 Subject: [PATCH] {maxwell_3d,buffer_cache}: Implement memory barriers using 3D registers Drop MemoryBarrier from the buffer cache and use Maxwell3D's register WaitForIdle. To implement this on OpenGL we just call glMemoryBarrier with the necessary bits. Vulkan lacks this synchronization primitive, so we set an event and immediately wait for it. This is not a pretty solution, but it's what Vulkan can do without submitting the current command buffer to the queue (which ends up being more expensive on the CPU). --- src/video_core/buffer_cache/buffer_cache.h | 6 ----- src/video_core/engines/maxwell_3d.cpp | 4 ++++ src/video_core/engines/maxwell_3d.h | 5 ++++- src/video_core/rasterizer_interface.h | 3 +++ .../renderer_opengl/gl_buffer_cache.cpp | 4 ---- .../renderer_opengl/gl_buffer_cache.h | 2 -- .../renderer_opengl/gl_rasterizer.cpp | 11 ++++++++++ .../renderer_opengl/gl_rasterizer.h | 1 + .../renderer_vulkan/vk_buffer_cache.h | 2 -- .../renderer_vulkan/vk_rasterizer.cpp | 22 ++++++++++++++++++- .../renderer_vulkan/vk_rasterizer.h | 2 ++ src/video_core/renderer_vulkan/wrapper.cpp | 1 + src/video_core/renderer_vulkan/wrapper.h | 10 +++++++++ 13 files changed, 57 insertions(+), 16 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 510f11089f..c86e914bed 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -89,10 +89,6 @@ public: map->MarkAsWritten(true); MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1); } - } else { - if (map->IsWritten()) { - WriteBarrier(); - } } return {ToHandle(block), static_cast(block->GetOffset(cpu_addr))}; @@ -254,8 +250,6 @@ protected: virtual BufferType ToHandle(const OwnerBuffer& storage) = 0; - virtual void WriteBarrier() = 0; - virtual OwnerBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0; virtual void UploadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size, diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 39e3b66a23..7db055ea0d 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -184,6 +184,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { } switch (method) { + case MAXWELL3D_REG_INDEX(wait_for_idle): { + rasterizer.WaitForIdle(); + break; + } case MAXWELL3D_REG_INDEX(shadow_ram_control): { shadow_state.shadow_ram_control = static_cast(method_call.argument); break; diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 3dfba8197d..b026afabe5 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -709,7 +709,9 @@ public: union { struct { - INSERT_UNION_PADDING_WORDS(0x45); + INSERT_UNION_PADDING_WORDS(0x44); + + u32 wait_for_idle; struct { u32 upload_address; @@ -1535,6 +1537,7 @@ private: static_assert(offsetof(Maxwell3D::Regs, field_name) == position * 4, \ "Field " #field_name " has invalid position") +ASSERT_REG_POSITION(wait_for_idle, 0x44); ASSERT_REG_POSITION(macros, 0x45); ASSERT_REG_POSITION(shadow_ram_control, 0x49); ASSERT_REG_POSITION(upload, 0x60); diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 603f619523..3cbdac8e70 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -80,6 +80,9 @@ public: /// and invalidated virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0; + /// Notify the host renderer to wait for previous primitive and compute operations. + virtual void WaitForIdle() = 0; + /// Notify the rasterizer to send all written commands to the host GPU. virtual void FlushCommands() = 0; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 4efce0de77..d2cab50bd9 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -51,10 +51,6 @@ Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { return std::make_shared(cpu_addr, size); } -void OGLBufferCache::WriteBarrier() { - glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); -} - GLuint OGLBufferCache::ToHandle(const Buffer& buffer) { return buffer->GetHandle(); } diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index a748178575..a9e86cfc7c 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -59,8 +59,6 @@ protected: GLuint ToHandle(const Buffer& buffer) override; - void WriteBarrier() override; - void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, const u8* data) override; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 6fe155bccd..0d07d24227 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -725,6 +725,17 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) { InvalidateRegion(addr, size); } +void RasterizerOpenGL::WaitForIdle() { + // Place a barrier on everything that is not framebuffer related. + // This is related to another flag that is not currently implemented. + glMemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT | GL_ELEMENT_ARRAY_BARRIER_BIT | + GL_UNIFORM_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT | + GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | GL_COMMAND_BARRIER_BIT | + GL_PIXEL_BUFFER_BARRIER_BIT | GL_TEXTURE_UPDATE_BARRIER_BIT | + GL_BUFFER_UPDATE_BARRIER_BIT | GL_TRANSFORM_FEEDBACK_BARRIER_BIT | + GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT); +} + void RasterizerOpenGL::FlushCommands() { // Only flush when we have commands queued to OpenGL. if (num_queued_commands == 0) { diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index ebd2173eba..a95bc7be27 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -75,6 +75,7 @@ public: void SignalSyncPoint(u32 value) override; void ReleaseFences() override; void FlushAndInvalidateRegion(VAddr addr, u64 size) override; + void WaitForIdle() override; void FlushCommands() override; void TickFrame() override; bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 3cd2e27741..c05c27b32c 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -55,8 +55,6 @@ public: protected: VkBuffer ToHandle(const Buffer& buffer) override; - void WriteBarrier() override {} - Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override; void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index c821b12299..991ed43850 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -302,7 +302,7 @@ RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWind buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool), sampler_cache(device), fence_manager(system, *this, device, scheduler, texture_cache, buffer_cache, query_cache), - query_cache(system, *this, device, scheduler) { + query_cache(system, *this, device, scheduler), wfi_event{device.GetLogical().CreateEvent()} { scheduler.SetQueryCache(query_cache); } @@ -576,6 +576,26 @@ void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size) { InvalidateRegion(addr, size); } +void RasterizerVulkan::WaitForIdle() { + // Everything but wait pixel operations. This intentionally includes FRAGMENT_SHADER_BIT because + // fragment shaders can still write storage buffers. + VkPipelineStageFlags flags = + VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | + VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT | + VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | + VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT; + if (device.IsExtTransformFeedbackSupported()) { + flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; + } + + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) { + cmdbuf.SetEvent(event, flags); + cmdbuf.WaitEvents(event, flags, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, {}, {}, {}); + }); +} + void RasterizerVulkan::FlushCommands() { if (draw_counter > 0) { draw_counter = 0; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index d41a7929ea..4f78bbd508 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -127,6 +127,7 @@ public: void SignalSyncPoint(u32 value) override; void ReleaseFences() override; void FlushAndInvalidateRegion(VAddr addr, u64 size) override; + void WaitForIdle() override; void FlushCommands() override; void TickFrame() override; bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, @@ -276,6 +277,7 @@ private: vk::Buffer default_buffer; VKMemoryCommit default_buffer_commit; + vk::Event wfi_event; std::array color_attachments; View zeta_attachment; diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/renderer_vulkan/wrapper.cpp index 7f5bc1404b..2ce9b06264 100644 --- a/src/video_core/renderer_vulkan/wrapper.cpp +++ b/src/video_core/renderer_vulkan/wrapper.cpp @@ -87,6 +87,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdSetStencilReference); X(vkCmdSetStencilWriteMask); X(vkCmdSetViewport); + X(vkCmdWaitEvents); X(vkCreateBuffer); X(vkCreateBufferView); X(vkCreateCommandPool); diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/renderer_vulkan/wrapper.h index bda16a2cb4..98937a77a6 100644 --- a/src/video_core/renderer_vulkan/wrapper.h +++ b/src/video_core/renderer_vulkan/wrapper.h @@ -205,6 +205,7 @@ struct DeviceDispatch : public InstanceDispatch { PFN_vkCmdSetStencilReference vkCmdSetStencilReference; PFN_vkCmdSetStencilWriteMask vkCmdSetStencilWriteMask; PFN_vkCmdSetViewport vkCmdSetViewport; + PFN_vkCmdWaitEvents vkCmdWaitEvents; PFN_vkCreateBuffer vkCreateBuffer; PFN_vkCreateBufferView vkCreateBufferView; PFN_vkCreateCommandPool vkCreateCommandPool; @@ -958,6 +959,15 @@ public: dld->vkCmdSetEvent(handle, event, stage_flags); } + void WaitEvents(Span events, VkPipelineStageFlags src_stage_mask, + VkPipelineStageFlags dst_stage_mask, Span memory_barriers, + Span buffer_barriers, + Span image_barriers) const noexcept { + dld->vkCmdWaitEvents(handle, events.size(), events.data(), src_stage_mask, dst_stage_mask, + memory_barriers.size(), memory_barriers.data(), buffer_barriers.size(), + buffer_barriers.data(), image_barriers.size(), image_barriers.data()); + } + void BindTransformFeedbackBuffersEXT(u32 first, u32 count, const VkBuffer* buffers, const VkDeviceSize* offsets, const VkDeviceSize* sizes) const noexcept {