Merge pull request #2055 from bunnei/gpu-thread

Asynchronous GPU command processing
2024-07-04 23:31:19 +01:00 · 2019-03-07 10:41:53 -05:00 · 2019-03-07 10:41:53 -05:00 · 4f352833a5
commit 4f352833a5
parent 076c76f4e4 84ad81ee67
26 changed files with 529 additions and 52 deletions
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@ -36,7 +36,8 @@
 #include "frontend/applets/software_keyboard.h"
 #include "frontend/applets/web_browser.h"
 #include "video_core/debug_utils/debug_utils.h"
-#include "video_core/gpu.h"
+#include "video_core/gpu_asynch.h"
 #include "video_core/gpu_synch.h"
 #include "video_core/renderer_base.h"
 #include "video_core/video_core.h"
@ -129,10 +130,16 @@ struct System::Impl {
            return ResultStatus::ErrorVideoCore;
        }
-        gpu_core = std::make_unique<Tegra::GPU>(system, renderer->Rasterizer());
+        is_powered_on = true;
        if (Settings::values.use_asynchronous_gpu_emulation) {
            gpu_core = std::make_unique<VideoCommon::GPUAsynch>(system, *renderer);
        } else {
            gpu_core = std::make_unique<VideoCommon::GPUSynch>(system, *renderer);
        }
        cpu_core_manager.Initialize(system);
-        is_powered_on = true;
+
        LOG_DEBUG(Core, "Initialized OK");
        // Reset counters and set time origin to current frame
--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
@ -36,7 +36,7 @@ void nvdisp_disp0::flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u3
    auto& instance = Core::System::GetInstance();
    instance.GetPerfStats().EndGameFrame();
-    instance.Renderer().SwapBuffers(framebuffer);
+    instance.GPU().SwapBuffers(framebuffer);
 }
 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
@ -178,7 +178,7 @@ u32 nvhost_as_gpu::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& ou
    auto& gpu = system_instance.GPU();
    auto cpu_addr = gpu.MemoryManager().GpuToCpuAddress(params.offset);
    ASSERT(cpu_addr);
-    system_instance.Renderer().Rasterizer().FlushAndInvalidateRegion(*cpu_addr, itr->second.size);
+    gpu.FlushAndInvalidateRegion(*cpu_addr, itr->second.size);
    params.offset = gpu.MemoryManager().UnmapBuffer(params.offset, itr->second.size);
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@ -136,16 +136,6 @@ u32 nvhost_gpu::AllocateObjectContext(const std::vector<u8>& input, std::vector<
    return 0;
 }
 static void PushGPUEntries(Tegra::CommandList&& entries) {
    if (entries.empty()) {
        return;
    }
    auto& dma_pusher{Core::System::GetInstance().GPU().DmaPusher()};
    dma_pusher.Push(std::move(entries));
    dma_pusher.DispatchCalls();
 }
 u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& output) {
    if (input.size() < sizeof(IoctlSubmitGpfifo)) {
        UNIMPLEMENTED();
@ -163,7 +153,7 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
    std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)],
                params.num_entries * sizeof(Tegra::CommandListHeader));
-    PushGPUEntries(std::move(entries));
+    Core::System::GetInstance().GPU().PushGPUEntries(std::move(entries));
    params.fence_out.id = 0;
    params.fence_out.value = 0;
@ -184,7 +174,7 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output)
    Memory::ReadBlock(params.address, entries.data(),
                      params.num_entries * sizeof(Tegra::CommandListHeader));
-    PushGPUEntries(std::move(entries));
+    Core::System::GetInstance().GPU().PushGPUEntries(std::move(entries));
    params.fence_out.id = 0;
    params.fence_out.value = 0;
--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@ -186,7 +186,7 @@ void NVFlinger::Compose() {
            // There was no queued buffer to draw, render previous frame
            system_instance.GetPerfStats().EndGameFrame();
-            system_instance.Renderer().SwapBuffers({});
+            system_instance.GPU().SwapBuffers({});
            continue;
        }
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@ -356,16 +356,16 @@ void RasterizerFlushVirtualRegion(VAddr start, u64 size, FlushMode mode) {
        const VAddr overlap_end = std::min(end, region_end);
        const VAddr overlap_size = overlap_end - overlap_start;
-        auto& rasterizer = system_instance.Renderer().Rasterizer();
+        auto& gpu = system_instance.GPU();
        switch (mode) {
        case FlushMode::Flush:
-            rasterizer.FlushRegion(overlap_start, overlap_size);
+            gpu.FlushRegion(overlap_start, overlap_size);
            break;
        case FlushMode::Invalidate:
-            rasterizer.InvalidateRegion(overlap_start, overlap_size);
+            gpu.InvalidateRegion(overlap_start, overlap_size);
            break;
        case FlushMode::FlushAndInvalidate:
-            rasterizer.FlushAndInvalidateRegion(overlap_start, overlap_size);
+            gpu.FlushAndInvalidateRegion(overlap_start, overlap_size);
            break;
        }
    };
--- a/src/core/settings.h
+++ b/src/core/settings.h
@ -393,6 +393,7 @@ struct Values {
    u16 frame_limit;
    bool use_disk_shader_cache;
    bool use_accurate_gpu_emulation;
    bool use_asynchronous_gpu_emulation;
    float bg_red;
    float bg_green;
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@ -162,6 +162,8 @@ TelemetrySession::TelemetrySession() {
             Settings::values.use_disk_shader_cache);
    AddField(Telemetry::FieldType::UserConfig, "Renderer_UseAccurateGpuEmulation",
             Settings::values.use_accurate_gpu_emulation);
    AddField(Telemetry::FieldType::UserConfig, "Renderer_UseAsynchronousGpuEmulation",
             Settings::values.use_asynchronous_gpu_emulation);
    AddField(Telemetry::FieldType::UserConfig, "System_UseDockedMode",
             Settings::values.use_docked_mode);
 }
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@ -17,6 +17,12 @@ add_library(video_core STATIC
    engines/shader_header.h
    gpu.cpp
    gpu.h
    gpu_asynch.cpp
    gpu_asynch.h
    gpu_synch.cpp
    gpu_synch.h
    gpu_thread.cpp
    gpu_thread.h
    macro_interpreter.cpp
    macro_interpreter.h
    memory_manager.cpp
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@ -48,7 +48,7 @@ void KeplerMemory::ProcessData(u32 data) {
    // We have to invalidate the destination region to evict any outdated surfaces from the cache.
    // We do this before actually writing the new data because the destination address might contain
    // a dirty surface that will have to be written back to memory.
-    rasterizer.InvalidateRegion(*dest_address, sizeof(u32));
+    Core::System::GetInstance().GPU().InvalidateRegion(*dest_address, sizeof(u32));
    Memory::Write32(*dest_address, data);
    system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@ -92,12 +92,12 @@ void MaxwellDMA::HandleCopy() {
    const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) {
        // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
        // copying.
-        rasterizer.FlushRegion(*source_cpu, src_size);
+        Core::System::GetInstance().GPU().FlushRegion(*source_cpu, src_size);
        // We have to invalidate the destination region to evict any outdated surfaces from the
        // cache. We do this before actually writing the new data because the destination address
        // might contain a dirty surface that will have to be written back to memory.
-        rasterizer.InvalidateRegion(*dest_cpu, dst_size);
+        Core::System::GetInstance().GPU().InvalidateRegion(*dest_cpu, dst_size);
    };
    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@ -12,7 +12,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_dma.h"
 #include "video_core/gpu.h"
-#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
 namespace Tegra {
@ -28,7 +28,8 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
    UNREACHABLE();
 }
-GPU::GPU(Core::System& system, VideoCore::RasterizerInterface& rasterizer) {
+GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
    auto& rasterizer{renderer.Rasterizer()};
    memory_manager = std::make_unique<Tegra::MemoryManager>();
    dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
    maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@ -16,8 +16,8 @@ class System;
 }
 namespace VideoCore {
-class RasterizerInterface;
+class RendererBase;
-}
+} // namespace VideoCore
 namespace Tegra {
@ -119,9 +119,10 @@ enum class EngineID {
    MAXWELL_DMA_COPY_A = 0xB0B5,
 };
-class GPU final {
+class GPU {
 public:
-    explicit GPU(Core::System& system, VideoCore::RasterizerInterface& rasterizer);
+    explicit GPU(Core::System& system, VideoCore::RendererBase& renderer);
    ~GPU();
    struct MethodCall {
@ -200,8 +201,42 @@ public:
        };
    } regs{};
    /// Push GPU command entries to be processed
    virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
    /// Swap buffers (render frame)
    virtual void SwapBuffers(
        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) = 0;
    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
    virtual void FlushRegion(VAddr addr, u64 size) = 0;
    /// Notify rasterizer that any caches of the specified region should be invalidated
    virtual void InvalidateRegion(VAddr addr, u64 size) = 0;
    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
 private:
    void ProcessBindMethod(const MethodCall& method_call);
    void ProcessSemaphoreTriggerMethod();
    void ProcessSemaphoreRelease();
    void ProcessSemaphoreAcquire();
    /// Calls a GPU puller method.
    void CallPullerMethod(const MethodCall& method_call);
    /// Calls a GPU engine method.
    void CallEngineMethod(const MethodCall& method_call);
    /// Determines where the method should be executed.
    bool ExecuteMethodOnEngine(const MethodCall& method_call);
 protected:
    std::unique_ptr<Tegra::DmaPusher> dma_pusher;
    VideoCore::RendererBase& renderer;
 private:
    std::unique_ptr<Tegra::MemoryManager> memory_manager;
    /// Mapping of command subchannels to their bound engine ids.
@ -217,18 +252,6 @@ private:
    std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
    /// Inline memory engine
    std::unique_ptr<Engines::KeplerMemory> kepler_memory;
    void ProcessBindMethod(const MethodCall& method_call);
    void ProcessSemaphoreTriggerMethod();
    void ProcessSemaphoreRelease();
    void ProcessSemaphoreAcquire();
    // Calls a GPU puller method.
    void CallPullerMethod(const MethodCall& method_call);
    // Calls a GPU engine method.
    void CallEngineMethod(const MethodCall& method_call);
    // Determines where the method should be executed.
    bool ExecuteMethodOnEngine(const MethodCall& method_call);
 };
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@ -0,0 +1,37 @@
 // Copyright 2019 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #include "video_core/gpu_asynch.h"
 #include "video_core/gpu_thread.h"
 #include "video_core/renderer_base.h"
 namespace VideoCommon {
 GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
    : Tegra::GPU(system, renderer), gpu_thread{renderer, *dma_pusher} {}
 GPUAsynch::~GPUAsynch() = default;
 void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
    gpu_thread.SubmitList(std::move(entries));
 }
 void GPUAsynch::SwapBuffers(
    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
    gpu_thread.SwapBuffers(std::move(framebuffer));
 }
 void GPUAsynch::FlushRegion(VAddr addr, u64 size) {
    gpu_thread.FlushRegion(addr, size);
 }
 void GPUAsynch::InvalidateRegion(VAddr addr, u64 size) {
    gpu_thread.InvalidateRegion(addr, size);
 }
 void GPUAsynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
    gpu_thread.FlushAndInvalidateRegion(addr, size);
 }
 } // namespace VideoCommon
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@ -0,0 +1,37 @@
 // Copyright 2019 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include "video_core/gpu.h"
 #include "video_core/gpu_thread.h"
 namespace VideoCore {
 class RendererBase;
 } // namespace VideoCore
 namespace VideoCommon {
 namespace GPUThread {
 class ThreadManager;
 } // namespace GPUThread
 /// Implementation of GPU interface that runs the GPU asynchronously
 class GPUAsynch : public Tegra::GPU {
 public:
    explicit GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer);
    ~GPUAsynch();
    void PushGPUEntries(Tegra::CommandList&& entries) override;
    void SwapBuffers(
        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
    void FlushRegion(VAddr addr, u64 size) override;
    void InvalidateRegion(VAddr addr, u64 size) override;
    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
 private:
    GPUThread::ThreadManager gpu_thread;
 };
 } // namespace VideoCommon
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@ -0,0 +1,37 @@
 // Copyright 2019 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #include "video_core/gpu_synch.h"
 #include "video_core/renderer_base.h"
 namespace VideoCommon {
 GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer)
    : Tegra::GPU(system, renderer) {}
 GPUSynch::~GPUSynch() = default;
 void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
    dma_pusher->Push(std::move(entries));
    dma_pusher->DispatchCalls();
 }
 void GPUSynch::SwapBuffers(
    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
    renderer.SwapBuffers(std::move(framebuffer));
 }
 void GPUSynch::FlushRegion(VAddr addr, u64 size) {
    renderer.Rasterizer().FlushRegion(addr, size);
 }
 void GPUSynch::InvalidateRegion(VAddr addr, u64 size) {
    renderer.Rasterizer().InvalidateRegion(addr, size);
 }
 void GPUSynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
    renderer.Rasterizer().FlushAndInvalidateRegion(addr, size);
 }
 } // namespace VideoCommon
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@ -0,0 +1,29 @@
 // Copyright 2019 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include "video_core/gpu.h"
 namespace VideoCore {
 class RendererBase;
 } // namespace VideoCore
 namespace VideoCommon {
 /// Implementation of GPU interface that runs the GPU synchronously
 class GPUSynch : public Tegra::GPU {
 public:
    explicit GPUSynch(Core::System& system, VideoCore::RendererBase& renderer);
    ~GPUSynch();
    void PushGPUEntries(Tegra::CommandList&& entries) override;
    void SwapBuffers(
        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
    void FlushRegion(VAddr addr, u64 size) override;
    void InvalidateRegion(VAddr addr, u64 size) override;
    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
 };
 } // namespace VideoCommon
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@ -0,0 +1,152 @@
 // Copyright 2019 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #include "common/assert.h"
 #include "common/microprofile.h"
 #include "core/frontend/scope_acquire_window_context.h"
 #include "core/settings.h"
 #include "video_core/dma_pusher.h"
 #include "video_core/gpu.h"
 #include "video_core/gpu_thread.h"
 #include "video_core/renderer_base.h"
 namespace VideoCommon::GPUThread {
 /// Executes a single GPU thread command
 static void ExecuteCommand(CommandData* command, VideoCore::RendererBase& renderer,
                           Tegra::DmaPusher& dma_pusher) {
    if (const auto submit_list = std::get_if<SubmitListCommand>(command)) {
        dma_pusher.Push(std::move(submit_list->entries));
        dma_pusher.DispatchCalls();
    } else if (const auto data = std::get_if<SwapBuffersCommand>(command)) {
        renderer.SwapBuffers(data->framebuffer);
    } else if (const auto data = std::get_if<FlushRegionCommand>(command)) {
        renderer.Rasterizer().FlushRegion(data->addr, data->size);
    } else if (const auto data = std::get_if<InvalidateRegionCommand>(command)) {
        renderer.Rasterizer().InvalidateRegion(data->addr, data->size);
    } else if (const auto data = std::get_if<FlushAndInvalidateRegionCommand>(command)) {
        renderer.Rasterizer().FlushAndInvalidateRegion(data->addr, data->size);
    } else {
        UNREACHABLE();
    }
 }
 /// Runs the GPU thread
 static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher,
                      SynchState& state) {
    MicroProfileOnThreadCreate("GpuThread");
    auto WaitForWakeup = [&]() {
        std::unique_lock<std::mutex> lock{state.signal_mutex};
        state.signal_condition.wait(lock, [&] { return !state.is_idle || !state.is_running; });
    };
    // Wait for first GPU command before acquiring the window context
    WaitForWakeup();
    // If emulation was stopped during disk shader loading, abort before trying to acquire context
    if (!state.is_running) {
        return;
    }
    Core::Frontend::ScopeAcquireWindowContext acquire_context{renderer.GetRenderWindow()};
    while (state.is_running) {
        if (!state.is_running) {
            return;
        }
        {
            // Thread has been woken up, so make the previous write queue the next read queue
            std::lock_guard<std::mutex> lock{state.signal_mutex};
            std::swap(state.push_queue, state.pop_queue);
        }
        // Execute all of the GPU commands
        while (!state.pop_queue->empty()) {
            ExecuteCommand(&state.pop_queue->front(), renderer, dma_pusher);
            state.pop_queue->pop();
        }
        state.UpdateIdleState();
        // Signal that the GPU thread has finished processing commands
        if (state.is_idle) {
            state.idle_condition.notify_one();
        }
        // Wait for CPU thread to send more GPU commands
        WaitForWakeup();
    }
 }
 ThreadManager::ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher)
    : renderer{renderer}, dma_pusher{dma_pusher}, thread{RunThread, std::ref(renderer),
                                                         std::ref(dma_pusher), std::ref(state)},
      thread_id{thread.get_id()} {}
 ThreadManager::~ThreadManager() {
    {
        // Notify GPU thread that a shutdown is pending
        std::lock_guard<std::mutex> lock{state.signal_mutex};
        state.is_running = false;
    }
    state.signal_condition.notify_one();
    thread.join();
 }
 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
    if (entries.empty()) {
        return;
    }
    PushCommand(SubmitListCommand(std::move(entries)), false, false);
 }
 void ThreadManager::SwapBuffers(
    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
    PushCommand(SwapBuffersCommand(std::move(framebuffer)), true, false);
 }
 void ThreadManager::FlushRegion(VAddr addr, u64 size) {
    // Block the CPU when using accurate emulation
    PushCommand(FlushRegionCommand(addr, size), Settings::values.use_accurate_gpu_emulation, false);
 }
 void ThreadManager::InvalidateRegion(VAddr addr, u64 size) {
    PushCommand(InvalidateRegionCommand(addr, size), true, true);
 }
 void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
    InvalidateRegion(addr, size);
 }
 void ThreadManager::PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu) {
    {
        std::lock_guard<std::mutex> lock{state.signal_mutex};
        if ((allow_on_cpu && state.is_idle) || IsGpuThread()) {
            // Execute the command synchronously on the current thread
            ExecuteCommand(&command_data, renderer, dma_pusher);
            return;
        }
        // Push the command to the GPU thread
        state.UpdateIdleState();
        state.push_queue->emplace(command_data);
    }
    // Signal the GPU thread that commands are pending
    state.signal_condition.notify_one();
    if (wait_for_idle) {
        // Wait for the GPU to be idle (all commands to be executed)
        std::unique_lock<std::mutex> lock{state.idle_mutex};
        state.idle_condition.wait(lock, [this] { return static_cast<bool>(state.is_idle); });
    }
 }
 } // namespace VideoCommon::GPUThread
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@ -0,0 +1,136 @@
 // Copyright 2019 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 #pragma once
 #include <array>
 #include <atomic>
 #include <condition_variable>
 #include <memory>
 #include <mutex>
 #include <optional>
 #include <thread>
 #include <variant>
 namespace Tegra {
 struct FramebufferConfig;
 class DmaPusher;
 } // namespace Tegra
 namespace VideoCore {
 class RendererBase;
 } // namespace VideoCore
 namespace VideoCommon::GPUThread {
 /// Command to signal to the GPU thread that a command list is ready for processing
 struct SubmitListCommand final {
    explicit SubmitListCommand(Tegra::CommandList&& entries) : entries{std::move(entries)} {}
    Tegra::CommandList entries;
 };
 /// Command to signal to the GPU thread that a swap buffers is pending
 struct SwapBuffersCommand final {
    explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer)
        : framebuffer{std::move(framebuffer)} {}
    std::optional<const Tegra::FramebufferConfig> framebuffer;
 };
 /// Command to signal to the GPU thread to flush a region
 struct FlushRegionCommand final {
    explicit constexpr FlushRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {}
    const VAddr addr;
    const u64 size;
 };
 /// Command to signal to the GPU thread to invalidate a region
 struct InvalidateRegionCommand final {
    explicit constexpr InvalidateRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {}
    const VAddr addr;
    const u64 size;
 };
 /// Command to signal to the GPU thread to flush and invalidate a region
 struct FlushAndInvalidateRegionCommand final {
    explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr, u64 size)
        : addr{addr}, size{size} {}
    const VAddr addr;
    const u64 size;
 };
 using CommandData = std::variant<SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
                                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand>;
 /// Struct used to synchronize the GPU thread
 struct SynchState final {
    std::atomic<bool> is_running{true};
    std::atomic<bool> is_idle{true};
    std::condition_variable signal_condition;
    std::mutex signal_mutex;
    std::condition_variable idle_condition;
    std::mutex idle_mutex;
    // We use two queues for sending commands to the GPU thread, one for writing (push_queue) to and
    // one for reading from (pop_queue). These are swapped whenever the current pop_queue becomes
    // empty. This allows for efficient thread-safe access, as it does not require any copies.
    using CommandQueue = std::queue<CommandData>;
    std::array<CommandQueue, 2> command_queues;
    CommandQueue* push_queue{&command_queues[0]};
    CommandQueue* pop_queue{&command_queues[1]};
    void UpdateIdleState() {
        std::lock_guard<std::mutex> lock{idle_mutex};
        is_idle = command_queues[0].empty() && command_queues[1].empty();
    }
 };
 /// Class used to manage the GPU thread
 class ThreadManager final {
 public:
    explicit ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher);
    ~ThreadManager();
    /// Push GPU command entries to be processed
    void SubmitList(Tegra::CommandList&& entries);
    /// Swap buffers (render frame)
    void SwapBuffers(
        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer);
    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
    void FlushRegion(VAddr addr, u64 size);
    /// Notify rasterizer that any caches of the specified region should be invalidated
    void InvalidateRegion(VAddr addr, u64 size);
    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
    void FlushAndInvalidateRegion(VAddr addr, u64 size);
    /// Waits the caller until the GPU thread is idle, used for synchronization
    void WaitForIdle();
 private:
    /// Pushes a command to be executed by the GPU thread
    void PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu);
    /// Returns true if this is called by the GPU thread
    bool IsGpuThread() const {
        return std::this_thread::get_id() == thread_id;
    }
 private:
    SynchState state;
    std::thread thread;
    std::thread::id thread_id;
    VideoCore::RendererBase& renderer;
    Tegra::DmaPusher& dma_pusher;
 };
 } // namespace VideoCommon::GPUThread
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@ -749,11 +749,7 @@ void RasterizerOpenGL::FlushAll() {}
 void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
-
+    res_cache.FlushRegion(addr, size);
    if (Settings::values.use_accurate_gpu_emulation) {
        // Only flush if use_accurate_gpu_emulation is enabled, as it incurs a performance hit
        res_cache.FlushRegion(addr, size);
    }
 }
 void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
--- a/src/yuzu/bootmanager.cpp
+++ b/src/yuzu/bootmanager.cpp
@ -20,10 +20,7 @@
 EmuThread::EmuThread(GRenderWindow* render_window) : render_window(render_window) {}
 void EmuThread::run() {
-    if (!Settings::values.use_multi_core) {
+    render_window->MakeCurrent();
        // Single core mode must acquire OpenGL context for entire emulation session
        render_window->MakeCurrent();
    }
    MicroProfileOnThreadCreate("EmuThread");
@ -38,6 +35,11 @@ void EmuThread::run() {
    emit LoadProgress(VideoCore::LoadCallbackStage::Complete, 0, 0);
    if (Settings::values.use_asynchronous_gpu_emulation) {
        // Release OpenGL context for the GPU thread
        render_window->DoneCurrent();
    }
    // holds whether the cpu was running during the last iteration,
    // so that the DebugModeLeft signal can be emitted before the
    // next execution step
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@ -374,6 +374,8 @@ void Config::ReadValues() {
        qt_config->value("use_disk_shader_cache", false).toBool();
    Settings::values.use_accurate_gpu_emulation =
        qt_config->value("use_accurate_gpu_emulation", false).toBool();
    Settings::values.use_asynchronous_gpu_emulation =
        qt_config->value("use_asynchronous_gpu_emulation", false).toBool();
    Settings::values.bg_red = qt_config->value("bg_red", 0.0).toFloat();
    Settings::values.bg_green = qt_config->value("bg_green", 0.0).toFloat();
@ -633,6 +635,8 @@ void Config::SaveValues() {
    qt_config->setValue("frame_limit", Settings::values.frame_limit);
    qt_config->setValue("use_disk_shader_cache", Settings::values.use_disk_shader_cache);
    qt_config->setValue("use_accurate_gpu_emulation", Settings::values.use_accurate_gpu_emulation);
    qt_config->setValue("use_asynchronous_gpu_emulation",
                        Settings::values.use_asynchronous_gpu_emulation);
    // Cast to double because Qt's written float values are not human-readable
    qt_config->setValue("bg_red", (double)Settings::values.bg_red);
--- a/src/yuzu/configuration/configure_graphics.cpp
+++ b/src/yuzu/configuration/configure_graphics.cpp
@ -75,6 +75,8 @@ void ConfigureGraphics::setConfiguration() {
    ui->frame_limit->setValue(Settings::values.frame_limit);
    ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache);
    ui->use_accurate_gpu_emulation->setChecked(Settings::values.use_accurate_gpu_emulation);
    ui->use_asynchronous_gpu_emulation->setEnabled(!Core::System::GetInstance().IsPoweredOn());
    ui->use_asynchronous_gpu_emulation->setChecked(Settings::values.use_asynchronous_gpu_emulation);
    UpdateBackgroundColorButton(QColor::fromRgbF(Settings::values.bg_red, Settings::values.bg_green,
                                                 Settings::values.bg_blue));
 }
@ -86,6 +88,8 @@ void ConfigureGraphics::applyConfiguration() {
    Settings::values.frame_limit = ui->frame_limit->value();
    Settings::values.use_disk_shader_cache = ui->use_disk_shader_cache->isChecked();
    Settings::values.use_accurate_gpu_emulation = ui->use_accurate_gpu_emulation->isChecked();
    Settings::values.use_asynchronous_gpu_emulation =
        ui->use_asynchronous_gpu_emulation->isChecked();
    Settings::values.bg_red = static_cast<float>(bg_color.redF());
    Settings::values.bg_green = static_cast<float>(bg_color.greenF());
    Settings::values.bg_blue = static_cast<float>(bg_color.blueF());
--- a/src/yuzu/configuration/configure_graphics.ui
+++ b/src/yuzu/configuration/configure_graphics.ui
@ -63,6 +63,13 @@
          </property>
         </widget>
        </item>
        <item>
         <widget class="QCheckBox" name="use_asynchronous_gpu_emulation">
          <property name="text">
           <string>Use asynchronous GPU emulation</string>
          </property>
         </widget>
        </item>
        <item>
         <layout class="QHBoxLayout" name="horizontalLayout">
          <item>
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@ -354,6 +354,8 @@ void Config::ReadValues() {
        sdl2_config->GetBoolean("Renderer", "use_disk_shader_cache", false);
    Settings::values.use_accurate_gpu_emulation =
        sdl2_config->GetBoolean("Renderer", "use_accurate_gpu_emulation", false);
    Settings::values.use_asynchronous_gpu_emulation =
        sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false);
    Settings::values.bg_red = (float)sdl2_config->GetReal("Renderer", "bg_red", 0.0);
    Settings::values.bg_green = (float)sdl2_config->GetReal("Renderer", "bg_green", 0.0);
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@ -118,6 +118,10 @@ use_disk_shader_cache =
 # 0 (default): Off (fast), 1 : On (slow)
 use_accurate_gpu_emulation =
 # Whether to use asynchronous GPU emulation
 # 0 : Off (slow), 1 (default): On (fast)
 use_asynchronous_gpu_emulation =
 # The clear color for the renderer. What shows up on the sides of the bottom screen.
 # Must be in range of 0.0-1.0. Defaults to 1.0 for all.
 bg_red =