kern: audit (and fix) our hardware maintenance instructions to match official kernel

2024-11-22 20:06:40 +00:00 · 2021-10-27 12:31:53 -07:00 · 2021-10-27 12:31:53 -07:00 · e81a1ce5a8
commit e81a1ce5a8
parent fb59d0ad43
16 changed files with 104 additions and 203 deletions
--- a/libraries/libmesosphere/include/mesosphere.hpp
+++ b/libraries/libmesosphere/include/mesosphere.hpp
@ -93,3 +93,4 @@

 /* Deferred includes. */
 #include <mesosphere/kern_k_auto_object_impls.hpp>
+#include <mesosphere/kern_k_scheduler_impls.hpp>
--- a/libraries/libmesosphere/include/mesosphere/arch/arm64/init/kern_k_init_page_table.hpp
+++ b/libraries/libmesosphere/include/mesosphere/arch/arm64/init/kern_k_init_page_table.hpp
@ -279,7 +279,7 @@ namespace ams::kern::arch::arm64::init {

                /* Invalidate the entire tlb. */
                cpu::DataSynchronizationBarrierInnerShareable();
-                cpu::InvalidateEntireTlb();
+                cpu::InvalidateEntireTlbInnerShareable();

                /* Copy data, if we should. */
                const u64 negative_block_size_for_mask = static_cast<u64>(-static_cast<s64>(block_size));
@ -350,7 +350,6 @@ namespace ams::kern::arch::arm64::init {
                    /* If we don't already have an L2 table, we need to make a new one. */
                    if (!l1_entry->IsTable()) {
                        KPhysicalAddress new_table = AllocateNewPageTable(allocator);
-                        ClearNewPageTable(new_table);
                        *l1_entry = L1PageTableEntry(PageTableEntry::TableTag{}, new_table, attr.IsPrivilegedExecuteNever());
                        cpu::DataSynchronizationBarrierInnerShareable();
                    }
@ -361,12 +360,12 @@ namespace ams::kern::arch::arm64::init {
                    if (util::IsAligned(GetInteger(virt_addr), L2ContiguousBlockSize) && util::IsAligned(GetInteger(phys_addr), L2ContiguousBlockSize) && size >= L2ContiguousBlockSize) {
                        for (size_t i = 0; i < L2ContiguousBlockSize / L2BlockSize; i++) {
                            l2_entry[i] = L2PageTableEntry(PageTableEntry::BlockTag{}, phys_addr, attr, PageTableEntry::SoftwareReservedBit_None, true);
-                            cpu::DataSynchronizationBarrierInnerShareable();

                            virt_addr += L2BlockSize;
                            phys_addr += L2BlockSize;
                            size      -= L2BlockSize;
                        }
+                        cpu::DataSynchronizationBarrierInnerShareable();
                        continue;
                    }

@ -384,7 +383,6 @@ namespace ams::kern::arch::arm64::init {
                    /* If we don't already have an L3 table, we need to make a new one. */
                    if (!l2_entry->IsTable()) {
                        KPhysicalAddress new_table = AllocateNewPageTable(allocator);
-                        ClearNewPageTable(new_table);
                        *l2_entry = L2PageTableEntry(PageTableEntry::TableTag{}, new_table, attr.IsPrivilegedExecuteNever());
                        cpu::DataSynchronizationBarrierInnerShareable();
                    }
@ -395,12 +393,12 @@ namespace ams::kern::arch::arm64::init {
                    if (util::IsAligned(GetInteger(virt_addr), L3ContiguousBlockSize) && util::IsAligned(GetInteger(phys_addr), L3ContiguousBlockSize) && size >= L3ContiguousBlockSize) {
                        for (size_t i = 0; i < L3ContiguousBlockSize / L3BlockSize; i++) {
                            l3_entry[i] = L3PageTableEntry(PageTableEntry::BlockTag{}, phys_addr, attr, PageTableEntry::SoftwareReservedBit_None, true);
-                            cpu::DataSynchronizationBarrierInnerShareable();

                            virt_addr += L3BlockSize;
                            phys_addr += L3BlockSize;
                            size      -= L3BlockSize;
                        }
+                        cpu::DataSynchronizationBarrierInnerShareable();
                        continue;
                    }

--- a/libraries/libmesosphere/include/mesosphere/arch/arm64/kern_cpu.hpp
+++ b/libraries/libmesosphere/include/mesosphere/arch/arm64/kern_cpu.hpp
@ -60,6 +60,11 @@ namespace ams::kern::arch::arm64::cpu {
        __asm__ __volatile__("isb" ::: "memory");
    }

+    ALWAYS_INLINE void EnsureInstructionConsistencyInnerShareable() {
+        DataSynchronizationBarrierInnerShareable();
+        InstructionMemoryBarrier();
+    }
+
    ALWAYS_INLINE void EnsureInstructionConsistency() {
        DataSynchronizationBarrier();
        InstructionMemoryBarrier();
@ -177,7 +182,6 @@ namespace ams::kern::arch::arm64::cpu {
    NOINLINE void SynchronizeAllCores();

    /* Cache management helpers. */
-    void ClearPageToZeroImpl(void *);
    void StoreEntireCacheForInit();
    void FlushEntireCacheForInit();

@ -190,10 +194,16 @@ namespace ams::kern::arch::arm64::cpu {

    void InvalidateEntireInstructionCache();

-    ALWAYS_INLINE void ClearPageToZero(void *page) {
+    ALWAYS_INLINE void ClearPageToZero(void * const page) {
        MESOSPHERE_ASSERT(util::IsAligned(reinterpret_cast<uintptr_t>(page), PageSize));
        MESOSPHERE_ASSERT(page != nullptr);
-        ClearPageToZeroImpl(page);
+
+        uintptr_t cur = reinterpret_cast<uintptr_t>(__builtin_assume_aligned(page, PageSize));
+        const uintptr_t last = cur + PageSize - DataCacheLineSize;
+
+        for (/* ... */; cur <= last; cur += DataCacheLineSize) {
+            __asm__ __volatile__("dc zva, %[cur]" :: [cur]"r"(cur) : "memory");
+        }
    }

    ALWAYS_INLINE void InvalidateTlbByAsid(u32 asid) {
@ -213,6 +223,11 @@ namespace ams::kern::arch::arm64::cpu {
        EnsureInstructionConsistency();
    }

+    ALWAYS_INLINE void InvalidateEntireTlbInnerShareable() {
+        __asm__ __volatile__("tlbi vmalle1is" ::: "memory");
+        EnsureInstructionConsistencyInnerShareable();
+    }
+
    ALWAYS_INLINE void InvalidateEntireTlbDataOnly() {
        __asm__ __volatile__("tlbi vmalle1is" ::: "memory");
        DataSynchronizationBarrier();
--- a/libraries/libmesosphere/include/mesosphere/arch/arm64/kern_k_page_table.hpp
+++ b/libraries/libmesosphere/include/mesosphere/arch/arm64/kern_k_page_table.hpp
@ -219,27 +219,27 @@ namespace ams::kern::arch::arm64 {

            Result ChangePermissions(KProcessAddress virt_addr, size_t num_pages, PageTableEntry entry_template, DisableMergeAttribute disable_merge_attr, bool refresh_mapping, PageLinkedList *page_list, bool reuse_ll);

-            static void PteDataSynchronizationBarrier() {
+            static ALWAYS_INLINE void PteDataSynchronizationBarrier() {
                cpu::DataSynchronizationBarrierInnerShareable();
            }

-            static void ClearPageTable(KVirtualAddress table) {
+            static ALWAYS_INLINE void ClearPageTable(KVirtualAddress table) {
                cpu::ClearPageToZero(GetVoidPointer(table));
            }

-            void OnTableUpdated() const {
+            ALWAYS_INLINE void OnTableUpdated() const {
                cpu::InvalidateTlbByAsid(m_asid);
            }

-            void OnKernelTableUpdated() const {
+            ALWAYS_INLINE void OnKernelTableUpdated() const {
                cpu::InvalidateEntireTlbDataOnly();
            }

-            void OnKernelTableSinglePageUpdated(KProcessAddress virt_addr) const {
+            ALWAYS_INLINE void OnKernelTableSinglePageUpdated(KProcessAddress virt_addr) const {
                cpu::InvalidateTlbByVaDataOnly(virt_addr);
            }

-            void NoteUpdated() const {
+            ALWAYS_INLINE void NoteUpdated() const {
                cpu::DataSynchronizationBarrier();

                if (this->IsKernel()) {
@ -249,7 +249,7 @@ namespace ams::kern::arch::arm64 {
                }
            }

-            void NoteSingleKernelPageUpdated(KProcessAddress virt_addr) const {
+            ALWAYS_INLINE void NoteSingleKernelPageUpdated(KProcessAddress virt_addr) const {
                MESOSPHERE_ASSERT(this->IsKernel());

                cpu::DataSynchronizationBarrier();
--- a/libraries/libmesosphere/include/mesosphere/arch/arm64/kern_k_slab_heap_impl.hpp
+++ b/libraries/libmesosphere/include/mesosphere/arch/arm64/kern_k_slab_heap_impl.hpp
@ -45,6 +45,7 @@ namespace ams::kern::arch::arm64 {

            /* Select L1 cache. */
            cpu::SetCsselrEl1(0);
+            cpu::InstructionMemoryBarrier();

            /* Check that the L1 cache is not direct-mapped. */
            return cpu::CacheSizeIdRegisterAccessor().GetAssociativity() != 0;
--- a/libraries/libmesosphere/include/mesosphere/kern_k_dynamic_resource_manager.hpp
+++ b/libraries/libmesosphere/include/mesosphere/kern_k_dynamic_resource_manager.hpp
@ -46,7 +46,7 @@ namespace ams::kern {
                return m_slab_heap->Allocate(m_page_allocator);
            }

-            void Free(T *t) const {
+            ALWAYS_INLINE void Free(T *t) const {
                m_slab_heap->Free(t);
            }
    };
--- a/libraries/libmesosphere/include/mesosphere/kern_k_scheduler.hpp
+++ b/libraries/libmesosphere/include/mesosphere/kern_k_scheduler.hpp
@ -211,18 +211,6 @@ namespace ams::kern {
            static consteval bool ValidateAssemblyOffsets();
    };

-    consteval bool KScheduler::ValidateAssemblyOffsets() {
-        static_assert(AMS_OFFSETOF(KScheduler, m_state.needs_scheduling)        == KSCHEDULER_NEEDS_SCHEDULING);
-        static_assert(AMS_OFFSETOF(KScheduler, m_state.interrupt_task_runnable) == KSCHEDULER_INTERRUPT_TASK_RUNNABLE);
-        static_assert(AMS_OFFSETOF(KScheduler, m_state.highest_priority_thread) == KSCHEDULER_HIGHEST_PRIORITY_THREAD);
-        static_assert(AMS_OFFSETOF(KScheduler, m_state.idle_thread_stack)       == KSCHEDULER_IDLE_THREAD_STACK);
-        static_assert(AMS_OFFSETOF(KScheduler, m_state.prev_thread)             == KSCHEDULER_PREVIOUS_THREAD);
-        static_assert(AMS_OFFSETOF(KScheduler, m_state.interrupt_task_manager)  == KSCHEDULER_INTERRUPT_TASK_MANAGER);
-
-        return true;
-    }
-    static_assert(KScheduler::ValidateAssemblyOffsets());
-
    class KScopedSchedulerLock : KScopedLock<KScheduler::LockType> {
        public:
            explicit ALWAYS_INLINE KScopedSchedulerLock() : KScopedLock(KScheduler::s_scheduler_lock) { /* ... */ }
--- a/libraries/libmesosphere/include/mesosphere/kern_k_scheduler_impls.hpp
+++ b/libraries/libmesosphere/include/mesosphere/kern_k_scheduler_impls.hpp
@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Atmosphère-NX
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+#include <mesosphere/kern_common.hpp>
+#include <mesosphere/kern_k_scheduler.hpp>
+#include <mesosphere/kern_select_interrupt_manager.hpp>
+
+namespace ams::kern {
+
+    /* NOTE: This header is included after all main headers. */
+    consteval bool KScheduler::ValidateAssemblyOffsets() {
+        static_assert(AMS_OFFSETOF(KScheduler, m_state.needs_scheduling)        == KSCHEDULER_NEEDS_SCHEDULING);
+        static_assert(AMS_OFFSETOF(KScheduler, m_state.interrupt_task_runnable) == KSCHEDULER_INTERRUPT_TASK_RUNNABLE);
+        static_assert(AMS_OFFSETOF(KScheduler, m_state.highest_priority_thread) == KSCHEDULER_HIGHEST_PRIORITY_THREAD);
+        static_assert(AMS_OFFSETOF(KScheduler, m_state.idle_thread_stack)       == KSCHEDULER_IDLE_THREAD_STACK);
+        static_assert(AMS_OFFSETOF(KScheduler, m_state.prev_thread)             == KSCHEDULER_PREVIOUS_THREAD);
+        static_assert(AMS_OFFSETOF(KScheduler, m_state.interrupt_task_manager)  == KSCHEDULER_INTERRUPT_TASK_MANAGER);
+
+        return true;
+    }
+    static_assert(KScheduler::ValidateAssemblyOffsets());
+
+    ALWAYS_INLINE void KScheduler::RescheduleOtherCores(u64 cores_needing_scheduling) {
+        if (const u64 core_mask = cores_needing_scheduling & ~(1ul << m_core_id); core_mask != 0) {
+            cpu::DataSynchronizationBarrier();
+            Kernel::GetInterruptManager().SendInterProcessorInterrupt(KInterruptName_Scheduler, core_mask);
+        }
+    }
+
+}
--- a/libraries/libmesosphere/source/arch/arm64/kern_cpu_asm.s
+++ b/libraries/libmesosphere/source/arch/arm64/kern_cpu_asm.s
@ -61,139 +61,3 @@ _ZN3ams4kern4arch5arm643cpu23SynchronizeAllCoresImplEPii:
 5:
    stlr wzr, [x0]
    ret
-
-
-/* ams::kern::arch::arm64::cpu::ClearPageToZero(void *) */
-.section    .text._ZN3ams4kern4arch5arm643cpu19ClearPageToZeroImplEPv, "ax", %progbits
-.global     _ZN3ams4kern4arch5arm643cpu19ClearPageToZeroImplEPv
-.type       _ZN3ams4kern4arch5arm643cpu19ClearPageToZeroImplEPv, %function
-_ZN3ams4kern4arch5arm643cpu19ClearPageToZeroImplEPv:
-    /* Efficiently clear the page using dc zva. */
-    dc      zva, x0
-    add     x8, x0, #0x040
-    dc      zva, x8
-    add     x8, x0, #0x080
-    dc      zva, x8
-    add     x8, x0, #0x0c0
-    dc      zva, x8
-    add     x8, x0, #0x100
-    dc      zva, x8
-    add     x8, x0, #0x140
-    dc      zva, x8
-    add     x8, x0, #0x180
-    dc      zva, x8
-    add     x8, x0, #0x1c0
-    dc      zva, x8
-    add     x8, x0, #0x200
-    dc      zva, x8
-    add     x8, x0, #0x240
-    dc      zva, x8
-    add     x8, x0, #0x280
-    dc      zva, x8
-    add     x8, x0, #0x2c0
-    dc      zva, x8
-    add     x8, x0, #0x300
-    dc      zva, x8
-    add     x8, x0, #0x340
-    dc      zva, x8
-    add     x8, x0, #0x380
-    dc      zva, x8
-    add     x8, x0, #0x3c0
-    dc      zva, x8
-    add     x8, x0, #0x400
-    dc      zva, x8
-    add     x8, x0, #0x440
-    dc      zva, x8
-    add     x8, x0, #0x480
-    dc      zva, x8
-    add     x8, x0, #0x4c0
-    dc      zva, x8
-    add     x8, x0, #0x500
-    dc      zva, x8
-    add     x8, x0, #0x540
-    dc      zva, x8
-    add     x8, x0, #0x580
-    dc      zva, x8
-    add     x8, x0, #0x5c0
-    dc      zva, x8
-    add     x8, x0, #0x600
-    dc      zva, x8
-    add     x8, x0, #0x640
-    dc      zva, x8
-    add     x8, x0, #0x680
-    dc      zva, x8
-    add     x8, x0, #0x6c0
-    dc      zva, x8
-    add     x8, x0, #0x700
-    dc      zva, x8
-    add     x8, x0, #0x740
-    dc      zva, x8
-    add     x8, x0, #0x780
-    dc      zva, x8
-    add     x8, x0, #0x7c0
-    dc      zva, x8
-    add     x8, x0, #0x800
-    dc      zva, x8
-    add     x8, x0, #0x840
-    dc      zva, x8
-    add     x8, x0, #0x880
-    dc      zva, x8
-    add     x8, x0, #0x8c0
-    dc      zva, x8
-    add     x8, x0, #0x900
-    dc      zva, x8
-    add     x8, x0, #0x940
-    dc      zva, x8
-    add     x8, x0, #0x980
-    dc      zva, x8
-    add     x8, x0, #0x9c0
-    dc      zva, x8
-    add     x8, x0, #0xa00
-    dc      zva, x8
-    add     x8, x0, #0xa40
-    dc      zva, x8
-    add     x8, x0, #0xa80
-    dc      zva, x8
-    add     x8, x0, #0xac0
-    dc      zva, x8
-    add     x8, x0, #0xb00
-    dc      zva, x8
-    add     x8, x0, #0xb40
-    dc      zva, x8
-    add     x8, x0, #0xb80
-    dc      zva, x8
-    add     x8, x0, #0xbc0
-    dc      zva, x8
-    add     x8, x0, #0xc00
-    dc      zva, x8
-    add     x8, x0, #0xc40
-    dc      zva, x8
-    add     x8, x0, #0xc80
-    dc      zva, x8
-    add     x8, x0, #0xcc0
-    dc      zva, x8
-    add     x8, x0, #0xd00
-    dc      zva, x8
-    add     x8, x0, #0xd40
-    dc      zva, x8
-    add     x8, x0, #0xd80
-    dc      zva, x8
-    add     x8, x0, #0xdc0
-    dc      zva, x8
-    add     x8, x0, #0xe00
-    dc      zva, x8
-    add     x8, x0, #0xe40
-    dc      zva, x8
-    add     x8, x0, #0xe80
-    dc      zva, x8
-    add     x8, x0, #0xec0
-    dc      zva, x8
-    add     x8, x0, #0xf00
-    dc      zva, x8
-    add     x8, x0, #0xf40
-    dc      zva, x8
-    add     x8, x0, #0xf80
-    dc      zva, x8
-    add     x8, x0, #0xfc0
-    dc      zva, x8
-    ret
--- a/libraries/libmesosphere/source/arch/arm64/kern_exception_handlers.cpp
+++ b/libraries/libmesosphere/source/arch/arm64/kern_exception_handlers.cpp
@ -225,7 +225,7 @@ namespace ams::kern::arch::arm64 {
            if (AMS_UNLIKELY(GetCurrentThread().IsSingleStep())) {
                GetCurrentThread().ClearSingleStep();
                cpu::MonitorDebugSystemControlRegisterAccessor().SetSoftwareStep(false).Store();
-                cpu::EnsureInstructionConsistency();
+                cpu::InstructionMemoryBarrier();
            }
            #endif

--- a/libraries/libmesosphere/source/arch/arm64/kern_k_page_table.cpp
+++ b/libraries/libmesosphere/source/arch/arm64/kern_k_page_table.cpp
@ -169,10 +169,10 @@ namespace ams::kern::arch::arm64 {
        m_manager = std::addressof(Kernel::GetSystemPageTableManager());

        /* Allocate a page for ttbr. */
+        /* NOTE: It is a postcondition of page table manager allocation that the page is all-zero. */
        const u64 asid_tag = (static_cast<u64>(m_asid) << 48ul);
        const KVirtualAddress page = m_manager->Allocate();
        MESOSPHERE_ASSERT(page != Null<KVirtualAddress>);
-        cpu::ClearPageToZero(GetVoidPointer(page));
        m_ttbr = GetInteger(KPageTableBase::GetLinearMappedPhysicalAddress(page)) | asid_tag;

        /* Initialize the base page table. */
@ -1058,7 +1058,7 @@ namespace ams::kern::arch::arm64 {
        auto sw_reserved_bits = PageTableEntry::EncodeSoftwareReservedBits(head_entry->IsHeadMergeDisabled(), head_entry->IsHeadAndBodyMergeDisabled(), tail_entry->IsTailMergeDisabled());

        /* Merge! */
-        PteDataSynchronizationBarrier();
+        /* NOTE: As of 13.1.0, Nintendo does not do: PteDataSynchronizationBarrier(); */
        *l1_entry = L1PageTableEntry(PageTableEntry::BlockTag{}, phys_addr, PageTableEntry(entry_template), sw_reserved_bits, false);

        /* Note that we updated. */
--- a/libraries/libmesosphere/source/board/nintendo/nx/kern_k_device_page_table.cpp
+++ b/libraries/libmesosphere/source/board/nintendo/nx/kern_k_device_page_table.cpp
@ -656,9 +656,8 @@ namespace ams::kern::board::nintendo::nx {
        MESOSPHERE_ASSERT(IsValidPhysicalAddress(table_phys_addr));
        Kernel::GetSystemPageTableManager().Open(table_virt_addr, 1);

-        /* Clear the page and save it. */
+        /* Save the page. Note that it is a pre-condition that the page is cleared, when allocated from the system page table manager. */
        /* NOTE: Nintendo does not check the result of StoreDataCache. */
-        cpu::ClearPageToZero(GetVoidPointer(table_virt_addr));
        cpu::StoreDataCache(GetVoidPointer(table_virt_addr), PageDirectorySize);
        g_reserved_table_phys_addr = table_phys_addr;

--- a/libraries/libmesosphere/source/board/nintendo/nx/kern_k_sleep_manager.cpp
+++ b/libraries/libmesosphere/source/board/nintendo/nx/kern_k_sleep_manager.cpp
@ -341,7 +341,9 @@ namespace ams::kern::board::nintendo::nx {

            /* Restore pmu registers. */
            cpu::SetPmUserEnrEl0(0);
-            cpu::PerformanceMonitorsControlRegisterAccessor().SetEventCounterReset(true).SetCycleCounterReset(true).Store();
+            cpu::PerformanceMonitorsControlRegisterAccessor(0).SetEventCounterReset(true).SetCycleCounterReset(true).Store();
+            cpu::EnsureInstructionConsistency();
+
            cpu::SetPmOvsClrEl0(static_cast<u64>(static_cast<u32>(~u32())));
            cpu::SetPmIntEnClrEl1(static_cast<u64>(static_cast<u32>(~u32())));
            cpu::SetPmCntEnClrEl0(static_cast<u64>(static_cast<u32>(~u32())));
--- a/libraries/libmesosphere/source/kern_k_scheduler.cpp
+++ b/libraries/libmesosphere/source/kern_k_scheduler.cpp
@ -79,13 +79,6 @@ namespace ams::kern {
        RescheduleCurrentCore();
    }

-    void KScheduler::RescheduleOtherCores(u64 cores_needing_scheduling) {
-        if (const u64 core_mask = cores_needing_scheduling & ~(1ul << m_core_id); core_mask != 0) {
-            cpu::DataSynchronizationBarrier();
-            Kernel::GetInterruptManager().SendInterProcessorInterrupt(KInterruptName_Scheduler, core_mask);
-        }
-    }
-
    u64 KScheduler::UpdateHighestPriorityThread(KThread *highest_thread) {
        if (KThread *prev_highest_thread = m_state.highest_priority_thread; AMS_LIKELY(prev_highest_thread != highest_thread)) {
            if (AMS_LIKELY(prev_highest_thread != nullptr)) {
@ -254,9 +247,24 @@ namespace ams::kern {

        MESOSPHERE_KTRACE_THREAD_SWITCH(next_thread);

+        #if defined(MESOSPHERE_ENABLE_HARDWARE_SINGLE_STEP)
+        /* Ensure the single-step bit in mdscr reflects the correct single-step state for the new thread. */
+        cpu::MonitorDebugSystemControlRegisterAccessor().SetSoftwareStep(next_thread->IsSingleStep()).Store();
+        #endif
+
        /* Switch the current process, if we're switching processes. */
        if (KProcess *next_process = next_thread->GetOwnerProcess(); next_process != cur_process) {
            KProcess::Switch(cur_process, next_process);
+        } else {
+            /* The single-step bit set up above requires an instruction synchronization barrier, to ensure */
+            /* the state change takes before we actually perform a return which might break-to-step. */
+            /* KProcess::Switch performs an isb incidentally, and so when we're changing process we */
+            /* can piggy-back off of that isb to avoid unnecessarily emptying the pipeline twice. */
+            /* However, this means that when we're switching to thread in a different process, */
+            /* we must ensure that we still isb. In practice, gcc will deduplicate into a single isb. */
+            #if defined(MESOSPHERE_ENABLE_HARDWARE_SINGLE_STEP)
+            cpu::InstructionMemoryBarrier();
+            #endif
        }

        /* Set the new thread. */
--- a/mesosphere/kernel/source/arch/arm64/init/kern_init_core.cpp
+++ b/mesosphere/kernel/source/arch/arm64/init/kern_init_core.cpp
@ -49,10 +49,9 @@ namespace ams::kern::init {
        constexpr PageTableEntry KernelRwDataUncachedAttribute(PageTableEntry::Permission_KernelRW, PageTableEntry::PageAttribute_NormalMemoryNotCacheable, PageTableEntry::Shareable_InnerShareable, PageTableEntry::MappingFlag_Mapped);

        void StoreDataCache(const void *addr, size_t size) {
-            uintptr_t start = util::AlignDown(reinterpret_cast<uintptr_t>(addr), cpu::DataCacheLineSize);
-            uintptr_t end   = reinterpret_cast<uintptr_t>(addr) + size;
-            for (uintptr_t cur = start; cur < end; cur += cpu::DataCacheLineSize) {
-                __asm__ __volatile__("dc cvac, %[cur]" :: [cur]"r"(cur) : "memory");
+            const uintptr_t start = util::AlignDown(reinterpret_cast<uintptr_t>(addr), cpu::DataCacheLineSize);
+            for (size_t stored = 0; stored < size; stored += cpu::DataCacheLineSize) {
+                __asm__ __volatile__("dc cvac, %[cur]" :: [cur]"r"(start + stored) : "memory");
            }
            cpu::DataSynchronizationBarrier();
        }
@ -594,11 +593,13 @@ namespace ams::kern::init {

        switch (num_watchpoints) {
            FOR_I_IN_15_TO_1(MESOSPHERE_INITIALIZE_WATCHPOINT_CASE, 0)
+            case 0:
+                cpu::SetDbgWcr0El1(0);
+                cpu::SetDbgWvr0El1(0);
+            [[fallthrough]];
            default:
                break;
        }
-        cpu::SetDbgWcr0El1(0);
-        cpu::SetDbgWvr0El1(0);

        switch (num_breakpoints) {
            FOR_I_IN_15_TO_1(MESOSPHERE_INITIALIZE_BREAKPOINT_CASE, 0)
--- a/mesosphere/kernel/source/arch/arm64/kern_k_scheduler_asm.s
+++ b/mesosphere/kernel/source/arch/arm64/kern_k_scheduler_asm.s
@ -227,26 +227,7 @@ _ZN3ams4kern10KScheduler12ScheduleImplEv:
    mov    x0, x22
    RESTORE_THREAD_CONTEXT(x0, x1, x2, 9f)

-9:  /* Configure single-step, if we should. */
-    #if defined(MESOSPHERE_ENABLE_HARDWARE_SINGLE_STEP)
-
-    /* Get a reference to the new thread's stack parameters. */
-    add    x2, sp, #0x1000
-    and    x2, x2, #~(0x1000-1)
-
-    /* Read the single-step flag. */
-    ldurb  w2, [x2, #-(THREAD_STACK_PARAMETERS_SIZE - THREAD_STACK_PARAMETERS_IS_SINGLE_STEP)]
-
-    /* Update the single-step bit in mdscr_el1. */
-    mrs    x1, mdscr_el1
-    bic    x1, x1, #1
-    orr    x1, x1, x2
-    msr    mdscr_el1, x1
-
-    isb
-    #endif
-
-    /* We're done restoring the thread context, and can return safely. */
+9:  /* We're done restoring the thread context, and can return safely. */
    ret

 10: /* Our switch failed. */