Path: blob/master/src/core/cpu_recompiler_arm64.cpp
4802 views
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <[email protected]>1// SPDX-License-Identifier: CC-BY-NC-ND-4.023#include "cpu_recompiler_arm64.h"4#include "cpu_core_private.h"5#include "cpu_pgxp.h"6#include "gte.h"7#include "settings.h"8#include "timing_event.h"910#include "common/align.h"11#include "common/assert.h"12#include "common/log.h"13#include "common/memmap.h"14#include "common/string_util.h"1516#include <limits>1718#ifdef CPU_ARCH_ARM641920#include "vixl/aarch64/constants-aarch64.h"2122#ifdef ENABLE_HOST_DISASSEMBLY23#include "vixl/aarch64/disasm-aarch64.h"24#endif2526LOG_CHANNEL(Recompiler);2728#define PTR(x) vixl::aarch64::MemOperand(RSTATE, (((u8*)(x)) - ((u8*)&g_state)))2930#define RWRET vixl::aarch64::w031#define RXRET vixl::aarch64::x032#define RWARG1 vixl::aarch64::w033#define RXARG1 vixl::aarch64::x034#define RWARG2 vixl::aarch64::w135#define RXARG2 vixl::aarch64::x136#define RWARG3 vixl::aarch64::w237#define RXARG3 vixl::aarch64::x238#define RWSCRATCH vixl::aarch64::w1639#define RXSCRATCH vixl::aarch64::x1640#define RSTATE vixl::aarch64::x1941#define RMEMBASE vixl::aarch64::x204243static bool armIsCallerSavedRegister(u32 id);44static s64 armGetPCDisplacement(const void* current, const void* target);45static bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr);46static void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr);47static void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm);48static void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);49static void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline);50static void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr);51static void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,52bool sign_extend_word = false);53static void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,54const vixl::aarch64::Register& tempreg = RXSCRATCH);55static u8* armGetJumpTrampoline(const void* target);56static void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment);5758static constexpr u32 TRAMPOLINE_AREA_SIZE = 4 * 1024;59static std::unordered_map<const void*, u32> s_trampoline_targets;60static u8* s_trampoline_start_ptr = nullptr;61static u32 s_trampoline_used = 0;6263namespace CPU {6465using namespace vixl::aarch64;6667static ARM64Recompiler s_instance;68Recompiler* g_compiler = &s_instance;6970} // namespace CPU7172bool armIsCallerSavedRegister(u32 id)73{74// same on both linux and windows75return (id <= 18);76}7778void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm)79{80// From vixl macro assembler.81DebugAssert(vixl::IsUint32(imm) || vixl::IsInt32(imm) || rd.Is64Bits());82DebugAssert(rd.GetCode() != vixl::aarch64::sp.GetCode());8384if (imm == 0)85{86armAsm->mov(rd, vixl::aarch64::Assembler::AppropriateZeroRegFor(rd));87return;88}8990// The worst case for size is mov 64-bit immediate to sp:91// * up to 4 instructions to materialise the constant92// * 1 instruction to move to sp9394// Immediates on Aarch64 can be produced using an initial value, and zero to95// three move keep operations.96//97// Initial values can be generated with:98// 1. 64-bit move zero (movz).99// 2. 32-bit move inverted (movn).100// 3. 64-bit move inverted.101// 4. 32-bit orr immediate.102// 5. 64-bit orr immediate.103// Move-keep may then be used to modify each of the 16-bit half words.104//105// The code below supports all five initial value generators, and106// applying move-keep operations to move-zero and move-inverted initial107// values.108109// Try to move the immediate in one instruction, and if that fails, switch to110// using multiple instructions.111const unsigned reg_size = rd.GetSizeInBits();112113if (vixl::aarch64::Assembler::IsImmMovz(imm, reg_size) && !rd.IsSP())114{115// Immediate can be represented in a move zero instruction. Movz can't write116// to the stack pointer.117armAsm->movz(rd, imm);118return;119}120else if (vixl::aarch64::Assembler::IsImmMovn(imm, reg_size) && !rd.IsSP())121{122// Immediate can be represented in a move negative instruction. Movn can't123// write to the stack pointer.124armAsm->movn(rd, rd.Is64Bits() ? ~imm : (~imm & vixl::aarch64::kWRegMask));125return;126}127else if (vixl::aarch64::Assembler::IsImmLogical(imm, reg_size))128{129// Immediate can be represented in a logical orr instruction.130DebugAssert(!rd.IsZero());131armAsm->orr(rd, vixl::aarch64::Assembler::AppropriateZeroRegFor(rd), imm);132return;133}134135// Generic immediate case. Imm will be represented by136// [imm3, imm2, imm1, imm0], where each imm is 16 bits.137// A move-zero or move-inverted is generated for the first non-zero or138// non-0xffff immX, and a move-keep for subsequent non-zero immX.139140uint64_t ignored_halfword = 0;141bool invert_move = false;142// If the number of 0xffff halfwords is greater than the number of 0x0000143// halfwords, it's more efficient to use move-inverted.144if (vixl::CountClearHalfWords(~imm, reg_size) > vixl::CountClearHalfWords(imm, reg_size))145{146ignored_halfword = 0xffff;147invert_move = true;148}149150// Iterate through the halfwords. Use movn/movz for the first non-ignored151// halfword, and movk for subsequent halfwords.152DebugAssert((reg_size % 16) == 0);153bool first_mov_done = false;154for (unsigned i = 0; i < (reg_size / 16); i++)155{156uint64_t imm16 = (imm >> (16 * i)) & 0xffff;157if (imm16 != ignored_halfword)158{159if (!first_mov_done)160{161if (invert_move)162armAsm->movn(rd, ~imm16 & 0xffff, 16 * i);163else164armAsm->movz(rd, imm16, 16 * i);165first_mov_done = true;166}167else168{169// Construct a wider constant.170armAsm->movk(rd, imm16, 16 * i);171}172}173}174175DebugAssert(first_mov_done);176}177178s64 armGetPCDisplacement(const void* current, const void* target)179{180// pxAssert(Common::IsAlignedPow2(reinterpret_cast<size_t>(current), 4));181// pxAssert(Common::IsAlignedPow2(reinterpret_cast<size_t>(target), 4));182return static_cast<s64>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)) >> 2);183}184185bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr)186{187const void* cur = armAsm->GetCursorAddress<const void*>();188const void* current_code_ptr_page =189reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));190const void* ptr_page =191reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));192const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;193const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);194195return (vixl::IsInt21(page_displacement) && (vixl::aarch64::Assembler::IsImmAddSub(page_offset) ||196vixl::aarch64::Assembler::IsImmLogical(page_offset, 64)));197}198199void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr)200{201DebugAssert(reg.IsX());202203const void* cur = armAsm->GetCursorAddress<const void*>();204const void* current_code_ptr_page =205reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));206const void* ptr_page =207reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));208const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;209const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);210if (vixl::IsInt21(page_displacement) && vixl::aarch64::Assembler::IsImmAddSub(page_offset))211{212armAsm->adrp(reg, page_displacement);213armAsm->add(reg, reg, page_offset);214}215else if (vixl::IsInt21(page_displacement) && vixl::aarch64::Assembler::IsImmLogical(page_offset, 64))216{217armAsm->adrp(reg, page_displacement);218armAsm->orr(reg, reg, page_offset);219}220else221{222armEmitMov(armAsm, reg, reinterpret_cast<uintptr_t>(addr));223}224}225226void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline)227{228const void* cur = armAsm->GetCursorAddress<const void*>();229s64 displacement = armGetPCDisplacement(cur, ptr);230bool use_blr = !vixl::IsInt26(displacement);231bool use_trampoline = use_blr && !armIsInAdrpRange(armAsm, ptr);232if (use_blr && use_trampoline && !force_inline)233{234if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)235{236displacement = armGetPCDisplacement(cur, trampoline);237use_blr = !vixl::IsInt26(displacement);238}239}240241if (use_blr)242{243armMoveAddressToReg(armAsm, RXSCRATCH, ptr);244armAsm->br(RXSCRATCH);245}246else247{248armAsm->b(displacement);249}250}251252void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline)253{254const void* cur = armAsm->GetCursorAddress<const void*>();255s64 displacement = armGetPCDisplacement(cur, ptr);256bool use_blr = !vixl::IsInt26(displacement);257bool use_trampoline = use_blr && !armIsInAdrpRange(armAsm, ptr);258if (use_blr && use_trampoline && !force_inline)259{260if (u8* trampoline = armGetJumpTrampoline(ptr); trampoline)261{262displacement = armGetPCDisplacement(cur, trampoline);263use_blr = !vixl::IsInt26(displacement);264}265}266267if (use_blr)268{269armMoveAddressToReg(armAsm, RXSCRATCH, ptr);270armAsm->blr(RXSCRATCH);271}272else273{274armAsm->bl(displacement);275}276}277278void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr)279{280const s64 jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(ptr) -281reinterpret_cast<intptr_t>(armAsm->GetCursorAddress<const void*>()));282// pxAssert(Common::IsAligned(jump_distance, 4));283284if (vixl::aarch64::Instruction::IsValidImmPCOffset(vixl::aarch64::CondBranchType, jump_distance >> 2))285{286armAsm->b(jump_distance >> 2, cond);287}288else289{290vixl::aarch64::Label branch_not_taken;291armAsm->b(&branch_not_taken, InvertCondition(cond));292293const s64 new_jump_distance = static_cast<s64>(reinterpret_cast<intptr_t>(ptr) -294reinterpret_cast<intptr_t>(armAsm->GetCursorAddress<const void*>()));295armAsm->b(new_jump_distance >> 2);296armAsm->bind(&branch_not_taken);297}298}299300void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr,301bool sign_extend_word)302{303const void* cur = armAsm->GetCursorAddress<const void*>();304const void* current_code_ptr_page =305reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));306const void* ptr_page =307reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));308const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;309const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);310vixl::aarch64::MemOperand memop;311312const vixl::aarch64::Register xreg = reg.X();313if (vixl::IsInt21(page_displacement))314{315armAsm->adrp(xreg, page_displacement);316memop = vixl::aarch64::MemOperand(xreg, static_cast<int64_t>(page_offset));317}318else319{320armMoveAddressToReg(armAsm, xreg, addr);321memop = vixl::aarch64::MemOperand(xreg);322}323324if (sign_extend_word)325armAsm->ldrsw(reg, memop);326else327armAsm->ldr(reg, memop);328}329330[[maybe_unused]] void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg,331const void* addr, const vixl::aarch64::Register& tempreg)332{333DebugAssert(tempreg.IsX());334335const void* cur = armAsm->GetCursorAddress<const void*>();336const void* current_code_ptr_page =337reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(cur) & ~static_cast<uintptr_t>(0xFFF));338const void* ptr_page =339reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));340const s64 page_displacement = armGetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;341const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);342343if (vixl::IsInt21(page_displacement))344{345armAsm->adrp(tempreg, page_displacement);346armAsm->str(reg, vixl::aarch64::MemOperand(tempreg, static_cast<int64_t>(page_offset)));347}348else349{350armMoveAddressToReg(armAsm, tempreg, addr);351armAsm->str(reg, vixl::aarch64::MemOperand(tempreg));352}353}354355u8* armGetJumpTrampoline(const void* target)356{357auto it = s_trampoline_targets.find(target);358if (it != s_trampoline_targets.end())359return s_trampoline_start_ptr + it->second;360361// align to 16 bytes?362const u32 offset = Common::AlignUpPow2(s_trampoline_used, CPU::Recompiler::FUNCTION_ALIGNMENT);363364// 4 movs plus a jump365if (TRAMPOLINE_AREA_SIZE - offset < 20)366{367Panic("Ran out of space in constant pool");368return nullptr;369}370371u8* start = s_trampoline_start_ptr + offset;372vixl::aarch64::Assembler armAsm(start, TRAMPOLINE_AREA_SIZE - offset);373#ifdef VIXL_DEBUG374vixl::CodeBufferCheckScope armAsmCheck(&armAsm, TRAMPOLINE_AREA_SIZE - offset,375vixl::CodeBufferCheckScope::kDontReserveBufferSpace);376#endif377armMoveAddressToReg(&armAsm, RXSCRATCH, target);378armAsm.br(RXSCRATCH);379armAsm.FinalizeCode();380381const u32 size = static_cast<u32>(armAsm.GetSizeOfCodeGenerated());382DebugAssert(size < 20);383s_trampoline_targets.emplace(target, offset);384s_trampoline_used = offset + static_cast<u32>(size);385386MemMap::FlushInstructionCache(start, size);387return start;388}389390void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment)391{392size_t addr = armAsm->GetCursorAddress<size_t>();393const size_t end_addr = Common::AlignUpPow2(addr, alignment);394while (addr != end_addr)395{396armAsm->nop();397addr += vixl::aarch64::kInstructionSize;398}399}400401void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)402{403#ifdef ENABLE_HOST_DISASSEMBLY404class MyDisassembler : public vixl::aarch64::Disassembler405{406protected:407void ProcessOutput(const vixl::aarch64::Instruction* instr) override408{409DEBUG_LOG("0x{:016X} {:08X}\t\t{}", reinterpret_cast<uint64_t>(instr), instr->GetInstructionBits(), GetOutput());410}411};412413vixl::aarch64::Decoder decoder;414MyDisassembler disas;415decoder.AppendVisitor(&disas);416decoder.Decode(static_cast<const vixl::aarch64::Instruction*>(start),417reinterpret_cast<const vixl::aarch64::Instruction*>(static_cast<const u8*>(start) + size));418#else419ERROR_LOG("Not compiled with ENABLE_HOST_DISASSEMBLY.");420#endif421}422423u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)424{425return size / vixl::aarch64::kInstructionSize;426}427428u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)429{430using namespace vixl::aarch64;431432const s64 disp = armGetPCDisplacement(code, dst);433DebugAssert(vixl::IsInt26(disp));434435const u32 new_code = B | Assembler::ImmUncondBranch(disp);436std::memcpy(code, &new_code, sizeof(new_code));437if (flush_icache)438MemMap::FlushInstructionCache(code, kInstructionSize);439440return kInstructionSize;441}442443u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)444{445using namespace vixl::aarch64;446447Assembler actual_asm(static_cast<u8*>(code), code_size);448Assembler* RESTRICT armAsm = &actual_asm;449450#ifdef VIXL_DEBUG451vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);452#endif453454Label dispatch;455Label run_events_and_dispatch;456457g_enter_recompiler = armAsm->GetCursorAddress<decltype(g_enter_recompiler)>();458{459// Need the CPU state for basically everything :-)460armMoveAddressToReg(armAsm, RSTATE, &g_state);461462// Fastmem setup, oldrec doesn't need it463if (IsUsingFastmem())464armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));465466// Fall through to event dispatcher467}468469// check events then for frame done470armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);471{472armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));473armAsm->ldr(RWARG2, PTR(&g_state.downcount));474armAsm->cmp(RWARG1, RWARG2);475armAsm->b(&dispatch, lt);476477g_run_events_and_dispatch = armAsm->GetCursorAddress<const void*>();478armAsm->bind(&run_events_and_dispatch);479armEmitCall(armAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents), true);480}481482armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);483g_dispatcher = armAsm->GetCursorAddress<const void*>();484{485armAsm->bind(&dispatch);486487// x9 <- s_fast_map[pc >> 16]488armAsm->ldr(RWARG1, PTR(&g_state.pc));489armMoveAddressToReg(armAsm, RXARG3, g_code_lut.data());490armAsm->lsr(RWARG2, RWARG1, 16);491armAsm->ubfx(RWARG1, RWARG1, 2, 14);492armAsm->ldr(RXARG2, MemOperand(RXARG3, RXARG2, LSL, 3));493494// blr(x9[pc * 2]) (fast_map[pc >> 2])495armAsm->ldr(RXARG1, MemOperand(RXARG2, RXARG1, LSL, 3));496armAsm->br(RXARG1);497}498499armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);500g_compile_or_revalidate_block = armAsm->GetCursorAddress<const void*>();501{502armAsm->ldr(RWARG1, PTR(&g_state.pc));503armEmitCall(armAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock), true);504armAsm->b(&dispatch);505}506507armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);508g_discard_and_recompile_block = armAsm->GetCursorAddress<const void*>();509{510armAsm->ldr(RWARG1, PTR(&g_state.pc));511armEmitCall(armAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock), true);512armAsm->b(&dispatch);513}514515armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT);516g_interpret_block = armAsm->GetCursorAddress<const void*>();517{518armEmitCall(armAsm, reinterpret_cast<const void*>(GetInterpretUncachedBlockFunction()), true);519armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));520armAsm->ldr(RWARG2, PTR(&g_state.downcount));521armAsm->cmp(RWARG1, RWARG2);522armAsm->b(&run_events_and_dispatch, ge);523armAsm->b(&dispatch);524}525526armAsm->FinalizeCode();527528s_trampoline_targets.clear();529s_trampoline_start_ptr = static_cast<u8*>(code) + armAsm->GetCursorOffset();530s_trampoline_used = 0;531532return static_cast<u32>(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE;533}534535void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size)536{537constexpr u8 padding_value = 0x00;538std::memset(dst, padding_value, size);539}540541CPU::ARM64Recompiler::ARM64Recompiler() : m_emitter(PositionDependentCode), m_far_emitter(PositionIndependentCode)542{543}544545CPU::ARM64Recompiler::~ARM64Recompiler() = default;546547const void* CPU::ARM64Recompiler::GetCurrentCodePointer()548{549return armAsm->GetCursorAddress<const void*>();550}551552void CPU::ARM64Recompiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,553u32 far_code_space)554{555Recompiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);556557// TODO: don't recreate this every time..558DebugAssert(!armAsm);559m_emitter.GetBuffer()->Reset(code_buffer, code_buffer_space);560m_far_emitter.GetBuffer()->Reset(far_code_buffer, far_code_space);561armAsm = &m_emitter;562563#ifdef VIXL_DEBUG564m_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(&m_emitter, code_buffer_space,565vixl::CodeBufferCheckScope::kDontReserveBufferSpace);566m_far_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(567&m_far_emitter, far_code_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);568#endif569570// Need to wipe it out so it's correct when toggling fastmem.571m_host_regs = {};572573const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE.GetCode() : NUM_HOST_REGS;574for (u32 i = 0; i < NUM_HOST_REGS; i++)575{576HostRegAlloc& ra = m_host_regs[i];577578if (i == RWARG1.GetCode() || i == RWARG1.GetCode() || i == RWARG2.GetCode() || i == RWARG3.GetCode() ||579i == RWSCRATCH.GetCode() || i == RSTATE.GetCode() || i == membase_idx || i == x18.GetCode() || i >= 30)580{581continue;582}583584ra.flags = HR_USABLE | (armIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);585}586}587588void CPU::ARM64Recompiler::SwitchToFarCode(bool emit_jump, vixl::aarch64::Condition cond)589{590DebugAssert(armAsm == &m_emitter);591if (emit_jump)592{593const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());594if (cond != Condition::al)595{596if (vixl::IsInt19(disp))597{598armAsm->b(disp, cond);599}600else601{602Label skip;603armAsm->b(&skip, vixl::aarch64::InvertCondition(cond));604armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));605armAsm->bind(&skip);606}607}608else609{610armAsm->b(disp);611}612}613armAsm = &m_far_emitter;614}615616void CPU::ARM64Recompiler::SwitchToFarCodeIfBitSet(const vixl::aarch64::Register& reg, u32 bit)617{618const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());619if (vixl::IsInt14(disp))620{621armAsm->tbnz(reg, bit, disp);622}623else624{625Label skip;626armAsm->tbz(reg, bit, &skip);627armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));628armAsm->bind(&skip);629}630631armAsm = &m_far_emitter;632}633634void CPU::ARM64Recompiler::SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch64::Register& reg, bool nonzero)635{636const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>());637if (vixl::IsInt19(disp))638{639nonzero ? armAsm->cbnz(reg, disp) : armAsm->cbz(reg, disp);640}641else642{643Label skip;644nonzero ? armAsm->cbz(reg, &skip) : armAsm->cbnz(reg, &skip);645armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter.GetCursorAddress<const void*>()));646armAsm->bind(&skip);647}648649armAsm = &m_far_emitter;650}651652void CPU::ARM64Recompiler::SwitchToNearCode(bool emit_jump, vixl::aarch64::Condition cond)653{654DebugAssert(armAsm == &m_far_emitter);655if (emit_jump)656{657const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_emitter.GetCursorAddress<const void*>());658(cond != Condition::al) ? armAsm->b(disp, cond) : armAsm->b(disp);659}660armAsm = &m_emitter;661}662663void CPU::ARM64Recompiler::EmitMov(const vixl::aarch64::Register& dst, u32 val)664{665armEmitMov(armAsm, dst, val);666}667668void CPU::ARM64Recompiler::EmitCall(const void* ptr, bool force_inline /*= false*/)669{670armEmitCall(armAsm, ptr, force_inline);671}672673vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckAddSubConstant(s32 val)674{675if (Assembler::IsImmAddSub(val))676return vixl::aarch64::Operand(static_cast<int64_t>(val));677678EmitMov(RWSCRATCH, static_cast<u32>(val));679return vixl::aarch64::Operand(RWSCRATCH);680}681682vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckAddSubConstant(u32 val)683{684return armCheckAddSubConstant(static_cast<s32>(val));685}686687vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckCompareConstant(s32 val)688{689if (Assembler::IsImmConditionalCompare(val))690return vixl::aarch64::Operand(static_cast<int64_t>(val));691692EmitMov(RWSCRATCH, static_cast<u32>(val));693return vixl::aarch64::Operand(RWSCRATCH);694}695696vixl::aarch64::Operand CPU::ARM64Recompiler::armCheckLogicalConstant(u32 val)697{698if (Assembler::IsImmLogical(val, 32))699return vixl::aarch64::Operand(static_cast<s64>(static_cast<u64>(val)));700701EmitMov(RWSCRATCH, val);702return vixl::aarch64::Operand(RWSCRATCH);703}704705void CPU::ARM64Recompiler::BeginBlock()706{707Recompiler::BeginBlock();708}709710void CPU::ARM64Recompiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)711{712// store it first to reduce code size, because we can offset713armMoveAddressToReg(armAsm, RXARG1, ram_ptr);714armMoveAddressToReg(armAsm, RXARG2, shadow_ptr);715716bool first = true;717u32 offset = 0;718Label block_changed;719720while (size >= 16)721{722const VRegister vtmp = v2.V4S();723const VRegister dst = first ? v0.V4S() : v1.V4S();724armAsm->ldr(dst, MemOperand(RXARG1, offset));725armAsm->ldr(vtmp, MemOperand(RXARG2, offset));726armAsm->cmeq(dst, dst, vtmp);727if (!first)728armAsm->and_(v0.V16B(), v0.V16B(), dst.V16B());729else730first = false;731732offset += 16;733size -= 16;734}735736if (!first)737{738// TODO: make sure this doesn't choke on ffffffff739armAsm->uminv(s0, v0.V4S());740armAsm->fcmp(s0, 0.0);741armAsm->b(&block_changed, eq);742}743744while (size >= 8)745{746armAsm->ldr(RXARG3, MemOperand(RXARG1, offset));747armAsm->ldr(RXSCRATCH, MemOperand(RXARG2, offset));748armAsm->cmp(RXARG3, RXSCRATCH);749armAsm->b(&block_changed, ne);750offset += 8;751size -= 8;752}753754while (size >= 4)755{756armAsm->ldr(RWARG3, MemOperand(RXARG1, offset));757armAsm->ldr(RWSCRATCH, MemOperand(RXARG2, offset));758armAsm->cmp(RWARG3, RWSCRATCH);759armAsm->b(&block_changed, ne);760offset += 4;761size -= 4;762}763764DebugAssert(size == 0);765766Label block_unchanged;767armAsm->b(&block_unchanged);768armAsm->bind(&block_changed);769armEmitJmp(armAsm, CodeCache::g_discard_and_recompile_block, false);770armAsm->bind(&block_unchanged);771}772773void CPU::ARM64Recompiler::GenerateICacheCheckAndUpdate()774{775if (!m_block->HasFlag(CodeCache::BlockFlags::IsUsingICache))776{777if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))778{779armEmitFarLoad(armAsm, RWARG2, GetFetchMemoryAccessTimePtr());780armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));781armEmitMov(armAsm, RWARG3, m_block->size);782armAsm->mul(RWARG2, RWARG2, RWARG3);783armAsm->add(RWARG1, RWARG1, RWARG2);784armAsm->str(RWARG1, PTR(&g_state.pending_ticks));785}786else787{788armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));789armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(static_cast<u32>(m_block->uncached_fetch_ticks)));790armAsm->str(RWARG1, PTR(&g_state.pending_ticks));791}792}793else if (m_block->icache_line_count > 0)794{795const auto& ticks_reg = RWARG1;796const auto& current_tag_reg = RWARG2;797const auto& existing_tag_reg = RWARG3;798const auto& fill_ticks_reg = w4;799const auto& ticks_to_add_reg = w5;800801VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;802const TickCount fill_ticks = GetICacheFillTicks(current_pc);803if (fill_ticks <= 0)804return;805806armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));807armEmitMov(armAsm, current_tag_reg, current_pc);808armEmitMov(armAsm, fill_ticks_reg, fill_ticks);809810for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)811{812const u32 line = GetICacheLine(current_pc);813const u32 offset = OFFSETOF(State, icache_tags) + (line * sizeof(u32));814815Label cache_hit;816armAsm->ldr(existing_tag_reg, MemOperand(RSTATE, offset));817armAsm->str(current_tag_reg, MemOperand(RSTATE, offset));818armAsm->cmp(existing_tag_reg, current_tag_reg);819armAsm->csel(ticks_to_add_reg, fill_ticks_reg, wzr, ne);820armAsm->add(ticks_reg, ticks_reg, ticks_to_add_reg);821822if (i != (m_block->icache_line_count - 1))823armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));824}825826armAsm->str(ticks_reg, PTR(&g_state.pending_ticks));827}828}829830void CPU::ARM64Recompiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,831s32 arg3reg /*= -1*/)832{833if (arg1reg >= 0 && arg1reg != static_cast<s32>(RXARG1.GetCode()))834armAsm->mov(RXARG1, XRegister(arg1reg));835if (arg2reg >= 0 && arg2reg != static_cast<s32>(RXARG2.GetCode()))836armAsm->mov(RXARG2, XRegister(arg2reg));837if (arg3reg >= 0 && arg3reg != static_cast<s32>(RXARG3.GetCode()))838armAsm->mov(RXARG3, XRegister(arg3reg));839EmitCall(func);840}841842void CPU::ARM64Recompiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)843{844if (newpc.has_value())845{846if (m_dirty_pc || m_compiler_pc != newpc)847{848EmitMov(RWSCRATCH, newpc.value());849armAsm->str(RWSCRATCH, PTR(&g_state.pc));850}851}852m_dirty_pc = false;853854// flush regs855Flush(FLUSH_END_BLOCK);856EndAndLinkBlock(newpc, do_event_test, false);857}858859void CPU::ARM64Recompiler::EndBlockWithException(Exception excode)860{861// flush regs, but not pc, it's going to get overwritten862// flush cycles because of the GTE instruction stuff...863Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);864865// TODO: flush load delay866867EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,868inst->cop.cop_n));869EmitMov(RWARG2, m_current_instruction_pc);870if (excode != Exception::BP)871{872EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));873}874else875{876EmitMov(RWARG3, inst->bits);877EmitCall(reinterpret_cast<const void*>(&CPU::RaiseBreakException));878}879m_dirty_pc = false;880881EndAndLinkBlock(std::nullopt, true, false);882}883884void CPU::ARM64Recompiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test, bool force_run_events)885{886// event test887// pc should've been flushed888DebugAssert(!m_dirty_pc && !m_block_ended);889m_block_ended = true;890891// TODO: try extracting this to a function892893// save cycles for event test894const TickCount cycles = std::exchange(m_cycles, 0);895896// pending_ticks += cycles897// if (pending_ticks >= downcount) { dispatch_event(); }898if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)899armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));900if (do_event_test)901armAsm->ldr(RWARG2, PTR(&g_state.downcount));902if (cycles > 0)903armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(cycles));904if (m_gte_done_cycle > cycles)905{906armAsm->add(RWARG2, RWARG1, armCheckAddSubConstant(m_gte_done_cycle - cycles));907armAsm->str(RWARG2, PTR(&g_state.gte_completion_tick));908}909if (do_event_test)910armAsm->cmp(RWARG1, RWARG2);911if (cycles > 0)912armAsm->str(RWARG1, PTR(&g_state.pending_ticks));913if (do_event_test)914armEmitCondBranch(armAsm, ge, CodeCache::g_run_events_and_dispatch);915916// jump to dispatcher or next block917if (force_run_events)918{919armEmitJmp(armAsm, CodeCache::g_run_events_and_dispatch, false);920}921else if (!newpc.has_value())922{923armEmitJmp(armAsm, CodeCache::g_dispatcher, false);924}925else926{927const void* target = (newpc.value() == m_block->pc) ?928CodeCache::CreateSelfBlockLink(m_block, armAsm->GetCursorAddress<void*>(),929armAsm->GetBuffer()->GetStartAddress<const void*>()) :930CodeCache::CreateBlockLink(m_block, armAsm->GetCursorAddress<void*>(), newpc.value());931armEmitJmp(armAsm, target, true);932}933}934935const void* CPU::ARM64Recompiler::EndCompile(u32* code_size, u32* far_code_size)936{937#ifdef VIXL_DEBUG938m_emitter_check.reset();939m_far_emitter_check.reset();940#endif941942m_emitter.FinalizeCode();943m_far_emitter.FinalizeCode();944945u8* const code = m_emitter.GetBuffer()->GetStartAddress<u8*>();946*code_size = static_cast<u32>(m_emitter.GetCursorOffset());947*far_code_size = static_cast<u32>(m_far_emitter.GetCursorOffset());948armAsm = nullptr;949return code;950}951952const char* CPU::ARM64Recompiler::GetHostRegName(u32 reg) const953{954static constexpr std::array<const char*, 32> reg64_names = {955{"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",956"x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp", "lr", "sp"}};957return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";958}959960void CPU::ARM64Recompiler::LoadHostRegWithConstant(u32 reg, u32 val)961{962EmitMov(WRegister(reg), val);963}964965void CPU::ARM64Recompiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)966{967armAsm->ldr(WRegister(reg), PTR(ptr));968}969970void CPU::ARM64Recompiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)971{972armAsm->str(WRegister(reg), PTR(ptr));973}974975void CPU::ARM64Recompiler::StoreConstantToCPUPointer(u32 val, const void* ptr)976{977if (val == 0)978{979armAsm->str(wzr, PTR(ptr));980return;981}982983EmitMov(RWSCRATCH, val);984armAsm->str(RWSCRATCH, PTR(ptr));985}986987void CPU::ARM64Recompiler::CopyHostReg(u32 dst, u32 src)988{989if (src != dst)990armAsm->mov(WRegister(dst), WRegister(src));991}992993void CPU::ARM64Recompiler::AssertRegOrConstS(CompileFlags cf) const994{995DebugAssert(cf.valid_host_s || cf.const_s);996}997998void CPU::ARM64Recompiler::AssertRegOrConstT(CompileFlags cf) const999{1000DebugAssert(cf.valid_host_t || cf.const_t);1001}10021003vixl::aarch64::MemOperand CPU::ARM64Recompiler::MipsPtr(Reg r) const1004{1005DebugAssert(r < Reg::count);1006return PTR(&g_state.regs.r[static_cast<u32>(r)]);1007}10081009vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegD(CompileFlags cf) const1010{1011DebugAssert(cf.valid_host_d);1012return WRegister(cf.host_d);1013}10141015vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegS(CompileFlags cf) const1016{1017DebugAssert(cf.valid_host_s);1018return WRegister(cf.host_s);1019}10201021vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegT(CompileFlags cf) const1022{1023DebugAssert(cf.valid_host_t);1024return WRegister(cf.host_t);1025}10261027vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegLO(CompileFlags cf) const1028{1029DebugAssert(cf.valid_host_lo);1030return WRegister(cf.host_lo);1031}10321033vixl::aarch64::Register CPU::ARM64Recompiler::CFGetRegHI(CompileFlags cf) const1034{1035DebugAssert(cf.valid_host_hi);1036return WRegister(cf.host_hi);1037}10381039void CPU::ARM64Recompiler::MoveSToReg(const vixl::aarch64::Register& dst, CompileFlags cf)1040{1041DebugAssert(dst.IsW());1042if (cf.valid_host_s)1043{1044if (cf.host_s != dst.GetCode())1045armAsm->mov(dst, WRegister(cf.host_s));1046}1047else if (cf.const_s)1048{1049const u32 cv = GetConstantRegU32(cf.MipsS());1050if (cv == 0)1051armAsm->mov(dst, wzr);1052else1053EmitMov(dst, cv);1054}1055else1056{1057WARNING_LOG("Hit memory path in MoveSToReg() for {}", GetRegName(cf.MipsS()));1058armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_s]));1059}1060}10611062void CPU::ARM64Recompiler::MoveTToReg(const vixl::aarch64::Register& dst, CompileFlags cf)1063{1064DebugAssert(dst.IsW());1065if (cf.valid_host_t)1066{1067if (cf.host_t != dst.GetCode())1068armAsm->mov(dst, WRegister(cf.host_t));1069}1070else if (cf.const_t)1071{1072const u32 cv = GetConstantRegU32(cf.MipsT());1073if (cv == 0)1074armAsm->mov(dst, wzr);1075else1076EmitMov(dst, cv);1077}1078else1079{1080WARNING_LOG("Hit memory path in MoveTToReg() for {}", GetRegName(cf.MipsT()));1081armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_t]));1082}1083}10841085void CPU::ARM64Recompiler::MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg, bool ignore_load_delays)1086{1087DebugAssert(reg < Reg::count && dst.IsW());1088if (ignore_load_delays && m_load_delay_register == reg)1089{1090if (m_load_delay_value_register == NUM_HOST_REGS)1091armAsm->ldr(dst, PTR(&g_state.load_delay_value));1092else1093armAsm->mov(dst, WRegister(m_load_delay_value_register));1094}1095else if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))1096{1097armAsm->mov(dst, WRegister(hreg.value()));1098}1099else if (HasConstantReg(reg))1100{1101EmitMov(dst, GetConstantRegU32(reg));1102}1103else1104{1105armAsm->ldr(dst, MipsPtr(reg));1106}1107}11081109void CPU::ARM64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,1110Reg arg3reg /* = Reg::count */)1111{1112DebugAssert(g_settings.gpu_pgxp_enable);11131114Flush(FLUSH_FOR_C_CALL);11151116if (arg2reg != Reg::count)1117MoveMIPSRegToReg(RWARG2, arg2reg);1118if (arg3reg != Reg::count)1119MoveMIPSRegToReg(RWARG3, arg3reg);11201121EmitMov(RWARG1, arg1val);1122EmitCall(func);1123}11241125void CPU::ARM64Recompiler::Flush(u32 flags)1126{1127Recompiler::Flush(flags);11281129if (flags & FLUSH_PC && m_dirty_pc)1130{1131StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);1132m_dirty_pc = false;1133}11341135if (flags & FLUSH_INSTRUCTION_BITS)1136{1137// This sucks, but it's only used for fallbacks.1138EmitMov(RWARG1, inst->bits);1139EmitMov(RWARG2, m_current_instruction_pc);1140EmitMov(RWARG3, m_current_instruction_branch_delay_slot);1141armAsm->str(RWARG1, PTR(&g_state.current_instruction.bits));1142armAsm->str(RWARG2, PTR(&g_state.current_instruction_pc));1143armAsm->strb(RWARG3, PTR(&g_state.current_instruction_in_branch_delay_slot));1144}11451146if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)1147{1148// This sucks :(1149// TODO: make it a function?1150armAsm->ldrb(RWARG1, PTR(&g_state.load_delay_reg));1151armAsm->ldr(RWARG2, PTR(&g_state.load_delay_value));1152EmitMov(RWSCRATCH, OFFSETOF(CPU::State, regs.r[0]));1153armAsm->add(RWARG1, RWSCRATCH, vixl::aarch64::Operand(RWARG1, LSL, 2));1154armAsm->str(RWARG2, MemOperand(RSTATE, RXARG1));1155EmitMov(RWSCRATCH, static_cast<u8>(Reg::count));1156armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));1157m_load_delay_dirty = false;1158}11591160if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)1161{1162if (m_load_delay_value_register != NUM_HOST_REGS)1163FreeHostReg(m_load_delay_value_register);11641165EmitMov(RWSCRATCH, static_cast<u8>(m_load_delay_register));1166armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));1167m_load_delay_register = Reg::count;1168m_load_delay_dirty = true;1169}11701171if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)1172{1173// May as well flush cycles while we're here.1174// GTE spanning blocks is very rare, we _could_ disable this for speed.1175armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));1176armAsm->ldr(RWARG2, PTR(&g_state.gte_completion_tick));1177if (m_cycles > 0)1178{1179armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));1180m_cycles = 0;1181}1182armAsm->cmp(RWARG2, RWARG1);1183armAsm->csel(RWARG1, RWARG2, RWARG1, hs);1184armAsm->str(RWARG1, PTR(&g_state.pending_ticks));1185m_dirty_gte_done_cycle = false;1186}11871188if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)1189{1190armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));11911192// update cycles at the same time1193if (flags & FLUSH_CYCLES && m_cycles > 0)1194{1195armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));1196armAsm->str(RWARG1, PTR(&g_state.pending_ticks));1197m_gte_done_cycle -= m_cycles;1198m_cycles = 0;1199}12001201armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_gte_done_cycle));1202armAsm->str(RWARG1, PTR(&g_state.gte_completion_tick));1203m_gte_done_cycle = 0;1204m_dirty_gte_done_cycle = true;1205}12061207if (flags & FLUSH_CYCLES && m_cycles > 0)1208{1209armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));1210armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));1211armAsm->str(RWARG1, PTR(&g_state.pending_ticks));1212m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);1213m_cycles = 0;1214}1215}12161217void CPU::ARM64Recompiler::Compile_Fallback()1218{1219WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc,1220inst->bits);12211222Flush(FLUSH_FOR_INTERPRETER);12231224EmitCall(reinterpret_cast<const void*>(&CPU::RecompilerThunks::InterpretInstruction));12251226// TODO: make me less garbage1227// TODO: this is wrong, it flushes the load delay on the same cycle when we return.1228// but nothing should be going through here..1229Label no_load_delay;1230armAsm->ldrb(RWARG1, PTR(&g_state.next_load_delay_reg));1231armAsm->cmp(RWARG1, static_cast<u8>(Reg::count));1232armAsm->b(&no_load_delay, eq);1233armAsm->ldr(RWARG2, PTR(&g_state.next_load_delay_value));1234armAsm->strb(RWARG1, PTR(&g_state.load_delay_reg));1235armAsm->str(RWARG2, PTR(&g_state.load_delay_value));1236EmitMov(RWARG1, static_cast<u32>(Reg::count));1237armAsm->strb(RWARG1, PTR(&g_state.next_load_delay_reg));1238armAsm->bind(&no_load_delay);12391240m_load_delay_dirty = EMULATE_LOAD_DELAYS;1241}12421243void CPU::ARM64Recompiler::CheckBranchTarget(const vixl::aarch64::Register& pcreg)1244{1245DebugAssert(pcreg.IsW());1246if (!g_settings.cpu_recompiler_memory_exceptions)1247return;12481249armAsm->tst(pcreg, armCheckLogicalConstant(0x3));1250SwitchToFarCode(true, ne);12511252BackupHostState();1253EndBlockWithException(Exception::AdEL);12541255RestoreHostState();1256SwitchToNearCode(false);1257}12581259void CPU::ARM64Recompiler::Compile_jr(CompileFlags cf)1260{1261const Register pcreg = CFGetRegS(cf);1262CheckBranchTarget(pcreg);12631264armAsm->str(pcreg, PTR(&g_state.pc));12651266CompileBranchDelaySlot(false);1267EndBlock(std::nullopt, true);1268}12691270void CPU::ARM64Recompiler::Compile_jalr(CompileFlags cf)1271{1272const Register pcreg = CFGetRegS(cf);1273if (MipsD() != Reg::zero)1274SetConstantReg(MipsD(), GetBranchReturnAddress(cf));12751276CheckBranchTarget(pcreg);1277armAsm->str(pcreg, PTR(&g_state.pc));12781279CompileBranchDelaySlot(false);1280EndBlock(std::nullopt, true);1281}12821283void CPU::ARM64Recompiler::Compile_bxx(CompileFlags cf, BranchCondition cond)1284{1285AssertRegOrConstS(cf);12861287const u32 taken_pc = GetConditionalBranchTarget(cf);12881289Flush(FLUSH_FOR_BRANCH);12901291DebugAssert(cf.valid_host_s);12921293// MipsT() here should equal zero for zero branches.1294DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);12951296Label taken;1297const Register rs = CFGetRegS(cf);1298switch (cond)1299{1300case BranchCondition::Equal:1301case BranchCondition::NotEqual:1302{1303AssertRegOrConstT(cf);1304if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0))1305{1306(cond == BranchCondition::Equal) ? armAsm->cbz(rs, &taken) : armAsm->cbnz(rs, &taken);1307}1308else1309{1310if (cf.valid_host_t)1311armAsm->cmp(rs, CFGetRegT(cf));1312else if (cf.const_t)1313armAsm->cmp(rs, armCheckCompareConstant(GetConstantRegU32(cf.MipsT())));13141315armAsm->b(&taken, (cond == BranchCondition::Equal) ? eq : ne);1316}1317}1318break;13191320case BranchCondition::GreaterThanZero:1321{1322armAsm->cmp(rs, 0);1323armAsm->b(&taken, gt);1324}1325break;13261327case BranchCondition::GreaterEqualZero:1328{1329armAsm->cmp(rs, 0);1330armAsm->b(&taken, ge);1331}1332break;13331334case BranchCondition::LessThanZero:1335{1336armAsm->cmp(rs, 0);1337armAsm->b(&taken, lt);1338}1339break;13401341case BranchCondition::LessEqualZero:1342{1343armAsm->cmp(rs, 0);1344armAsm->b(&taken, le);1345}1346break;1347}13481349BackupHostState();1350if (!cf.delay_slot_swapped)1351CompileBranchDelaySlot();13521353EndBlock(m_compiler_pc, true);13541355armAsm->bind(&taken);13561357RestoreHostState();1358if (!cf.delay_slot_swapped)1359CompileBranchDelaySlot();13601361EndBlock(taken_pc, true);1362}13631364void CPU::ARM64Recompiler::Compile_addi(CompileFlags cf, bool overflow)1365{1366const Register rs = CFGetRegS(cf);1367const Register rt = CFGetRegT(cf);1368if (const u32 imm = inst->i.imm_sext32(); imm != 0)1369{1370if (!overflow)1371{1372armAsm->add(rt, rs, armCheckAddSubConstant(imm));1373}1374else1375{1376armAsm->adds(rt, rs, armCheckAddSubConstant(imm));1377TestOverflow(rt);1378}1379}1380else if (rt.GetCode() != rs.GetCode())1381{1382armAsm->mov(rt, rs);1383}1384}13851386void CPU::ARM64Recompiler::Compile_addi(CompileFlags cf)1387{1388Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);1389}13901391void CPU::ARM64Recompiler::Compile_addiu(CompileFlags cf)1392{1393Compile_addi(cf, false);1394}13951396void CPU::ARM64Recompiler::Compile_slti(CompileFlags cf)1397{1398Compile_slti(cf, true);1399}14001401void CPU::ARM64Recompiler::Compile_sltiu(CompileFlags cf)1402{1403Compile_slti(cf, false);1404}14051406void CPU::ARM64Recompiler::Compile_slti(CompileFlags cf, bool sign)1407{1408armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(static_cast<s32>(inst->i.imm_sext32())));1409armAsm->cset(CFGetRegT(cf), sign ? lt : lo);1410}14111412void CPU::ARM64Recompiler::Compile_andi(CompileFlags cf)1413{1414const Register rt = CFGetRegT(cf);1415if (const u32 imm = inst->i.imm_zext32(); imm != 0)1416armAsm->and_(rt, CFGetRegS(cf), armCheckLogicalConstant(imm));1417else1418armAsm->mov(rt, wzr);1419}14201421void CPU::ARM64Recompiler::Compile_ori(CompileFlags cf)1422{1423const Register rt = CFGetRegT(cf);1424const Register rs = CFGetRegS(cf);1425if (const u32 imm = inst->i.imm_zext32(); imm != 0)1426armAsm->orr(rt, rs, armCheckLogicalConstant(imm));1427else if (rt.GetCode() != rs.GetCode())1428armAsm->mov(rt, rs);1429}14301431void CPU::ARM64Recompiler::Compile_xori(CompileFlags cf)1432{1433const Register rt = CFGetRegT(cf);1434const Register rs = CFGetRegS(cf);1435if (const u32 imm = inst->i.imm_zext32(); imm != 0)1436armAsm->eor(rt, rs, armCheckLogicalConstant(imm));1437else if (rt.GetCode() != rs.GetCode())1438armAsm->mov(rt, rs);1439}14401441void CPU::ARM64Recompiler::Compile_shift(CompileFlags cf,1442void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,1443const vixl::aarch64::Register&, unsigned))1444{1445const Register rd = CFGetRegD(cf);1446const Register rt = CFGetRegT(cf);1447if (inst->r.shamt > 0)1448(armAsm->*op)(rd, rt, inst->r.shamt);1449else if (rd.GetCode() != rt.GetCode())1450armAsm->mov(rd, rt);1451}14521453void CPU::ARM64Recompiler::Compile_sll(CompileFlags cf)1454{1455Compile_shift(cf, &Assembler::lsl);1456}14571458void CPU::ARM64Recompiler::Compile_srl(CompileFlags cf)1459{1460Compile_shift(cf, &Assembler::lsr);1461}14621463void CPU::ARM64Recompiler::Compile_sra(CompileFlags cf)1464{1465Compile_shift(cf, &Assembler::asr);1466}14671468void CPU::ARM64Recompiler::Compile_variable_shift(1469CompileFlags cf,1470void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, const vixl::aarch64::Register&,1471const vixl::aarch64::Register&),1472void (vixl::aarch64::Assembler::*op_const)(const vixl::aarch64::Register&, const vixl::aarch64::Register&, unsigned))1473{1474const Register rd = CFGetRegD(cf);14751476AssertRegOrConstS(cf);1477AssertRegOrConstT(cf);14781479const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;1480if (!cf.valid_host_t)1481MoveTToReg(rt, cf);14821483if (cf.const_s)1484{1485if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)1486(armAsm->*op_const)(rd, rt, shift);1487else if (rd.GetCode() != rt.GetCode())1488armAsm->mov(rd, rt);1489}1490else1491{1492(armAsm->*op)(rd, rt, CFGetRegS(cf));1493}1494}14951496void CPU::ARM64Recompiler::Compile_sllv(CompileFlags cf)1497{1498Compile_variable_shift(cf, &Assembler::lslv, &Assembler::lsl);1499}15001501void CPU::ARM64Recompiler::Compile_srlv(CompileFlags cf)1502{1503Compile_variable_shift(cf, &Assembler::lsrv, &Assembler::lsr);1504}15051506void CPU::ARM64Recompiler::Compile_srav(CompileFlags cf)1507{1508Compile_variable_shift(cf, &Assembler::asrv, &Assembler::asr);1509}15101511void CPU::ARM64Recompiler::Compile_mult(CompileFlags cf, bool sign)1512{1513const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;1514if (!cf.valid_host_s)1515MoveSToReg(rs, cf);15161517const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;1518if (!cf.valid_host_t)1519MoveTToReg(rt, cf);15201521// TODO: if lo/hi gets killed, we can use a 32-bit multiply1522const Register lo = CFGetRegLO(cf);1523const Register hi = CFGetRegHI(cf);15241525(sign) ? armAsm->smull(lo.X(), rs, rt) : armAsm->umull(lo.X(), rs, rt);1526armAsm->lsr(hi.X(), lo.X(), 32);1527}15281529void CPU::ARM64Recompiler::Compile_mult(CompileFlags cf)1530{1531Compile_mult(cf, true);1532}15331534void CPU::ARM64Recompiler::Compile_multu(CompileFlags cf)1535{1536Compile_mult(cf, false);1537}15381539void CPU::ARM64Recompiler::Compile_div(CompileFlags cf)1540{1541const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;1542if (!cf.valid_host_s)1543MoveSToReg(rs, cf);15441545const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;1546if (!cf.valid_host_t)1547MoveTToReg(rt, cf);15481549const Register rlo = CFGetRegLO(cf);1550const Register rhi = CFGetRegHI(cf);15511552// TODO: This could be slightly more optimal1553Label done;1554Label not_divide_by_zero;1555armAsm->cbnz(rt, ¬_divide_by_zero);1556armAsm->mov(rhi, rs); // hi = num1557EmitMov(rlo, 1);1558EmitMov(RWSCRATCH, static_cast<u32>(-1));1559armAsm->cmp(rs, 0);1560armAsm->csel(rlo, RWSCRATCH, rlo, ge); // lo = s >= 0 ? -1 : 11561armAsm->b(&done);15621563armAsm->bind(¬_divide_by_zero);1564Label not_unrepresentable;1565armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(0x80000000u)));1566armAsm->b(¬_unrepresentable, ne);1567armAsm->cmp(rt, armCheckCompareConstant(-1));1568armAsm->b(¬_unrepresentable, ne);15691570EmitMov(rlo, 0x80000000u);1571EmitMov(rhi, 0);1572armAsm->b(&done);15731574armAsm->bind(¬_unrepresentable);15751576armAsm->sdiv(rlo, rs, rt);15771578// TODO: skip when hi is dead1579armAsm->msub(rhi, rlo, rt, rs);15801581armAsm->bind(&done);1582}15831584void CPU::ARM64Recompiler::Compile_divu(CompileFlags cf)1585{1586const Register rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;1587if (!cf.valid_host_s)1588MoveSToReg(rs, cf);15891590const Register rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;1591if (!cf.valid_host_t)1592MoveTToReg(rt, cf);15931594const Register rlo = CFGetRegLO(cf);1595const Register rhi = CFGetRegHI(cf);15961597Label done;1598Label not_divide_by_zero;1599armAsm->cbnz(rt, ¬_divide_by_zero);1600EmitMov(rlo, static_cast<u32>(-1));1601armAsm->mov(rhi, rs);1602armAsm->b(&done);16031604armAsm->bind(¬_divide_by_zero);16051606armAsm->udiv(rlo, rs, rt);16071608// TODO: skip when hi is dead1609armAsm->msub(rhi, rlo, rt, rs);16101611armAsm->bind(&done);1612}16131614void CPU::ARM64Recompiler::TestOverflow(const vixl::aarch64::Register& result)1615{1616DebugAssert(result.IsW());1617SwitchToFarCode(true, vs);16181619BackupHostState();16201621// toss the result1622ClearHostReg(result.GetCode());16231624EndBlockWithException(Exception::Ov);16251626RestoreHostState();16271628SwitchToNearCode(false);1629}16301631void CPU::ARM64Recompiler::Compile_dst_op(CompileFlags cf,1632void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,1633const vixl::aarch64::Register&,1634const vixl::aarch64::Operand&),1635bool commutative, bool logical, bool overflow)1636{1637AssertRegOrConstS(cf);1638AssertRegOrConstT(cf);16391640const Register rd = CFGetRegD(cf);1641if (cf.valid_host_s && cf.valid_host_t)1642{1643(armAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));1644}1645else if (commutative && (cf.const_s || cf.const_t))1646{1647const Register src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);1648if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)1649{1650(armAsm->*op)(rd, src, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));1651}1652else1653{1654if (rd.GetCode() != src.GetCode())1655armAsm->mov(rd, src);1656overflow = false;1657}1658}1659else if (cf.const_s)1660{1661// TODO: Check where we can use wzr here1662EmitMov(RWSCRATCH, GetConstantRegU32(cf.MipsS()));1663(armAsm->*op)(rd, RWSCRATCH, CFGetRegT(cf));1664}1665else if (cf.const_t)1666{1667const Register rs = CFGetRegS(cf);1668if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)1669{1670(armAsm->*op)(rd, rs, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));1671}1672else1673{1674if (rd.GetCode() != rs.GetCode())1675armAsm->mov(rd, rs);1676overflow = false;1677}1678}16791680if (overflow)1681TestOverflow(rd);1682}16831684void CPU::ARM64Recompiler::Compile_add(CompileFlags cf)1685{1686if (g_settings.cpu_recompiler_memory_exceptions)1687Compile_dst_op(cf, &Assembler::adds, true, false, true);1688else1689Compile_dst_op(cf, &Assembler::add, true, false, false);1690}16911692void CPU::ARM64Recompiler::Compile_addu(CompileFlags cf)1693{1694Compile_dst_op(cf, &Assembler::add, true, false, false);1695}16961697void CPU::ARM64Recompiler::Compile_sub(CompileFlags cf)1698{1699if (g_settings.cpu_recompiler_memory_exceptions)1700Compile_dst_op(cf, &Assembler::subs, false, false, true);1701else1702Compile_dst_op(cf, &Assembler::sub, false, false, false);1703}17041705void CPU::ARM64Recompiler::Compile_subu(CompileFlags cf)1706{1707Compile_dst_op(cf, &Assembler::sub, false, false, false);1708}17091710void CPU::ARM64Recompiler::Compile_and(CompileFlags cf)1711{1712AssertRegOrConstS(cf);1713AssertRegOrConstT(cf);17141715// special cases - and with self -> self, and with 0 -> 01716const Register regd = CFGetRegD(cf);1717if (cf.MipsS() == cf.MipsT())1718{1719armAsm->mov(regd, CFGetRegS(cf));1720return;1721}1722else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))1723{1724armAsm->mov(regd, wzr);1725return;1726}17271728Compile_dst_op(cf, &Assembler::and_, true, true, false);1729}17301731void CPU::ARM64Recompiler::Compile_or(CompileFlags cf)1732{1733AssertRegOrConstS(cf);1734AssertRegOrConstT(cf);17351736// or/nor with 0 -> no effect1737const Register regd = CFGetRegD(cf);1738if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())1739{1740cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);1741return;1742}17431744Compile_dst_op(cf, &Assembler::orr, true, true, false);1745}17461747void CPU::ARM64Recompiler::Compile_xor(CompileFlags cf)1748{1749AssertRegOrConstS(cf);1750AssertRegOrConstT(cf);17511752const Register regd = CFGetRegD(cf);1753if (cf.MipsS() == cf.MipsT())1754{1755// xor with self -> zero1756armAsm->mov(regd, wzr);1757return;1758}1759else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))1760{1761// xor with zero -> no effect1762cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);1763return;1764}17651766Compile_dst_op(cf, &Assembler::eor, true, true, false);1767}17681769void CPU::ARM64Recompiler::Compile_nor(CompileFlags cf)1770{1771Compile_or(cf);1772armAsm->mvn(CFGetRegD(cf), CFGetRegD(cf));1773}17741775void CPU::ARM64Recompiler::Compile_slt(CompileFlags cf)1776{1777Compile_slt(cf, true);1778}17791780void CPU::ARM64Recompiler::Compile_sltu(CompileFlags cf)1781{1782Compile_slt(cf, false);1783}17841785void CPU::ARM64Recompiler::Compile_slt(CompileFlags cf, bool sign)1786{1787AssertRegOrConstS(cf);1788AssertRegOrConstT(cf);17891790// TODO: swap and reverse op for constants1791if (cf.const_s)1792{1793EmitMov(RWSCRATCH, GetConstantRegS32(cf.MipsS()));1794armAsm->cmp(RWSCRATCH, CFGetRegT(cf));1795}1796else if (cf.const_t)1797{1798armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(GetConstantRegS32(cf.MipsT())));1799}1800else1801{1802armAsm->cmp(CFGetRegS(cf), CFGetRegT(cf));1803}18041805armAsm->cset(CFGetRegD(cf), sign ? lt : lo);1806}18071808vixl::aarch64::Register1809CPU::ARM64Recompiler::ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional<VirtualMemoryAddress>& address,1810const std::optional<const vixl::aarch64::Register>& reg)1811{1812const u32 imm = inst->i.imm_sext32();1813if (cf.valid_host_s && imm == 0 && !reg.has_value())1814return CFGetRegS(cf);18151816const Register dst = reg.has_value() ? reg.value() : RWARG1;1817if (address.has_value())1818{1819EmitMov(dst, address.value());1820}1821else if (imm == 0)1822{1823if (cf.valid_host_s)1824{1825if (const Register src = CFGetRegS(cf); src.GetCode() != dst.GetCode())1826armAsm->mov(dst, CFGetRegS(cf));1827}1828else1829{1830armAsm->ldr(dst, MipsPtr(cf.MipsS()));1831}1832}1833else1834{1835if (cf.valid_host_s)1836{1837armAsm->add(dst, CFGetRegS(cf), armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));1838}1839else1840{1841armAsm->ldr(dst, MipsPtr(cf.MipsS()));1842armAsm->add(dst, dst, armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));1843}1844}18451846return dst;1847}18481849template<typename RegAllocFn>1850vixl::aarch64::Register CPU::ARM64Recompiler::GenerateLoad(const vixl::aarch64::Register& addr_reg,1851MemoryAccessSize size, bool sign, bool use_fastmem,1852const RegAllocFn& dst_reg_alloc)1853{1854DebugAssert(addr_reg.IsW());1855if (use_fastmem)1856{1857m_cycles += Bus::RAM_READ_TICKS;18581859const Register dst = dst_reg_alloc();18601861if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)1862{1863DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());1864armAsm->lsr(RXARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);1865armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 3));1866}18671868const MemOperand mem =1869MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());1870u8* start = armAsm->GetCursorAddress<u8*>();1871switch (size)1872{1873case MemoryAccessSize::Byte:1874sign ? armAsm->ldrsb(dst, mem) : armAsm->ldrb(dst, mem);1875break;18761877case MemoryAccessSize::HalfWord:1878sign ? armAsm->ldrsh(dst, mem) : armAsm->ldrh(dst, mem);1879break;18801881case MemoryAccessSize::Word:1882armAsm->ldr(dst, mem);1883break;1884}18851886AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), dst.GetCode(), size, sign, true);1887return dst;1888}18891890if (addr_reg.GetCode() != RWARG1.GetCode())1891armAsm->mov(RWARG1, addr_reg);18921893const bool checked = g_settings.cpu_recompiler_memory_exceptions;1894switch (size)1895{1896case MemoryAccessSize::Byte:1897{1898EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryByte) :1899reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryByte));1900}1901break;1902case MemoryAccessSize::HalfWord:1903{1904EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryHalfWord) :1905reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryHalfWord));1906}1907break;1908case MemoryAccessSize::Word:1909{1910EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::ReadMemoryWord) :1911reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryWord));1912}1913break;1914}19151916// TODO: turn this into an asm function instead1917if (checked)1918{1919SwitchToFarCodeIfBitSet(RXRET, 63);1920BackupHostState();19211922// Need to stash this in a temp because of the flush.1923const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));1924armAsm->neg(temp.X(), RXRET);1925armAsm->lsl(temp, temp, 2);19261927Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);19281929// cause_bits = (-result << 2) | BD | cop_n1930armAsm->orr(RWARG1, temp,1931armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(1932static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));1933EmitMov(RWARG2, m_current_instruction_pc);1934EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));1935FreeHostReg(temp.GetCode());1936EndBlock(std::nullopt, true);19371938RestoreHostState();1939SwitchToNearCode(false);1940}19411942const Register dst_reg = dst_reg_alloc();1943switch (size)1944{1945case MemoryAccessSize::Byte:1946{1947sign ? armAsm->sxtb(dst_reg, RWRET) : armAsm->uxtb(dst_reg, RWRET);1948}1949break;1950case MemoryAccessSize::HalfWord:1951{1952sign ? armAsm->sxth(dst_reg, RWRET) : armAsm->uxth(dst_reg, RWRET);1953}1954break;1955case MemoryAccessSize::Word:1956{1957if (dst_reg.GetCode() != RWRET.GetCode())1958armAsm->mov(dst_reg, RWRET);1959}1960break;1961}19621963return dst_reg;1964}19651966void CPU::ARM64Recompiler::GenerateStore(const vixl::aarch64::Register& addr_reg,1967const vixl::aarch64::Register& value_reg, MemoryAccessSize size,1968bool use_fastmem)1969{1970DebugAssert(addr_reg.IsW() && value_reg.IsW());1971if (use_fastmem)1972{1973if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)1974{1975DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());1976armAsm->lsr(RXARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);1977armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 3));1978}19791980const MemOperand mem =1981MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());1982u8* start = armAsm->GetCursorAddress<u8*>();1983switch (size)1984{1985case MemoryAccessSize::Byte:1986armAsm->strb(value_reg, mem);1987break;19881989case MemoryAccessSize::HalfWord:1990armAsm->strh(value_reg, mem);1991break;19921993case MemoryAccessSize::Word:1994armAsm->str(value_reg, mem);1995break;1996}1997AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), value_reg.GetCode(), size, false, false);1998return;1999}20002001if (addr_reg.GetCode() != RWARG1.GetCode())2002armAsm->mov(RWARG1, addr_reg);2003if (value_reg.GetCode() != RWARG2.GetCode())2004armAsm->mov(RWARG2, value_reg);20052006const bool checked = g_settings.cpu_recompiler_memory_exceptions;2007switch (size)2008{2009case MemoryAccessSize::Byte:2010{2011EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryByte) :2012reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryByte));2013}2014break;2015case MemoryAccessSize::HalfWord:2016{2017EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryHalfWord) :2018reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryHalfWord));2019}2020break;2021case MemoryAccessSize::Word:2022{2023EmitCall(checked ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::WriteMemoryWord) :2024reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryWord));2025}2026break;2027}20282029// TODO: turn this into an asm function instead2030if (checked)2031{2032SwitchToFarCodeIfRegZeroOrNonZero(RXRET, true);2033BackupHostState();20342035// Need to stash this in a temp because of the flush.2036const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));2037armAsm->lsl(temp, RWRET, 2);20382039Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);20402041// cause_bits = (result << 2) | BD | cop_n2042armAsm->orr(RWARG1, temp,2043armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(2044static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));2045EmitMov(RWARG2, m_current_instruction_pc);2046EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));2047FreeHostReg(temp.GetCode());2048EndBlock(std::nullopt, true);20492050RestoreHostState();2051SwitchToNearCode(false);2052}2053}20542055void CPU::ARM64Recompiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2056const std::optional<VirtualMemoryAddress>& address)2057{2058const std::optional<WRegister> addr_reg =2059g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :2060std::optional<WRegister>();2061FlushForLoadStore(address, false, use_fastmem);2062const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);2063const Register data = GenerateLoad(addr, size, sign, use_fastmem, [this, cf]() -> Register {2064if (cf.MipsT() == Reg::zero)2065return RWRET;20662067return WRegister(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),2068EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG,2069cf.MipsT()));2070});20712072if (g_settings.gpu_pgxp_enable)2073{2074Flush(FLUSH_FOR_C_CALL);20752076EmitMov(RWARG1, inst->bits);2077armAsm->mov(RWARG2, addr);2078armAsm->mov(RWARG3, data);2079EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);2080FreeHostReg(addr_reg.value().GetCode());2081}2082}20832084void CPU::ARM64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2085const std::optional<VirtualMemoryAddress>& address)2086{2087DebugAssert(size == MemoryAccessSize::Word && !sign);20882089const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));2090FlushForLoadStore(address, false, use_fastmem);20912092// TODO: if address is constant, this can be simplified..20932094// If we're coming from another block, just flush the load delay and hope for the best..2095if (m_load_delay_dirty)2096UpdateLoadDelay();20972098// We'd need to be careful here if we weren't overwriting it..2099ComputeLoadStoreAddressArg(cf, address, addr);21002101// Do PGXP first, it does its own load.2102if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)2103{2104Flush(FLUSH_FOR_C_CALL);2105EmitMov(RWARG1, inst->bits);2106armAsm->mov(RWARG2, addr);2107MoveMIPSRegToReg(RWARG3, inst->r.rt, true);2108EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWx));2109}21102111armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));2112GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });21132114if (inst->r.rt == Reg::zero)2115{2116FreeHostReg(addr.GetCode());2117return;2118}21192120// lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is2121// never written back. NOTE: can't trust T in cf because of the flush2122const Reg rt = inst->r.rt;2123Register value;2124if (m_load_delay_register == rt)2125{2126const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?2127AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :2128m_load_delay_value_register;2129RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);2130value = WRegister(existing_ld_rt);2131}2132else2133{2134if constexpr (EMULATE_LOAD_DELAYS)2135{2136value = WRegister(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));2137if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())2138armAsm->mov(value, WRegister(rtreg.value()));2139else if (HasConstantReg(rt))2140EmitMov(value, GetConstantRegU32(rt));2141else2142armAsm->ldr(value, MipsPtr(rt));2143}2144else2145{2146value = WRegister(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));2147}2148}21492150DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());2151armAsm->and_(RWARG2, addr, 3);2152armAsm->lsl(RWARG2, RWARG2, 3); // *82153EmitMov(RWARG3, 24);2154armAsm->sub(RWARG3, RWARG3, RWARG2);21552156if (inst->op == InstructionOp::lwl)2157{2158// const u32 mask = UINT32_C(0x00FFFFFF) >> shift;2159// new_value = (value & mask) | (RWRET << (24 - shift));2160EmitMov(RWSCRATCH, 0xFFFFFFu);2161armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG2);2162armAsm->and_(value, value, RWSCRATCH);2163armAsm->lslv(RWRET, RWRET, RWARG3);2164armAsm->orr(value, value, RWRET);2165}2166else2167{2168// const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);2169// new_value = (value & mask) | (RWRET >> shift);2170armAsm->lsrv(RWRET, RWRET, RWARG2);2171EmitMov(RWSCRATCH, 0xFFFFFF00u);2172armAsm->lslv(RWSCRATCH, RWSCRATCH, RWARG3);2173armAsm->and_(value, value, RWSCRATCH);2174armAsm->orr(value, value, RWRET);2175}21762177FreeHostReg(addr.GetCode());2178}21792180void CPU::ARM64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2181const std::optional<VirtualMemoryAddress>& address)2182{2183const u32 index = static_cast<u32>(inst->r.rt.GetValue());2184const auto [ptr, action] = GetGTERegisterPointer(index, true);2185const std::optional<WRegister> addr_reg =2186g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :2187std::optional<WRegister>();2188FlushForLoadStore(address, false, use_fastmem);2189const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);2190const Register value = GenerateLoad(addr, MemoryAccessSize::Word, false, use_fastmem, [this, action = action]() {2191return (action == GTERegisterAccessAction::CallHandler && g_settings.gpu_pgxp_enable) ?2192WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :2193RWRET;2194});21952196switch (action)2197{2198case GTERegisterAccessAction::Ignore:2199{2200break;2201}22022203case GTERegisterAccessAction::Direct:2204{2205armAsm->str(value, PTR(ptr));2206break;2207}22082209case GTERegisterAccessAction::SignExtend16:2210{2211armAsm->sxth(RWARG3, value);2212armAsm->str(RWARG3, PTR(ptr));2213break;2214}22152216case GTERegisterAccessAction::ZeroExtend16:2217{2218armAsm->uxth(RWARG3, value);2219armAsm->str(RWARG3, PTR(ptr));2220break;2221}22222223case GTERegisterAccessAction::CallHandler:2224{2225Flush(FLUSH_FOR_C_CALL);2226armAsm->mov(RWARG2, value);2227EmitMov(RWARG1, index);2228EmitCall(reinterpret_cast<const void*>(>E::WriteRegister));2229break;2230}22312232case GTERegisterAccessAction::PushFIFO:2233{2234// SXY0 <- SXY12235// SXY1 <- SXY22236// SXY2 <- SXYP2237DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());2238armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));2239armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));2240armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));2241armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));2242armAsm->str(value, PTR(&g_state.gte_regs.SXY2[0]));2243break;2244}22452246default:2247{2248Panic("Unknown action");2249return;2250}2251}22522253if (g_settings.gpu_pgxp_enable)2254{2255Flush(FLUSH_FOR_C_CALL);2256armAsm->mov(RWARG3, value);2257if (value.GetCode() != RWRET.GetCode())2258FreeHostReg(value.GetCode());2259armAsm->mov(RWARG2, addr);2260FreeHostReg(addr_reg.value().GetCode());2261EmitMov(RWARG1, inst->bits);2262EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));2263}2264}22652266void CPU::ARM64Recompiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2267const std::optional<VirtualMemoryAddress>& address)2268{2269AssertRegOrConstS(cf);2270AssertRegOrConstT(cf);22712272const std::optional<WRegister> addr_reg =2273g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :2274std::optional<WRegister>();2275FlushForLoadStore(address, true, use_fastmem);2276const Register addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);2277const Register data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;2278if (!cf.valid_host_t)2279MoveTToReg(RWARG2, cf);22802281GenerateStore(addr, data, size, use_fastmem);22822283if (g_settings.gpu_pgxp_enable)2284{2285Flush(FLUSH_FOR_C_CALL);2286MoveMIPSRegToReg(RWARG3, cf.MipsT());2287armAsm->mov(RWARG2, addr);2288EmitMov(RWARG1, inst->bits);2289EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);2290FreeHostReg(addr_reg.value().GetCode());2291}2292}22932294void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2295const std::optional<VirtualMemoryAddress>& address)2296{2297DebugAssert(size == MemoryAccessSize::Word && !sign);22982299// TODO: this can take over rt's value if it's no longer needed2300// NOTE: can't trust T in cf because of the alloc2301const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));23022303FlushForLoadStore(address, true, use_fastmem);23042305// TODO: if address is constant, this can be simplified..2306// We'd need to be careful here if we weren't overwriting it..2307ComputeLoadStoreAddressArg(cf, address, addr);23082309if (g_settings.gpu_pgxp_enable)2310{2311Flush(FLUSH_FOR_C_CALL);2312EmitMov(RWARG1, inst->bits);2313armAsm->mov(RWARG2, addr);2314MoveMIPSRegToReg(RWARG3, inst->r.rt);2315EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWx));2316}23172318armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));2319GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });23202321armAsm->and_(RWSCRATCH, addr, 3);2322armAsm->lsl(RWSCRATCH, RWSCRATCH, 3); // *82323armAsm->and_(addr, addr, armCheckLogicalConstant(~0x3u));23242325MoveMIPSRegToReg(RWARG2, inst->r.rt);23262327if (inst->op == InstructionOp::swl)2328{2329// const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;2330// new_value = (RWRET & mem_mask) | (value >> (24 - shift));2331EmitMov(RWARG3, 0xFFFFFF00u);2332armAsm->lslv(RWARG3, RWARG3, RWSCRATCH);2333armAsm->and_(RWRET, RWRET, RWARG3);23342335EmitMov(RWARG3, 24);2336armAsm->sub(RWARG3, RWARG3, RWSCRATCH);2337armAsm->lsrv(RWARG2, RWARG2, RWARG3);2338armAsm->orr(RWARG2, RWARG2, RWRET);2339}2340else2341{2342// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);2343// new_value = (RWRET & mem_mask) | (value << shift);2344armAsm->lslv(RWARG2, RWARG2, RWSCRATCH);23452346EmitMov(RWARG3, 24);2347armAsm->sub(RWARG3, RWARG3, RWSCRATCH);2348EmitMov(RWSCRATCH, 0x00FFFFFFu);2349armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG3);2350armAsm->and_(RWRET, RWRET, RWSCRATCH);2351armAsm->orr(RWARG2, RWARG2, RWRET);2352}23532354GenerateStore(addr, RWARG2, MemoryAccessSize::Word, use_fastmem);2355FreeHostReg(addr.GetCode());2356}23572358void CPU::ARM64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,2359const std::optional<VirtualMemoryAddress>& address)2360{2361const u32 index = static_cast<u32>(inst->r.rt.GetValue());2362const auto [ptr, action] = GetGTERegisterPointer(index, false);2363const Register addr = (g_settings.gpu_pgxp_enable || action == GTERegisterAccessAction::CallHandler) ?2364WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) :2365RWARG1;2366const Register data = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;2367FlushForLoadStore(address, true, use_fastmem);2368ComputeLoadStoreAddressArg(cf, address, addr);23692370switch (action)2371{2372case GTERegisterAccessAction::Direct:2373{2374armAsm->ldr(data, PTR(ptr));2375}2376break;23772378case GTERegisterAccessAction::CallHandler:2379{2380// should already be flushed.. except in fastmem case2381Flush(FLUSH_FOR_C_CALL);2382EmitMov(RWARG1, index);2383EmitCall(reinterpret_cast<const void*>(>E::ReadRegister));2384armAsm->mov(data, RWRET);2385}2386break;23872388default:2389{2390Panic("Unknown action");2391}2392break;2393}23942395GenerateStore(addr, data, size, use_fastmem);2396if (!g_settings.gpu_pgxp_enable)2397{2398if (addr.GetCode() != RWARG1.GetCode())2399FreeHostReg(addr.GetCode());2400}2401else2402{2403// TODO: This can be simplified because we don't need to validate in PGXP..2404Flush(FLUSH_FOR_C_CALL);2405armAsm->mov(RWARG3, data);2406FreeHostReg(data.GetCode());2407armAsm->mov(RWARG2, addr);2408FreeHostReg(addr.GetCode());2409EmitMov(RWARG1, inst->bits);2410EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));2411}2412}24132414void CPU::ARM64Recompiler::Compile_mtc0(CompileFlags cf)2415{2416// TODO: we need better constant setting here.. which will need backprop2417AssertRegOrConstT(cf);24182419const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());2420const u32* ptr = GetCop0RegPtr(reg);2421const u32 mask = GetCop0RegWriteMask(reg);2422if (!ptr)2423{2424Compile_Fallback();2425return;2426}24272428if (mask == 0)2429{2430// if it's a read-only register, ignore2431DEBUG_LOG("Ignoring write to read-only cop0 reg {}", static_cast<u32>(reg));2432return;2433}24342435// for some registers, we need to test certain bits2436const bool needs_bit_test = (reg == Cop0Reg::SR);2437const Register new_value = RWARG1;2438const Register old_value = RWARG2;2439const Register changed_bits = RWARG3;2440const Register mask_reg = RWSCRATCH;24412442// Load old value2443armAsm->ldr(old_value, PTR(ptr));24442445// No way we fit this in an immediate..2446EmitMov(mask_reg, mask);24472448// update value2449if (cf.valid_host_t)2450armAsm->and_(new_value, CFGetRegT(cf), mask_reg);2451else2452EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);24532454if (needs_bit_test)2455armAsm->eor(changed_bits, old_value, new_value);2456armAsm->bic(old_value, old_value, mask_reg);2457armAsm->orr(new_value, old_value, new_value);2458armAsm->str(new_value, PTR(ptr));24592460if (reg == Cop0Reg::SR)2461{2462// TODO: replace with register backup2463// We could just inline the whole thing..2464Flush(FLUSH_FOR_C_CALL);24652466Label caches_unchanged;2467armAsm->tbz(changed_bits, 16, &caches_unchanged);2468EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));2469armAsm->ldr(RWARG1, PTR(ptr)); // reload value for interrupt test below2470if (CodeCache::IsUsingFastmem())2471armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));2472armAsm->bind(&caches_unchanged);24732474TestInterrupts(RWARG1);2475}2476else if (reg == Cop0Reg::CAUSE)2477{2478armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));2479TestInterrupts(RWARG1);2480}2481else if (reg == Cop0Reg::DCIC || reg == Cop0Reg::BPCM)2482{2483// need to check whether we're switching to debug mode2484Flush(FLUSH_FOR_C_CALL);2485EmitCall(reinterpret_cast<const void*>(&CPU::UpdateDebugDispatcherFlag));2486SwitchToFarCodeIfRegZeroOrNonZero(RWRET, true);2487BackupHostState();2488Flush(FLUSH_FOR_EARLY_BLOCK_EXIT);2489EmitCall(reinterpret_cast<const void*>(&CPU::ExitExecution)); // does not return2490RestoreHostState();2491SwitchToNearCode(false);2492}2493}24942495void CPU::ARM64Recompiler::Compile_rfe(CompileFlags cf)2496{2497// shift mode bits right two, preserving upper bits2498armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));2499armAsm->bfxil(RWARG1, RWARG1, 2, 4);2500armAsm->str(RWARG1, PTR(&g_state.cop0_regs.sr.bits));25012502TestInterrupts(RWARG1);2503}25042505void CPU::ARM64Recompiler::TestInterrupts(const vixl::aarch64::Register& sr)2506{2507DebugAssert(sr.IsW());25082509// if Iec == 0 then goto no_interrupt2510Label no_interrupt;2511armAsm->tbz(sr, 0, &no_interrupt);25122513// sr & cause2514armAsm->ldr(RWSCRATCH, PTR(&g_state.cop0_regs.cause.bits));2515armAsm->and_(sr, sr, RWSCRATCH);25162517// ((sr & cause) & 0xff00) == 0 goto no_interrupt2518armAsm->tst(sr, 0xFF00);25192520SwitchToFarCode(true, ne);2521BackupHostState();25222523// Update load delay, this normally happens at the end of an instruction, but we're finishing it early.2524UpdateLoadDelay();25252526Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);25272528// Can't use EndBlockWithException() here, because it'll use the wrong PC.2529// Can't use RaiseException() on the fast path if we're the last instruction, because the next PC is unknown.2530if (!iinfo->is_last_instruction)2531{2532EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(Exception::INT, iinfo->is_branch_instruction, false,2533(inst + 1)->cop.cop_n));2534EmitMov(RWARG2, m_compiler_pc);2535EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));2536m_dirty_pc = false;2537EndAndLinkBlock(std::nullopt, true, false);2538}2539else2540{2541if (m_dirty_pc)2542EmitMov(RWARG1, m_compiler_pc);2543armAsm->str(wzr, PTR(&g_state.downcount));2544if (m_dirty_pc)2545armAsm->str(RWARG1, PTR(&g_state.pc));2546m_dirty_pc = false;2547EndAndLinkBlock(std::nullopt, false, true);2548}25492550RestoreHostState();2551SwitchToNearCode(false);25522553armAsm->bind(&no_interrupt);2554}25552556void CPU::ARM64Recompiler::Compile_mfc2(CompileFlags cf)2557{2558const u32 index = inst->cop.Cop2Index();2559const Reg rt = inst->r.rt;25602561const auto [ptr, action] = GetGTERegisterPointer(index, false);2562if (action == GTERegisterAccessAction::Ignore)2563return;25642565u32 hreg;2566if (action == GTERegisterAccessAction::Direct)2567{2568hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),2569EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);2570armAsm->ldr(WRegister(hreg), PTR(ptr));2571}2572else if (action == GTERegisterAccessAction::CallHandler)2573{2574Flush(FLUSH_FOR_C_CALL);2575EmitMov(RWARG1, index);2576EmitCall(reinterpret_cast<const void*>(>E::ReadRegister));25772578hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),2579EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);2580armAsm->mov(WRegister(hreg), RWRET);2581}2582else2583{2584Panic("Unknown action");2585return;2586}25872588if (g_settings.gpu_pgxp_enable)2589{2590Flush(FLUSH_FOR_C_CALL);2591EmitMov(RWARG1, inst->bits);2592armAsm->mov(RWARG2, WRegister(hreg));2593EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));2594}2595}25962597void CPU::ARM64Recompiler::Compile_mtc2(CompileFlags cf)2598{2599const u32 index = inst->cop.Cop2Index();2600const auto [ptr, action] = GetGTERegisterPointer(index, true);2601if (action == GTERegisterAccessAction::Ignore)2602return;26032604if (action == GTERegisterAccessAction::Direct)2605{2606if (cf.const_t)2607StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);2608else2609armAsm->str(CFGetRegT(cf), PTR(ptr));2610}2611else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)2612{2613const bool sign = (action == GTERegisterAccessAction::SignExtend16);2614if (cf.valid_host_t)2615{2616sign ? armAsm->sxth(RWARG1, CFGetRegT(cf)) : armAsm->uxth(RWARG1, CFGetRegT(cf));2617armAsm->str(RWARG1, PTR(ptr));2618}2619else if (cf.const_t)2620{2621const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));2622StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);2623}2624else2625{2626Panic("Unsupported setup");2627}2628}2629else if (action == GTERegisterAccessAction::CallHandler)2630{2631Flush(FLUSH_FOR_C_CALL);2632EmitMov(RWARG1, index);2633MoveTToReg(RWARG2, cf);2634EmitCall(reinterpret_cast<const void*>(>E::WriteRegister));2635}2636else if (action == GTERegisterAccessAction::PushFIFO)2637{2638// SXY0 <- SXY12639// SXY1 <- SXY22640// SXY2 <- SXYP2641DebugAssert(RWRET.GetCode() != RWARG2.GetCode() && RWRET.GetCode() != RWARG3.GetCode());2642armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));2643armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));2644armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));2645armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));2646if (cf.valid_host_t)2647armAsm->str(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));2648else if (cf.const_t)2649StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);2650else2651Panic("Unsupported setup");2652}2653else2654{2655Panic("Unknown action");2656}2657}26582659void CPU::ARM64Recompiler::Compile_cop2(CompileFlags cf)2660{2661TickCount func_ticks;2662GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);26632664Flush(FLUSH_FOR_C_CALL);2665EmitMov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);2666EmitCall(reinterpret_cast<const void*>(func));26672668AddGTETicks(func_ticks);2669}26702671u32 CPU::Recompiler::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,2672TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,2673u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,2674bool is_load)2675{2676Assembler arm_asm(static_cast<u8*>(thunk_code), thunk_space);2677Assembler* armAsm = &arm_asm;26782679#ifdef VIXL_DEBUG2680vixl::CodeBufferCheckScope asm_check(armAsm, thunk_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);2681#endif26822683static constexpr u32 GPR_SIZE = 8;26842685// save regs2686u32 num_gprs = 0;26872688for (u32 i = 0; i < NUM_HOST_REGS; i++)2689{2690if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))2691num_gprs++;2692}26932694const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE);26952696// TODO: use stp+ldp, vixl helper?26972698if (stack_size > 0)2699{2700armAsm->sub(sp, sp, stack_size);27012702u32 stack_offset = 0;2703for (u32 i = 0; i < NUM_HOST_REGS; i++)2704{2705if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))2706{2707armAsm->str(XRegister(i), MemOperand(sp, stack_offset));2708stack_offset += GPR_SIZE;2709}2710}2711}27122713if (cycles_to_add != 0)2714{2715// NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles2716Assert(Assembler::IsImmAddSub(cycles_to_add));2717armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));2718armAsm->add(RWSCRATCH, RWSCRATCH, cycles_to_add);2719armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));2720}27212722if (address_register != static_cast<u8>(RWARG1.GetCode()))2723armAsm->mov(RWARG1, WRegister(address_register));27242725if (!is_load)2726{2727if (data_register != static_cast<u8>(RWARG2.GetCode()))2728armAsm->mov(RWARG2, WRegister(data_register));2729}27302731switch (size)2732{2733case MemoryAccessSize::Byte:2734{2735armEmitCall(armAsm,2736is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryByte) :2737reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryByte),2738false);2739}2740break;2741case MemoryAccessSize::HalfWord:2742{2743armEmitCall(armAsm,2744is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryHalfWord) :2745reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryHalfWord),2746false);2747}2748break;2749case MemoryAccessSize::Word:2750{2751armEmitCall(armAsm,2752is_load ? reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedReadMemoryWord) :2753reinterpret_cast<const void*>(&CPU::RecompilerThunks::UncheckedWriteMemoryWord),2754false);2755}2756break;2757}27582759if (is_load)2760{2761const WRegister dst = WRegister(data_register);2762switch (size)2763{2764case MemoryAccessSize::Byte:2765{2766is_signed ? armAsm->sxtb(dst, RWRET) : armAsm->uxtb(dst, RWRET);2767}2768break;2769case MemoryAccessSize::HalfWord:2770{2771is_signed ? armAsm->sxth(dst, RWRET) : armAsm->uxth(dst, RWRET);2772}2773break;2774case MemoryAccessSize::Word:2775{2776if (dst.GetCode() != RWRET.GetCode())2777armAsm->mov(dst, RWRET);2778}2779break;2780}2781}27822783if (cycles_to_remove != 0)2784{2785Assert(Assembler::IsImmAddSub(cycles_to_remove));2786armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));2787armAsm->sub(RWSCRATCH, RWSCRATCH, cycles_to_remove);2788armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));2789}27902791// restore regs2792if (stack_size > 0)2793{2794u32 stack_offset = 0;2795for (u32 i = 0; i < NUM_HOST_REGS; i++)2796{2797if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))2798{2799armAsm->ldr(XRegister(i), MemOperand(sp, stack_offset));2800stack_offset += GPR_SIZE;2801}2802}28032804armAsm->add(sp, sp, stack_size);2805}28062807armEmitJmp(armAsm, static_cast<const u8*>(code_address) + code_size, true);2808armAsm->FinalizeCode();28092810return static_cast<u32>(armAsm->GetCursorOffset());2811}28122813#endif // CPU_ARCH_ARM64281428152816